gitextract_nmtq5ath/

├── .astylerc
├── .clang-format
├── .gitattributes
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug.md
│   │   ├── model-convert.md
│   │   ├── others.md
│   │   └── quantization.md
│   ├── dependabot.yml
│   ├── labeler.yml
│   └── workflows/
│       ├── android.yml
│       ├── code-format-msg.yml
│       ├── code-format.yml
│       ├── codeql-analysis.yml
│       ├── compare-binary-size-pr-comment.yml
│       ├── compare-binary-size.yml
│       ├── elf-riscv32.yml
│       ├── elf-riscv64.yml
│       ├── esp32.yml
│       ├── harmonyos.yml
│       ├── ios.yml
│       ├── labeler.yml
│       ├── linux-aarch64.yml
│       ├── linux-arm.yml
│       ├── linux-loongarch64.yml
│       ├── linux-mips.yml
│       ├── linux-mips64.yml
│       ├── linux-ppc64.yml
│       ├── linux-riscv32.yml
│       ├── linux-riscv64.yml
│       ├── linux-x64-cpu-clang.yml
│       ├── linux-x64-cpu-gcc-musl.yml
│       ├── linux-x64-cpu-gcc.yml
│       ├── linux-x64-gpu-clang.yml
│       ├── linux-x64-gpu-gcc.yml
│       ├── linux-x64-sde.yml
│       ├── linux-x86-cpu-clang.yml
│       ├── linux-x86-cpu-gcc.yml
│       ├── mac-catalyst.yml
│       ├── macos.yml
│       ├── pnnx.yml
│       ├── python.yml
│       ├── release-python.yml
│       ├── release.yml
│       ├── sync-wiki.yml
│       ├── test-coverage.yml
│       ├── tvos.yml
│       ├── visionos.yml
│       ├── watchos.yml
│       ├── web-assembly.yml
│       ├── windows-arm.yml
│       ├── windows-clang.yml
│       ├── windows-mingw.yml
│       ├── windows-xp.yml
│       └── windows.yml
├── .gitignore
├── .gitmodules
├── CITATION.cff
├── CMakeLists.txt
├── CONTRIBUTING.md
├── Info.plist
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── benchmark/
│   ├── CMakeLists.txt
│   ├── FastestDet.param
│   ├── README.md
│   ├── RankCards/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── Rcards.h
│   │   └── main.cpp
│   ├── alexnet.param
│   ├── benchncnn.cpp
│   ├── benchncnn_param_data.h.in
│   ├── blazeface.param
│   ├── efficientnet_b0.param
│   ├── efficientnetv2_b0.param
│   ├── googlenet.param
│   ├── googlenet_int8.param
│   ├── mnasnet.param
│   ├── mobilenet.param
│   ├── mobilenet_int8.param
│   ├── mobilenet_ssd.param
│   ├── mobilenet_ssd_int8.param
│   ├── mobilenet_v2.param
│   ├── mobilenet_v3.param
│   ├── mobilenet_yolo.param
│   ├── mobilenetv2_yolov3.param
│   ├── nanodet_m.param
│   ├── proxylessnasnet.param
│   ├── regnety_400m.param
│   ├── resnet18.param
│   ├── resnet18_int8.param
│   ├── resnet50.param
│   ├── resnet50_int8.param
│   ├── shufflenet.param
│   ├── shufflenet_v2.param
│   ├── squeezenet.param
│   ├── squeezenet_int8.param
│   ├── squeezenet_ssd.param
│   ├── squeezenet_ssd_int8.param
│   ├── vgg16.param
│   ├── vgg16_int8.param
│   ├── vision_transformer.param
│   ├── yolo-fastest-1.1.param
│   ├── yolo-fastestv2.param
│   └── yolov4-tiny.param
├── build-android.cmd
├── build.sh
├── cmake/
│   ├── ncnnConfig.cmake.in
│   ├── ncnn_add_layer.cmake
│   ├── ncnn_add_param.cmake
│   ├── ncnn_add_shader.cmake
│   ├── ncnn_generate_avx512_source.cmake
│   ├── ncnn_generate_avx_source.cmake
│   ├── ncnn_generate_fma_source.cmake
│   ├── ncnn_generate_lasx_source.cmake
│   ├── ncnn_generate_lsx_source.cmake
│   ├── ncnn_generate_msa_source.cmake
│   ├── ncnn_generate_param_header.cmake
│   ├── ncnn_generate_rvv_source.cmake
│   ├── ncnn_generate_shader_comp_header.cmake
│   ├── ncnn_generate_xtheadvector_source.cmake
│   └── run_test.cmake
├── codeformat.sh
├── docs/
│   ├── Home.md
│   ├── application-with-ncnn-inside.md
│   ├── benchmark/
│   │   ├── the-benchmark-of-caffe-android-lib,-mini-caffe,-and-ncnn.md
│   │   └── vulkan-conformance-test.md
│   ├── developer-guide/
│   │   ├── aarch64-mix-assembly-and-intrinsic.md
│   │   ├── add-custom-layer.zh.md
│   │   ├── arm-a53-a55-dual-issue.md
│   │   ├── armv7-mix-assembly-and-intrinsic.md
│   │   ├── binaryop-broadcasting.md
│   │   ├── build-ncnn-on-windows-xp.zh.md
│   │   ├── custom-allocator.md
│   │   ├── element-packing.md
│   │   ├── expression.md
│   │   ├── glsl-extension.md
│   │   ├── glsl-extension.zh.md
│   │   ├── how-to-be-a-contributor.zh.md
│   │   ├── how-to-implement-custom-layer-step-by-step.md
│   │   ├── how-to-write-a-neon-optimized-op-kernel.md
│   │   ├── how-to-write-a-sse-optimized-op-kernel.zh.md
│   │   ├── kvcache.md
│   │   ├── layer-feat-mask.md
│   │   ├── layer-support-behavior.md
│   │   ├── low-level-operation-api.md
│   │   ├── ncnn-tips-and-tricks.zh.md
│   │   ├── new-model-load-api.md
│   │   ├── new-param-load-api.md
│   │   ├── operation-param-weight-table.md
│   │   ├── operators.md
│   │   ├── param-and-model-file-structure.md
│   │   ├── preload-practice.zh.md
│   │   ├── tensorflow-op-combination.md
│   │   └── vulkan-driver-loader.md
│   ├── faq.en.md
│   ├── faq.md
│   ├── how-to-build/
│   │   ├── build-mlir2ncnn.md
│   │   └── how-to-build.md
│   └── how-to-use-and-FAQ/
│       ├── FAQ-ncnn-produce-wrong-result.md
│       ├── FAQ-ncnn-protobuf-problem.zh.md
│       ├── FAQ-ncnn-throw-error.md
│       ├── FAQ-ncnn-vulkan.md
│       ├── build-minimal-library.md
│       ├── efficient-roi-resize-rotate.md
│       ├── ncnn-load-model.md
│       ├── openmp-best-practice.md
│       ├── openmp-best-practice.zh.md
│       ├── quantized-int8-inference.md
│       ├── use-ncnn-with-alexnet.md
│       ├── use-ncnn-with-alexnet.zh.md
│       ├── use-ncnn-with-opencv.md
│       ├── use-ncnn-with-own-project.md
│       ├── use-ncnn-with-pytorch-or-onnx.md
│       ├── use-ncnnoptimize-to-optimize-model.md
│       └── vulkan-notes.md
├── examples/
│   ├── CMakeLists.txt
│   ├── arcface.cpp
│   ├── fasterrcnn.cpp
│   ├── mobilenetssd.cpp
│   ├── mobilenetv2ssdlite.cpp
│   ├── mobilenetv3ssdlite.cpp
│   ├── nanodet.cpp
│   ├── nanodetplus_pnnx.cpp
│   ├── p2pnet.cpp
│   ├── peleenetssd_seg.cpp
│   ├── piper.cpp
│   ├── ppocrv5.cpp
│   ├── ppocrv5_dict.h
│   ├── retinaface.cpp
│   ├── rfcn.cpp
│   ├── rvm.cpp
│   ├── scrfd.cpp
│   ├── scrfd_crowdhuman.cpp
│   ├── shufflenetv2.cpp
│   ├── simplepose.cpp
│   ├── squeezencnn/
│   │   └── README.md
│   ├── squeezenet.cpp
│   ├── squeezenet_c_api.cpp
│   ├── squeezenet_v1.1.caffemodel
│   ├── squeezenet_v1.1.param
│   ├── squeezenet_v1.1.prototxt
│   ├── squeezenetssd.cpp
│   ├── synset_words.txt
│   ├── whisper.cpp
│   ├── yolact.cpp
│   ├── yolo11.cpp
│   ├── yolo11_cls.cpp
│   ├── yolo11_obb.cpp
│   ├── yolo11_pose.cpp
│   ├── yolo11_seg.cpp
│   ├── yolov2.cpp
│   ├── yolov3.cpp
│   ├── yolov4.cpp
│   ├── yolov5.cpp
│   ├── yolov5_pnnx.cpp
│   ├── yolov7.cpp
│   ├── yolov7_pnnx.cpp
│   ├── yolov8.cpp
│   ├── yolov8_cls.cpp
│   ├── yolov8_obb.cpp
│   ├── yolov8_pose.cpp
│   ├── yolov8_seg.cpp
│   ├── yoloworld.cpp
│   └── yolox.cpp
├── package.sh
├── pyproject.toml
├── python/
│   ├── CMakeLists.txt
│   ├── README.md
│   ├── examples/
│   │   ├── fasterrcnn.py
│   │   ├── mobilenetssd.py
│   │   ├── mobilenetv2ssdlite.py
│   │   ├── mobilenetv3ssdlite.py
│   │   ├── model_zoo.py
│   │   ├── nanodet.py
│   │   ├── peleenetssd.py
│   │   ├── retinaface.py
│   │   ├── rfcn.py
│   │   ├── shufflenetv2.py
│   │   ├── simplepose.py
│   │   ├── squeezenet.py
│   │   ├── squeezenetssd.py
│   │   ├── yolact.py
│   │   ├── yolov2.py
│   │   ├── yolov3.py
│   │   ├── yolov4.py
│   │   ├── yolov5.py
│   │   └── yolov8.py
│   ├── ncnn/
│   │   ├── __init__.py
│   │   ├── model_zoo/
│   │   │   ├── __init__.py
│   │   │   ├── fasterrcnn.py
│   │   │   ├── mobilenetssd.py
│   │   │   ├── mobilenetv2ssdlite.py
│   │   │   ├── mobilenetv3ssdlite.py
│   │   │   ├── model_store.py
│   │   │   ├── model_zoo.py
│   │   │   ├── nanodet.py
│   │   │   ├── peleenetssd.py
│   │   │   ├── retinaface.py
│   │   │   ├── rfcn.py
│   │   │   ├── shufflenetv2.py
│   │   │   ├── simplepose.py
│   │   │   ├── squeezenet.py
│   │   │   ├── squeezenetssd.py
│   │   │   ├── yolact.py
│   │   │   ├── yolov2.py
│   │   │   ├── yolov3.py
│   │   │   ├── yolov4.py
│   │   │   ├── yolov5.py
│   │   │   ├── yolov7.py
│   │   │   └── yolov8.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── download.py
│   │       ├── functional.py
│   │       ├── objects.py
│   │       └── visual.py
│   ├── requirements.txt
│   ├── setup.py.i
│   ├── src/
│   │   ├── main.cpp
│   │   ├── pybind11_allocator.h
│   │   ├── pybind11_bind.h
│   │   ├── pybind11_datareader.h
│   │   ├── pybind11_layer.h
│   │   ├── pybind11_mat.h
│   │   └── pybind11_modelbin.h
│   └── tests/
│       ├── benchmark.py
│       ├── custom_layer.param
│       ├── test.param
│       ├── test_allocator.py
│       ├── test_blob.py
│       ├── test_extractor.py
│       ├── test_mat.py
│       ├── test_net.py
│       ├── test_option.py
│       ├── test_paramdict.py
│       ├── test_vulkan_allocator.py
│       └── test_vulkan_device.py
├── setup.py
├── src/
│   ├── CMakeLists.txt
│   ├── allocator.cpp
│   ├── allocator.h
│   ├── benchmark.cpp
│   ├── benchmark.h
│   ├── blob.cpp
│   ├── blob.h
│   ├── c_api.cpp
│   ├── c_api.h
│   ├── command.cpp
│   ├── command.h
│   ├── convert_ycbcr.comp
│   ├── cpu.cpp
│   ├── cpu.h
│   ├── datareader.cpp
│   ├── datareader.h
│   ├── expression.cpp
│   ├── expression.h
│   ├── gpu.cpp
│   ├── gpu.h
│   ├── layer/
│   │   ├── absval.cpp
│   │   ├── absval.h
│   │   ├── argmax.cpp
│   │   ├── argmax.h
│   │   ├── arm/
│   │   │   ├── absval_arm.cpp
│   │   │   ├── absval_arm.h
│   │   │   ├── arm_activation.h
│   │   │   ├── arm_usability.h
│   │   │   ├── batchnorm_arm.cpp
│   │   │   ├── batchnorm_arm.h
│   │   │   ├── batchnorm_arm_asimdhp.cpp
│   │   │   ├── bias_arm.cpp
│   │   │   ├── bias_arm.h
│   │   │   ├── binaryop_arm.cpp
│   │   │   ├── binaryop_arm.h
│   │   │   ├── binaryop_arm_asimdhp.cpp
│   │   │   ├── cast_arm.cpp
│   │   │   ├── cast_arm.h
│   │   │   ├── cast_arm_bf16.cpp
│   │   │   ├── cast_arm_vfpv4.cpp
│   │   │   ├── cast_bf16.h
│   │   │   ├── cast_fp16.h
│   │   │   ├── clip_arm.cpp
│   │   │   ├── clip_arm.h
│   │   │   ├── clip_arm_asimdhp.cpp
│   │   │   ├── concat_arm.cpp
│   │   │   ├── concat_arm.h
│   │   │   ├── convolution1d_arm.cpp
│   │   │   ├── convolution1d_arm.h
│   │   │   ├── convolution1d_arm_asimdhp.cpp
│   │   │   ├── convolution1d_packed.h
│   │   │   ├── convolution1d_packed_bf16s.h
│   │   │   ├── convolution1d_packed_fp16s.h
│   │   │   ├── convolution_1x1.h
│   │   │   ├── convolution_2x2.h
│   │   │   ├── convolution_3x3.h
│   │   │   ├── convolution_3x3_int8.h
│   │   │   ├── convolution_3x3_pack1to4.h
│   │   │   ├── convolution_3x3_pack1to4_bf16s.h
│   │   │   ├── convolution_3x3_pack1to4_fp16s.h
│   │   │   ├── convolution_3x3_pack1to8_fp16s.h
│   │   │   ├── convolution_3x3_pack4.h
│   │   │   ├── convolution_3x3_pack4_bf16s.h
│   │   │   ├── convolution_3x3_pack4_fp16s.h
│   │   │   ├── convolution_3x3_pack4to1.h
│   │   │   ├── convolution_3x3_pack8_fp16s.h
│   │   │   ├── convolution_3x3_winograd.h
│   │   │   ├── convolution_3x3_winograd_bf16s.h
│   │   │   ├── convolution_3x3_winograd_fp16s.h
│   │   │   ├── convolution_3x3_winograd_int8.h
│   │   │   ├── convolution_4x4.h
│   │   │   ├── convolution_5x5.h
│   │   │   ├── convolution_5x5_pack4.h
│   │   │   ├── convolution_5x5_pack4_bf16s.h
│   │   │   ├── convolution_5x5_pack8_fp16s.h
│   │   │   ├── convolution_7x7.h
│   │   │   ├── convolution_7x7_pack1to4.h
│   │   │   ├── convolution_7x7_pack1to4_bf16s.h
│   │   │   ├── convolution_7x7_pack1to8_fp16s.h
│   │   │   ├── convolution_arm.cpp
│   │   │   ├── convolution_arm.h
│   │   │   ├── convolution_arm_asimddp.cpp
│   │   │   ├── convolution_arm_asimdhp.cpp
│   │   │   ├── convolution_arm_i8mm.cpp
│   │   │   ├── convolution_im2col_gemm.h
│   │   │   ├── convolution_im2col_gemm_bf16s.h
│   │   │   ├── convolution_im2col_gemm_bf16s_fp16s.h
│   │   │   ├── convolution_im2col_gemm_fp16s.h
│   │   │   ├── convolution_im2col_gemm_int8.h
│   │   │   ├── convolution_packed.h
│   │   │   ├── convolution_packed_bf16s.h
│   │   │   ├── convolution_packed_fp16s.h
│   │   │   ├── convolution_packed_int8.h
│   │   │   ├── convolutiondepthwise_3x3.h
│   │   │   ├── convolutiondepthwise_3x3_fp16s.h
│   │   │   ├── convolutiondepthwise_3x3_int8.h
│   │   │   ├── convolutiondepthwise_3x3_pack4.h
│   │   │   ├── convolutiondepthwise_3x3_pack4_bf16s.h
│   │   │   ├── convolutiondepthwise_3x3_pack8_fp16s.h
│   │   │   ├── convolutiondepthwise_3x3_pack8_int8.h
│   │   │   ├── convolutiondepthwise_5x5.h
│   │   │   ├── convolutiondepthwise_5x5_pack4.h
│   │   │   ├── convolutiondepthwise_5x5_pack4_bf16s.h
│   │   │   ├── convolutiondepthwise_5x5_pack8_fp16s.h
│   │   │   ├── convolutiondepthwise_arm.cpp
│   │   │   ├── convolutiondepthwise_arm.h
│   │   │   ├── convolutiondepthwise_arm_asimdhp.cpp
│   │   │   ├── crop_arm.cpp
│   │   │   ├── crop_arm.h
│   │   │   ├── deconvolution_3x3.h
│   │   │   ├── deconvolution_4x4.h
│   │   │   ├── deconvolution_4x4_fp16s.h
│   │   │   ├── deconvolution_arm.cpp
│   │   │   ├── deconvolution_arm.h
│   │   │   ├── deconvolution_arm_asimdhp.cpp
│   │   │   ├── deconvolutiondepthwise_arm.cpp
│   │   │   ├── deconvolutiondepthwise_arm.h
│   │   │   ├── deconvolutiondepthwise_arm_asimdhp.cpp
│   │   │   ├── dequantize_arm.cpp
│   │   │   ├── dequantize_arm.h
│   │   │   ├── dequantize_arm_asimdhp.cpp
│   │   │   ├── dropout_arm.cpp
│   │   │   ├── dropout_arm.h
│   │   │   ├── eltwise_arm.cpp
│   │   │   ├── eltwise_arm.h
│   │   │   ├── eltwise_arm_asimdhp.cpp
│   │   │   ├── flatten_arm.cpp
│   │   │   ├── flatten_arm.h
│   │   │   ├── gelu_arm.cpp
│   │   │   ├── gelu_arm.h
│   │   │   ├── gelu_arm_asimdhp.cpp
│   │   │   ├── gemm_arm.cpp
│   │   │   ├── gemm_arm.h
│   │   │   ├── gemm_arm_asimddp.cpp
│   │   │   ├── gemm_arm_asimdfhm.cpp
│   │   │   ├── gemm_arm_asimdhp.cpp
│   │   │   ├── gemm_arm_i8mm.cpp
│   │   │   ├── gemm_arm_vfpv4.cpp
│   │   │   ├── gemm_bf16s.h
│   │   │   ├── gemm_bf16s_fp16s.h
│   │   │   ├── gemm_fp16s.h
│   │   │   ├── gemm_int8.h
│   │   │   ├── gemm_int8_bf16s.h
│   │   │   ├── gemm_int8_fp16s.h
│   │   │   ├── groupnorm_arm.cpp
│   │   │   ├── groupnorm_arm.h
│   │   │   ├── groupnorm_arm_asimdhp.cpp
│   │   │   ├── gru_arm.cpp
│   │   │   ├── gru_arm.h
│   │   │   ├── gru_arm_asimddp.cpp
│   │   │   ├── gru_arm_asimdhp.cpp
│   │   │   ├── gru_arm_vfpv4.cpp
│   │   │   ├── gru_int8.h
│   │   │   ├── hardsigmoid_arm.cpp
│   │   │   ├── hardsigmoid_arm.h
│   │   │   ├── hardsigmoid_arm_asimdhp.cpp
│   │   │   ├── hardswish_arm.cpp
│   │   │   ├── hardswish_arm.h
│   │   │   ├── hardswish_arm_asimdhp.cpp
│   │   │   ├── innerproduct_arm.cpp
│   │   │   ├── innerproduct_arm.h
│   │   │   ├── innerproduct_arm_asimdfhm.cpp
│   │   │   ├── innerproduct_arm_asimdhp.cpp
│   │   │   ├── innerproduct_arm_vfpv4.cpp
│   │   │   ├── innerproduct_fp16s.h
│   │   │   ├── innerproduct_gemm_fp16s.h
│   │   │   ├── instancenorm_arm.cpp
│   │   │   ├── instancenorm_arm.h
│   │   │   ├── instancenorm_arm_asimdhp.cpp
│   │   │   ├── interp_arm.cpp
│   │   │   ├── interp_arm.h
│   │   │   ├── interp_arm_asimdhp.cpp
│   │   │   ├── interp_bicubic.h
│   │   │   ├── interp_bicubic_bf16s.h
│   │   │   ├── interp_bicubic_fp16s.h
│   │   │   ├── interp_bicubic_pack4.h
│   │   │   ├── interp_bicubic_pack4_bf16s.h
│   │   │   ├── interp_bicubic_pack4_fp16s.h
│   │   │   ├── interp_bicubic_pack8_fp16s.h
│   │   │   ├── interp_bilinear.h
│   │   │   ├── interp_bilinear_bf16s.h
│   │   │   ├── interp_bilinear_fp16s.h
│   │   │   ├── interp_bilinear_pack4.h
│   │   │   ├── interp_bilinear_pack4_bf16s.h
│   │   │   ├── interp_bilinear_pack4_fp16s.h
│   │   │   ├── interp_bilinear_pack8_fp16s.h
│   │   │   ├── layernorm_arm.cpp
│   │   │   ├── layernorm_arm.h
│   │   │   ├── layernorm_arm_asimdhp.cpp
│   │   │   ├── lrn_arm.cpp
│   │   │   ├── lrn_arm.h
│   │   │   ├── lstm_arm.cpp
│   │   │   ├── lstm_arm.h
│   │   │   ├── lstm_arm_asimddp.cpp
│   │   │   ├── lstm_arm_asimdhp.cpp
│   │   │   ├── lstm_arm_vfpv4.cpp
│   │   │   ├── lstm_int8.h
│   │   │   ├── matmul_arm.cpp
│   │   │   ├── matmul_arm.h
│   │   │   ├── mish_arm.cpp
│   │   │   ├── mish_arm.h
│   │   │   ├── mish_arm_asimdhp.cpp
│   │   │   ├── multiheadattention_arm.cpp
│   │   │   ├── multiheadattention_arm.h
│   │   │   ├── neon_mathfun.h
│   │   │   ├── neon_mathfun_fp16s.h
│   │   │   ├── neon_mathfun_tanh.h
│   │   │   ├── packing_arm.cpp
│   │   │   ├── packing_arm.h
│   │   │   ├── padding_arm.cpp
│   │   │   ├── padding_arm.h
│   │   │   ├── padding_pack4.h
│   │   │   ├── padding_pack4_bf16s_fp16s.h
│   │   │   ├── padding_pack8_fp16s.h
│   │   │   ├── padding_pack8_int8.h
│   │   │   ├── pixelshuffle_arm.cpp
│   │   │   ├── pixelshuffle_arm.h
│   │   │   ├── pooling_2x2.h
│   │   │   ├── pooling_2x2_pack4.h
│   │   │   ├── pooling_2x2_pack4_bf16s.h
│   │   │   ├── pooling_3x3.h
│   │   │   ├── pooling_3x3_pack4.h
│   │   │   ├── pooling_3x3_pack4_bf16s.h
│   │   │   ├── pooling_arm.cpp
│   │   │   ├── pooling_arm.h
│   │   │   ├── pooling_arm_asimdhp.cpp
│   │   │   ├── prelu_arm.cpp
│   │   │   ├── prelu_arm.h
│   │   │   ├── prelu_arm_asimdhp.cpp
│   │   │   ├── quantize_arm.cpp
│   │   │   ├── quantize_arm.h
│   │   │   ├── quantize_arm_asimdhp.cpp
│   │   │   ├── relu_arm.cpp
│   │   │   ├── relu_arm.h
│   │   │   ├── relu_arm_asimdhp.cpp
│   │   │   ├── requantize_arm.cpp
│   │   │   ├── requantize_arm.h
│   │   │   ├── reshape_arm.cpp
│   │   │   ├── reshape_arm.h
│   │   │   ├── rmsnorm_arm.cpp
│   │   │   ├── rmsnorm_arm.h
│   │   │   ├── rmsnorm_arm_asimdhp.cpp
│   │   │   ├── rnn_arm.cpp
│   │   │   ├── rnn_arm.h
│   │   │   ├── rnn_arm_asimddp.cpp
│   │   │   ├── rnn_arm_asimdhp.cpp
│   │   │   ├── rnn_arm_vfpv4.cpp
│   │   │   ├── rnn_int8.h
│   │   │   ├── scale_arm.cpp
│   │   │   ├── scale_arm.h
│   │   │   ├── selu_arm.cpp
│   │   │   ├── selu_arm.h
│   │   │   ├── shufflechannel_arm.cpp
│   │   │   ├── shufflechannel_arm.h
│   │   │   ├── sigmoid_arm.cpp
│   │   │   ├── sigmoid_arm.h
│   │   │   ├── sigmoid_arm_asimdhp.cpp
│   │   │   ├── slice_arm.cpp
│   │   │   ├── slice_arm.h
│   │   │   ├── softmax_arm.cpp
│   │   │   ├── softmax_arm.h
│   │   │   ├── softmax_arm_asimdhp.cpp
│   │   │   ├── swish_arm.cpp
│   │   │   ├── swish_arm.h
│   │   │   ├── swish_arm_asimdhp.cpp
│   │   │   ├── tanh_arm.cpp
│   │   │   ├── tanh_arm.h
│   │   │   ├── tanh_arm_asimdhp.cpp
│   │   │   ├── unaryop_arm.cpp
│   │   │   ├── unaryop_arm.h
│   │   │   └── unaryop_arm_asimdhp.cpp
│   │   ├── batchnorm.cpp
│   │   ├── batchnorm.h
│   │   ├── bias.cpp
│   │   ├── bias.h
│   │   ├── binaryop.cpp
│   │   ├── binaryop.h
│   │   ├── bnll.cpp
│   │   ├── bnll.h
│   │   ├── cast.cpp
│   │   ├── cast.h
│   │   ├── celu.cpp
│   │   ├── celu.h
│   │   ├── clip.cpp
│   │   ├── clip.h
│   │   ├── concat.cpp
│   │   ├── concat.h
│   │   ├── convolution.cpp
│   │   ├── convolution.h
│   │   ├── convolution1d.cpp
│   │   ├── convolution1d.h
│   │   ├── convolution3d.cpp
│   │   ├── convolution3d.h
│   │   ├── convolutiondepthwise.cpp
│   │   ├── convolutiondepthwise.h
│   │   ├── convolutiondepthwise1d.cpp
│   │   ├── convolutiondepthwise1d.h
│   │   ├── convolutiondepthwise3d.cpp
│   │   ├── convolutiondepthwise3d.h
│   │   ├── copyto.cpp
│   │   ├── copyto.h
│   │   ├── crop.cpp
│   │   ├── crop.h
│   │   ├── cumulativesum.cpp
│   │   ├── cumulativesum.h
│   │   ├── deconvolution.cpp
│   │   ├── deconvolution.h
│   │   ├── deconvolution1d.cpp
│   │   ├── deconvolution1d.h
│   │   ├── deconvolution3d.cpp
│   │   ├── deconvolution3d.h
│   │   ├── deconvolutiondepthwise.cpp
│   │   ├── deconvolutiondepthwise.h
│   │   ├── deconvolutiondepthwise1d.cpp
│   │   ├── deconvolutiondepthwise1d.h
│   │   ├── deconvolutiondepthwise3d.cpp
│   │   ├── deconvolutiondepthwise3d.h
│   │   ├── deepcopy.cpp
│   │   ├── deepcopy.h
│   │   ├── deformableconv2d.cpp
│   │   ├── deformableconv2d.h
│   │   ├── dequantize.cpp
│   │   ├── dequantize.h
│   │   ├── detectionoutput.cpp
│   │   ├── detectionoutput.h
│   │   ├── diag.cpp
│   │   ├── diag.h
│   │   ├── dropout.cpp
│   │   ├── dropout.h
│   │   ├── einsum.cpp
│   │   ├── einsum.h
│   │   ├── eltwise.cpp
│   │   ├── eltwise.h
│   │   ├── elu.cpp
│   │   ├── elu.h
│   │   ├── embed.cpp
│   │   ├── embed.h
│   │   ├── erf.cpp
│   │   ├── erf.h
│   │   ├── exp.cpp
│   │   ├── exp.h
│   │   ├── expanddims.cpp
│   │   ├── expanddims.h
│   │   ├── flatten.cpp
│   │   ├── flatten.h
│   │   ├── flip.cpp
│   │   ├── flip.h
│   │   ├── fold.cpp
│   │   ├── fold.h
│   │   ├── fused_activation.h
│   │   ├── gelu.cpp
│   │   ├── gelu.h
│   │   ├── gemm.cpp
│   │   ├── gemm.h
│   │   ├── glu.cpp
│   │   ├── glu.h
│   │   ├── gridsample.cpp
│   │   ├── gridsample.h
│   │   ├── groupnorm.cpp
│   │   ├── groupnorm.h
│   │   ├── gru.cpp
│   │   ├── gru.h
│   │   ├── hardsigmoid.cpp
│   │   ├── hardsigmoid.h
│   │   ├── hardswish.cpp
│   │   ├── hardswish.h
│   │   ├── innerproduct.cpp
│   │   ├── innerproduct.h
│   │   ├── input.cpp
│   │   ├── input.h
│   │   ├── instancenorm.cpp
│   │   ├── instancenorm.h
│   │   ├── interp.cpp
│   │   ├── interp.h
│   │   ├── inversespectrogram.cpp
│   │   ├── inversespectrogram.h
│   │   ├── layernorm.cpp
│   │   ├── layernorm.h
│   │   ├── log.cpp
│   │   ├── log.h
│   │   ├── loongarch/
│   │   │   ├── absval_loongarch.cpp
│   │   │   ├── absval_loongarch.h
│   │   │   ├── batchnorm_loongarch.cpp
│   │   │   ├── batchnorm_loongarch.h
│   │   │   ├── bias_loongarch.cpp
│   │   │   ├── bias_loongarch.h
│   │   │   ├── binaryop_loongarch.cpp
│   │   │   ├── binaryop_loongarch.h
│   │   │   ├── cast_loongarch.cpp
│   │   │   ├── cast_loongarch.h
│   │   │   ├── clip_loongarch.cpp
│   │   │   ├── clip_loongarch.h
│   │   │   ├── concat_loongarch.cpp
│   │   │   ├── concat_loongarch.h
│   │   │   ├── convolution1d_loongarch.cpp
│   │   │   ├── convolution1d_loongarch.h
│   │   │   ├── convolution_1x1.h
│   │   │   ├── convolution_1x1_int8.h
│   │   │   ├── convolution_1x1_pack1to4_int8.h
│   │   │   ├── convolution_1x1_pack4.h
│   │   │   ├── convolution_1x1_pack4to1.h
│   │   │   ├── convolution_1x1_pack8to1_int8.h
│   │   │   ├── convolution_1x1_pack8to4_int8.h
│   │   │   ├── convolution_3x3.h
│   │   │   ├── convolution_3x3_int8.h
│   │   │   ├── convolution_3x3_pack1to4.h
│   │   │   ├── convolution_3x3_pack4.h
│   │   │   ├── convolution_3x3_pack8to1_int8.h
│   │   │   ├── convolution_3x3_pack8to4_int8.h
│   │   │   ├── convolution_7x7_pack1to4.h
│   │   │   ├── convolution_int8.h
│   │   │   ├── convolution_loongarch.cpp
│   │   │   ├── convolution_loongarch.h
│   │   │   ├── convolution_pack1to4.h
│   │   │   ├── convolution_pack1to4_int8.h
│   │   │   ├── convolution_pack4.h
│   │   │   ├── convolution_pack4to1.h
│   │   │   ├── convolution_pack8to1_int8.h
│   │   │   ├── convolution_pack8to4_int8.h
│   │   │   ├── convolution_sgemm.h
│   │   │   ├── convolution_sgemm_int8.h
│   │   │   ├── convolution_sgemm_pack1to4_int8.h
│   │   │   ├── convolution_sgemm_pack4.h
│   │   │   ├── convolution_sgemm_pack4to1.h
│   │   │   ├── convolution_sgemm_pack8to1_int8.h
│   │   │   ├── convolution_sgemm_pack8to4_int8.h
│   │   │   ├── convolution_winograd_dot.h
│   │   │   ├── convolution_winograd_dot_int8.h
│   │   │   ├── convolution_winograd_dot_pack4.h
│   │   │   ├── convolution_winograd_dot_pack8to1_int8.h
│   │   │   ├── convolution_winograd_dot_pack8to4_int8.h
│   │   │   ├── convolution_winograd_transform.h
│   │   │   ├── convolution_winograd_transform_int8.h
│   │   │   ├── convolution_winograd_transform_pack4.h
│   │   │   ├── convolution_winograd_transform_pack4_int8.h
│   │   │   ├── convolution_winograd_transform_pack8_int8.h
│   │   │   ├── convolutiondepthwise_3x3.h
│   │   │   ├── convolutiondepthwise_3x3_pack4.h
│   │   │   ├── convolutiondepthwise_5x5_pack4.h
│   │   │   ├── convolutiondepthwise_loongarch.cpp
│   │   │   ├── convolutiondepthwise_loongarch.h
│   │   │   ├── crop_loongarch.cpp
│   │   │   ├── crop_loongarch.h
│   │   │   ├── deconvolution_loongarch.cpp
│   │   │   ├── deconvolution_loongarch.h
│   │   │   ├── deconvolution_pack1to4.h
│   │   │   ├── deconvolution_pack4.h
│   │   │   ├── deconvolution_pack4to1.h
│   │   │   ├── deconvolutiondepthwise_loongarch.cpp
│   │   │   ├── deconvolutiondepthwise_loongarch.h
│   │   │   ├── dequantize_loongarch.cpp
│   │   │   ├── dequantize_loongarch.h
│   │   │   ├── dropout_loongarch.cpp
│   │   │   ├── dropout_loongarch.h
│   │   │   ├── eltwise_loongarch.cpp
│   │   │   ├── eltwise_loongarch.h
│   │   │   ├── flatten_loongarch.cpp
│   │   │   ├── flatten_loongarch.h
│   │   │   ├── hardsigmoid_loongarch.cpp
│   │   │   ├── hardsigmoid_loongarch.h
│   │   │   ├── hardswish_loongarch.cpp
│   │   │   ├── hardswish_loongarch.h
│   │   │   ├── innerproduct_loongarch.cpp
│   │   │   ├── innerproduct_loongarch.h
│   │   │   ├── interp_bicubic.h
│   │   │   ├── interp_bicubic_pack4.h
│   │   │   ├── interp_bilinear.h
│   │   │   ├── interp_bilinear_pack4.h
│   │   │   ├── interp_loongarch.cpp
│   │   │   ├── interp_loongarch.h
│   │   │   ├── lasx_mathfun.h
│   │   │   ├── loongarch_activation.h
│   │   │   ├── loongarch_usability.h
│   │   │   ├── lsx_mathfun.h
│   │   │   ├── mish_loongarch.cpp
│   │   │   ├── mish_loongarch.h
│   │   │   ├── packing_loongarch.cpp
│   │   │   ├── packing_loongarch.h
│   │   │   ├── padding_loongarch.cpp
│   │   │   ├── padding_loongarch.h
│   │   │   ├── padding_pack4.h
│   │   │   ├── padding_pack8_int8.h
│   │   │   ├── pooling_loongarch.cpp
│   │   │   ├── pooling_loongarch.h
│   │   │   ├── prelu_loongarch.cpp
│   │   │   ├── prelu_loongarch.h
│   │   │   ├── quantize_loongarch.cpp
│   │   │   ├── quantize_loongarch.h
│   │   │   ├── relu_loongarch.cpp
│   │   │   ├── relu_loongarch.h
│   │   │   ├── requantize_loongarch.cpp
│   │   │   ├── requantize_loongarch.h
│   │   │   ├── sigmoid_loongarch.cpp
│   │   │   ├── sigmoid_loongarch.h
│   │   │   ├── slice_loongarch.cpp
│   │   │   ├── slice_loongarch.h
│   │   │   ├── softmax_loongarch.cpp
│   │   │   ├── softmax_loongarch.h
│   │   │   ├── swish_loongarch.cpp
│   │   │   ├── swish_loongarch.h
│   │   │   ├── tanh_loongarch.cpp
│   │   │   ├── tanh_loongarch.h
│   │   │   ├── unaryop_loongarch.cpp
│   │   │   └── unaryop_loongarch.h
│   │   ├── lrn.cpp
│   │   ├── lrn.h
│   │   ├── lstm.cpp
│   │   ├── lstm.h
│   │   ├── matmul.cpp
│   │   ├── matmul.h
│   │   ├── memorydata.cpp
│   │   ├── memorydata.h
│   │   ├── mips/
│   │   │   ├── absval_mips.cpp
│   │   │   ├── absval_mips.h
│   │   │   ├── batchnorm_mips.cpp
│   │   │   ├── batchnorm_mips.h
│   │   │   ├── bias_mips.cpp
│   │   │   ├── bias_mips.h
│   │   │   ├── binaryop_mips.cpp
│   │   │   ├── binaryop_mips.h
│   │   │   ├── cast_mips.cpp
│   │   │   ├── cast_mips.h
│   │   │   ├── clip_mips.cpp
│   │   │   ├── clip_mips.h
│   │   │   ├── concat_mips.cpp
│   │   │   ├── concat_mips.h
│   │   │   ├── convolution1d_mips.cpp
│   │   │   ├── convolution1d_mips.h
│   │   │   ├── convolution_1x1.h
│   │   │   ├── convolution_1x1_int8.h
│   │   │   ├── convolution_1x1_pack1to4_int8.h
│   │   │   ├── convolution_1x1_pack4.h
│   │   │   ├── convolution_1x1_pack4to1.h
│   │   │   ├── convolution_1x1_pack8to1_int8.h
│   │   │   ├── convolution_1x1_pack8to4_int8.h
│   │   │   ├── convolution_3x3.h
│   │   │   ├── convolution_3x3_int8.h
│   │   │   ├── convolution_3x3_pack1to4.h
│   │   │   ├── convolution_3x3_pack4.h
│   │   │   ├── convolution_3x3_pack8to1_int8.h
│   │   │   ├── convolution_3x3_pack8to4_int8.h
│   │   │   ├── convolution_7x7_pack1to4.h
│   │   │   ├── convolution_int8.h
│   │   │   ├── convolution_mips.cpp
│   │   │   ├── convolution_mips.h
│   │   │   ├── convolution_mips_mmi.cpp
│   │   │   ├── convolution_pack1to4.h
│   │   │   ├── convolution_pack1to4_int8.h
│   │   │   ├── convolution_pack4.h
│   │   │   ├── convolution_pack4to1.h
│   │   │   ├── convolution_pack8to1_int8.h
│   │   │   ├── convolution_pack8to4_int8.h
│   │   │   ├── convolution_sgemm.h
│   │   │   ├── convolution_sgemm_int8.h
│   │   │   ├── convolution_sgemm_pack1to4_int8.h
│   │   │   ├── convolution_sgemm_pack4.h
│   │   │   ├── convolution_sgemm_pack4to1.h
│   │   │   ├── convolution_sgemm_pack8to1_int8.h
│   │   │   ├── convolution_sgemm_pack8to4_int8.h
│   │   │   ├── convolution_winograd_dot.h
│   │   │   ├── convolution_winograd_dot_int8.h
│   │   │   ├── convolution_winograd_dot_pack4.h
│   │   │   ├── convolution_winograd_dot_pack8to1_int8.h
│   │   │   ├── convolution_winograd_dot_pack8to4_int8.h
│   │   │   ├── convolution_winograd_transform.h
│   │   │   ├── convolution_winograd_transform_int8.h
│   │   │   ├── convolution_winograd_transform_pack4.h
│   │   │   ├── convolution_winograd_transform_pack4_int8.h
│   │   │   ├── convolution_winograd_transform_pack8_int8.h
│   │   │   ├── convolutiondepthwise_3x3.h
│   │   │   ├── convolutiondepthwise_3x3_pack4.h
│   │   │   ├── convolutiondepthwise_5x5_pack4.h
│   │   │   ├── convolutiondepthwise_mips.cpp
│   │   │   ├── convolutiondepthwise_mips.h
│   │   │   ├── crop_mips.cpp
│   │   │   ├── crop_mips.h
│   │   │   ├── deconvolution_mips.cpp
│   │   │   ├── deconvolution_mips.h
│   │   │   ├── deconvolution_pack1to4.h
│   │   │   ├── deconvolution_pack4.h
│   │   │   ├── deconvolution_pack4to1.h
│   │   │   ├── deconvolutiondepthwise_mips.cpp
│   │   │   ├── deconvolutiondepthwise_mips.h
│   │   │   ├── dequantize_mips.cpp
│   │   │   ├── dequantize_mips.h
│   │   │   ├── dropout_mips.cpp
│   │   │   ├── dropout_mips.h
│   │   │   ├── eltwise_mips.cpp
│   │   │   ├── eltwise_mips.h
│   │   │   ├── elu_mips.cpp
│   │   │   ├── elu_mips.h
│   │   │   ├── erf_mips.cpp
│   │   │   ├── erf_mips.h
│   │   │   ├── flatten_mips.cpp
│   │   │   ├── flatten_mips.h
│   │   │   ├── gelu_mips.cpp
│   │   │   ├── gelu_mips.h
│   │   │   ├── hardsigmoid_mips.cpp
│   │   │   ├── hardsigmoid_mips.h
│   │   │   ├── hardswish_mips.cpp
│   │   │   ├── hardswish_mips.h
│   │   │   ├── innerproduct_mips.cpp
│   │   │   ├── innerproduct_mips.h
│   │   │   ├── interp_bicubic.h
│   │   │   ├── interp_bicubic_pack4.h
│   │   │   ├── interp_bilinear.h
│   │   │   ├── interp_bilinear_pack4.h
│   │   │   ├── interp_mips.cpp
│   │   │   ├── interp_mips.h
│   │   │   ├── loongson_mmi.h
│   │   │   ├── mips_activation.h
│   │   │   ├── mips_usability.h
│   │   │   ├── mish_mips.cpp
│   │   │   ├── mish_mips.h
│   │   │   ├── msa_mathfun.h
│   │   │   ├── packing_mips.cpp
│   │   │   ├── packing_mips.h
│   │   │   ├── padding_mips.cpp
│   │   │   ├── padding_mips.h
│   │   │   ├── padding_pack4.h
│   │   │   ├── padding_pack8_int8.h
│   │   │   ├── pooling_mips.cpp
│   │   │   ├── pooling_mips.h
│   │   │   ├── prelu_mips.cpp
│   │   │   ├── prelu_mips.h
│   │   │   ├── quantize_mips.cpp
│   │   │   ├── quantize_mips.h
│   │   │   ├── relu_mips.cpp
│   │   │   ├── relu_mips.h
│   │   │   ├── requantize_mips.cpp
│   │   │   ├── requantize_mips.h
│   │   │   ├── selu_mips.cpp
│   │   │   ├── selu_mips.h
│   │   │   ├── sigmoid_mips.cpp
│   │   │   ├── sigmoid_mips.h
│   │   │   ├── slice_mips.cpp
│   │   │   ├── slice_mips.h
│   │   │   ├── softmax_mips.cpp
│   │   │   ├── softmax_mips.h
│   │   │   ├── swish_mips.cpp
│   │   │   ├── swish_mips.h
│   │   │   ├── tanh_mips.cpp
│   │   │   ├── tanh_mips.h
│   │   │   ├── unaryop_mips.cpp
│   │   │   └── unaryop_mips.h
│   │   ├── mish.cpp
│   │   ├── mish.h
│   │   ├── multiheadattention.cpp
│   │   ├── multiheadattention.h
│   │   ├── mvn.cpp
│   │   ├── mvn.h
│   │   ├── noop.cpp
│   │   ├── noop.h
│   │   ├── normalize.cpp
│   │   ├── normalize.h
│   │   ├── packing.cpp
│   │   ├── packing.h
│   │   ├── padding.cpp
│   │   ├── padding.h
│   │   ├── permute.cpp
│   │   ├── permute.h
│   │   ├── pixelshuffle.cpp
│   │   ├── pixelshuffle.h
│   │   ├── pooling.cpp
│   │   ├── pooling.h
│   │   ├── pooling1d.cpp
│   │   ├── pooling1d.h
│   │   ├── pooling3d.cpp
│   │   ├── pooling3d.h
│   │   ├── power.cpp
│   │   ├── power.h
│   │   ├── prelu.cpp
│   │   ├── prelu.h
│   │   ├── priorbox.cpp
│   │   ├── priorbox.h
│   │   ├── proposal.cpp
│   │   ├── proposal.h
│   │   ├── psroipooling.cpp
│   │   ├── psroipooling.h
│   │   ├── quantize.cpp
│   │   ├── quantize.h
│   │   ├── reduction.cpp
│   │   ├── reduction.h
│   │   ├── relu.cpp
│   │   ├── relu.h
│   │   ├── reorg.cpp
│   │   ├── reorg.h
│   │   ├── requantize.cpp
│   │   ├── requantize.h
│   │   ├── reshape.cpp
│   │   ├── reshape.h
│   │   ├── riscv/
│   │   │   ├── absval_riscv.cpp
│   │   │   ├── absval_riscv.h
│   │   │   ├── absval_riscv_zfh.cpp
│   │   │   ├── batchnorm_riscv.cpp
│   │   │   ├── batchnorm_riscv.h
│   │   │   ├── batchnorm_riscv_zfh.cpp
│   │   │   ├── bias_riscv.cpp
│   │   │   ├── bias_riscv.h
│   │   │   ├── bias_riscv_zfh.cpp
│   │   │   ├── binaryop_riscv.cpp
│   │   │   ├── binaryop_riscv.h
│   │   │   ├── binaryop_riscv_zfh.cpp
│   │   │   ├── bnll_riscv.cpp
│   │   │   ├── bnll_riscv.h
│   │   │   ├── bnll_riscv_zfh.cpp
│   │   │   ├── cast_riscv.cpp
│   │   │   ├── cast_riscv.h
│   │   │   ├── cast_riscv_zfh.cpp
│   │   │   ├── celu_riscv.cpp
│   │   │   ├── celu_riscv.h
│   │   │   ├── celu_riscv_zfh.cpp
│   │   │   ├── clip_riscv.cpp
│   │   │   ├── clip_riscv.h
│   │   │   ├── clip_riscv_zfh.cpp
│   │   │   ├── concat_riscv.cpp
│   │   │   ├── concat_riscv.h
│   │   │   ├── convolution1d_riscv.cpp
│   │   │   ├── convolution1d_riscv.h
│   │   │   ├── convolution1d_riscv_zfh.cpp
│   │   │   ├── convolution_1x1.h
│   │   │   ├── convolution_1x1_fp16s.h
│   │   │   ├── convolution_1x1_pack1ton.h
│   │   │   ├── convolution_1x1_pack1ton_fp16s.h
│   │   │   ├── convolution_1x1_packn.h
│   │   │   ├── convolution_1x1_packn_fp16s.h
│   │   │   ├── convolution_1x1_packnto1.h
│   │   │   ├── convolution_1x1_packnto1_fp16s.h
│   │   │   ├── convolution_3x3.h
│   │   │   ├── convolution_3x3_pack1ton.h
│   │   │   ├── convolution_3x3_pack1ton_fp16s.h
│   │   │   ├── convolution_3x3_packn.h
│   │   │   ├── convolution_3x3_packn_fp16s.h
│   │   │   ├── convolution_7x7_pack1ton.h
│   │   │   ├── convolution_7x7_pack1ton_fp16s.h
│   │   │   ├── convolution_fp16s.h
│   │   │   ├── convolution_pack1ton.h
│   │   │   ├── convolution_pack1ton_fp16s.h
│   │   │   ├── convolution_packn.h
│   │   │   ├── convolution_packn_fp16s.h
│   │   │   ├── convolution_packnto1.h
│   │   │   ├── convolution_packnto1_fp16s.h
│   │   │   ├── convolution_riscv.cpp
│   │   │   ├── convolution_riscv.h
│   │   │   ├── convolution_riscv_zfh.cpp
│   │   │   ├── convolution_sgemm.h
│   │   │   ├── convolution_sgemm_fp16s.h
│   │   │   ├── convolution_sgemm_pack1ton.h
│   │   │   ├── convolution_sgemm_pack1ton_fp16s.h
│   │   │   ├── convolution_sgemm_packn.h
│   │   │   ├── convolution_sgemm_packn_fp16s.h
│   │   │   ├── convolution_sgemm_packnto1.h
│   │   │   ├── convolution_sgemm_packnto1_fp16s.h
│   │   │   ├── convolution_winograd_dot.h
│   │   │   ├── convolution_winograd_dot_packn.h
│   │   │   ├── convolution_winograd_dot_packn_fp16s.h
│   │   │   ├── convolution_winograd_transform.h
│   │   │   ├── convolution_winograd_transform_packn.h
│   │   │   ├── convolution_winograd_transform_packn_fp16s.h
│   │   │   ├── convolutiondepthwise_3x3.h
│   │   │   ├── convolutiondepthwise_3x3_packn.h
│   │   │   ├── convolutiondepthwise_3x3_packn_fp16s.h
│   │   │   ├── convolutiondepthwise_5x5_packn.h
│   │   │   ├── convolutiondepthwise_5x5_packn_fp16s.h
│   │   │   ├── convolutiondepthwise_riscv.cpp
│   │   │   ├── convolutiondepthwise_riscv.h
│   │   │   ├── convolutiondepthwise_riscv_zfh.cpp
│   │   │   ├── crop_riscv.cpp
│   │   │   ├── crop_riscv.h
│   │   │   ├── deconvolution_fp16s.h
│   │   │   ├── deconvolution_pack1ton.h
│   │   │   ├── deconvolution_pack1ton_fp16s.h
│   │   │   ├── deconvolution_packn.h
│   │   │   ├── deconvolution_packn_fp16s.h
│   │   │   ├── deconvolution_packnto1.h
│   │   │   ├── deconvolution_packnto1_fp16s.h
│   │   │   ├── deconvolution_riscv.cpp
│   │   │   ├── deconvolution_riscv.h
│   │   │   ├── deconvolution_riscv_zfh.cpp
│   │   │   ├── deconvolutiondepthwise_riscv.cpp
│   │   │   ├── deconvolutiondepthwise_riscv.h
│   │   │   ├── deconvolutiondepthwise_riscv_zfh.cpp
│   │   │   ├── deformableconv2d_pack1ton.h
│   │   │   ├── deformableconv2d_packn.h
│   │   │   ├── deformableconv2d_packnto1.h
│   │   │   ├── deformableconv2d_riscv.cpp
│   │   │   ├── deformableconv2d_riscv.h
│   │   │   ├── dropout_riscv.cpp
│   │   │   ├── dropout_riscv.h
│   │   │   ├── eltwise_riscv.cpp
│   │   │   ├── eltwise_riscv.h
│   │   │   ├── eltwise_riscv_zfh.cpp
│   │   │   ├── flatten_riscv.cpp
│   │   │   ├── flatten_riscv.h
│   │   │   ├── gelu_riscv.cpp
│   │   │   ├── gelu_riscv.h
│   │   │   ├── gemm_bf16s_fp16s.h
│   │   │   ├── gemm_fp16s.h
│   │   │   ├── gemm_riscv.cpp
│   │   │   ├── gemm_riscv.h
│   │   │   ├── gemm_riscv_zfh.cpp
│   │   │   ├── gru_riscv.cpp
│   │   │   ├── gru_riscv.h
│   │   │   ├── gru_riscv_zfh.cpp
│   │   │   ├── hardsigmoid_riscv.cpp
│   │   │   ├── hardsigmoid_riscv.h
│   │   │   ├── hardsigmoid_riscv_zfh.cpp
│   │   │   ├── hardswish_riscv.cpp
│   │   │   ├── hardswish_riscv.h
│   │   │   ├── hardswish_riscv_zfh.cpp
│   │   │   ├── innerproduct_riscv.cpp
│   │   │   ├── innerproduct_riscv.h
│   │   │   ├── innerproduct_riscv_zfh.cpp
│   │   │   ├── instancenorm_riscv.cpp
│   │   │   ├── instancenorm_riscv.h
│   │   │   ├── instancenorm_riscv_zfh.cpp
│   │   │   ├── interp_bicubic.h
│   │   │   ├── interp_bicubic_fp16s.h
│   │   │   ├── interp_bicubic_packn.h
│   │   │   ├── interp_bicubic_packn_fp16s.h
│   │   │   ├── interp_bilinear.h
│   │   │   ├── interp_bilinear_fp16s.h
│   │   │   ├── interp_bilinear_packn.h
│   │   │   ├── interp_bilinear_packn_fp16s.h
│   │   │   ├── interp_riscv.cpp
│   │   │   ├── interp_riscv.h
│   │   │   ├── interp_riscv_zfh.cpp
│   │   │   ├── layernorm_riscv.cpp
│   │   │   ├── layernorm_riscv.h
│   │   │   ├── layernorm_riscv_zfh.cpp
│   │   │   ├── mish_riscv.cpp
│   │   │   ├── mish_riscv.h
│   │   │   ├── mish_riscv_zfh.cpp
│   │   │   ├── packing_riscv.cpp
│   │   │   ├── packing_riscv.h
│   │   │   ├── padding_packn.h
│   │   │   ├── padding_riscv.cpp
│   │   │   ├── padding_riscv.h
│   │   │   ├── pooling_riscv.cpp
│   │   │   ├── pooling_riscv.h
│   │   │   ├── pooling_riscv_zfh.cpp
│   │   │   ├── prelu_riscv.cpp
│   │   │   ├── prelu_riscv.h
│   │   │   ├── prelu_riscv_zfh.cpp
│   │   │   ├── relu_riscv.cpp
│   │   │   ├── relu_riscv.h
│   │   │   ├── relu_riscv_zfh.cpp
│   │   │   ├── riscv_activation.h
│   │   │   ├── riscv_usability.h
│   │   │   ├── rvv_mathfun.h
│   │   │   ├── rvv_mathfun_fp16s.h
│   │   │   ├── selu_riscv.cpp
│   │   │   ├── selu_riscv.h
│   │   │   ├── shufflechannel_riscv.cpp
│   │   │   ├── shufflechannel_riscv.h
│   │   │   ├── sigmoid_riscv.cpp
│   │   │   ├── sigmoid_riscv.h
│   │   │   ├── sigmoid_riscv_zfh.cpp
│   │   │   ├── softmax_riscv.cpp
│   │   │   ├── softmax_riscv.h
│   │   │   ├── swish_riscv.cpp
│   │   │   ├── swish_riscv.h
│   │   │   ├── swish_riscv_zfh.cpp
│   │   │   ├── tanh_riscv.cpp
│   │   │   ├── tanh_riscv.h
│   │   │   ├── tanh_riscv_zfh.cpp
│   │   │   ├── unaryop_riscv.cpp
│   │   │   ├── unaryop_riscv.h
│   │   │   └── unaryop_riscv_zfh.cpp
│   │   ├── rmsnorm.cpp
│   │   ├── rmsnorm.h
│   │   ├── rnn.cpp
│   │   ├── rnn.h
│   │   ├── roialign.cpp
│   │   ├── roialign.h
│   │   ├── roipooling.cpp
│   │   ├── roipooling.h
│   │   ├── rotaryembed.cpp
│   │   ├── rotaryembed.h
│   │   ├── scale.cpp
│   │   ├── scale.h
│   │   ├── sdpa.cpp
│   │   ├── sdpa.h
│   │   ├── selu.cpp
│   │   ├── selu.h
│   │   ├── shrink.cpp
│   │   ├── shrink.h
│   │   ├── shufflechannel.cpp
│   │   ├── shufflechannel.h
│   │   ├── sigmoid.cpp
│   │   ├── sigmoid.h
│   │   ├── slice.cpp
│   │   ├── slice.h
│   │   ├── softmax.cpp
│   │   ├── softmax.h
│   │   ├── softplus.cpp
│   │   ├── softplus.h
│   │   ├── spectrogram.cpp
│   │   ├── spectrogram.h
│   │   ├── split.cpp
│   │   ├── split.h
│   │   ├── spp.cpp
│   │   ├── spp.h
│   │   ├── squeeze.cpp
│   │   ├── squeeze.h
│   │   ├── statisticspooling.cpp
│   │   ├── statisticspooling.h
│   │   ├── swish.cpp
│   │   ├── swish.h
│   │   ├── tanh.cpp
│   │   ├── tanh.h
│   │   ├── threshold.cpp
│   │   ├── threshold.h
│   │   ├── tile.cpp
│   │   ├── tile.h
│   │   ├── unaryop.cpp
│   │   ├── unaryop.h
│   │   ├── unfold.cpp
│   │   ├── unfold.h
│   │   ├── vulkan/
│   │   │   ├── absval_vulkan.cpp
│   │   │   ├── absval_vulkan.h
│   │   │   ├── batchnorm_vulkan.cpp
│   │   │   ├── batchnorm_vulkan.h
│   │   │   ├── binaryop_vulkan.cpp
│   │   │   ├── binaryop_vulkan.h
│   │   │   ├── cast_vulkan.cpp
│   │   │   ├── cast_vulkan.h
│   │   │   ├── celu_vulkan.cpp
│   │   │   ├── celu_vulkan.h
│   │   │   ├── clip_vulkan.cpp
│   │   │   ├── clip_vulkan.h
│   │   │   ├── concat_vulkan.cpp
│   │   │   ├── concat_vulkan.h
│   │   │   ├── convolution1d_vulkan.cpp
│   │   │   ├── convolution1d_vulkan.h
│   │   │   ├── convolution_vulkan.cpp
│   │   │   ├── convolution_vulkan.h
│   │   │   ├── convolutiondepthwise_vulkan.cpp
│   │   │   ├── convolutiondepthwise_vulkan.h
│   │   │   ├── crop_vulkan.cpp
│   │   │   ├── crop_vulkan.h
│   │   │   ├── deconvolution_vulkan.cpp
│   │   │   ├── deconvolution_vulkan.h
│   │   │   ├── deconvolutiondepthwise_vulkan.cpp
│   │   │   ├── deconvolutiondepthwise_vulkan.h
│   │   │   ├── deepcopy_vulkan.cpp
│   │   │   ├── deepcopy_vulkan.h
│   │   │   ├── dequantize_vulkan.cpp
│   │   │   ├── dequantize_vulkan.h
│   │   │   ├── dropout_vulkan.cpp
│   │   │   ├── dropout_vulkan.h
│   │   │   ├── eltwise_vulkan.cpp
│   │   │   ├── eltwise_vulkan.h
│   │   │   ├── elu_vulkan.cpp
│   │   │   ├── elu_vulkan.h
│   │   │   ├── erf_vulkan.cpp
│   │   │   ├── erf_vulkan.h
│   │   │   ├── flatten_vulkan.cpp
│   │   │   ├── flatten_vulkan.h
│   │   │   ├── gelu_vulkan.cpp
│   │   │   ├── gelu_vulkan.h
│   │   │   ├── gemm_vulkan.cpp
│   │   │   ├── gemm_vulkan.h
│   │   │   ├── groupnorm_vulkan.cpp
│   │   │   ├── groupnorm_vulkan.h
│   │   │   ├── hardsigmoid_vulkan.cpp
│   │   │   ├── hardsigmoid_vulkan.h
│   │   │   ├── hardswish_vulkan.cpp
│   │   │   ├── hardswish_vulkan.h
│   │   │   ├── innerproduct_vulkan.cpp
│   │   │   ├── innerproduct_vulkan.h
│   │   │   ├── instancenorm_vulkan.cpp
│   │   │   ├── instancenorm_vulkan.h
│   │   │   ├── interp_vulkan.cpp
│   │   │   ├── interp_vulkan.h
│   │   │   ├── layernorm_vulkan.cpp
│   │   │   ├── layernorm_vulkan.h
│   │   │   ├── lrn_vulkan.cpp
│   │   │   ├── lrn_vulkan.h
│   │   │   ├── memorydata_vulkan.cpp
│   │   │   ├── memorydata_vulkan.h
│   │   │   ├── mish_vulkan.cpp
│   │   │   ├── mish_vulkan.h
│   │   │   ├── multiheadattention_vulkan.cpp
│   │   │   ├── multiheadattention_vulkan.h
│   │   │   ├── noop_vulkan.cpp
│   │   │   ├── noop_vulkan.h
│   │   │   ├── normalize_vulkan.cpp
│   │   │   ├── normalize_vulkan.h
│   │   │   ├── packing_vulkan.cpp
│   │   │   ├── packing_vulkan.h
│   │   │   ├── padding_vulkan.cpp
│   │   │   ├── padding_vulkan.h
│   │   │   ├── permute_vulkan.cpp
│   │   │   ├── permute_vulkan.h
│   │   │   ├── pixelshuffle_vulkan.cpp
│   │   │   ├── pixelshuffle_vulkan.h
│   │   │   ├── pooling_vulkan.cpp
│   │   │   ├── pooling_vulkan.h
│   │   │   ├── prelu_vulkan.cpp
│   │   │   ├── prelu_vulkan.h
│   │   │   ├── priorbox_vulkan.cpp
│   │   │   ├── priorbox_vulkan.h
│   │   │   ├── quantize_vulkan.cpp
│   │   │   ├── quantize_vulkan.h
│   │   │   ├── reduction_vulkan.cpp
│   │   │   ├── reduction_vulkan.h
│   │   │   ├── relu_vulkan.cpp
│   │   │   ├── relu_vulkan.h
│   │   │   ├── reorg_vulkan.cpp
│   │   │   ├── reorg_vulkan.h
│   │   │   ├── requantize_vulkan.cpp
│   │   │   ├── requantize_vulkan.h
│   │   │   ├── reshape_vulkan.cpp
│   │   │   ├── reshape_vulkan.h
│   │   │   ├── rmsnorm_vulkan.cpp
│   │   │   ├── rmsnorm_vulkan.h
│   │   │   ├── rotaryembed_vulkan.cpp
│   │   │   ├── rotaryembed_vulkan.h
│   │   │   ├── scale_vulkan.cpp
│   │   │   ├── scale_vulkan.h
│   │   │   ├── sdpa_vulkan.cpp
│   │   │   ├── sdpa_vulkan.h
│   │   │   ├── selu_vulkan.cpp
│   │   │   ├── selu_vulkan.h
│   │   │   ├── shader/
│   │   │   │   ├── .clang-format
│   │   │   │   ├── absval.comp
│   │   │   │   ├── batchnorm.comp
│   │   │   │   ├── batchnorm_pack4.comp
│   │   │   │   ├── binaryop.comp
│   │   │   │   ├── binaryop_broadcast.comp
│   │   │   │   ├── binaryop_broadcast_pack1to4.comp
│   │   │   │   ├── binaryop_broadcast_pack4.comp
│   │   │   │   ├── binaryop_pack4.comp
│   │   │   │   ├── cast_fp16_to_fp32.comp
│   │   │   │   ├── cast_fp16_to_fp32_pack4.comp
│   │   │   │   ├── cast_fp32_to_fp16.comp
│   │   │   │   ├── cast_fp32_to_fp16_pack4.comp
│   │   │   │   ├── celu.comp
│   │   │   │   ├── clip.comp
│   │   │   │   ├── concat.comp
│   │   │   │   ├── concat_pack4.comp
│   │   │   │   ├── concat_pack4to1.comp
│   │   │   │   ├── convolution1d_packed.comp
│   │   │   │   ├── convolution_1x1s1d1_cm.comp
│   │   │   │   ├── convolution_3x3s1d1_winograd23_transform_input.comp
│   │   │   │   ├── convolution_3x3s1d1_winograd23_transform_output.comp
│   │   │   │   ├── convolution_3x3s1d1_winograd43_transform_input.comp
│   │   │   │   ├── convolution_3x3s1d1_winograd43_transform_output.comp
│   │   │   │   ├── convolution_3x3s1d1_winograd_gemm.comp
│   │   │   │   ├── convolution_gemm_cm.comp
│   │   │   │   ├── convolution_pack1to4_3x3s1d1_winograd_gemm.comp
│   │   │   │   ├── convolution_pack4_3x3s1d1_winograd23_transform_input.comp
│   │   │   │   ├── convolution_pack4_3x3s1d1_winograd23_transform_output.comp
│   │   │   │   ├── convolution_pack4_3x3s1d1_winograd43_transform_input.comp
│   │   │   │   ├── convolution_pack4_3x3s1d1_winograd43_transform_output.comp
│   │   │   │   ├── convolution_pack4_3x3s1d1_winograd_gemm.comp
│   │   │   │   ├── convolution_pack4to1_3x3s1d1_winograd_gemm.comp
│   │   │   │   ├── convolution_packed.comp
│   │   │   │   ├── convolution_packed_1x1s1d1.comp
│   │   │   │   ├── convolution_packed_gemm.comp
│   │   │   │   ├── convolution_winograd_gemm_cm.comp
│   │   │   │   ├── convolutiondepthwise.comp
│   │   │   │   ├── convolutiondepthwise_group.comp
│   │   │   │   ├── convolutiondepthwise_group_pack1to4.comp
│   │   │   │   ├── convolutiondepthwise_group_pack4.comp
│   │   │   │   ├── convolutiondepthwise_group_pack4to1.comp
│   │   │   │   ├── convolutiondepthwise_pack4.comp
│   │   │   │   ├── crop.comp
│   │   │   │   ├── crop_pack1to4.comp
│   │   │   │   ├── crop_pack4.comp
│   │   │   │   ├── crop_pack4to1.comp
│   │   │   │   ├── deconvolution_col2im.comp
│   │   │   │   ├── deconvolution_gemm_cm.comp
│   │   │   │   ├── deconvolution_gemm_packed.comp
│   │   │   │   ├── deconvolution_pack4_col2im.comp
│   │   │   │   ├── deconvolution_packed.comp
│   │   │   │   ├── deconvolutiondepthwise.comp
│   │   │   │   ├── deconvolutiondepthwise_group.comp
│   │   │   │   ├── deconvolutiondepthwise_group_pack1to4.comp
│   │   │   │   ├── deconvolutiondepthwise_group_pack4.comp
│   │   │   │   ├── deconvolutiondepthwise_group_pack4to1.comp
│   │   │   │   ├── deconvolutiondepthwise_pack4.comp
│   │   │   │   ├── deepcopy.comp
│   │   │   │   ├── deepcopy_pack4.comp
│   │   │   │   ├── dequantize.comp
│   │   │   │   ├── dequantize_pack4.comp
│   │   │   │   ├── dropout.comp
│   │   │   │   ├── eltwise.comp
│   │   │   │   ├── elu.comp
│   │   │   │   ├── erf.comp
│   │   │   │   ├── flatten.comp
│   │   │   │   ├── flatten_pack1to4.comp
│   │   │   │   ├── flatten_pack4.comp
│   │   │   │   ├── gelu.comp
│   │   │   │   ├── gemm.comp
│   │   │   │   ├── gemm_cm.comp
│   │   │   │   ├── gemm_sg.comp
│   │   │   │   ├── groupnorm_coeffs.comp
│   │   │   │   ├── groupnorm_coeffs_pack4.comp
│   │   │   │   ├── groupnorm_norm.comp
│   │   │   │   ├── groupnorm_norm_pack4.comp
│   │   │   │   ├── groupnorm_reduce_mean.comp
│   │   │   │   ├── groupnorm_reduce_mean_pack4.comp
│   │   │   │   ├── groupnorm_reduce_sum4_fp16_to_fp32.comp
│   │   │   │   ├── groupnorm_reduce_sum4_fp16_to_fp32_pack4.comp
│   │   │   │   ├── groupnorm_reduce_sum4_fp32.comp
│   │   │   │   ├── groupnorm_reduce_sum4_fp32_pack4.comp
│   │   │   │   ├── groupnorm_sub_mean_square.comp
│   │   │   │   ├── groupnorm_sub_mean_square_pack4.comp
│   │   │   │   ├── hardsigmoid.comp
│   │   │   │   ├── hardswish.comp
│   │   │   │   ├── innerproduct.comp
│   │   │   │   ├── innerproduct_gemm.comp
│   │   │   │   ├── innerproduct_gemm_wp1to4.comp
│   │   │   │   ├── innerproduct_gemm_wp4.comp
│   │   │   │   ├── innerproduct_gemm_wp4to1.comp
│   │   │   │   ├── innerproduct_pack1to4.comp
│   │   │   │   ├── innerproduct_pack4.comp
│   │   │   │   ├── innerproduct_pack4to1.comp
│   │   │   │   ├── innerproduct_reduce_sum8.comp
│   │   │   │   ├── innerproduct_reduce_sum8_pack4.comp
│   │   │   │   ├── innerproduct_sum8.comp
│   │   │   │   ├── innerproduct_sum8_pack1to4.comp
│   │   │   │   ├── innerproduct_sum8_pack4.comp
│   │   │   │   ├── innerproduct_sum8_pack4to1.comp
│   │   │   │   ├── instancenorm_coeffs.comp
│   │   │   │   ├── instancenorm_coeffs_pack4.comp
│   │   │   │   ├── instancenorm_norm.comp
│   │   │   │   ├── instancenorm_norm_pack4.comp
│   │   │   │   ├── instancenorm_reduce_mean.comp
│   │   │   │   ├── instancenorm_reduce_mean_pack4.comp
│   │   │   │   ├── instancenorm_reduce_sum4_fp16_to_fp32.comp
│   │   │   │   ├── instancenorm_reduce_sum4_fp16_to_fp32_pack4.comp
│   │   │   │   ├── instancenorm_reduce_sum4_fp32.comp
│   │   │   │   ├── instancenorm_reduce_sum4_fp32_pack4.comp
│   │   │   │   ├── instancenorm_sub_mean_square.comp
│   │   │   │   ├── instancenorm_sub_mean_square_pack4.comp
│   │   │   │   ├── interp.comp
│   │   │   │   ├── interp_bicubic.comp
│   │   │   │   ├── interp_bicubic_coeffs.comp
│   │   │   │   ├── interp_bicubic_pack4.comp
│   │   │   │   ├── interp_pack4.comp
│   │   │   │   ├── layernorm_coeffs.comp
│   │   │   │   ├── layernorm_coeffs_pack4.comp
│   │   │   │   ├── layernorm_norm.comp
│   │   │   │   ├── layernorm_norm_pack4.comp
│   │   │   │   ├── layernorm_reduce_mean.comp
│   │   │   │   ├── layernorm_reduce_mean_pack4.comp
│   │   │   │   ├── layernorm_reduce_sum4_fp16_to_fp32.comp
│   │   │   │   ├── layernorm_reduce_sum4_fp16_to_fp32_pack4.comp
│   │   │   │   ├── layernorm_reduce_sum4_fp32.comp
│   │   │   │   ├── layernorm_reduce_sum4_fp32_pack4.comp
│   │   │   │   ├── layernorm_sub_mean_square.comp
│   │   │   │   ├── layernorm_sub_mean_square_pack4.comp
│   │   │   │   ├── lrn_norm.comp
│   │   │   │   ├── lrn_norm_across_channel_pack4.comp
│   │   │   │   ├── lrn_norm_within_channel_pack4.comp
│   │   │   │   ├── lrn_square_pad.comp
│   │   │   │   ├── lrn_square_pad_across_channel_pack4.comp
│   │   │   │   ├── lrn_square_pad_within_channel_pack4.comp
│   │   │   │   ├── mish.comp
│   │   │   │   ├── multiheadattention_qk_cross.comp
│   │   │   │   ├── multiheadattention_qk_cross_pack1to4.comp
│   │   │   │   ├── multiheadattention_qk_cross_pack4.comp
│   │   │   │   ├── multiheadattention_qk_cross_pack4to1.comp
│   │   │   │   ├── multiheadattention_qkv_cross.comp
│   │   │   │   ├── multiheadattention_qkv_cross_pack1to4.comp
│   │   │   │   ├── multiheadattention_qkv_cross_pack4.comp
│   │   │   │   ├── multiheadattention_qkv_cross_pack4to1.comp
│   │   │   │   ├── normalize_coeffs.comp
│   │   │   │   ├── normalize_coeffs_pack4.comp
│   │   │   │   ├── normalize_norm.comp
│   │   │   │   ├── normalize_norm_pack4.comp
│   │   │   │   ├── normalize_reduce_sum4_fp16_to_fp32.comp
│   │   │   │   ├── normalize_reduce_sum4_fp16_to_fp32_pack4.comp
│   │   │   │   ├── normalize_reduce_sum4_fp32.comp
│   │   │   │   ├── normalize_reduce_sum4_fp32_pack4.comp
│   │   │   │   ├── packing.comp
│   │   │   │   ├── packing_int8.comp
│   │   │   │   ├── packing_pack1to4.comp
│   │   │   │   ├── packing_pack1to4_int8.comp
│   │   │   │   ├── packing_pack4to1.comp
│   │   │   │   ├── packing_pack4to1_int8.comp
│   │   │   │   ├── padding.comp
│   │   │   │   ├── padding_3d.comp
│   │   │   │   ├── padding_3d_pack4.comp
│   │   │   │   ├── padding_pack1to4.comp
│   │   │   │   ├── padding_pack4.comp
│   │   │   │   ├── padding_pack4to1.comp
│   │   │   │   ├── permute.comp
│   │   │   │   ├── permute_pack1to4.comp
│   │   │   │   ├── permute_pack4.comp
│   │   │   │   ├── permute_pack4to1.comp
│   │   │   │   ├── pixelshuffle.comp
│   │   │   │   ├── pixelshuffle_pack4.comp
│   │   │   │   ├── pixelshuffle_pack4to1.comp
│   │   │   │   ├── pooling.comp
│   │   │   │   ├── pooling_adaptive.comp
│   │   │   │   ├── pooling_adaptive_pack4.comp
│   │   │   │   ├── pooling_global_reduce_max.comp
│   │   │   │   ├── pooling_global_reduce_max_first.comp
│   │   │   │   ├── pooling_global_reduce_max_first_pack4.comp
│   │   │   │   ├── pooling_global_reduce_max_last.comp
│   │   │   │   ├── pooling_global_reduce_max_last_pack4.comp
│   │   │   │   ├── pooling_global_reduce_max_pack4.comp
│   │   │   │   ├── pooling_global_reduce_sum.comp
│   │   │   │   ├── pooling_global_reduce_sum_first.comp
│   │   │   │   ├── pooling_global_reduce_sum_first_pack4.comp
│   │   │   │   ├── pooling_global_reduce_sum_last.comp
│   │   │   │   ├── pooling_global_reduce_sum_last_pack4.comp
│   │   │   │   ├── pooling_global_reduce_sum_pack4.comp
│   │   │   │   ├── pooling_pack4.comp
│   │   │   │   ├── prelu.comp
│   │   │   │   ├── prelu_pack4.comp
│   │   │   │   ├── priorbox.comp
│   │   │   │   ├── priorbox_mxnet.comp
│   │   │   │   ├── quantize.comp
│   │   │   │   ├── quantize_pack4.comp
│   │   │   │   ├── reduction.comp
│   │   │   │   ├── relu.comp
│   │   │   │   ├── reorg.comp
│   │   │   │   ├── reorg_pack1to4.comp
│   │   │   │   ├── reorg_pack4.comp
│   │   │   │   ├── requantize.comp
│   │   │   │   ├── requantize_pack4.comp
│   │   │   │   ├── reshape.comp
│   │   │   │   ├── reshape_pack1to4.comp
│   │   │   │   ├── reshape_pack4.comp
│   │   │   │   ├── reshape_pack4to1.comp
│   │   │   │   ├── rmsnorm_coeffs.comp
│   │   │   │   ├── rmsnorm_coeffs_pack4.comp
│   │   │   │   ├── rmsnorm_norm.comp
│   │   │   │   ├── rmsnorm_norm_pack4.comp
│   │   │   │   ├── rmsnorm_square.comp
│   │   │   │   ├── rmsnorm_square_pack4.comp
│   │   │   │   ├── rotaryembed.comp
│   │   │   │   ├── rotaryembed_pack4.comp
│   │   │   │   ├── scale.comp
│   │   │   │   ├── scale_pack4.comp
│   │   │   │   ├── sdpa_cross.comp
│   │   │   │   ├── sdpa_cross_cm.comp
│   │   │   │   ├── sdpa_fa.comp
│   │   │   │   ├── sdpa_fa_cm.comp
│   │   │   │   ├── selu.comp
│   │   │   │   ├── shrink.comp
│   │   │   │   ├── shufflechannel.comp
│   │   │   │   ├── shufflechannel_pack4.comp
│   │   │   │   ├── sigmoid.comp
│   │   │   │   ├── slice.comp
│   │   │   │   ├── slice_pack1to4.comp
│   │   │   │   ├── slice_pack4.comp
│   │   │   │   ├── softmax_div_sum.comp
│   │   │   │   ├── softmax_div_sum_pack4.comp
│   │   │   │   ├── softmax_exp_sub_max.comp
│   │   │   │   ├── softmax_exp_sub_max_pack4.comp
│   │   │   │   ├── softmax_reduce_max.comp
│   │   │   │   ├── softmax_reduce_max_pack4.comp
│   │   │   │   ├── softmax_reduce_sum.comp
│   │   │   │   ├── softmax_reduce_sum_pack4.comp
│   │   │   │   ├── softplus.comp
│   │   │   │   ├── swish.comp
│   │   │   │   ├── tanh.comp
│   │   │   │   ├── unaryop.comp
│   │   │   │   ├── unfold_im2col.comp
│   │   │   │   ├── unfold_im2col_pack1to4.comp
│   │   │   │   ├── unfold_im2col_pack4.comp
│   │   │   │   ├── unfold_im2col_pack4to1.comp
│   │   │   │   └── vulkan_activation.comp
│   │   │   ├── shrink_vulkan.cpp
│   │   │   ├── shrink_vulkan.h
│   │   │   ├── shufflechannel_vulkan.cpp
│   │   │   ├── shufflechannel_vulkan.h
│   │   │   ├── sigmoid_vulkan.cpp
│   │   │   ├── sigmoid_vulkan.h
│   │   │   ├── slice_vulkan.cpp
│   │   │   ├── slice_vulkan.h
│   │   │   ├── softmax_vulkan.cpp
│   │   │   ├── softmax_vulkan.h
│   │   │   ├── softplus_vulkan.cpp
│   │   │   ├── softplus_vulkan.h
│   │   │   ├── split_vulkan.cpp
│   │   │   ├── split_vulkan.h
│   │   │   ├── swish_vulkan.cpp
│   │   │   ├── swish_vulkan.h
│   │   │   ├── tanh_vulkan.cpp
│   │   │   ├── tanh_vulkan.h
│   │   │   ├── unaryop_vulkan.cpp
│   │   │   ├── unaryop_vulkan.h
│   │   │   ├── unfold_vulkan.cpp
│   │   │   └── unfold_vulkan.h
│   │   ├── x86/
│   │   │   ├── absval_x86.cpp
│   │   │   ├── absval_x86.h
│   │   │   ├── avx512_mathfun.h
│   │   │   ├── avx_mathfun.h
│   │   │   ├── batchnorm_bf16s.h
│   │   │   ├── batchnorm_x86.cpp
│   │   │   ├── batchnorm_x86.h
│   │   │   ├── batchnorm_x86_avx512bf16.cpp
│   │   │   ├── bias_x86.cpp
│   │   │   ├── bias_x86.h
│   │   │   ├── binaryop_bf16s.h
│   │   │   ├── binaryop_functor.h
│   │   │   ├── binaryop_x86.cpp
│   │   │   ├── binaryop_x86.h
│   │   │   ├── binaryop_x86_avx512bf16.cpp
│   │   │   ├── bnll_x86.cpp
│   │   │   ├── bnll_x86.h
│   │   │   ├── cast_bf16.h
│   │   │   ├── cast_fp16.h
│   │   │   ├── cast_x86.cpp
│   │   │   ├── cast_x86.h
│   │   │   ├── cast_x86_avx2.cpp
│   │   │   ├── cast_x86_avx512bf16.cpp
│   │   │   ├── cast_x86_f16c.cpp
│   │   │   ├── clip_bf16s.h
│   │   │   ├── clip_x86.cpp
│   │   │   ├── clip_x86.h
│   │   │   ├── clip_x86_avx512bf16.cpp
│   │   │   ├── concat_x86.cpp
│   │   │   ├── concat_x86.h
│   │   │   ├── convolution1d_packed.h
│   │   │   ├── convolution1d_x86.cpp
│   │   │   ├── convolution1d_x86.h
│   │   │   ├── convolution_1x1.h
│   │   │   ├── convolution_2x2_pack8.h
│   │   │   ├── convolution_3x3.h
│   │   │   ├── convolution_3x3_int8.h
│   │   │   ├── convolution_3x3_pack16to1.h
│   │   │   ├── convolution_3x3_pack1to4.h
│   │   │   ├── convolution_3x3_pack1to8.h
│   │   │   ├── convolution_3x3_pack8.h
│   │   │   ├── convolution_3x3_pack8to1.h
│   │   │   ├── convolution_3x3_winograd.h
│   │   │   ├── convolution_3x3_winograd_int8.h
│   │   │   ├── convolution_5x5.h
│   │   │   ├── convolution_im2col_gemm.h
│   │   │   ├── convolution_im2col_gemm_int8.h
│   │   │   ├── convolution_packed.h
│   │   │   ├── convolution_packed_int8.h
│   │   │   ├── convolution_x86.cpp
│   │   │   ├── convolution_x86.h
│   │   │   ├── convolution_x86_avx2.cpp
│   │   │   ├── convolution_x86_avx512vnni.cpp
│   │   │   ├── convolution_x86_avxvnni.cpp
│   │   │   ├── convolution_x86_avxvnniint8.cpp
│   │   │   ├── convolution_x86_xop.cpp
│   │   │   ├── convolutiondepthwise_3x3.h
│   │   │   ├── convolutiondepthwise_3x3_int8.h
│   │   │   ├── convolutiondepthwise_3x3_pack16.h
│   │   │   ├── convolutiondepthwise_3x3_pack4.h
│   │   │   ├── convolutiondepthwise_3x3_pack8.h
│   │   │   ├── convolutiondepthwise_5x5_pack16.h
│   │   │   ├── convolutiondepthwise_5x5_pack4.h
│   │   │   ├── convolutiondepthwise_5x5_pack8.h
│   │   │   ├── convolutiondepthwise_x86.cpp
│   │   │   ├── convolutiondepthwise_x86.h
│   │   │   ├── crop_x86.cpp
│   │   │   ├── crop_x86.h
│   │   │   ├── deconvolution_packed.h
│   │   │   ├── deconvolution_x86.cpp
│   │   │   ├── deconvolution_x86.h
│   │   │   ├── deconvolutiondepthwise_x86.cpp
│   │   │   ├── deconvolutiondepthwise_x86.h
│   │   │   ├── deformableconv2d_packed.h
│   │   │   ├── deformableconv2d_x86.cpp
│   │   │   ├── deformableconv2d_x86.h
│   │   │   ├── dequantize_x86.cpp
│   │   │   ├── dequantize_x86.h
│   │   │   ├── dropout_x86.cpp
│   │   │   ├── dropout_x86.h
│   │   │   ├── eltwise_x86.cpp
│   │   │   ├── eltwise_x86.h
│   │   │   ├── elu_x86.cpp
│   │   │   ├── elu_x86.h
│   │   │   ├── erf_x86.cpp
│   │   │   ├── erf_x86.h
│   │   │   ├── flatten_x86.cpp
│   │   │   ├── flatten_x86.h
│   │   │   ├── gelu_x86.cpp
│   │   │   ├── gelu_x86.h
│   │   │   ├── gemm_bf16s.h
│   │   │   ├── gemm_int8.h
│   │   │   ├── gemm_x86.cpp
│   │   │   ├── gemm_x86.h
│   │   │   ├── gemm_x86_avx2.cpp
│   │   │   ├── gemm_x86_avx512vnni.cpp
│   │   │   ├── gemm_x86_avxvnni.cpp
│   │   │   ├── gemm_x86_avxvnniint8.cpp
│   │   │   ├── gemm_x86_xop.cpp
│   │   │   ├── gridsample_bicubic_apply_interpolation.h
│   │   │   ├── gridsample_bicubic_compute_blob.h
│   │   │   ├── gridsample_bilinear_apply_interpolation.h
│   │   │   ├── gridsample_bilinear_compute_blob.h
│   │   │   ├── gridsample_compute_blob.h
│   │   │   ├── gridsample_nearest_apply_interpolation.h
│   │   │   ├── gridsample_nearest_compute_blob.h
│   │   │   ├── gridsample_x86.cpp
│   │   │   ├── gridsample_x86.h
│   │   │   ├── groupnorm_bf16s.h
│   │   │   ├── groupnorm_x86.cpp
│   │   │   ├── groupnorm_x86.h
│   │   │   ├── groupnorm_x86_avx512bf16.cpp
│   │   │   ├── hardsigmoid_x86.cpp
│   │   │   ├── hardsigmoid_x86.h
│   │   │   ├── hardswish_x86.cpp
│   │   │   ├── hardswish_x86.h
│   │   │   ├── innerproduct_fp.h
│   │   │   ├── innerproduct_gemm_fp.h
│   │   │   ├── innerproduct_x86.cpp
│   │   │   ├── innerproduct_x86.h
│   │   │   ├── innerproduct_x86_f16c.cpp
│   │   │   ├── instancenorm_bf16s.h
│   │   │   ├── instancenorm_x86.cpp
│   │   │   ├── instancenorm_x86.h
│   │   │   ├── instancenorm_x86_avx512bf16.cpp
│   │   │   ├── interp_bicubic.h
│   │   │   ├── interp_bicubic_pack16.h
│   │   │   ├── interp_bicubic_pack4.h
│   │   │   ├── interp_bicubic_pack8.h
│   │   │   ├── interp_bilinear.h
│   │   │   ├── interp_bilinear_pack16.h
│   │   │   ├── interp_bilinear_pack4.h
│   │   │   ├── interp_bilinear_pack8.h
│   │   │   ├── interp_x86.cpp
│   │   │   ├── interp_x86.h
│   │   │   ├── interp_x86_avx2.cpp
│   │   │   ├── layernorm_bf16s.h
│   │   │   ├── layernorm_x86.cpp
│   │   │   ├── layernorm_x86.h
│   │   │   ├── layernorm_x86_avx512bf16.cpp
│   │   │   ├── lrn_x86.cpp
│   │   │   ├── lrn_x86.h
│   │   │   ├── lstm_int8.h
│   │   │   ├── lstm_x86.cpp
│   │   │   ├── lstm_x86.h
│   │   │   ├── lstm_x86_avx2.cpp
│   │   │   ├── lstm_x86_avx512vnni.cpp
│   │   │   ├── lstm_x86_avxvnni.cpp
│   │   │   ├── lstm_x86_xop.cpp
│   │   │   ├── matmul_x86.cpp
│   │   │   ├── matmul_x86.h
│   │   │   ├── mish_x86.cpp
│   │   │   ├── mish_x86.h
│   │   │   ├── multiheadattention_x86.cpp
│   │   │   ├── multiheadattention_x86.h
│   │   │   ├── packing_x86.cpp
│   │   │   ├── packing_x86.h
│   │   │   ├── padding_pack16.h
│   │   │   ├── padding_pack16_bf16s_fp16s.h
│   │   │   ├── padding_pack4.h
│   │   │   ├── padding_pack4_bf16s_fp16s.h
│   │   │   ├── padding_pack8.h
│   │   │   ├── padding_pack8_bf16s_fp16s.h
│   │   │   ├── padding_pack8_int8.h
│   │   │   ├── padding_x86.cpp
│   │   │   ├── padding_x86.h
│   │   │   ├── pooling_2x2.h
│   │   │   ├── pooling_2x2_pack16.h
│   │   │   ├── pooling_2x2_pack4.h
│   │   │   ├── pooling_2x2_pack8.h
│   │   │   ├── pooling_3x3_pack16.h
│   │   │   ├── pooling_3x3_pack4.h
│   │   │   ├── pooling_3x3_pack8.h
│   │   │   ├── pooling_x86.cpp
│   │   │   ├── pooling_x86.h
│   │   │   ├── prelu_bf16s.h
│   │   │   ├── prelu_x86.cpp
│   │   │   ├── prelu_x86.h
│   │   │   ├── prelu_x86_avx512bf16.cpp
│   │   │   ├── quantize_x86.cpp
│   │   │   ├── quantize_x86.h
│   │   │   ├── relu_bf16s.h
│   │   │   ├── relu_x86.cpp
│   │   │   ├── relu_x86.h
│   │   │   ├── relu_x86_avx512bf16.cpp
│   │   │   ├── requantize_x86.cpp
│   │   │   ├── requantize_x86.h
│   │   │   ├── reshape_x86.cpp
│   │   │   ├── reshape_x86.h
│   │   │   ├── rmsnorm_bf16s.h
│   │   │   ├── rmsnorm_x86.cpp
│   │   │   ├── rmsnorm_x86.h
│   │   │   ├── rmsnorm_x86_avx512bf16.cpp
│   │   │   ├── roialign_x86.cpp
│   │   │   ├── roialign_x86.h
│   │   │   ├── rotaryembed_x86.cpp
│   │   │   ├── rotaryembed_x86.h
│   │   │   ├── scale_bf16s.h
│   │   │   ├── scale_x86.cpp
│   │   │   ├── scale_x86.h
│   │   │   ├── scale_x86_avx512bf16.cpp
│   │   │   ├── sdpa_x86.cpp
│   │   │   ├── sdpa_x86.h
│   │   │   ├── selu_x86.cpp
│   │   │   ├── selu_x86.h
│   │   │   ├── shufflechannel_x86.cpp
│   │   │   ├── shufflechannel_x86.h
│   │   │   ├── sigmoid_bf16s.h
│   │   │   ├── sigmoid_x86.cpp
│   │   │   ├── sigmoid_x86.h
│   │   │   ├── sigmoid_x86_avx512bf16.cpp
│   │   │   ├── slice_x86.cpp
│   │   │   ├── slice_x86.h
│   │   │   ├── softmax_bf16s.h
│   │   │   ├── softmax_x86.cpp
│   │   │   ├── softmax_x86.h
│   │   │   ├── softmax_x86_avx512bf16.cpp
│   │   │   ├── sse_mathfun.h
│   │   │   ├── swish_bf16s.h
│   │   │   ├── swish_x86.cpp
│   │   │   ├── swish_x86.h
│   │   │   ├── swish_x86_avx512bf16.cpp
│   │   │   ├── tanh_x86.cpp
│   │   │   ├── tanh_x86.h
│   │   │   ├── unaryop_bf16s.h
│   │   │   ├── unaryop_functor.h
│   │   │   ├── unaryop_x86.cpp
│   │   │   ├── unaryop_x86.h
│   │   │   ├── unaryop_x86_avx512bf16.cpp
│   │   │   ├── x86_activation.h
│   │   │   ├── x86_usability.h
│   │   │   ├── yolov3detectionoutput_x86.cpp
│   │   │   └── yolov3detectionoutput_x86.h
│   │   ├── yolodetectionoutput.cpp
│   │   ├── yolodetectionoutput.h
│   │   ├── yolov3detectionoutput.cpp
│   │   └── yolov3detectionoutput.h
│   ├── layer.cpp
│   ├── layer.h
│   ├── layer_declaration.h.in
│   ├── layer_registry.h.in
│   ├── layer_shader_registry.h.in
│   ├── layer_shader_spv_data.h.in
│   ├── layer_shader_type.h
│   ├── layer_shader_type_enum.h.in
│   ├── layer_type.h
│   ├── layer_type_enum.h.in
│   ├── mat.cpp
│   ├── mat.h
│   ├── mat_pixel.cpp
│   ├── mat_pixel_affine.cpp
│   ├── mat_pixel_android.cpp
│   ├── mat_pixel_drawing.cpp
│   ├── mat_pixel_drawing_font.h
│   ├── mat_pixel_resize.cpp
│   ├── mat_pixel_rotate.cpp
│   ├── modelbin.cpp
│   ├── modelbin.h
│   ├── ncnn.pc.in
│   ├── net.cpp
│   ├── net.h
│   ├── option.cpp
│   ├── option.h
│   ├── paramdict.cpp
│   ├── paramdict.h
│   ├── pipeline.cpp
│   ├── pipeline.h
│   ├── pipelinecache.cpp
│   ├── pipelinecache.h
│   ├── platform.h.in
│   ├── ruapu.h
│   ├── simplemath.cpp
│   ├── simplemath.h
│   ├── simpleocv.cpp
│   ├── simpleocv.h
│   ├── simpleomp.cpp
│   ├── simpleomp.h
│   ├── simplestl.cpp
│   ├── simplestl.h
│   ├── simplevk.cpp
│   ├── simplevk.h
│   ├── simplevk.tbd
│   ├── stb_image.h
│   ├── stb_image_write.h
│   └── vulkan_header_fix.h
├── tests/
│   ├── CMakeLists.txt
│   ├── perf/
│   │   ├── CMakeLists.txt
│   │   ├── perf_batchnorm.cpp
│   │   ├── perf_binaryop.cpp
│   │   ├── perf_concat.cpp
│   │   ├── perf_convolution.cpp
│   │   ├── perf_convolutiondepthwise.cpp
│   │   ├── perf_deconvolution.cpp
│   │   ├── perf_innerproduct.cpp
│   │   ├── perf_pooling.cpp
│   │   ├── perf_relu.cpp
│   │   ├── perf_sigmoid.cpp
│   │   ├── perf_softmax.cpp
│   │   ├── perfutil.cpp
│   │   └── perfutil.h
│   ├── prng.h
│   ├── test_absval.cpp
│   ├── test_batchnorm.cpp
│   ├── test_bias.cpp
│   ├── test_binaryop.cpp
│   ├── test_binaryop_1.cpp
│   ├── test_binaryop_2.cpp
│   ├── test_binaryop_3.cpp
│   ├── test_binaryop_4.cpp
│   ├── test_bnll.cpp
│   ├── test_c_api.cpp
│   ├── test_cast.cpp
│   ├── test_celu.cpp
│   ├── test_clip.cpp
│   ├── test_command.cpp
│   ├── test_concat.cpp
│   ├── test_concat_oom.cpp
│   ├── test_convolution.cpp
│   ├── test_convolution1d.cpp
│   ├── test_convolution3d.cpp
│   ├── test_convolution_1.cpp
│   ├── test_convolution_2.cpp
│   ├── test_convolution_3.cpp
│   ├── test_convolution_oom.cpp
│   ├── test_convolutiondepthwise.cpp
│   ├── test_convolutiondepthwise1d.cpp
│   ├── test_convolutiondepthwise3d.cpp
│   ├── test_convolutiondepthwise_1.cpp
│   ├── test_copyto.cpp
│   ├── test_copyto_1.cpp
│   ├── test_cpu.cpp
│   ├── test_crop.cpp
│   ├── test_crop_1.cpp
│   ├── test_crop_2.cpp
│   ├── test_crop_3.cpp
│   ├── test_crop_oom.cpp
│   ├── test_cumulativesum.cpp
│   ├── test_deconvolution.cpp
│   ├── test_deconvolution1d.cpp
│   ├── test_deconvolution3d.cpp
│   ├── test_deconvolutiondepthwise.cpp
│   ├── test_deconvolutiondepthwise1d.cpp
│   ├── test_deconvolutiondepthwise3d.cpp
│   ├── test_deconvolutiondepthwise_1.cpp
│   ├── test_deepcopy.cpp
│   ├── test_deformableconv2d.cpp
│   ├── test_deformableconv2d_1.cpp
│   ├── test_deformableconv2d_2.cpp
│   ├── test_deformableconv2d_3.cpp
│   ├── test_deformableconv2d_4.cpp
│   ├── test_dequantize.cpp
│   ├── test_diag.cpp
│   ├── test_dropout.cpp
│   ├── test_einsum.cpp
│   ├── test_eltwise.cpp
│   ├── test_elu.cpp
│   ├── test_embed.cpp
│   ├── test_erf.cpp
│   ├── test_expanddims.cpp
│   ├── test_expression.cpp
│   ├── test_flatten.cpp
│   ├── test_flip.cpp
│   ├── test_fold.cpp
│   ├── test_gelu.cpp
│   ├── test_gemm_0.h
│   ├── test_gemm_0a.cpp
│   ├── test_gemm_0b.cpp
│   ├── test_gemm_0c.cpp
│   ├── test_gemm_0d.cpp
│   ├── test_gemm_0e.cpp
│   ├── test_gemm_0f.cpp
│   ├── test_gemm_1.h
│   ├── test_gemm_1a.cpp
│   ├── test_gemm_1b.cpp
│   ├── test_gemm_2.h
│   ├── test_gemm_2a.cpp
│   ├── test_gemm_2b.cpp
│   ├── test_gemm_2c.cpp
│   ├── test_gemm_2d.cpp
│   ├── test_gemm_2e.cpp
│   ├── test_gemm_3.cpp
│   ├── test_gemm_4.cpp
│   ├── test_gemm_nt.cpp
│   ├── test_gemm_oom.cpp
│   ├── test_glu.cpp
│   ├── test_gridsample.cpp
│   ├── test_groupnorm.cpp
│   ├── test_gru.cpp
│   ├── test_hardsigmoid.cpp
│   ├── test_hardswish.cpp
│   ├── test_innerproduct.cpp
│   ├── test_instancenorm.cpp
│   ├── test_interp.cpp
│   ├── test_interp_1.cpp
│   ├── test_inversespectrogram.cpp
│   ├── test_layernorm.cpp
│   ├── test_lrn.cpp
│   ├── test_lstm.cpp
│   ├── test_mat_pixel.cpp
│   ├── test_mat_pixel_affine.cpp
│   ├── test_mat_pixel_drawing.cpp
│   ├── test_mat_pixel_resize.cpp
│   ├── test_mat_pixel_rotate.cpp
│   ├── test_matmul.cpp
│   ├── test_memorydata.cpp
│   ├── test_mish.cpp
│   ├── test_multiheadattention.cpp
│   ├── test_multiheadattention_1.cpp
│   ├── test_multiheadattention_kvcache.cpp
│   ├── test_multiheadattention_oom.cpp
│   ├── test_noop.cpp
│   ├── test_normalize.cpp
│   ├── test_packing.cpp
│   ├── test_padding.cpp
│   ├── test_paramdict.cpp
│   ├── test_permute.cpp
│   ├── test_pixelshuffle.cpp
│   ├── test_pooling.cpp
│   ├── test_pooling1d.cpp
│   ├── test_pooling3d.cpp
│   ├── test_power.cpp
│   ├── test_prelu.cpp
│   ├── test_priorbox.cpp
│   ├── test_quantize.cpp
│   ├── test_quantize_oom.cpp
│   ├── test_reduction.cpp
│   ├── test_relu.cpp
│   ├── test_reorg.cpp
│   ├── test_requantize.cpp
│   ├── test_requantize_oom.cpp
│   ├── test_reshape.cpp
│   ├── test_reshape_1.cpp
│   ├── test_reshape_oom.cpp
│   ├── test_rmsnorm.cpp
│   ├── test_rnn.cpp
│   ├── test_roialign.cpp
│   ├── test_roipooling.cpp
│   ├── test_rotaryembed.cpp
│   ├── test_rotaryembed_oom.cpp
│   ├── test_scale.cpp
│   ├── test_sdpa.cpp
│   ├── test_sdpa_kvcache.cpp
│   ├── test_sdpa_oom.cpp
│   ├── test_selu.cpp
│   ├── test_shrink.cpp
│   ├── test_shufflechannel.cpp
│   ├── test_sigmoid.cpp
│   ├── test_slice.cpp
│   ├── test_slice_oom.cpp
│   ├── test_softmax.cpp
│   ├── test_softmax_oom.cpp
│   ├── test_softplus.cpp
│   ├── test_spectrogram.cpp
│   ├── test_squeeze.cpp
│   ├── test_squeezenet.cpp
│   ├── test_swish.cpp
│   ├── test_tanh.cpp
│   ├── test_tile.cpp
│   ├── test_tile_oom.cpp
│   ├── test_unaryop.cpp
│   ├── test_unfold.cpp
│   ├── test_yolov3detectionoutput.cpp
│   ├── testutil.cpp
│   └── testutil.h
├── toolchains/
│   ├── aarch64-linux-gnu-c.toolchain.cmake
│   ├── aarch64-linux-gnu.toolchain.cmake
│   ├── aarch64-qnx.toolchain.cmake
│   ├── anykav500.toolchain.cmake
│   ├── arm-linux-gnueabi-c.toolchain.cmake
│   ├── arm-linux-gnueabi.toolchain.cmake
│   ├── arm-linux-gnueabihf-vfpv3-d16.toolchain.cmake
│   ├── arm-linux-gnueabihf.toolchain.cmake
│   ├── c906-v310.toolchain.cmake
│   ├── c907-rv32-v310.toolchain.cmake
│   ├── c907-v310.toolchain.cmake
│   ├── c908-v310.toolchain.cmake
│   ├── c910-v310.toolchain.cmake
│   ├── esp32.toolchain.cmake
│   ├── himix100.toolchain.cmake
│   ├── himix200.toolchain.cmake
│   ├── himix210.toolchain.cmake
│   ├── hisiv300.toolchain.cmake
│   ├── hisiv500.toolchain.cmake
│   ├── hisiv600.toolchain.cmake
│   ├── host-c.clang.toolchain.cmake
│   ├── host-c.gcc.toolchain.cmake
│   ├── host.clang-m32.toolchain.cmake
│   ├── host.gcc-c++03.toolchain.cmake
│   ├── host.gcc-m32.toolchain.cmake
│   ├── host.gcc.toolchain.cmake
│   ├── ingenic-x2000.toolchain.cmake
│   ├── ios.toolchain.cmake
│   ├── iossimxc-x64.toolchain.cmake
│   ├── iossimxc.toolchain.cmake
│   ├── iosxc-arm64.toolchain.cmake
│   ├── iosxc.toolchain.cmake
│   ├── jetson.toolchain.cmake
│   ├── k1.llvm.toolchain.cmake
│   ├── k1.toolchain.cmake
│   ├── loongarch64-linux-gnu.toolchain.cmake
│   ├── loongarch64-unknown-linux-gnu.toolchain.cmake
│   ├── loongson2f-linux-gnuabi64.toolchain.cmake
│   ├── mips-mti-linux-gnu.toolchain.cmake
│   ├── mips32r2-linux-gnu.toolchain.cmake
│   ├── mips64el-linux-gnuabi64.toolchain.cmake
│   ├── mipsel-linux-gnu.toolchain.cmake
│   ├── mipsisa32r6el-linux-gnu.toolchain.cmake
│   ├── mipsisa64r6el-linux-gnuabi64.toolchain.cmake
│   ├── pi3.toolchain.cmake
│   ├── power8le-linux-gnu-vsx.clang.toolchain.cmake
│   ├── power8le-linux-gnu-vsx.toolchain.cmake
│   ├── power9le-linux-gnu-vsx.clang.toolchain.cmake
│   ├── power9le-linux-gnu-vsx.toolchain.cmake
│   ├── powerpc-linux-gnu.toolchain.cmake
│   ├── powerpc64le-linux-gnu.toolchain.cmake
│   ├── riscv32-unknown-elf.toolchain.cmake
│   ├── riscv64-linux-gnu.toolchain.cmake
│   ├── riscv64-unknown-elf.toolchain.cmake
│   ├── riscv64-unknown-linux-gnu.llvm-toolchain.cmake
│   ├── riscv64-unknown-linux-gnu.toolchain.cmake
│   ├── v831.toolchain.cmake
│   ├── windows-xp-clang.toolchain.cmake
│   ├── windows-xp-mingw.toolchain.cmake
│   └── windows-xp-msvc.toolchain.cmake
└── tools/
    ├── CMakeLists.txt
    ├── caffe/
    │   ├── CMakeLists.txt
    │   ├── caffe.proto
    │   └── caffe2ncnn.cpp
    ├── darknet/
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   └── darknet2ncnn.cpp
    ├── keras/
    │   └── readme.md
    ├── mlir/
    │   ├── CMakeLists.txt
    │   ├── fix_td.sh
    │   ├── mlir2ncnn.cpp
    │   ├── ncnn_dialect.cpp
    │   ├── ncnn_dialect.h
    │   ├── ncnn_ops.td
    │   ├── ncnn_rewriter.cpp
    │   ├── ncnn_rewriter.td
    │   ├── tf_attributes.cc
    │   ├── tf_attributes.h
    │   ├── tf_dialect.cpp
    │   ├── tf_dialect.h
    │   ├── tf_generated_ops.td
    │   ├── tf_op_base.td
    │   ├── tf_ops.td
    │   ├── tf_side_effects.h
    │   ├── tf_traits.h
    │   ├── tf_types.cc
    │   ├── tf_types.def
    │   └── tf_types.h
    ├── modelwriter.h
    ├── mxnet/
    │   ├── CMakeLists.txt
    │   └── mxnet2ncnn.cpp
    ├── ncnn2mem.cpp
    ├── ncnnmerge.cpp
    ├── ncnnoptimize.cpp
    ├── onnx/
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── onnx.proto
    │   └── onnx2ncnn.cpp
    ├── plugin/
    │   ├── ImageWatchNCNN.natvis
    │   ├── ImageWatchNNIE.natvis
    │   └── README.md
    ├── pnnx/
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── cmake/
    │   │   └── PNNXPyTorch.cmake
    │   ├── python/
    │   │   ├── README.md
    │   │   ├── examples/
    │   │   │   ├── convert.py
    │   │   │   └── export.py
    │   │   ├── pnnx/
    │   │   │   ├── __init__.py
    │   │   │   └── utils/
    │   │   │       ├── __init__.py
    │   │   │       ├── convert.py
    │   │   │       ├── export.py
    │   │   │       └── utils.py
    │   │   ├── requirements.txt
    │   │   ├── setup.py
    │   │   └── tests/
    │   │       ├── test_convert.py
    │   │       ├── test_dynamicinput_convert.py
    │   │       ├── test_dynamicinput_export.py
    │   │       ├── test_export.py
    │   │       ├── test_naiveinput_convert.py
    │   │       └── test_naiveinput_export.py
    │   ├── src/
    │   │   ├── CMakeLists.txt
    │   │   ├── ir.cpp
    │   │   ├── ir.h
    │   │   ├── load_onnx.cpp
    │   │   ├── load_onnx.h
    │   │   ├── load_tnn.cpp
    │   │   ├── load_tnn.h
    │   │   ├── load_torchscript.cpp
    │   │   ├── load_torchscript.h
    │   │   ├── main.cpp
    │   │   ├── onnx-data.proto
    │   │   ├── onnx-ml.proto
    │   │   ├── onnx-operators-ml.proto
    │   │   ├── pass_level0/
    │   │   │   ├── constant_unpooling.cpp
    │   │   │   ├── constant_unpooling.h
    │   │   │   ├── convert_half_to_float.cpp
    │   │   │   ├── convert_half_to_float.h
    │   │   │   ├── flatten_input.cpp
    │   │   │   ├── flatten_input.h
    │   │   │   ├── inline_block.cpp
    │   │   │   ├── inline_block.h
    │   │   │   ├── reset_device.cpp
    │   │   │   ├── reset_device.h
    │   │   │   ├── shape_inference.cpp
    │   │   │   └── shape_inference.h
    │   │   ├── pass_level0.cpp
    │   │   ├── pass_level0.h
    │   │   ├── pass_level1/
    │   │   │   ├── fuse_module_pass.cpp
    │   │   │   ├── fuse_module_pass.h
    │   │   │   ├── nn_AdaptiveAvgPool1d.cpp
    │   │   │   ├── nn_AdaptiveAvgPool2d.cpp
    │   │   │   ├── nn_AdaptiveAvgPool3d.cpp
    │   │   │   ├── nn_AdaptiveMaxPool1d.cpp
    │   │   │   ├── nn_AdaptiveMaxPool2d.cpp
    │   │   │   ├── nn_AdaptiveMaxPool3d.cpp
    │   │   │   ├── nn_AlphaDropout.cpp
    │   │   │   ├── nn_AvgPool1d.cpp
    │   │   │   ├── nn_AvgPool2d.cpp
    │   │   │   ├── nn_AvgPool3d.cpp
    │   │   │   ├── nn_BatchNorm1d.cpp
    │   │   │   ├── nn_BatchNorm2d.cpp
    │   │   │   ├── nn_BatchNorm3d.cpp
    │   │   │   ├── nn_CELU.cpp
    │   │   │   ├── nn_ChannelShuffle.cpp
    │   │   │   ├── nn_ConstantPad1d.cpp
    │   │   │   ├── nn_ConstantPad2d.cpp
    │   │   │   ├── nn_ConstantPad3d.cpp
    │   │   │   ├── nn_Conv1d.cpp
    │   │   │   ├── nn_Conv2d.cpp
    │   │   │   ├── nn_Conv3d.cpp
    │   │   │   ├── nn_ConvTranspose1d.cpp
    │   │   │   ├── nn_ConvTranspose2d.cpp
    │   │   │   ├── nn_ConvTranspose3d.cpp
    │   │   │   ├── nn_Dropout.cpp
    │   │   │   ├── nn_Dropout2d.cpp
    │   │   │   ├── nn_Dropout3d.cpp
    │   │   │   ├── nn_ELU.cpp
    │   │   │   ├── nn_Embedding.cpp
    │   │   │   ├── nn_Fold.cpp
    │   │   │   ├── nn_GELU.cpp
    │   │   │   ├── nn_GLU.cpp
    │   │   │   ├── nn_GRU.cpp
    │   │   │   ├── nn_GroupNorm.cpp
    │   │   │   ├── nn_Hardshrink.cpp
    │   │   │   ├── nn_Hardsigmoid.cpp
    │   │   │   ├── nn_Hardswish.cpp
    │   │   │   ├── nn_Hardtanh.cpp
    │   │   │   ├── nn_InstanceNorm1d.cpp
    │   │   │   ├── nn_InstanceNorm2d.cpp
    │   │   │   ├── nn_InstanceNorm3d.cpp
    │   │   │   ├── nn_LPPool1d.cpp
    │   │   │   ├── nn_LPPool2d.cpp
    │   │   │   ├── nn_LSTM.cpp
    │   │   │   ├── nn_LayerNorm.cpp
    │   │   │   ├── nn_LeakyReLU.cpp
    │   │   │   ├── nn_Linear.cpp
    │   │   │   ├── nn_LocalResponseNorm.cpp
    │   │   │   ├── nn_LogSigmoid.cpp
    │   │   │   ├── nn_LogSoftmax.cpp
    │   │   │   ├── nn_MaxPool1d.cpp
    │   │   │   ├── nn_MaxPool2d.cpp
    │   │   │   ├── nn_MaxPool3d.cpp
    │   │   │   ├── nn_Mish.cpp
    │   │   │   ├── nn_MultiheadAttention.cpp
    │   │   │   ├── nn_PReLU.cpp
    │   │   │   ├── nn_PixelShuffle.cpp
    │   │   │   ├── nn_PixelUnshuffle.cpp
    │   │   │   ├── nn_RMSNorm.cpp
    │   │   │   ├── nn_RNN.cpp
    │   │   │   ├── nn_RReLU.cpp
    │   │   │   ├── nn_ReLU.cpp
    │   │   │   ├── nn_ReLU6.cpp
    │   │   │   ├── nn_ReflectionPad1d.cpp
    │   │   │   ├── nn_ReflectionPad2d.cpp
    │   │   │   ├── nn_ReplicationPad1d.cpp
    │   │   │   ├── nn_ReplicationPad2d.cpp
    │   │   │   ├── nn_ReplicationPad3d.cpp
    │   │   │   ├── nn_SELU.cpp
    │   │   │   ├── nn_SiLU.cpp
    │   │   │   ├── nn_Sigmoid.cpp
    │   │   │   ├── nn_Softmax.cpp
    │   │   │   ├── nn_Softmax2d.cpp
    │   │   │   ├── nn_Softmin.cpp
    │   │   │   ├── nn_Softplus.cpp
    │   │   │   ├── nn_Softshrink.cpp
    │   │   │   ├── nn_Softsign.cpp
    │   │   │   ├── nn_Tanh.cpp
    │   │   │   ├── nn_Tanhshrink.cpp
    │   │   │   ├── nn_Threshold.cpp
    │   │   │   ├── nn_Unfold.cpp
    │   │   │   ├── nn_Upsample.cpp
    │   │   │   ├── nn_UpsamplingBilinear2d.cpp
    │   │   │   ├── nn_UpsamplingNearest2d.cpp
    │   │   │   ├── nn_ZeroPad2d.cpp
    │   │   │   ├── nn_maxunpool2d.cpp
    │   │   │   ├── nn_quantized_Conv2d.cpp
    │   │   │   ├── nn_quantized_DeQuantize.cpp
    │   │   │   ├── nn_quantized_Linear.cpp
    │   │   │   ├── nn_quantized_Quantize.cpp
    │   │   │   ├── torchvision_DeformConv2d.cpp
    │   │   │   └── torchvision_RoIAlign.cpp
    │   │   ├── pass_level1.cpp
    │   │   ├── pass_level1.h
    │   │   ├── pass_level2/
    │   │   │   ├── F_adaptive_avg_pool1d.cpp
    │   │   │   ├── F_adaptive_avg_pool2d.cpp
    │   │   │   ├── F_adaptive_avg_pool3d.cpp
    │   │   │   ├── F_adaptive_max_pool1d.cpp
    │   │   │   ├── F_adaptive_max_pool2d.cpp
    │   │   │   ├── F_adaptive_max_pool3d.cpp
    │   │   │   ├── F_affine_grid.cpp
    │   │   │   ├── F_alpha_dropout.cpp
    │   │   │   ├── F_avg_pool1d.cpp
    │   │   │   ├── F_avg_pool2d.cpp
    │   │   │   ├── F_avg_pool3d.cpp
    │   │   │   ├── F_batch_norm.cpp
    │   │   │   ├── F_celu.cpp
    │   │   │   ├── F_conv1d.cpp
    │   │   │   ├── F_conv2d.cpp
    │   │   │   ├── F_conv3d.cpp
    │   │   │   ├── F_conv_transpose1d.cpp
    │   │   │   ├── F_conv_transpose2d.cpp
    │   │   │   ├── F_conv_transpose3d.cpp
    │   │   │   ├── F_dropout.cpp
    │   │   │   ├── F_dropout23d.cpp
    │   │   │   ├── F_elu.cpp
    │   │   │   ├── F_embedding.cpp
    │   │   │   ├── F_feature_alpha_dropout.cpp
    │   │   │   ├── F_fold.cpp
    │   │   │   ├── F_gelu.cpp
    │   │   │   ├── F_glu.cpp
    │   │   │   ├── F_grid_sample.cpp
    │   │   │   ├── F_group_norm.cpp
    │   │   │   ├── F_hardshrink.cpp
    │   │   │   ├── F_hardsigmoid.cpp
    │   │   │   ├── F_hardswish.cpp
    │   │   │   ├── F_hardtanh.cpp
    │   │   │   ├── F_instance_norm.cpp
    │   │   │   ├── F_interpolate.cpp
    │   │   │   ├── F_layer_norm.cpp
    │   │   │   ├── F_leaky_relu.cpp
    │   │   │   ├── F_linear.cpp
    │   │   │   ├── F_local_response_norm.cpp
    │   │   │   ├── F_log_softmax.cpp
    │   │   │   ├── F_logsigmoid.cpp
    │   │   │   ├── F_lp_pool1d.cpp
    │   │   │   ├── F_lp_pool2d.cpp
    │   │   │   ├── F_max_pool1d.cpp
    │   │   │   ├── F_max_pool2d.cpp
    │   │   │   ├── F_max_pool3d.cpp
    │   │   │   ├── F_mish.cpp
    │   │   │   ├── F_normalize.cpp
    │   │   │   ├── F_pad.cpp
    │   │   │   ├── F_pairwise_distance.cpp
    │   │   │   ├── F_pixel_shuffle.cpp
    │   │   │   ├── F_pixel_unshuffle.cpp
    │   │   │   ├── F_prelu.cpp
    │   │   │   ├── F_relu.cpp
    │   │   │   ├── F_relu6.cpp
    │   │   │   ├── F_rms_norm.cpp
    │   │   │   ├── F_rrelu.cpp
    │   │   │   ├── F_scaled_dot_product_attention.cpp
    │   │   │   ├── F_selu.cpp
    │   │   │   ├── F_sigmoid.cpp
    │   │   │   ├── F_silu.cpp
    │   │   │   ├── F_softmax.cpp
    │   │   │   ├── F_softmin.cpp
    │   │   │   ├── F_softplus.cpp
    │   │   │   ├── F_softshrink.cpp
    │   │   │   ├── F_softsign.cpp
    │   │   │   ├── F_tanh.cpp
    │   │   │   ├── F_tanhshrink.cpp
    │   │   │   ├── F_threshold.cpp
    │   │   │   ├── F_unfold.cpp
    │   │   │   ├── F_upsample.cpp
    │   │   │   ├── F_upsample_bilinear.cpp
    │   │   │   ├── F_upsample_nearest.cpp
    │   │   │   ├── README.md
    │   │   │   ├── Tensor_copy.cpp
    │   │   │   ├── Tensor_expand.cpp
    │   │   │   ├── Tensor_expand_as.cpp
    │   │   │   ├── Tensor_fill.cpp
    │   │   │   ├── Tensor_index.cpp
    │   │   │   ├── Tensor_index_put.cpp
    │   │   │   ├── Tensor_masked_fill.cpp
    │   │   │   ├── Tensor_new_empty.cpp
    │   │   │   ├── Tensor_new_ones.cpp
    │   │   │   ├── Tensor_new_zeros.cpp
    │   │   │   ├── Tensor_permute.cpp
    │   │   │   ├── Tensor_repeat.cpp
    │   │   │   ├── Tensor_reshape.cpp
    │   │   │   ├── Tensor_reshape_as.cpp
    │   │   │   ├── Tensor_select.cpp
    │   │   │   ├── Tensor_size.cpp
    │   │   │   ├── Tensor_slice.cpp
    │   │   │   ├── Tensor_to.cpp
    │   │   │   ├── Tensor_type_as.cpp
    │   │   │   ├── Tensor_unflatten.cpp
    │   │   │   ├── eliminate_contiguous.cpp
    │   │   │   ├── eliminate_contiguous.h
    │   │   │   ├── eliminate_size_numtotensor_int.cpp
    │   │   │   ├── eliminate_size_numtotensor_int.h
    │   │   │   ├── functionize.cpp
    │   │   │   ├── functionize.h
    │   │   │   ├── fuse_constantlist.cpp
    │   │   │   ├── fuse_constantlist.h
    │   │   │   ├── nn_GRU.cpp
    │   │   │   ├── nn_LSTM.cpp
    │   │   │   ├── nn_RNN.cpp
    │   │   │   ├── nn_quantized_FloatFunctional.cpp
    │   │   │   ├── torch_addmm.cpp
    │   │   │   ├── torch_amax.cpp
    │   │   │   ├── torch_amin.cpp
    │   │   │   ├── torch_arange.cpp
    │   │   │   ├── torch_argmax.cpp
    │   │   │   ├── torch_argmin.cpp
    │   │   │   ├── torch_as_strided.cpp
    │   │   │   ├── torch_baddbmm.cpp
    │   │   │   ├── torch_bitwise_and.cpp
    │   │   │   ├── torch_bitwise_left_shift.cpp
    │   │   │   ├── torch_bitwise_not.cpp
    │   │   │   ├── torch_bitwise_or.cpp
    │   │   │   ├── torch_bitwise_right_shift.cpp
    │   │   │   ├── torch_bitwise_xor.cpp
    │   │   │   ├── torch_bmm.cpp
    │   │   │   ├── torch_cat.cpp
    │   │   │   ├── torch_chunk.cpp
    │   │   │   ├── torch_clamp.cpp
    │   │   │   ├── torch_clone.cpp
    │   │   │   ├── torch_complex.cpp
    │   │   │   ├── torch_cross.cpp
    │   │   │   ├── torch_cumprod.cpp
    │   │   │   ├── torch_cumsum.cpp
    │   │   │   ├── torch_dequantize.cpp
    │   │   │   ├── torch_diag.cpp
    │   │   │   ├── torch_einsum.cpp
    │   │   │   ├── torch_empty.cpp
    │   │   │   ├── torch_empty_like.cpp
    │   │   │   ├── torch_eq.cpp
    │   │   │   ├── torch_fft_fft.cpp
    │   │   │   ├── torch_fft_fft2.cpp
    │   │   │   ├── torch_fft_fftn.cpp
    │   │   │   ├── torch_fft_hfft.cpp
    │   │   │   ├── torch_fft_hfft2.cpp
    │   │   │   ├── torch_fft_hfftn.cpp
    │   │   │   ├── torch_fft_ifft.cpp
    │   │   │   ├── torch_fft_ifft2.cpp
    │   │   │   ├── torch_fft_ifftn.cpp
    │   │   │   ├── torch_fft_ihfft.cpp
    │   │   │   ├── torch_fft_ihfft2.cpp
    │   │   │   ├── torch_fft_ihfftn.cpp
    │   │   │   ├── torch_fft_irfft.cpp
    │   │   │   ├── torch_fft_irfft2.cpp
    │   │   │   ├── torch_fft_irfftn.cpp
    │   │   │   ├── torch_fft_rfft.cpp
    │   │   │   ├── torch_fft_rfft2.cpp
    │   │   │   ├── torch_fft_rfftn.cpp
    │   │   │   ├── torch_flatten.cpp
    │   │   │   ├── torch_flip.cpp
    │   │   │   ├── torch_full.cpp
    │   │   │   ├── torch_full_like.cpp
    │   │   │   ├── torch_gather.cpp
    │   │   │   ├── torch_ge.cpp
    │   │   │   ├── torch_gt.cpp
    │   │   │   ├── torch_imag.cpp
    │   │   │   ├── torch_index_select.cpp
    │   │   │   ├── torch_istft.cpp
    │   │   │   ├── torch_le.cpp
    │   │   │   ├── torch_lgamma.cpp
    │   │   │   ├── torch_logical_and.cpp
    │   │   │   ├── torch_logical_not.cpp
    │   │   │   ├── torch_logical_or.cpp
    │   │   │   ├── torch_logical_xor.cpp
    │   │   │   ├── torch_logsumexp.cpp
    │   │   │   ├── torch_lt.cpp
    │   │   │   ├── torch_masked_select.cpp
    │   │   │   ├── torch_matmul.cpp
    │   │   │   ├── torch_max.cpp
    │   │   │   ├── torch_mean.cpp
    │   │   │   ├── torch_min.cpp
    │   │   │   ├── torch_mm.cpp
    │   │   │   ├── torch_mv.cpp
    │   │   │   ├── torch_narrow.cpp
    │   │   │   ├── torch_ne.cpp
    │   │   │   ├── torch_norm.cpp
    │   │   │   ├── torch_normal.cpp
    │   │   │   ├── torch_ones.cpp
    │   │   │   ├── torch_ones_like.cpp
    │   │   │   ├── torch_positive.cpp
    │   │   │   ├── torch_prod.cpp
    │   │   │   ├── torch_quantize_per_tensor.cpp
    │   │   │   ├── torch_randn.cpp
    │   │   │   ├── torch_randn_like.cpp
    │   │   │   ├── torch_real.cpp
    │   │   │   ├── torch_repeat_interleave.cpp
    │   │   │   ├── torch_roll.cpp
    │   │   │   ├── torch_scatter_add.cpp
    │   │   │   ├── torch_slice_scatter.cpp
    │   │   │   ├── torch_split.cpp
    │   │   │   ├── torch_squeeze.cpp
    │   │   │   ├── torch_stack.cpp
    │   │   │   ├── torch_std.cpp
    │   │   │   ├── torch_stft.cpp
    │   │   │   ├── torch_sum.cpp
    │   │   │   ├── torch_t.cpp
    │   │   │   ├── torch_tensor_split.cpp
    │   │   │   ├── torch_tile.cpp
    │   │   │   ├── torch_topk.cpp
    │   │   │   ├── torch_transpose.cpp
    │   │   │   ├── torch_unbind.cpp
    │   │   │   ├── torch_unsqueeze.cpp
    │   │   │   ├── torch_var.cpp
    │   │   │   ├── torch_view_as_complex.cpp
    │   │   │   ├── torch_view_as_real.cpp
    │   │   │   ├── torch_where.cpp
    │   │   │   ├── torch_zeros.cpp
    │   │   │   ├── torch_zeros_like.cpp
    │   │   │   ├── torchaudio_F_inverse_spectrogram.cpp
    │   │   │   └── torchaudio_F_spectrogram.cpp
    │   │   ├── pass_level2.cpp
    │   │   ├── pass_level2.h
    │   │   ├── pass_level3/
    │   │   │   ├── assign_unique_name.cpp
    │   │   │   ├── assign_unique_name.h
    │   │   │   ├── eliminate_noop_math.cpp
    │   │   │   ├── eliminate_noop_math.h
    │   │   │   ├── eliminate_squeeze_unsqueeze_pair.cpp
    │   │   │   ├── eliminate_squeeze_unsqueeze_pair.h
    │   │   │   ├── eliminate_tuple_pair.cpp
    │   │   │   ├── eliminate_tuple_pair.h
    │   │   │   ├── expand_quantization_modules.cpp
    │   │   │   ├── expand_quantization_modules.h
    │   │   │   ├── fuse_dynamic_adaptive_pool.cpp
    │   │   │   ├── fuse_dynamic_adaptive_pool.h
    │   │   │   ├── fuse_einsum_operands.cpp
    │   │   │   ├── fuse_einsum_operands.h
    │   │   │   ├── fuse_expression.cpp
    │   │   │   ├── fuse_expression.h
    │   │   │   ├── fuse_index_expression.cpp
    │   │   │   ├── fuse_index_expression.h
    │   │   │   ├── fuse_maxpool_unpack.cpp
    │   │   │   ├── fuse_maxpool_unpack.h
    │   │   │   ├── fuse_multiheadattention_unpack.cpp
    │   │   │   ├── fuse_multiheadattention_unpack.h
    │   │   │   ├── fuse_op1ton_unpack.cpp
    │   │   │   ├── fuse_op1ton_unpack.h
    │   │   │   ├── fuse_opnto1_tensors.cpp
    │   │   │   ├── fuse_opnto1_tensors.h
    │   │   │   ├── fuse_rnn_unpack.cpp
    │   │   │   ├── fuse_rnn_unpack.h
    │   │   │   ├── rename_F_dropoutnd.cpp
    │   │   │   └── rename_F_dropoutnd.h
    │   │   ├── pass_level3.cpp
    │   │   ├── pass_level3.h
    │   │   ├── pass_level4/
    │   │   │   ├── attribute_pooling.cpp
    │   │   │   ├── attribute_pooling.h
    │   │   │   ├── canonicalize.cpp
    │   │   │   ├── canonicalize.h
    │   │   │   ├── dead_code_elimination.cpp
    │   │   │   ├── dead_code_elimination.h
    │   │   │   ├── fuse_custom_op.cpp
    │   │   │   └── fuse_custom_op.h
    │   │   ├── pass_level4.cpp
    │   │   ├── pass_level4.h
    │   │   ├── pass_level5/
    │   │   │   ├── attribute_unpooling.cpp
    │   │   │   ├── attribute_unpooling.h
    │   │   │   ├── eliminate_dropout.cpp
    │   │   │   ├── eliminate_dropout.h
    │   │   │   ├── eliminate_identity_operator.cpp
    │   │   │   ├── eliminate_identity_operator.h
    │   │   │   ├── eliminate_maxpool_indices.cpp
    │   │   │   ├── eliminate_maxpool_indices.h
    │   │   │   ├── eliminate_noop_cat.cpp
    │   │   │   ├── eliminate_noop_cat.h
    │   │   │   ├── eliminate_noop_einsum.cpp
    │   │   │   ├── eliminate_noop_einsum.h
    │   │   │   ├── eliminate_noop_expand.cpp
    │   │   │   ├── eliminate_noop_expand.h
    │   │   │   ├── eliminate_noop_expression.cpp
    │   │   │   ├── eliminate_noop_expression.h
    │   │   │   ├── eliminate_noop_pad.cpp
    │   │   │   ├── eliminate_noop_pad.h
    │   │   │   ├── eliminate_noop_permute.cpp
    │   │   │   ├── eliminate_noop_permute.h
    │   │   │   ├── eliminate_noop_reshape.cpp
    │   │   │   ├── eliminate_noop_reshape.h
    │   │   │   ├── eliminate_noop_slice.cpp
    │   │   │   ├── eliminate_noop_slice.h
    │   │   │   ├── eliminate_noop_upsample.cpp
    │   │   │   ├── eliminate_noop_upsample.h
    │   │   │   ├── eliminate_reshape_shape_expression.cpp
    │   │   │   ├── eliminate_reshape_shape_expression.h
    │   │   │   ├── eliminate_type_as.cpp
    │   │   │   ├── eliminate_type_as.h
    │   │   │   ├── eval_expression.cpp
    │   │   │   ├── eval_expression.h
    │   │   │   ├── fold_constants.cpp
    │   │   │   ├── fold_constants.h
    │   │   │   ├── fuse_adjacent_permute.cpp
    │   │   │   ├── fuse_adjacent_permute.h
    │   │   │   ├── fuse_adjacent_reshape.cpp
    │   │   │   ├── fuse_adjacent_reshape.h
    │   │   │   ├── fuse_channel_shuffle.cpp
    │   │   │   ├── fuse_channel_shuffle.h
    │   │   │   ├── fuse_constant_expression.cpp
    │   │   │   ├── fuse_constant_expression.h
    │   │   │   ├── fuse_conv1d_batchnorm1d.cpp
    │   │   │   ├── fuse_conv1d_batchnorm1d.h
    │   │   │   ├── fuse_conv2d_batchnorm2d.cpp
    │   │   │   ├── fuse_conv2d_batchnorm2d.h
    │   │   │   ├── fuse_conv3d_batchnorm3d.cpp
    │   │   │   ├── fuse_conv3d_batchnorm3d.h
    │   │   │   ├── fuse_convtranspose1d_batchnorm1d.cpp
    │   │   │   ├── fuse_convtranspose1d_batchnorm1d.h
    │   │   │   ├── fuse_convtranspose2d_batchnorm2d.cpp
    │   │   │   ├── fuse_convtranspose2d_batchnorm2d.h
    │   │   │   ├── fuse_convtranspose3d_batchnorm3d.cpp
    │   │   │   ├── fuse_convtranspose3d_batchnorm3d.h
    │   │   │   ├── fuse_layernorm.cpp
    │   │   │   ├── fuse_layernorm.h
    │   │   │   ├── fuse_linear_batchnorm1d.cpp
    │   │   │   ├── fuse_linear_batchnorm1d.h
    │   │   │   ├── fuse_multiheadattention.cpp
    │   │   │   ├── fuse_multiheadattention.h
    │   │   │   ├── fuse_multiheadattention_sameqkv.cpp
    │   │   │   ├── fuse_multiheadattention_sameqkv.h
    │   │   │   ├── fuse_pad_conv1d.cpp
    │   │   │   ├── fuse_pad_conv1d.h
    │   │   │   ├── fuse_pad_conv2d.cpp
    │   │   │   ├── fuse_pad_conv2d.h
    │   │   │   ├── fuse_pixel_shuffle.cpp
    │   │   │   ├── fuse_pixel_shuffle.h
    │   │   │   ├── fuse_pixel_unshuffle.cpp
    │   │   │   ├── fuse_pixel_unshuffle.h
    │   │   │   ├── fuse_rmsnorm.cpp
    │   │   │   ├── fuse_rmsnorm.h
    │   │   │   ├── fuse_scaled_dot_product_attention.cpp
    │   │   │   ├── fuse_scaled_dot_product_attention.h
    │   │   │   ├── fuse_select_to_unbind.cpp
    │   │   │   ├── fuse_select_to_unbind.h
    │   │   │   ├── fuse_silu.cpp
    │   │   │   ├── fuse_silu.h
    │   │   │   ├── fuse_slice_copy.cpp
    │   │   │   ├── fuse_slice_copy.h
    │   │   │   ├── fuse_slice_indices.cpp
    │   │   │   ├── fuse_slice_indices.h
    │   │   │   ├── fuse_slice_squeeze_to_select.cpp
    │   │   │   ├── fuse_slice_squeeze_to_select.h
    │   │   │   ├── fuse_slice_to_tensor_split.cpp
    │   │   │   ├── fuse_slice_to_tensor_split.h
    │   │   │   ├── fuse_static_batchnorm.cpp
    │   │   │   ├── fuse_static_batchnorm.h
    │   │   │   ├── fuse_static_conv.cpp
    │   │   │   ├── fuse_static_conv.h
    │   │   │   ├── fuse_static_convtranspose.cpp
    │   │   │   ├── fuse_static_convtranspose.h
    │   │   │   ├── fuse_static_embedding.cpp
    │   │   │   ├── fuse_static_embedding.h
    │   │   │   ├── fuse_static_groupnorm.cpp
    │   │   │   ├── fuse_static_groupnorm.h
    │   │   │   ├── fuse_static_instancenorm.cpp
    │   │   │   ├── fuse_static_instancenorm.h
    │   │   │   ├── fuse_static_layernorm.cpp
    │   │   │   ├── fuse_static_layernorm.h
    │   │   │   ├── fuse_static_linear.cpp
    │   │   │   ├── fuse_static_linear.h
    │   │   │   ├── fuse_static_prelu.cpp
    │   │   │   ├── fuse_static_prelu.h
    │   │   │   ├── fuse_static_rmsnorm.cpp
    │   │   │   ├── fuse_static_rmsnorm.h
    │   │   │   ├── fuse_transformers_multiheadattention.cpp
    │   │   │   ├── fuse_transformers_multiheadattention.h
    │   │   │   ├── fuse_transformers_scaled_dot_product_attention.cpp
    │   │   │   ├── fuse_transformers_scaled_dot_product_attention.h
    │   │   │   ├── normalize_einsum_equation.cpp
    │   │   │   ├── normalize_einsum_equation.h
    │   │   │   ├── unroll_rnn_op.cpp
    │   │   │   └── unroll_rnn_op.h
    │   │   ├── pass_level5.cpp
    │   │   ├── pass_level5.h
    │   │   ├── pass_ncnn/
    │   │   │   ├── F_adaptive_avg_pool1d.cpp
    │   │   │   ├── F_adaptive_avg_pool2d.cpp
    │   │   │   ├── F_adaptive_avg_pool3d.cpp
    │   │   │   ├── F_adaptive_max_pool1d.cpp
    │   │   │   ├── F_adaptive_max_pool2d.cpp
    │   │   │   ├── F_adaptive_max_pool3d.cpp
    │   │   │   ├── F_avg_pool1d.cpp
    │   │   │   ├── F_avg_pool2d.cpp
    │   │   │   ├── F_avg_pool3d.cpp
    │   │   │   ├── F_batch_norm.cpp
    │   │   │   ├── F_celu.cpp
    │   │   │   ├── F_conv1d.cpp
    │   │   │   ├── F_conv2d.cpp
    │   │   │   ├── F_conv3d.cpp
    │   │   │   ├── F_conv_transpose1d.cpp
    │   │   │   ├── F_conv_transpose2d.cpp
    │   │   │   ├── F_conv_transpose3d.cpp
    │   │   │   ├── F_elu.cpp
    │   │   │   ├── F_embedding.cpp
    │   │   │   ├── F_fold.cpp
    │   │   │   ├── F_gelu.cpp
    │   │   │   ├── F_glu.cpp
    │   │   │   ├── F_grid_sample.cpp
    │   │   │   ├── F_group_norm.cpp
    │   │   │   ├── F_hardshrink.cpp
    │   │   │   ├── F_hardsigmoid.cpp
    │   │   │   ├── F_hardswish.cpp
    │   │   │   ├── F_hardtanh.cpp
    │   │   │   ├── F_instance_norm.cpp
    │   │   │   ├── F_interpolate.cpp
    │   │   │   ├── F_layer_norm.cpp
    │   │   │   ├── F_leaky_relu.cpp
    │   │   │   ├── F_linear.cpp
    │   │   │   ├── F_local_response_norm.cpp
    │   │   │   ├── F_log_softmax.cpp
    │   │   │   ├── F_logsigmoid.cpp
    │   │   │   ├── F_max_pool1d.cpp
    │   │   │   ├── F_max_pool2d.cpp
    │   │   │   ├── F_max_pool3d.cpp
    │   │   │   ├── F_mish.cpp
    │   │   │   ├── F_normalize.cpp
    │   │   │   ├── F_pad.cpp
    │   │   │   ├── F_pixel_shuffle.cpp
    │   │   │   ├── F_pixel_unshuffle.cpp
    │   │   │   ├── F_prelu.cpp
    │   │   │   ├── F_relu.cpp
    │   │   │   ├── F_relu6.cpp
    │   │   │   ├── F_rms_norm.cpp
    │   │   │   ├── F_scaled_dot_product_attention.cpp
    │   │   │   ├── F_selu.cpp
    │   │   │   ├── F_sigmoid.cpp
    │   │   │   ├── F_silu.cpp
    │   │   │   ├── F_softmax.cpp
    │   │   │   ├── F_softplus.cpp
    │   │   │   ├── F_softshrink.cpp
    │   │   │   ├── F_tanh.cpp
    │   │   │   ├── F_unfold.cpp
    │   │   │   ├── F_upsample.cpp
    │   │   │   ├── F_upsample_bilinear.cpp
    │   │   │   ├── F_upsample_nearest.cpp
    │   │   │   ├── Tensor_expand.cpp
    │   │   │   ├── Tensor_permute.cpp
    │   │   │   ├── Tensor_repeat.cpp
    │   │   │   ├── Tensor_reshape.cpp
    │   │   │   ├── Tensor_reshape_as.cpp
    │   │   │   ├── Tensor_unflatten.cpp
    │   │   │   ├── chain_multi_output.cpp
    │   │   │   ├── chain_multi_output.h
    │   │   │   ├── convert_Tensor_select.cpp
    │   │   │   ├── convert_Tensor_select.h
    │   │   │   ├── convert_Tensor_slice.cpp
    │   │   │   ├── convert_Tensor_slice.h
    │   │   │   ├── convert_Tensor_slice_copy.cpp
    │   │   │   ├── convert_Tensor_slice_copy.h
    │   │   │   ├── convert_attribute.cpp
    │   │   │   ├── convert_attribute.h
    │   │   │   ├── convert_custom_op.cpp
    │   │   │   ├── convert_custom_op.h
    │   │   │   ├── convert_half_to_float.cpp
    │   │   │   ├── convert_half_to_float.h
    │   │   │   ├── convert_input.cpp
    │   │   │   ├── convert_input.h
    │   │   │   ├── convert_module_op.cpp
    │   │   │   ├── convert_module_op.h
    │   │   │   ├── convert_reshape_interp_expression.cpp
    │   │   │   ├── convert_reshape_interp_expression.h
    │   │   │   ├── convert_slice_expression.cpp
    │   │   │   ├── convert_slice_expression.h
    │   │   │   ├── convert_torch_cat.cpp
    │   │   │   ├── convert_torch_cat.h
    │   │   │   ├── convert_torch_chunk.cpp
    │   │   │   ├── convert_torch_chunk.h
    │   │   │   ├── convert_torch_einsum.cpp
    │   │   │   ├── convert_torch_einsum.h
    │   │   │   ├── convert_torch_split.cpp
    │   │   │   ├── convert_torch_split.h
    │   │   │   ├── convert_torch_stack.cpp
    │   │   │   ├── convert_torch_stack.h
    │   │   │   ├── convert_torch_tensor_split.cpp
    │   │   │   ├── convert_torch_tensor_split.h
    │   │   │   ├── convert_torch_unbind.cpp
    │   │   │   ├── convert_torch_unbind.h
    │   │   │   ├── eliminate_noop.cpp
    │   │   │   ├── eliminate_noop.h
    │   │   │   ├── eliminate_output.cpp
    │   │   │   ├── eliminate_output.h
    │   │   │   ├── expand_expression.cpp
    │   │   │   ├── expand_expression.h
    │   │   │   ├── fuse_binaryop_eltwise.cpp
    │   │   │   ├── fuse_binaryop_eltwise.h
    │   │   │   ├── fuse_convert_rotaryembed.cpp
    │   │   │   ├── fuse_convert_rotaryembed.h
    │   │   │   ├── fuse_convert_shufflechannel_slice.cpp
    │   │   │   ├── fuse_convert_shufflechannel_slice.h
    │   │   │   ├── fuse_convolution1d_activation.cpp
    │   │   │   ├── fuse_convolution1d_activation.h
    │   │   │   ├── fuse_convolution_activation.cpp
    │   │   │   ├── fuse_convolution_activation.h
    │   │   │   ├── fuse_convolutiondepthwise1d_activation.cpp
    │   │   │   ├── fuse_convolutiondepthwise1d_activation.h
    │   │   │   ├── fuse_convolutiondepthwise_activation.cpp
    │   │   │   ├── fuse_convolutiondepthwise_activation.h
    │   │   │   ├── fuse_deconvolution_activation.cpp
    │   │   │   ├── fuse_deconvolution_activation.h
    │   │   │   ├── fuse_deconvolutiondepthwise_activation.cpp
    │   │   │   ├── fuse_deconvolutiondepthwise_activation.h
    │   │   │   ├── fuse_innerproduct_activation.cpp
    │   │   │   ├── fuse_innerproduct_activation.h
    │   │   │   ├── fuse_padding_convolution.cpp
    │   │   │   ├── fuse_padding_convolution.h
    │   │   │   ├── fuse_padding_convolutiondepthwise.cpp
    │   │   │   ├── fuse_padding_convolutiondepthwise.h
    │   │   │   ├── fuse_transpose_matmul.cpp
    │   │   │   ├── fuse_transpose_matmul.h
    │   │   │   ├── insert_reshape_global_pooling.cpp
    │   │   │   ├── insert_reshape_global_pooling.h
    │   │   │   ├── insert_reshape_linear.cpp
    │   │   │   ├── insert_reshape_linear.h
    │   │   │   ├── insert_reshape_numpy_binaryop_broadcast.cpp
    │   │   │   ├── insert_reshape_numpy_binaryop_broadcast.h
    │   │   │   ├── insert_reshape_pooling.cpp
    │   │   │   ├── insert_reshape_pooling.h
    │   │   │   ├── insert_split.cpp
    │   │   │   ├── insert_split.h
    │   │   │   ├── nn_AdaptiveAvgPool1d.cpp
    │   │   │   ├── nn_AdaptiveAvgPool2d.cpp
    │   │   │   ├── nn_AdaptiveAvgPool3d.cpp
    │   │   │   ├── nn_AdaptiveMaxPool1d.cpp
    │   │   │   ├── nn_AdaptiveMaxPool2d.cpp
    │   │   │   ├── nn_AdaptiveMaxPool3d.cpp
    │   │   │   ├── nn_AvgPool1d.cpp
    │   │   │   ├── nn_AvgPool2d.cpp
    │   │   │   ├── nn_AvgPool3d.cpp
    │   │   │   ├── nn_BatchNorm1d.cpp
    │   │   │   ├── nn_BatchNorm2d.cpp
    │   │   │   ├── nn_BatchNorm3d.cpp
    │   │   │   ├── nn_CELU.cpp
    │   │   │   ├── nn_ChannelShuffle.cpp
    │   │   │   ├── nn_ConstantPad1d.cpp
    │   │   │   ├── nn_ConstantPad2d.cpp
    │   │   │   ├── nn_ConstantPad3d.cpp
    │   │   │   ├── nn_Conv1d.cpp
    │   │   │   ├── nn_Conv2d.cpp
    │   │   │   ├── nn_Conv3d.cpp
    │   │   │   ├── nn_ConvTranspose1d.cpp
    │   │   │   ├── nn_ConvTranspose2d.cpp
    │   │   │   ├── nn_ConvTranspose3d.cpp
    │   │   │   ├── nn_ELU.cpp
    │   │   │   ├── nn_Embedding.cpp
    │   │   │   ├── nn_Fold.cpp
    │   │   │   ├── nn_GELU.cpp
    │   │   │   ├── nn_GLU.cpp
    │   │   │   ├── nn_GRU.cpp
    │   │   │   ├── nn_GroupNorm.cpp
    │   │   │   ├── nn_Hardshrink.cpp
    │   │   │   ├── nn_Hardsigmoid.cpp
    │   │   │   ├── nn_Hardswish.cpp
    │   │   │   ├── nn_Hardtanh.cpp
    │   │   │   ├── nn_InstanceNorm2d.cpp
    │   │   │   ├── nn_LSTM.cpp
    │   │   │   ├── nn_LayerNorm.cpp
    │   │   │   ├── nn_LeakyReLU.cpp
    │   │   │   ├── nn_Linear.cpp
    │   │   │   ├── nn_LocalResponseNorm.cpp
    │   │   │   ├── nn_LogSigmoid.cpp
    │   │   │   ├── nn_LogSoftmax.cpp
    │   │   │   ├── nn_MaxPool1d.cpp
    │   │   │   ├── nn_MaxPool2d.cpp
    │   │   │   ├── nn_MaxPool3d.cpp
    │   │   │   ├── nn_Mish.cpp
    │   │   │   ├── nn_MultiheadAttention.cpp
    │   │   │   ├── nn_PReLU.cpp
    │   │   │   ├── nn_PixelShuffle.cpp
    │   │   │   ├── nn_PixelUnshuffle.cpp
    │   │   │   ├── nn_RMSNorm.cpp
    │   │   │   ├── nn_RNN.cpp
    │   │   │   ├── nn_ReLU.cpp
    │   │   │   ├── nn_ReLU6.cpp
    │   │   │   ├── nn_ReflectionPad1d.cpp
    │   │   │   ├── nn_ReflectionPad2d.cpp
    │   │   │   ├── nn_ReplicationPad1d.cpp
    │   │   │   ├── nn_ReplicationPad2d.cpp
    │   │   │   ├── nn_ReplicationPad3d.cpp
    │   │   │   ├── nn_SELU.cpp
    │   │   │   ├── nn_SiLU.cpp
    │   │   │   ├── nn_Sigmoid.cpp
    │   │   │   ├── nn_Softmax.cpp
    │   │   │   ├── nn_Softmax2d.cpp
    │   │   │   ├── nn_Softplus.cpp
    │   │   │   ├── nn_Softshrink.cpp
    │   │   │   ├── nn_Tanh.cpp
    │   │   │   ├── nn_Unfold.cpp
    │   │   │   ├── nn_Upsample.cpp
    │   │   │   ├── nn_UpsamplingBilinear2d.cpp
    │   │   │   ├── nn_UpsamplingNearest2d.cpp
    │   │   │   ├── nn_ZeroPad2d.cpp
    │   │   │   ├── solve_batch_index.cpp
    │   │   │   ├── solve_batch_index.h
    │   │   │   ├── torch_addmm.cpp
    │   │   │   ├── torch_amax.cpp
    │   │   │   ├── torch_amin.cpp
    │   │   │   ├── torch_bmm.cpp
    │   │   │   ├── torch_clamp.cpp
    │   │   │   ├── torch_clone.cpp
    │   │   │   ├── torch_cumsum.cpp
    │   │   │   ├── torch_diag.cpp
    │   │   │   ├── torch_flatten.cpp
    │   │   │   ├── torch_flip.cpp
    │   │   │   ├── torch_istft.cpp
    │   │   │   ├── torch_logsumexp.cpp
    │   │   │   ├── torch_matmul.cpp
    │   │   │   ├── torch_max.cpp
    │   │   │   ├── torch_mean.cpp
    │   │   │   ├── torch_min.cpp
    │   │   │   ├── torch_mm.cpp
    │   │   │   ├── torch_norm.cpp
    │   │   │   ├── torch_prod.cpp
    │   │   │   ├── torch_roll.cpp
    │   │   │   ├── torch_slice_scatter.cpp
    │   │   │   ├── torch_squeeze.cpp
    │   │   │   ├── torch_stft.cpp
    │   │   │   ├── torch_sum.cpp
    │   │   │   ├── torch_t.cpp
    │   │   │   ├── torch_transpose.cpp
    │   │   │   ├── torch_unsqueeze.cpp
    │   │   │   ├── torchaudio_F_inverse_spectrogram.cpp
    │   │   │   ├── torchaudio_F_spectrogram.cpp
    │   │   │   └── torchvision_DeformConv2d.cpp
    │   │   ├── pass_ncnn.cpp
    │   │   ├── pass_ncnn.h
    │   │   ├── pass_onnx/
    │   │   │   ├── canonicalize.cpp
    │   │   │   ├── canonicalize.h
    │   │   │   ├── dead_code_elimination.cpp
    │   │   │   ├── dead_code_elimination.h
    │   │   │   ├── eliminate_initializer_input.cpp
    │   │   │   ├── eliminate_initializer_input.h
    │   │   │   ├── eliminate_noop.cpp
    │   │   │   ├── eliminate_noop.h
    │   │   │   ├── fold_constants.cpp
    │   │   │   ├── fold_constants.h
    │   │   │   ├── fuse_constant_as_attribute.cpp
    │   │   │   ├── fuse_constant_as_attribute.h
    │   │   │   ├── inline_containers.cpp
    │   │   │   ├── inline_containers.h
    │   │   │   ├── inline_if_graph.cpp
    │   │   │   ├── inline_if_graph.h
    │   │   │   ├── model_stat.cpp
    │   │   │   ├── model_stat.h
    │   │   │   ├── shape_inference.cpp
    │   │   │   └── shape_inference.h
    │   │   ├── pass_onnx.cpp
    │   │   ├── pass_onnx.h
    │   │   ├── pass_tnn/
    │   │   │   ├── fuse_shape_list_construct.cpp
    │   │   │   ├── fuse_shape_list_construct.h
    │   │   │   ├── fuse_shape_size.cpp
    │   │   │   ├── fuse_shape_size.h
    │   │   │   ├── lower_concat.cpp
    │   │   │   ├── lower_concat.h
    │   │   │   ├── lower_convolution_activation.cpp
    │   │   │   ├── lower_convolution_activation.h
    │   │   │   ├── lower_power.cpp
    │   │   │   └── lower_power.h
    │   │   ├── save_ncnn.cpp
    │   │   ├── save_ncnn.h
    │   │   ├── save_onnx.cpp
    │   │   ├── save_onnx.h
    │   │   ├── storezip.cpp
    │   │   ├── storezip.h
    │   │   ├── utils.cpp
    │   │   └── utils.h
    │   └── tests/
    │       ├── CMakeLists.txt
    │       ├── ncnn/
    │       │   ├── CMakeLists.txt
    │       │   ├── test_F_adaptive_avg_pool1d.py
    │       │   ├── test_F_adaptive_avg_pool2d.py
    │       │   ├── test_F_adaptive_avg_pool3d.py
    │       │   ├── test_F_adaptive_max_pool1d.py
    │       │   ├── test_F_adaptive_max_pool2d.py
    │       │   ├── test_F_adaptive_max_pool3d.py
    │       │   ├── test_F_alpha_dropout.py
    │       │   ├── test_F_avg_pool1d.py
    │       │   ├── test_F_avg_pool2d.py
    │       │   ├── test_F_avg_pool3d.py
    │       │   ├── test_F_batch_norm.py
    │       │   ├── test_F_celu.py
    │       │   ├── test_F_conv1d.py
    │       │   ├── test_F_conv2d.py
    │       │   ├── test_F_conv3d.py
    │       │   ├── test_F_conv_transpose1d.py
    │       │   ├── test_F_conv_transpose2d.py
    │       │   ├── test_F_conv_transpose3d.py
    │       │   ├── test_F_dropout.py
    │       │   ├── test_F_dropout2d.py
    │       │   ├── test_F_dropout3d.py
    │       │   ├── test_F_elu.py
    │       │   ├── test_F_embedding.py
    │       │   ├── test_F_feature_alpha_dropout.py
    │       │   ├── test_F_fold.py
    │       │   ├── test_F_gelu.py
    │       │   ├── test_F_glu.py
    │       │   ├── test_F_grid_sample.py
    │       │   ├── test_F_group_norm.py
    │       │   ├── test_F_hardshrink.py
    │       │   ├── test_F_hardsigmoid.py
    │       │   ├── test_F_hardswish.py
    │       │   ├── test_F_hardtanh.py
    │       │   ├── test_F_interpolate.py
    │       │   ├── test_F_layer_norm.py
    │       │   ├── test_F_leaky_relu.py
    │       │   ├── test_F_local_response_norm.py
    │       │   ├── test_F_log_softmax.py
    │       │   ├── test_F_logsigmoid.py
    │       │   ├── test_F_max_pool1d.py
    │       │   ├── test_F_max_pool2d.py
    │       │   ├── test_F_max_pool3d.py
    │       │   ├── test_F_mish.py
    │       │   ├── test_F_normalize.py
    │       │   ├── test_F_pad.py
    │       │   ├── test_F_pixel_shuffle.py
    │       │   ├── test_F_pixel_unshuffle.py
    │       │   ├── test_F_prelu.py
    │       │   ├── test_F_relu.py
    │       │   ├── test_F_relu6.py
    │       │   ├── test_F_rms_norm.py
    │       │   ├── test_F_scaled_dot_product_attention.py
    │       │   ├── test_F_selu.py
    │       │   ├── test_F_sigmoid.py
    │       │   ├── test_F_silu.py
    │       │   ├── test_F_softmax.py
    │       │   ├── test_F_softshrink.py
    │       │   ├── test_F_tanh.py
    │       │   ├── test_F_unfold.py
    │       │   ├── test_F_upsample.py
    │       │   ├── test_F_upsample_bilinear.py
    │       │   ├── test_F_upsample_nearest.py
    │       │   ├── test_Tensor_expand.py
    │       │   ├── test_Tensor_permute.py
    │       │   ├── test_Tensor_repeat.py
    │       │   ├── test_Tensor_reshape.py
    │       │   ├── test_Tensor_reshape_as.py
    │       │   ├── test_Tensor_slice.py
    │       │   ├── test_Tensor_slice_copy.py
    │       │   ├── test_Tensor_unflatten.py
    │       │   ├── test_Tensor_view.py
    │       │   ├── test_convnext_tiny.py
    │       │   ├── test_mobilenet_v2.py
    │       │   ├── test_mobilenet_v3_small.py
    │       │   ├── test_ncnn_fuse_binaryop_eltwise.py
    │       │   ├── test_ncnn_fuse_pad_conv.py
    │       │   ├── test_ncnn_fuse_shufflechannel_slice.py
    │       │   ├── test_ncnn_fuse_transpose_matmul.py
    │       │   ├── test_ncnn_interp_expr.py
    │       │   ├── test_ncnn_numpy_binaryop_broadcast.py
    │       │   ├── test_ncnn_reshape_expr.py
    │       │   ├── test_ncnn_slice_expr.py
    │       │   ├── test_ncnn_solve_batch_index.py
    │       │   ├── test_nn_AdaptiveAvgPool1d.py
    │       │   ├── test_nn_AdaptiveAvgPool2d.py
    │       │   ├── test_nn_AdaptiveAvgPool3d.py
    │       │   ├── test_nn_AdaptiveMaxPool1d.py
    │       │   ├── test_nn_AdaptiveMaxPool2d.py
    │       │   ├── test_nn_AdaptiveMaxPool3d.py
    │       │   ├── test_nn_AlphaDropout.py
    │       │   ├── test_nn_AvgPool1d.py
    │       │   ├── test_nn_AvgPool2d.py
    │       │   ├── test_nn_AvgPool3d.py
    │       │   ├── test_nn_BatchNorm1d.py
    │       │   ├── test_nn_BatchNorm2d.py
    │       │   ├── test_nn_BatchNorm3d.py
    │       │   ├── test_nn_CELU.py
    │       │   ├── test_nn_ChannelShuffle.py
    │       │   ├── test_nn_ConstantPad1d.py
    │       │   ├── test_nn_ConstantPad2d.py
    │       │   ├── test_nn_ConstantPad3d.py
    │       │   ├── test_nn_Conv1d.py
    │       │   ├── test_nn_Conv2d.py
    │       │   ├── test_nn_Conv3d.py
    │       │   ├── test_nn_ConvTranspose1d.py
    │       │   ├── test_nn_ConvTranspose2d.py
    │       │   ├── test_nn_ConvTranspose3d.py
    │       │   ├── test_nn_Dropout.py
    │       │   ├── test_nn_Dropout2d.py
    │       │   ├── test_nn_Dropout3d.py
    │       │   ├── test_nn_ELU.py
    │       │   ├── test_nn_Embedding.py
    │       │   ├── test_nn_Fold.py
    │       │   ├── test_nn_GELU.py
    │       │   ├── test_nn_GLU.py
    │       │   ├── test_nn_GRU.py
    │       │   ├── test_nn_GroupNorm.py
    │       │   ├── test_nn_Hardshrink.py
    │       │   ├── test_nn_Hardsigmoid.py
    │       │   ├── test_nn_Hardswish.py
    │       │   ├── test_nn_Hardtanh.py
    │       │   ├── test_nn_Identity.py
    │       │   ├── test_nn_InstanceNorm2d.py
    │       │   ├── test_nn_LSTM.py
    │       │   ├── test_nn_LayerNorm.py
    │       │   ├── test_nn_LeakyReLU.py
    │       │   ├── test_nn_Linear.py
    │       │   ├── test_nn_LocalResponseNorm.py
    │       │   ├── test_nn_LogSigmoid.py
    │       │   ├── test_nn_LogSoftmax.py
    │       │   ├── test_nn_MaxPool1d.py
    │       │   ├── test_nn_MaxPool2d.py
    │       │   ├── test_nn_MaxPool3d.py
    │       │   ├── test_nn_Mish.py
    │       │   ├── test_nn_MultiheadAttention.py
    │       │   ├── test_nn_PReLU.py
    │       │   ├── test_nn_PixelShuffle.py
    │       │   ├── test_nn_PixelUnshuffle.py
    │       │   ├── test_nn_RMSNorm.py
    │       │   ├── test_nn_RNN.py
    │       │   ├── test_nn_ReLU.py
    │       │   ├── test_nn_ReLU6.py
    │       │   ├── test_nn_ReflectionPad1d.py
    │       │   ├── test_nn_ReflectionPad2d.py
    │       │   ├── test_nn_ReplicationPad1d.py
    │       │   ├── test_nn_ReplicationPad2d.py
    │       │   ├── test_nn_ReplicationPad3d.py
    │       │   ├── test_nn_SELU.py
    │       │   ├── test_nn_SiLU.py
    │       │   ├── test_nn_Sigmoid.py
    │       │   ├── test_nn_Softmax.py
    │       │   ├── test_nn_Softmax2d.py
    │       │   ├── test_nn_Softshrink.py
    │       │   ├── test_nn_Tanh.py
    │       │   ├── test_nn_Unfold.py
    │       │   ├── test_nn_Upsample.py
    │       │   ├── test_nn_UpsamplingBilinear2d.py
    │       │   ├── test_nn_UpsamplingNearest2d.py
    │       │   ├── test_nn_ZeroPad2d.py
    │       │   ├── test_resnet18.py
    │       │   ├── test_shufflenet_v2_x1_0.py
    │       │   ├── test_squeezenet1_1.py
    │       │   ├── test_torch_abs.py
    │       │   ├── test_torch_acos.py
    │       │   ├── test_torch_addmm.py
    │       │   ├── test_torch_amax.py
    │       │   ├── test_torch_amin.py
    │       │   ├── test_torch_asin.py
    │       │   ├── test_torch_atan.py
    │       │   ├── test_torch_atan2.py
    │       │   ├── test_torch_bmm.py
    │       │   ├── test_torch_cat.py
    │       │   ├── test_torch_ceil.py
    │       │   ├── test_torch_chunk.py
    │       │   ├── test_torch_clamp.py
    │       │   ├── test_torch_clone.py
    │       │   ├── test_torch_cos.py
    │       │   ├── test_torch_cumsum.py
    │       │   ├── test_torch_diag.py
    │       │   ├── test_torch_einsum.py
    │       │   ├── test_torch_exp.py
    │       │   ├── test_torch_flatten.py
    │       │   ├── test_torch_flip.py
    │       │   ├── test_torch_floor.py
    │       │   ├── test_torch_istft.py
    │       │   ├── test_torch_log.py
    │       │   ├── test_torch_log10.py
    │       │   ├── test_torch_logsumexp.py
    │       │   ├── test_torch_matmul.py
    │       │   ├── test_torch_max.py
    │       │   ├── test_torch_maximum.py
    │       │   ├── test_torch_mean.py
    │       │   ├── test_torch_min.py
    │       │   ├── test_torch_minimum.py
    │       │   ├── test_torch_mm.py
    │       │   ├── test_torch_neg.py
    │       │   ├── test_torch_norm.py
    │       │   ├── test_torch_pow.py
    │       │   ├── test_torch_prod.py
    │       │   ├── test_torch_reciprocal.py
    │       │   ├── test_torch_roll.py
    │       │   ├── test_torch_round.py
    │       │   ├── test_torch_rsqrt.py
    │       │   ├── test_torch_sin.py
    │       │   ├── test_torch_slice_scatter.py
    │       │   ├── test_torch_sqrt.py
    │       │   ├── test_torch_square.py
    │       │   ├── test_torch_squeeze.py
    │       │   ├── test_torch_stack.py
    │       │   ├── test_torch_stft.py
    │       │   ├── test_torch_sum.py
    │       │   ├── test_torch_t.py
    │       │   ├── test_torch_tan.py
    │       │   ├── test_torch_tanh.py
    │       │   ├── test_torch_tensor_split.py
    │       │   ├── test_torch_transpose.py
    │       │   ├── test_torch_trunc.py
    │       │   ├── test_torch_unbind.py
    │       │   ├── test_torch_unsqueeze.py
    │       │   ├── test_torchaudio_F_inverse_spectrogram.py
    │       │   ├── test_torchaudio_F_spectrogram.py
    │       │   ├── test_torchaudio_InverseSpectrogram.py
    │       │   ├── test_torchaudio_Spectrogram.py
    │       │   ├── test_torchvision_DeformConv2d.py
    │       │   ├── test_transformers_deepseek_v3_attention.py
    │       │   ├── test_transformers_qwen2_attention.py
    │       │   ├── test_transformers_qwen3_attention.py
    │       │   └── test_vit_b_32.py
    │       ├── onnx/
    │       │   ├── CMakeLists.txt
    │       │   ├── test_F_adaptive_avg_pool1d.py
    │       │   ├── test_F_adaptive_avg_pool2d.py
    │       │   ├── test_F_adaptive_avg_pool3d.py
    │       │   ├── test_F_adaptive_max_pool1d.py
    │       │   ├── test_F_adaptive_max_pool2d.py
    │       │   ├── test_F_adaptive_max_pool3d.py
    │       │   ├── test_F_avg_pool1d.py
    │       │   ├── test_F_avg_pool2d.py
    │       │   ├── test_F_avg_pool3d.py
    │       │   ├── test_F_batch_norm.py
    │       │   ├── test_F_celu.py
    │       │   ├── test_F_conv1d.py
    │       │   ├── test_F_conv2d.py
    │       │   ├── test_F_conv3d.py
    │       │   ├── test_F_conv_transpose1d.py
    │       │   ├── test_F_conv_transpose2d.py
    │       │   ├── test_F_conv_transpose3d.py
    │       │   ├── test_F_elu.py
    │       │   ├── test_F_gelu.py
    │       │   ├── test_F_group_norm.py
    │       │   ├── test_F_hardshrink.py
    │       │   ├── test_F_hardsigmoid.py
    │       │   ├── test_F_hardswish.py
    │       │   ├── test_F_hardtanh.py
    │       │   ├── test_F_interpolate.py
    │       │   ├── test_F_layer_norm.py
    │       │   ├── test_F_leaky_relu.py
    │       │   ├── test_F_linear.py
    │       │   ├── test_F_local_response_norm.py
    │       │   ├── test_F_log_softmax.py
    │       │   ├── test_F_logsigmoid.py
    │       │   ├── test_F_max_pool1d.py
    │       │   ├── test_F_max_pool2d.py
    │       │   ├── test_F_max_pool3d.py
    │       │   ├── test_F_mish.py
    │       │   ├── test_F_normalize.py
    │       │   ├── test_F_pad.py
    │       │   ├── test_F_pixel_shuffle.py
    │       │   ├── test_F_pixel_unshuffle.py
    │       │   ├── test_F_prelu.py
    │       │   ├── test_F_relu.py
    │       │   ├── test_F_relu6.py
    │       │   ├── test_F_scaled_dot_product_attention.py
    │       │   ├── test_F_selu.py
    │       │   ├── test_F_sigmoid.py
    │       │   ├── test_F_silu.py
    │       │   ├── test_F_softmax.py
    │       │   ├── test_F_softmin.py
    │       │   ├── test_F_softplus.py
    │       │   ├── test_F_softshrink.py
    │       │   ├── test_F_softsign.py
    │       │   ├── test_F_tanh.py
    │       │   ├── test_F_tanhshrink.py
    │       │   ├── test_F_upsample.py
    │       │   ├── test_F_upsample_bilinear.py
    │       │   ├── test_F_upsample_nearest.py
    │       │   ├── test_Tensor_expand.py
    │       │   ├── test_Tensor_permute.py
    │       │   ├── test_Tensor_repeat.py
    │       │   ├── test_Tensor_reshape.py
    │       │   ├── test_Tensor_reshape_as.py
    │       │   ├── test_Tensor_select.py
    │       │   ├── test_Tensor_slice.py
    │       │   ├── test_Tensor_unflatten.py
    │       │   ├── test_Tensor_view.py
    │       │   ├── test_convnext_tiny.py
    │       │   ├── test_mobilenet_v2.py
    │       │   ├── test_mobilenet_v3_small.py
    │       │   ├── test_nn_AdaptiveAvgPool1d.py
    │       │   ├── test_nn_AdaptiveAvgPool2d.py
    │       │   ├── test_nn_AdaptiveAvgPool3d.py
    │       │   ├── test_nn_AdaptiveMaxPool1d.py
    │       │   ├── test_nn_AdaptiveMaxPool2d.py
    │       │   ├── test_nn_AdaptiveMaxPool3d.py
    │       │   ├── test_nn_AvgPool1d.py
    │       │   ├── test_nn_AvgPool2d.py
    │       │   ├── test_nn_AvgPool3d.py
    │       │   ├── test_nn_BatchNorm1d.py
    │       │   ├── test_nn_BatchNorm2d.py
    │       │   ├── test_nn_BatchNorm3d.py
    │       │   ├── test_nn_CELU.py
    │       │   ├── test_nn_ConstantPad1d.py
    │       │   ├── test_nn_ConstantPad2d.py
    │       │   ├── test_nn_ConstantPad3d.py
    │       │   ├── test_nn_Conv1d.py
    │       │   ├── test_nn_Conv2d.py
    │       │   ├── test_nn_Conv3d.py
    │       │   ├── test_nn_ConvTranspose1d.py
    │       │   ├── test_nn_ConvTranspose2d.py
    │       │   ├── test_nn_ConvTranspose3d.py
    │       │   ├── test_nn_ELU.py
    │       │   ├── test_nn_GELU.py
    │       │   ├── test_nn_GRU.py
    │       │   ├── test_nn_GroupNorm.py
    │       │   ├── test_nn_Hardshrink.py
    │       │   ├── test_nn_Hardsigmoid.py
    │       │   ├── test_nn_Hardswish.py
    │       │   ├── test_nn_Hardtanh.py
    │       │   ├── test_nn_InstanceNorm1d.py
    │       │   ├── test_nn_InstanceNorm2d.py
    │       │   ├── test_nn_InstanceNorm3d.py
    │       │   ├── test_nn_LSTM.py
    │       │   ├── test_nn_LayerNorm.py
    │       │   ├── test_nn_LeakyReLU.py
    │       │   ├── test_nn_Linear.py
    │       │   ├── test_nn_LocalResponseNorm.py
    │       │   ├── test_nn_LogSigmoid.py
    │       │   ├── test_nn_LogSoftmax.py
    │       │   ├── test_nn_MaxPool1d.py
    │       │   ├── test_nn_MaxPool2d.py
    │       │   ├── test_nn_MaxPool3d.py
    │       │   ├── test_nn_Mish.py
    │       │   ├── test_nn_MultiheadAttention.py
    │       │   ├── test_nn_PReLU.py
    │       │   ├── test_nn_PixelShuffle.py
    │       │   ├── test_nn_PixelUnshuffle.py
    │       │   ├── test_nn_RNN.py
    │       │   ├── test_nn_ReLU.py
    │       │   ├── test_nn_ReLU6.py
    │       │   ├── test_nn_ReflectionPad1d.py
    │       │   ├── test_nn_ReflectionPad2d.py
    │       │   ├── test_nn_ReplicationPad1d.py
    │       │   ├── test_nn_ReplicationPad2d.py
    │       │   ├── test_nn_ReplicationPad3d.py
    │       │   ├── test_nn_SELU.py
    │       │   ├── test_nn_SiLU.py
    │       │   ├── test_nn_Sigmoid.py
    │       │   ├── test_nn_Softmax.py
    │       │   ├── test_nn_Softmin.py
    │       │   ├── test_nn_Softplus.py
    │       │   ├── test_nn_Softshrink.py
    │       │   ├── test_nn_Softsign.py
    │       │   ├── test_nn_Tanh.py
    │       │   ├── test_nn_Tanhshrink.py
    │       │   ├── test_nn_Upsample.py
    │       │   ├── test_nn_UpsamplingBilinear2d.py
    │       │   ├── test_nn_UpsamplingNearest2d.py
    │       │   ├── test_nn_ZeroPad2d.py
    │       │   ├── test_onnx_activation_ops.py
    │       │   ├── test_onnx_conv_ops.py
    │       │   ├── test_onnx_dense_ops.py
    │       │   ├── test_onnx_fuse_channel_shuffle.py
    │       │   ├── test_onnx_fuse_pixel_shuffle.py
    │       │   ├── test_onnx_fuse_pixel_unshuffle.py
    │       │   ├── test_onnx_layout_ops.py
    │       │   ├── test_onnx_math_ops.py
    │       │   ├── test_onnx_normalize_ops.py
    │       │   ├── test_onnx_opset21_ops.py
    │       │   ├── test_onnx_pool_ops.py
    │       │   ├── test_onnx_reduce_ops.py
    │       │   ├── test_onnx_rnn_ops.py
    │       │   ├── test_resnet18.py
    │       │   ├── test_shufflenet_v2_x1_0.py
    │       │   ├── test_squeezenet1_1.py
    │       │   ├── test_swin_t.py
    │       │   ├── test_torch_cat.py
    │       │   ├── test_torch_ceil.py
    │       │   ├── test_torch_chunk.py
    │       │   ├── test_torch_clamp.py
    │       │   ├── test_torch_flatten.py
    │       │   ├── test_torch_flip.py
    │       │   ├── test_torch_floor.py
    │       │   ├── test_torch_logical_and.py
    │       │   ├── test_torch_logical_not.py
    │       │   ├── test_torch_logical_or.py
    │       │   ├── test_torch_logical_xor.py
    │       │   ├── test_torch_max.py
    │       │   ├── test_torch_maximum.py
    │       │   ├── test_torch_mean.py
    │       │   ├── test_torch_min.py
    │       │   ├── test_torch_minimum.py
    │       │   ├── test_torch_norm.py
    │       │   ├── test_torch_prod.py
    │       │   ├── test_torch_roll.py
    │       │   ├── test_torch_split.py
    │       │   ├── test_torch_squeeze.py
    │       │   ├── test_torch_stack.py
    │       │   ├── test_torch_sum.py
    │       │   ├── test_torch_transpose.py
    │       │   ├── test_torch_unbind.py
    │       │   ├── test_torch_unsqueeze.py
    │       │   ├── test_transformers_albert_attention.py
    │       │   ├── test_transformers_bart_attention.py
    │       │   ├── test_transformers_bert_attention.py
    │       │   ├── test_transformers_bert_generation_attention.py
    │       │   ├── test_transformers_blenderbot_attention.py
    │       │   ├── test_transformers_camembert_attention.py
    │       │   ├── test_transformers_chinese_clip_attention.py
    │       │   ├── test_transformers_clip_attention.py
    │       │   ├── test_transformers_ctrl_attention.py
    │       │   ├── test_transformers_deberta_attention.py
    │       │   ├── test_transformers_distilbert_attention.py
    │       │   ├── test_transformers_electra_attention.py
    │       │   ├── test_transformers_flaubert_attention.py
    │       │   ├── test_transformers_fsmt_attention.py
    │       │   ├── test_transformers_funnel_attention.py
    │       │   ├── test_transformers_gpt2_attention.py
    │       │   ├── test_transformers_layoutlm_attention.py
    │       │   ├── test_transformers_lxmert_attention.py
    │       │   ├── test_transformers_m2m_100_attention.py
    │       │   ├── test_transformers_marian_attention.py
    │       │   ├── test_transformers_mbart_attention.py
    │       │   ├── test_transformers_mobilebert_attention.py
    │       │   ├── test_transformers_mt5_attention.py
    │       │   ├── test_transformers_openai_attention.py
    │       │   ├── test_transformers_pegasus_attention.py
    │       │   ├── test_transformers_prophetnet_attention.py
    │       │   ├── test_transformers_reformer_attention.py
    │       │   ├── test_transformers_roberta_attention.py
    │       │   ├── test_transformers_squeezebert_attention.py
    │       │   ├── test_transformers_t5_attention.py
    │       │   ├── test_transformers_xlm_attention.py
    │       │   ├── test_transformers_xlm_roberta_attention.py
    │       │   └── test_vit_b_32.py
    │       ├── run_test.cmake
    │       ├── test_F_adaptive_avg_pool1d.py
    │       ├── test_F_adaptive_avg_pool2d.py
    │       ├── test_F_adaptive_avg_pool3d.py
    │       ├── test_F_adaptive_max_pool1d.py
    │       ├── test_F_adaptive_max_pool2d.py
    │       ├── test_F_adaptive_max_pool3d.py
    │       ├── test_F_affine_grid.py
    │       ├── test_F_alpha_dropout.py
    │       ├── test_F_avg_pool1d.py
    │       ├── test_F_avg_pool2d.py
    │       ├── test_F_avg_pool3d.py
    │       ├── test_F_batch_norm.py
    │       ├── test_F_celu.py
    │       ├── test_F_conv1d.py
    │       ├── test_F_conv2d.py
    │       ├── test_F_conv3d.py
    │       ├── test_F_conv_transpose1d.py
    │       ├── test_F_conv_transpose2d.py
    │       ├── test_F_conv_transpose3d.py
    │       ├── test_F_dropout.py
    │       ├── test_F_dropout2d.py
    │       ├── test_F_dropout3d.py
    │       ├── test_F_elu.py
    │       ├── test_F_embedding.py
    │       ├── test_F_feature_alpha_dropout.py
    │       ├── test_F_fold.py
    │       ├── test_F_gelu.py
    │       ├── test_F_glu.py
    │       ├── test_F_grid_sample.py
    │       ├── test_F_group_norm.py
    │       ├── test_F_hardshrink.py
    │       ├── test_F_hardsigmoid.py
    │       ├── test_F_hardswish.py
    │       ├── test_F_hardtanh.py
    │       ├── test_F_instance_norm.py
    │       ├── test_F_interpolate.py
    │       ├── test_F_layer_norm.py
    │       ├── test_F_leaky_relu.py
    │       ├── test_F_linear.py
    │       ├── test_F_local_response_norm.py
    │       ├── test_F_log_softmax.py
    │       ├── test_F_logsigmoid.py
    │       ├── test_F_lp_pool1d.py
    │       ├── test_F_lp_pool2d.py
    │       ├── test_F_max_pool1d.py
    │       ├── test_F_max_pool2d.py
    │       ├── test_F_max_pool3d.py
    │       ├── test_F_mish.py
    │       ├── test_F_normalize.py
    │       ├── test_F_pad.py
    │       ├── test_F_pairwise_distance.py
    │       ├── test_F_pixel_shuffle.py
    │       ├── test_F_pixel_unshuffle.py
    │       ├── test_F_prelu.py
    │       ├── test_F_relu.py
    │       ├── test_F_relu6.py
    │       ├── test_F_rms_norm.py
    │       ├── test_F_rrelu.py
    │       ├── test_F_scaled_dot_product_attention.py
    │       ├── test_F_selu.py
    │       ├── test_F_sigmoid.py
    │       ├── test_F_silu.py
    │       ├── test_F_softmax.py
    │       ├── test_F_softmin.py
    │       ├── test_F_softplus.py
    │       ├── test_F_softshrink.py
    │       ├── test_F_softsign.py
    │       ├── test_F_tanh.py
    │       ├── test_F_tanhshrink.py
    │       ├── test_F_threshold.py
    │       ├── test_F_unfold.py
    │       ├── test_F_upsample.py
    │       ├── test_F_upsample_bilinear.py
    │       ├── test_F_upsample_nearest.py
    │       ├── test_Tensor_expand.py
    │       ├── test_Tensor_fill.py
    │       ├── test_Tensor_index.py
    │       ├── test_Tensor_index_put.py
    │       ├── test_Tensor_masked_fill.py
    │       ├── test_Tensor_new_empty.py
    │       ├── test_Tensor_new_full.py
    │       ├── test_Tensor_new_ones.py
    │       ├── test_Tensor_new_zeros.py
    │       ├── test_Tensor_permute.py
    │       ├── test_Tensor_repeat.py
    │       ├── test_Tensor_reshape.py
    │       ├── test_Tensor_reshape_as.py
    │       ├── test_Tensor_select.py
    │       ├── test_Tensor_slice.py
    │       ├── test_Tensor_slice_copy.py
    │       ├── test_Tensor_to.py
    │       ├── test_Tensor_type_as.py
    │       ├── test_Tensor_unflatten.py
    │       ├── test_Tensor_view.py
    │       ├── test_convnext_tiny.py
    │       ├── test_ir_complex.py
    │       ├── test_mobilenet_v2.py
    │       ├── test_mobilenet_v3_small.py
    │       ├── test_nn_AdaptiveAvgPool1d.py
    │       ├── test_nn_AdaptiveAvgPool2d.py
    │       ├── test_nn_AdaptiveAvgPool3d.py
    │       ├── test_nn_AdaptiveMaxPool1d.py
    │       ├── test_nn_AdaptiveMaxPool2d.py
    │       ├── test_nn_AdaptiveMaxPool3d.py
    │       ├── test_nn_AlphaDropout.py
    │       ├── test_nn_AvgPool1d.py
    │       ├── test_nn_AvgPool2d.py
    │       ├── test_nn_AvgPool3d.py
    │       ├── test_nn_BatchNorm1d.py
    │       ├── test_nn_BatchNorm2d.py
    │       ├── test_nn_BatchNorm3d.py
    │       ├── test_nn_CELU.py
    │       ├── test_nn_ChannelShuffle.py
    │       ├── test_nn_ConstantPad1d.py
    │       ├── test_nn_ConstantPad2d.py
    │       ├── test_nn_ConstantPad3d.py
    │       ├── test_nn_Conv1d.py
    │       ├── test_nn_Conv2d.py
    │       ├── test_nn_Conv3d.py
    │       ├── test_nn_ConvTranspose1d.py
    │       ├── test_nn_ConvTranspose2d.py
    │       ├── test_nn_ConvTranspose3d.py
    │       ├── test_nn_Dropout.py
    │       ├── test_nn_Dropout2d.py
    │       ├── test_nn_Dropout3d.py
    │       ├── test_nn_ELU.py
    │       ├── test_nn_Embedding.py
    │       ├── test_nn_Fold.py
    │       ├── test_nn_GELU.py
    │       ├── test_nn_GLU.py
    │       ├── test_nn_GRU.py
    │       ├── test_nn_GroupNorm.py
    │       ├── test_nn_Hardshrink.py
    │       ├── test_nn_Hardsigmoid.py
    │       ├── test_nn_Hardswish.py
    │       ├── test_nn_Hardtanh.py
    │       ├── test_nn_Identity.py
    │       ├── test_nn_InstanceNorm1d.py
    │       ├── test_nn_InstanceNorm2d.py
    │       ├── test_nn_InstanceNorm3d.py
    │       ├── test_nn_LPPool1d.py
    │       ├── test_nn_LPPool2d.py
    │       ├── test_nn_LSTM.py
    │       ├── test_nn_LayerNorm.py
    │       ├── test_nn_LeakyReLU.py
    │       ├── test_nn_Linear.py
    │       ├── test_nn_LocalResponseNorm.py
    │       ├── test_nn_LogSigmoid.py
    │       ├── test_nn_LogSoftmax.py
    │       ├── test_nn_MaxPool1d.py
    │       ├── test_nn_MaxPool2d.py
    │       ├── test_nn_MaxPool3d.py
    │       ├── test_nn_Mish.py
    │       ├── test_nn_MultiheadAttention.py
    │       ├── test_nn_PReLU.py
    │       ├── test_nn_PixelShuffle.py
    │       ├── test_nn_PixelUnshuffle.py
    │       ├── test_nn_RMSNorm.py
    │       ├── test_nn_RNN.py
    │       ├── test_nn_RReLU.py
    │       ├── test_nn_ReLU.py
    │       ├── test_nn_ReLU6.py
    │       ├── test_nn_ReflectionPad1d.py
    │       ├── test_nn_ReflectionPad2d.py
    │       ├── test_nn_ReplicationPad1d.py
    │       ├── test_nn_ReplicationPad2d.py
    │       ├── test_nn_ReplicationPad3d.py
    │       ├── test_nn_SELU.py
    │       ├── test_nn_SiLU.py
    │       ├── test_nn_Sigmoid.py
    │       ├── test_nn_Softmax.py
    │       ├── test_nn_Softmax2d.py
    │       ├── test_nn_Softmin.py
    │       ├── test_nn_Softplus.py
    │       ├── test_nn_Softshrink.py
    │       ├── test_nn_Softsign.py
    │       ├── test_nn_Tanh.py
    │       ├── test_nn_Tanhshrink.py
    │       ├── test_nn_Threshold.py
    │       ├── test_nn_Unfold.py
    │       ├── test_nn_Upsample.py
    │       ├── test_nn_UpsamplingBilinear2d.py
    │       ├── test_nn_UpsamplingNearest2d.py
    │       ├── test_nn_ZeroPad2d.py
    │       ├── test_pnnx_eliminate_noop_cat.py
    │       ├── test_pnnx_eliminate_noop_expand.py
    │       ├── test_pnnx_eliminate_noop_math.py
    │       ├── test_pnnx_eliminate_noop_upsample.py
    │       ├── test_pnnx_expression.py
    │       ├── test_pnnx_fold_constant.py
    │       ├── test_pnnx_fuse_adjacent_permute.py
    │       ├── test_pnnx_fuse_adjacent_reshape.py
    │       ├── test_pnnx_fuse_channel_shuffle.py
    │       ├── test_pnnx_fuse_conv1d_batchnorm1d.py
    │       ├── test_pnnx_fuse_conv2d_batchnorm2d.py
    │       ├── test_pnnx_fuse_conv3d_batchnorm3d.py
    │       ├── test_pnnx_fuse_convtranspose1d_batchnorm1d.py
    │       ├── test_pnnx_fuse_convtranspose2d_batchnorm2d.py
    │       ├── test_pnnx_fuse_convtranspose3d_batchnorm3d.py
    │       ├── test_pnnx_fuse_input_unpack.py
    │       ├── test_pnnx_fuse_layernorm.py
    │       ├── test_pnnx_fuse_linear_batchnorm1d.py
    │       ├── test_pnnx_fuse_multiheadattention.py
    │       ├── test_pnnx_fuse_pad_conv1d.py
    │       ├── test_pnnx_fuse_pad_conv2d.py
    │       ├── test_pnnx_fuse_pixel_shuffle.py
    │       ├── test_pnnx_fuse_pixel_unshuffle.py
    │       ├── test_pnnx_fuse_rmsnorm.py
    │       ├── test_pnnx_fuse_scaled_dot_product_attention.py
    │       ├── test_pnnx_fuse_select_to_unbind.py
    │       ├── test_pnnx_fuse_slice_to_tensor_split.py
    │       ├── test_quantization_shufflenet_v2_x1_0.py
    │       ├── test_resnet18.py
    │       ├── test_shufflenet_v2_x1_0.py
    │       ├── test_squeezenet1_1.py
    │       ├── test_swin_t.py
    │       ├── test_torch_abs.py
    │       ├── test_torch_acos.py
    │       ├── test_torch_acosh.py
    │       ├── test_torch_addmm.py
    │       ├── test_torch_amax.py
    │       ├── test_torch_amin.py
    │       ├── test_torch_arange.py
    │       ├── test_torch_argmax.py
    │       ├── test_torch_argmin.py
    │       ├── test_torch_asin.py
    │       ├── test_torch_asinh.py
    │       ├── test_torch_atan.py
    │       ├── test_torch_atan2.py
    │       ├── test_torch_atanh.py
    │       ├── test_torch_bitwise_and.py
    │       ├── test_torch_bitwise_left_shift.py
    │       ├── test_torch_bitwise_not.py
    │       ├── test_torch_bitwise_or.py
    │       ├── test_torch_bitwise_right_shift.py
    │       ├── test_torch_bitwise_xor.py
    │       ├── test_torch_bmm.py
    │       ├── test_torch_cat.py
    │       ├── test_torch_ceil.py
    │       ├── test_torch_chunk.py
    │       ├── test_torch_clamp.py
    │       ├── test_torch_clone.py
    │       ├── test_torch_complex.py
    │       ├── test_torch_cos.py
    │       ├── test_torch_cosh.py
    │       ├── test_torch_cross.py
    │       ├── test_torch_cumprod.py
    │       ├── test_torch_cumsum.py
    │       ├── test_torch_diag.py
    │       ├── test_torch_einsum.py
    │       ├── test_torch_eq.py
    │       ├── test_torch_exp.py
    │       ├── test_torch_fft_fft.py
    │       ├── test_torch_fft_fft2.py
    │       ├── test_torch_fft_fftn.py
    │       ├── test_torch_fft_hfft.py
    │       ├── test_torch_fft_hfft2.py
    │       ├── test_torch_fft_hfftn.py
    │       ├── test_torch_fft_ifft.py
    │       ├── test_torch_fft_ifft2.py
    │       ├── test_torch_fft_ifftn.py
    │       ├── test_torch_fft_ihfft.py
    │       ├── test_torch_fft_ihfft2.py
    │       ├── test_torch_fft_ihfftn.py
    │       ├── test_torch_fft_irfft.py
    │       ├── test_torch_fft_irfft2.py
    │       ├── test_torch_fft_irfftn.py
    │       ├── test_torch_fft_rfft.py
    │       ├── test_torch_fft_rfft2.py
    │       ├── test_torch_fft_rfftn.py
    │       ├── test_torch_flatten.py
    │       ├── test_torch_flip.py
    │       ├── test_torch_floor.py
    │       ├── test_torch_full.py
    │       ├── test_torch_full_like.py
    │       ├── test_torch_gather.py
    │       ├── test_torch_ge.py
    │       ├── test_torch_gt.py
    │       ├── test_torch_imag.py
    │       ├── test_torch_index_select.py
    │       ├── test_torch_istft.py
    │       ├── test_torch_le.py
    │       ├── test_torch_lgamma.py
    │       ├── test_torch_log.py
    │       ├── test_torch_log10.py
    │       ├── test_torch_logaddexp.py
    │       ├── test_torch_logical_and.py
    │       ├── test_torch_logical_not.py
    │       ├── test_torch_logical_or.py
    │       ├── test_torch_logical_xor.py
    │       ├── test_torch_logsumexp.py
    │       ├── test_torch_lt.py
    │       ├── test_torch_masked_select.py
    │       ├── test_torch_matmul.py
    │       ├── test_torch_max.py
    │       ├── test_torch_maximum.py
    │       ├── test_torch_mean.py
    │       ├── test_torch_min.py
    │       ├── test_torch_minimum.py
    │       ├── test_torch_mm.py
    │       ├── test_torch_mv.py
    │       ├── test_torch_narrow.py
    │       ├── test_torch_ne.py
    │       ├── test_torch_neg.py
    │       ├── test_torch_norm.py
    │       ├── test_torch_ones.py
    │       ├── test_torch_ones_like.py
    │       ├── test_torch_positive.py
    │       ├── test_torch_pow.py
    │       ├── test_torch_prod.py
    │       ├── test_torch_real.py
    │       ├── test_torch_reciprocal.py
    │       ├── test_torch_repeat_interleave.py
    │       ├── test_torch_roll.py
    │       ├── test_torch_round.py
    │       ├── test_torch_rsqrt.py
    │       ├── test_torch_scatter_add.py
    │       ├── test_torch_sign.py
    │       ├── test_torch_sin.py
    │       ├── test_torch_sinh.py
    │       ├── test_torch_slice_scatter.py
    │       ├── test_torch_split.py
    │       ├── test_torch_sqrt.py
    │       ├── test_torch_square.py
    │       ├── test_torch_squeeze.py
    │       ├── test_torch_stack.py
    │       ├── test_torch_std.py
    │       ├── test_torch_stft.py
    │       ├── test_torch_sum.py
    │       ├── test_torch_t.py
    │       ├── test_torch_tan.py
    │       ├── test_torch_tanh.py
    │       ├── test_torch_tensor_split.py
    │       ├── test_torch_tile.py
    │       ├── test_torch_topk.py
    │       ├── test_torch_transpose.py
    │       ├── test_torch_trunc.py
    │       ├── test_torch_unbind.py
    │       ├── test_torch_unsqueeze.py
    │       ├── test_torch_view_as_complex.py
    │       ├── test_torch_view_as_real.py
    │       ├── test_torch_where.py
    │       ├── test_torch_zeros.py
    │       ├── test_torch_zeros_like.py
    │       ├── test_torchaudio_F_inverse_spectrogram.py
    │       ├── test_torchaudio_F_spectrogram.py
    │       ├── test_torchaudio_InverseSpectrogram.py
    │       ├── test_torchaudio_Spectrogram.py
    │       ├── test_torchvision_DeformConv2d.py
    │       ├── test_torchvision_RoIAlign.py
    │       ├── test_transformers_albert_attention.py
    │       ├── test_transformers_bart_attention.py
    │       ├── test_transformers_bert_attention.py
    │       ├── test_transformers_bert_generation_attention.py
    │       ├── test_transformers_blenderbot_attention.py
    │       ├── test_transformers_camembert_attention.py
    │       ├── test_transformers_chinese_clip_attention.py
    │       ├── test_transformers_clip_attention.py
    │       ├── test_transformers_ctrl_attention.py
    │       ├── test_transformers_deberta_attention.py
    │       ├── test_transformers_deepseek_v3_attention.py
    │       ├── test_transformers_distilbert_attention.py
    │       ├── test_transformers_electra_attention.py
    │       ├── test_transformers_flaubert_attention.py
    │       ├── test_transformers_fsmt_attention.py
    │       ├── test_transformers_funnel_attention.py
    │       ├── test_transformers_gpt2_attention.py
    │       ├── test_transformers_layoutlm_attention.py
    │       ├── test_transformers_longformer_attention.py
    │       ├── test_transformers_lxmert_attention.py
    │       ├── test_transformers_m2m_100_attention.py
    │       ├── test_transformers_marian_attention.py
    │       ├── test_transformers_mbart_attention.py
    │       ├── test_transformers_mobilebert_attention.py
    │       ├── test_transformers_mt5_attention.py
    │       ├── test_transformers_openai_attention.py
    │       ├── test_transformers_pegasus_attention.py
    │       ├── test_transformers_prophetnet_attention.py
    │       ├── test_transformers_qwen2_attention.py
    │       ├── test_transformers_qwen3_attention.py
    │       ├── test_transformers_reformer_attention.py
    │       ├── test_transformers_roberta_attention.py
    │       ├── test_transformers_squeezebert_attention.py
    │       ├── test_transformers_t5_attention.py
    │       ├── test_transformers_xlm_attention.py
    │       ├── test_transformers_xlm_roberta_attention.py
    │       ├── test_transformers_xlnet_attention.py
    │       └── test_vit_b_32.py
    ├── pytorch/
    │   └── README.md
    ├── quantize/
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── imreadwrite.cpp
    │   ├── imreadwrite.h
    │   ├── ncnn2int8.cpp
    │   ├── ncnn2table.cpp
    │   └── npy.hpp
    └── tensorflow/
        └── readme.txt