Repository: espressif/esp-nn Branch: master Commit: d45b843ca5f8 Files: 99 Total size: 801.9 KB Directory structure: gitextract__zjpraf8/ ├── .github/ │ └── workflows/ │ └── upload_component.yml ├── .gitignore ├── .gitlab-ci.yml ├── CMakeLists.txt ├── CONTRIBUTING.md ├── Kconfig.projbuild ├── LICENSE ├── README.md ├── idf_component.yml ├── include/ │ ├── esp_nn.h │ ├── esp_nn_ansi_c.h │ ├── esp_nn_ansi_headers.h │ ├── esp_nn_defs.h │ ├── esp_nn_esp32p4.h │ ├── esp_nn_esp32s3.h │ └── esp_nn_generic_opt.h ├── src/ │ ├── activation_functions/ │ │ ├── esp_nn_hard_swish_ansi.c │ │ ├── esp_nn_hard_swish_s8_esp32p4.c │ │ ├── esp_nn_hard_swish_s8_esp32s3.c │ │ ├── esp_nn_relu_ansi.c │ │ ├── esp_nn_relu_s8_esp32p4.c │ │ └── esp_nn_relu_s8_esp32s3.S │ ├── basic_math/ │ │ ├── esp_nn_add_ansi.c │ │ ├── esp_nn_add_s8_esp32p4.c │ │ ├── esp_nn_add_s8_esp32s3.S │ │ ├── esp_nn_mul_ansi.c │ │ ├── esp_nn_mul_broadcast_s8_esp32s3.S │ │ ├── esp_nn_mul_s8_esp32p4.c │ │ └── esp_nn_mul_s8_esp32s3.S │ ├── common/ │ │ ├── common_functions.h │ │ ├── esp_nn_common_functions_esp32s3.S │ │ ├── esp_nn_dot_s8_esp32s3.S │ │ ├── esp_nn_mean_ansi.c │ │ ├── esp_nn_mean_s8_esp32p4.c │ │ ├── esp_nn_mean_s8_esp32s3.c │ │ ├── esp_nn_multiply_by_quantized_mult_esp32p4.S │ │ ├── esp_nn_multiply_by_quantized_mult_esp32s3.S │ │ └── esp_nn_multiply_by_quantized_mult_ver1_esp32s3.S │ ├── convolution/ │ │ ├── esp_nn_conv_ansi.c │ │ ├── esp_nn_conv_esp32p4.c │ │ ├── esp_nn_conv_esp32s3.c │ │ ├── esp_nn_conv_opt.c │ │ ├── esp_nn_conv_s16_mult4_1x1_esp32s3.S │ │ ├── esp_nn_conv_s16_mult8_esp32s3.S │ │ ├── esp_nn_conv_s8_1x1_esp32s3.c │ │ ├── esp_nn_conv_s8_3x3_opt_esp32s3.c │ │ ├── esp_nn_conv_s8_filter_aligned_input_padded_esp32s3.S │ │ ├── esp_nn_conv_s8_mult8_1x1_esp32s3.S │ │ ├── esp_nn_depthwise_conv_ansi.c │ │ ├── esp_nn_depthwise_conv_esp32p4.c │ │ ├── esp_nn_depthwise_conv_opt.c │ │ ├── esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3.S │ │ ├── esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3.S │ │ ├── esp_nn_depthwise_conv_s16_mult1_esp32s3.S │ │ ├── esp_nn_depthwise_conv_s16_mult4_esp32s3.S │ │ ├── esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3.S │ │ ├── esp_nn_depthwise_conv_s16_mult8_esp32s3.S │ │ ├── esp_nn_depthwise_conv_s8_esp32s3.c │ │ └── esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3.S │ ├── fully_connected/ │ │ ├── esp_nn_fc_s8_mac16_esp32s3.S │ │ ├── esp_nn_fully_connected_ansi.c │ │ ├── esp_nn_fully_connected_esp32s3.c │ │ ├── esp_nn_fully_connected_per_ch_s8_esp32s3.S │ │ ├── esp_nn_fully_connected_s8_esp32p4.c │ │ └── esp_nn_fully_connected_s8_esp32s3.S │ ├── logistic/ │ │ └── esp_nn_logistic_ansi.c │ ├── pooling/ │ │ ├── esp_nn_avg_pool_ansi.c │ │ ├── esp_nn_avg_pool_s8_esp32p4.c │ │ ├── esp_nn_avg_pool_s8_esp32s3.S │ │ ├── esp_nn_avg_pool_s8_esp32s3.c │ │ ├── esp_nn_max_pool_ansi.c │ │ ├── esp_nn_max_pool_s8_esp32p4.c │ │ └── esp_nn_max_pool_s8_esp32s3.S │ └── softmax/ │ ├── esp_nn_softmax_ansi.c │ ├── esp_nn_softmax_opt.c │ ├── esp_nn_softmax_s8_esp32p4.c │ ├── esp_nn_softmax_s8_esp32s3.c │ └── softmax_common.h ├── test_app/ │ ├── CMakeLists.txt │ ├── Makefile │ ├── main/ │ │ ├── CMakeLists.txt │ │ ├── component.mk │ │ └── main.c │ ├── sdkconfig.defaults │ ├── sdkconfig.defaults.esp32p4 │ └── sdkconfig.defaults.esp32s3 └── tests/ ├── CMakeLists.txt ├── README.md ├── component.mk ├── include/ │ ├── test_functions.h │ └── test_utils.h └── src/ ├── basic_math_test.c ├── convolution_test.c ├── fully_connected_test.c ├── hard_swish_test.c ├── mean_test.c ├── pooling_test.c ├── relu_test.c └── softmax_test.c ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/upload_component.yml ================================================ name: Push esp-nn to IDF Component Registry on: push: branches: - master jobs: upload_components: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Upload esp-nn to IDF Component Registry uses: espressif/upload-components-ci-action@v1 with: namespace: "espressif" name: "esp-nn" api_token: ${{ secrets.IDF_COMPONENT_API_TOKEN }} ================================================ FILE: .gitignore ================================================ .config *.o *.i *.s *.orig *.pyc # gtags GTAGS GRTAGS GPATH # emacs .dir-locals.el # emacs temp file suffixes *~ .#* \#*# # eclipse setting .settings # MacOS directory files .DS_Store # Example project files examples/**/sdkconfig examples/**/sdkconfig.old examples/**/build # Test app files test_app/build test_app/sdkconfig test_app/sdkconfig.old # Doc build artifacts docs/_build/ docs/doxygen-warning-log.txt docs/sphinx-warning-log.txt docs/sphinx-warning-log-sanitized.txt docs/xml/ docs/xml_in/ docs/man/ docs/doxygen_sqlite3.db TEST_LOGS # gcov coverage reports *.gcda *.gcno coverage.info coverage_report/ # VS Code Settings .vscode/ ================================================ FILE: .gitlab-ci.yml ================================================ stages: - build # Avoid running duplicate pipeline workflow: rules: - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' - if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH' variables: GIT_STRATEGY: fetch GIT_SUBMODULE_STRATEGY: recursive before_script: - mkdir -p ~/.ssh - chmod 700 ~/.ssh - echo -n $GITLAB_KEY_TMP > ~/.ssh/id_rsa_base64 - base64 --decode --ignore-garbage ~/.ssh/id_rsa_base64 > ~/.ssh/id_rsa - chmod 600 ~/.ssh/id_rsa - echo -e "Host gitlab.espressif.cn\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config - | if [ -n "$IDF_COMPONENT_MGR_VER" ]; then pip install idf-component-manager==$IDF_COMPONENT_MGR_VER fi .test_build: &test_build # Build examples - for TARGET in $EXAMPLE_TARGETS; do - idf.py set-target $TARGET build - done .build_template: stage: build image: espressif/idf:latest tags: - build variables: PEDANTIC_FLAGS: "-Werror -Wno-error=cpp -Werror=unused-variable -Werror=unused-but-set-variable -Werror=unused-function" EXTRA_CFLAGS: "${PEDANTIC_FLAGS}" EXTRA_CXXFLAGS: "${PEDANTIC_FLAGS}" rules: - if: '$CI_PIPELINE_SOURCE == "schedule"' when: never - when: always script: - cd ${CI_PROJECT_DIR}/test_app # build examples - *test_build - cd ${CI_PROJECT_DIR} build_idf_v5.5: extends: .build_template image: espressif/idf:release-v5.5 variables: EXAMPLE_TARGETS: esp32 esp32s3 esp32c3 esp32p4 build_idf_v5.2: extends: .build_template image: espressif/idf:release-v5.2 variables: EXAMPLE_TARGETS: esp32 esp32s3 esp32c3 build_idf_v5.0: extends: .build_template image: espressif/idf:release-v5.0 variables: EXAMPLE_TARGETS: esp32 esp32s3 esp32c3 build_idf_v4.4: extends: .build_template image: espressif/idf:release-v4.4 variables: EXAMPLE_TARGETS: esp32 esp32s3 esp32c3 IDF_COMPONENT_MGR_VER: "1.2.0" build_idf_v4.3: extends: .build_template image: espressif/idf:release-v4.3 variables: EXAMPLE_TARGETS: esp32 build_idf_v4.2: extends: .build_template image: espressif/idf:release-v4.2 variables: EXAMPLE_TARGETS: esp32 ================================================ FILE: CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.5) set(c_srcs "src/activation_functions/esp_nn_relu_ansi.c" "src/activation_functions/esp_nn_hard_swish_ansi.c" "src/common/esp_nn_mean_ansi.c" "src/basic_math/esp_nn_add_ansi.c" "src/basic_math/esp_nn_mul_ansi.c" "src/convolution/esp_nn_conv_ansi.c" "src/convolution/esp_nn_conv_opt.c" "src/convolution/esp_nn_depthwise_conv_ansi.c" "src/convolution/esp_nn_depthwise_conv_opt.c" "src/fully_connected/esp_nn_fully_connected_ansi.c" "src/softmax/esp_nn_softmax_ansi.c" "src/softmax/esp_nn_softmax_opt.c" "src/logistic/esp_nn_logistic_ansi.c" "src/pooling/esp_nn_avg_pool_ansi.c" "src/pooling/esp_nn_max_pool_ansi.c") if(CONFIG_IDF_TARGET_ESP32S3) set(s3_srcs "src/common/esp_nn_common_functions_esp32s3.S" "src/common/esp_nn_dot_s8_esp32s3.S" "src/common/esp_nn_multiply_by_quantized_mult_esp32s3.S" "src/common/esp_nn_multiply_by_quantized_mult_ver1_esp32s3.S" "src/activation_functions/esp_nn_relu_s8_esp32s3.S" "src/activation_functions/esp_nn_hard_swish_s8_esp32s3.c" "src/common/esp_nn_mean_s8_esp32s3.c" "src/basic_math/esp_nn_add_s8_esp32s3.S" "src/basic_math/esp_nn_mul_s8_esp32s3.S" "src/basic_math/esp_nn_mul_broadcast_s8_esp32s3.S" "src/convolution/esp_nn_conv_esp32s3.c" "src/convolution/esp_nn_conv_s8_1x1_esp32s3.c" "src/convolution/esp_nn_conv_s8_3x3_opt_esp32s3.c" "src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c" "src/convolution/esp_nn_conv_s16_mult8_esp32s3.S" "src/convolution/esp_nn_conv_s8_mult8_1x1_esp32s3.S" "src/convolution/esp_nn_conv_s16_mult4_1x1_esp32s3.S" "src/convolution/esp_nn_conv_s8_filter_aligned_input_padded_esp32s3.S" "src/convolution/esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3.S" "src/convolution/esp_nn_depthwise_conv_s16_mult1_esp32s3.S" "src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3.S" "src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3.S" "src/convolution/esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3.S" "src/convolution/esp_nn_depthwise_conv_s16_mult4_esp32s3.S" "src/convolution/esp_nn_depthwise_conv_s16_mult8_esp32s3.S" "src/fully_connected/esp_nn_fully_connected_esp32s3.c" "src/fully_connected/esp_nn_fc_s8_mac16_esp32s3.S" "src/fully_connected/esp_nn_fully_connected_s8_esp32s3.S" "src/fully_connected/esp_nn_fully_connected_per_ch_s8_esp32s3.S" "src/pooling/esp_nn_max_pool_s8_esp32s3.S" "src/pooling/esp_nn_avg_pool_s8_esp32s3.c" "src/pooling/esp_nn_avg_pool_s8_esp32s3.S" "src/softmax/esp_nn_softmax_s8_esp32s3.c") endif() if(CONFIG_IDF_TARGET_ESP32P4) set(p4_srcs "src/common/esp_nn_mean_s8_esp32p4.c" "src/common/esp_nn_multiply_by_quantized_mult_esp32p4.S" "src/activation_functions/esp_nn_hard_swish_s8_esp32p4.c" "src/activation_functions/esp_nn_relu_s8_esp32p4.c" "src/basic_math/esp_nn_add_s8_esp32p4.c" "src/basic_math/esp_nn_mul_s8_esp32p4.c" "src/convolution/esp_nn_conv_esp32p4.c" "src/convolution/esp_nn_depthwise_conv_esp32p4.c" "src/fully_connected/esp_nn_fully_connected_s8_esp32p4.c" "src/pooling/esp_nn_avg_pool_s8_esp32p4.c" "src/pooling/esp_nn_max_pool_s8_esp32p4.c" "src/softmax/esp_nn_softmax_s8_esp32p4.c") endif() idf_component_register(SRCS "${c_srcs}" "${s3_srcs}" "${p4_srcs}" INCLUDE_DIRS "include" "src/common") if(CONFIG_IDF_TARGET_ESP32S3) target_compile_options(${COMPONENT_LIB} PRIVATE -mlongcalls -fno-unroll-loops -O2 -Wno-unused-function) else() target_compile_options(${COMPONENT_LIB} PRIVATE -O2 -Wno-unused-function) endif() if(CONFIG_NN_SKIP_NUDGE) target_compile_definitions(${COMPONENT_LIB} PRIVATE SKIP_NUDGE) endif() ================================================ FILE: CONTRIBUTING.md ================================================ # Contributing Contributions to ESP-NN project in the form of pull requests, bug reports, and feature requests are welcome! This document covers various topics related to contributions to the ESP-NN projects. Please read it if you plan to submit a PR! ## CLA We require accepting the contributor's license agreement for all pull requests. When opening a pull request the first time you will be prompted to sign the CLA by the [CLA Assistant](https://cla-assistant.io/) service. ## Large-scale Changes If you'd like to propose a change to the existing APIs or a large-scale refactoring of the implementation, we recommend opening an issue first to discuss this. ## Updating the Benchmarks Table The benchmarks table in [README.md](README.md) contains benchmarks for ESP32-S3. The benchmarks are collected by running the app in [test_app](test_app/) directory. Please update this table if you have changed the implementations of some of the functions or added the new ones. ## Releasing a new version Maintainers should follow the steps below to release a new version of ESP-NN component. Assuming the new version is `vX.Y.Z`: 1. Ensure you are on the latest `master` branch: ```bash git checkout master git pull --ff-only origin master ``` 1. Create the new tag: ```bash git tag -s -a -m "vX.Y.Z" vX.Y.Z ``` 1. Push the tag and the branch to the internal repository: ```bash git push origin vX.Y.Z ``` 1. CI will automatically push the tag to Github and will upload the new version to the IDF Component Registry. 1. Go to https://github.com/espressif/esp-nn/releases and create a release from the tag vX.Y.Z. 1. Write the release notes and publish the release. ================================================ FILE: Kconfig.projbuild ================================================ menu "ESP-NN" choice NN_OPTIMIZATIONS bool "Optimization for nn functions" default NN_OPTIMIZED help Use ANSI-C versions for verification and debug purpose. Optimisations are automatically picked up for a chipset. For ESP32-S3, assembly optimisations are selected. For other platforms(viz., ESP32, ESP32-C3), generic optimisations are used. config NN_ANSI_C bool "ANSI C" help ANSI C versions for verification and debug purposes. config NN_OPTIMIZED bool "Optimized versions" help Optimisations are automatically picked up for a chipset. For ESP32-S3, assembly optimisations are selected. For other platforms(viz., ESP32, ESP32-C3), generic optimisations are used. endchoice config NN_OPTIMIZATIONS int default 0 if NN_ANSI_C default 1 if NN_OPTIMIZED config NN_SKIP_NUDGE bool "Use fast (non-bit-exact) requantization" depends on NN_OPTIMIZED default n help When enabled, kernels use a faster requantize path that may differ from the TFLite reference by +/-1 LSB at half-shift boundaries. On ESP32-S3, this also skips the nudge addition in the assembly requantize for ~20% speedup. Leave disabled for bit-exact behavior (recommended for tests and for matching reference outputs). endmenu ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # ESP-NN The library contains optimised NN (Neural Network) functions for various Espressif chips. * Supported platforms: * TensorFlow Lite Micro (TFLite Micro). Repo can be found [here](https://github.com/espressif/tflite-micro-esp-examples) * Supported ESP chips include: * ESP32-S3 (Assembly versions optimised to benefit from vector instructions of ESP32-S3) * ESP32-P4 (Optimised using PIE/QACC SIMD instructions) * ESP32 (Generic optimisations) * ESP32-C3 (Generic optimisations) ## Performance ### Kernelwise performance for s8 versions: * Kernelwise performance on ESP32-P4 chip * Numbers are ticks taken for kernel to execute * Chip config: 360MHz, SPI-RAM: HEX 200MHz, L2-Cache: 128KB | Function | ANSI C | Optimized | Opt Ratio | Data info | Memory | | ----------------| --------|---------|---------|-------------|-----------| | elementwise_add | 190786 | 88451 | 2.16 | size = 1615 | External | | elementwise_mul | 76585 | 47601 | 1.60 | size = 1615 | External | | convolution | 4005512 | 572459 | 7.00 | input(10,10), filter(64x1x1x64), pad(0,0), stride(1,1) | External | | convolution | 249700 | 71104 | 3.51 | input(8,8), filter(16x1x1x16), pad(0,0), stride(1,1) | External | | convolution | 816975 | 533318 | 1.53 | input(10,10), filter(64x3x3x3), pad(0,0), stride(1,1) | External | | depthwise conv | 962834 | 482389 | 2.00 | input (16, 16), pad(0,0), stride(1,1) filter: 1x3x3x16 | External | | depthwise conv | 1365066 | 703989 | 1.94 | input (12, 12), pad(1,1), stride(1,1) filter: 8x5x5x4 | External | | max pool | 482184 | 24178 | 19.94 | input(16,16), filter (1x3x3x16) | Internal | | avg pool | 303210 | 84401 | 3.59 | input(16,16), filter (1x3x3x16) | Internal | | fully connected | 7650 | 915 | 8.36 | len: 271, ch = 3 | Internal | | prelu (relu6) | 1195 | 154 | 7.76 | size, 1615 | Internal | | softmax | 14260 | 8587 | 1.66 | width: 256 | Internal | | hard_swish | 703970 | 516582 | 1.36 | size: 12544 | External | | mean | 10113 | 4686 | 2.16 | 7x7x16 | Internal | * Kernelwise performance on ESP32-S3 chip * Numbers are ticks taken for kernel to execute * Chip config: 240MHz, SPI: QPI 80MHz, Data cache: 64KB | Function | ANSI C | Optimized | Opt Ratio | Data info | Memory | | ----------------| ---------|-----------|-----------|-------------|-----------| | elementwise_add | 281337 | 74440 | 3.78 | size = 1615 | External | | elementwise_mul | 122703 | 35002 | 3.51 | size = 1615 | External | | convolution | 4712500 | 331008 | 14.24 | input(10,10), filter(64x1x1x64), pad(0,0), stride(1,1) | External | | convolution | 312754 | 39022 | 8.01 | input(8,8), filter(16x1x1x16), pad(0,0), stride(1,1) | External | | convolution | 2193289 | 394842 | 5.55 | input(8,8), filter(64x3x3x3), pad(0,0), stride(1,1) | External | | depthwise conv | 1159831 | 184176 | 6.30 | input(18,18), pad(0,0), stride(1,1), filter: 1x3x3x16 | External | | depthwise conv | 1671363 | 372435 | 4.49 | input(12,12), pad(1,1), stride(1,1), filter: 8x5x5x4 | External | | max pool | 376294 | 48069 | 7.83 | input(16,16), filter(1x3x3x16) | Internal | | avg pool | 427293 | 118052 | 3.62 | input(16,16), filter(1x3x3x16) | Internal | | fully connected | 8443 | 1078 | 7.83 | len: 271, ch = 3 | Internal | | softmax | 15209 | 11107 | 1.37 | h: 8, w: 32 | Internal | | prelu (relu6) | 1125 | 98 | 11.48 | size: 1615 | Internal | ### Model-level performance: * **Person Detection** (Visual Wake Words, INT8 quantized — from [esp-tflite-micro](https://github.com/espressif/esp-tflite-micro)) * Numbers are time (ms) for `invoke()` call, using internal memory | Chip | CPU Freq | without ESP-NN | with ESP-NN | | -------- | -------- | -------------- | ----------- | | ESP32-P4 | 360MHz | 1395ms | 73ms | | ESP32-S3 | 240MHz | 2300ms | 54ms | | ESP32 | 240MHz | 4084ms | 380ms | | ESP32-C3 | 160MHz | 3355ms | 426ms | * **MobileNetV3 Small** (INT8 quantized, 224x224x3, 1000 classes) | Chip | CPU Freq | without ESP-NN | with ESP-NN | | -------- | -------- | -------------- | ----------- | | ESP32-S3 | 240MHz | 26000ms | 1434ms | | ESP32-P4 | 360MHz | 11600ms | 1050ms | > **Note**: - The above is time taken for execution of the `invoke()` call - SPIRAM used for TensorArena. - Person detection on ESP32-S3 with internal RAM: 47ms - ESP32-P4 optimisation is work in progress - `Without ESP-NN` case is when `esp-nn` is completely disabled by removing below flag from [CMakeLists.txt](CMakeLists.txt): ```cmake # enable ESP-NN optimizations by Espressif target_compile_options(${COMPONENT_LIB} PRIVATE -DESP_NN) ``` ## Configuration * To configure, please use `idf.py menuconfig` and under `ESP-NN` select `NN_OPTIMIZATIONS` * There are two options presented: * Optimized versions * ANSI C * Default selection is for `Optimized versions`. For ESP32-S3 and ESP32-P4, assembly versions are automatically selected, whereas for other chips (viz., ESP32, ESP32-C3), generic optimisations are selected. * For debugging purposes, you may want to select `ANSI C` reference versions. ## Contributing If you encounter an issue with ESP-NN, or wish to submit a feature request, please use the Issues section on the Github. For general questions related to this library, please use the esp32.com forum. Please check [CONTRIBUTING.md](CONTRIBUTING.md) for further information if you'd like to contribute to ESP-NN. ## Copyrights and License All original source code in this repository is Copyright (C) 2020-2021 Espressif Systems. This source code is licensed under the Apache License 2.0 as described in the file LICENSE. ================================================ FILE: idf_component.yml ================================================ version: "1.2.3" description: Optimized NN (Neural Network) functions for Espressif chips url: https://github.com/espressif/esp-nn repository: https://github.com/espressif/esp-nn.git issues: https://github.com/espressif/esp-nn/issues dependencies: idf: version: ">=4.2" files: exclude: - test_app - tests ================================================ FILE: include/esp_nn.h ================================================ // Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #if defined(CONFIG_NN_OPTIMIZED) // select apt optimisations #ifdef CONFIG_IDF_TARGET_ESP32P4 #define ARCH_ESP32_P4 1 #endif #ifdef CONFIG_IDF_TARGET_ESP32S3 #define ARCH_ESP32_S3 1 #endif #ifdef CONFIG_IDF_TARGET_ESP32 #define ARCH_ESP32 1 #endif #endif #ifdef __cplusplus extern "C" { #endif /* reference kernels included by default */ #include "esp_nn_ansi_headers.h" #if defined(CONFIG_NN_OPTIMIZED) #if defined(ARCH_ESP32_P4) #include "esp_nn_esp32p4.h" #elif defined(ARCH_ESP32_S3) #include "esp_nn_esp32s3.h" #else // for other platforms use generic optimisations #include "esp_nn_generic_opt.h" #endif // #if defined(ARCH_ESP32_S3) #else #include "esp_nn_ansi_c.h" #endif #ifdef __cplusplus } #endif ================================================ FILE: include/esp_nn_ansi_c.h ================================================ /* * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /** * @file Header definitions to include for ANSI C versions. * These are just typedefs to pick up ANSI versions. */ #pragma once #include "esp_nn_defs.h" #include "esp_nn_ansi_headers.h" #define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_ansi #define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_ansi #define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_ansi #define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_ansi #define esp_nn_conv_s8 esp_nn_conv_s8_ansi #define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_ansi #define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_ansi #define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_ansi #define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_ansi #define esp_nn_relu6_s8 esp_nn_relu6_s8_ansi #define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_ansi #define esp_nn_get_hard_swish_scratch_size() 0 #define esp_nn_set_hard_swish_scratch_buf(buf) #define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_ansi #define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_ansi #define esp_nn_max_pool_s8 esp_nn_max_pool_s8_ansi #define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_ansi #define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_ansi #define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_ansi #define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_ansi #define esp_nn_softmax_s8 esp_nn_softmax_s8_ansi #define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi #define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi #define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi ================================================ FILE: include/esp_nn_ansi_headers.h ================================================ /* * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #pragma once /** * @file Header definitions to include for esp_nn reference functions */ #include "esp_nn_defs.h" /************************** Basic math functions ****************************/ /** * @brief elementwise addition * * @note inputs type: int8_t, output: int8_t * input offsets: although int32_t, they are contained in 8 bits [-128, 127] * * shift values are expected to be <= 0 */ void esp_nn_add_elementwise_s8_ansi(const int8_t *input1_data, const int8_t *input2_data, const int32_t input1_offset, const int32_t input2_offset, const int32_t input1_mult, const int32_t input2_mult, const int32_t input1_shift, const int32_t input2_shift, const int32_t left_shift, int8_t *output, const int32_t out_offset, const int32_t out_mult, const int32_t out_shift, const int32_t activation_min, const int32_t activation_max, const int32_t size); /** * @brief elementwise multiplication * * @note inputs type: int8_t, output: int8_t * input offsets: although int32_t, they are contained in 8 bits [-128, 127] * * output shift is expected to be <= 0 */ void esp_nn_mul_elementwise_s8_ansi(const int8_t *input1_data, const int8_t *input2_data, const int32_t input1_offset, const int32_t input2_offset, int8_t *output, const int32_t out_offset, const int32_t out_mult, const int32_t out_shift, const int32_t activation_min, const int32_t activation_max, const int32_t size); /** * @brief broadcast MUL for [H,W,C] * [1,1,C] pattern (SE-block) * * @note input2_per_ch has `channels` elements, broadcast to all spatial positions. * Uses fast requantization (constant nudge). */ void esp_nn_mul_broadcast_channel_s8_ansi(const int8_t *input1, const int8_t *input2_per_ch, const int32_t input1_offset, const int32_t input2_offset, int8_t *output, const int32_t output_offset, const int32_t output_mult, const int32_t output_shift, const int32_t activation_min, const int32_t activation_max, const int32_t total_spatial, const int32_t channels); /************************** Convolution functions *****************************/ /** * @brief depthwise convolution per channel * * @note inputs type: int8_t, output: int8_t * Version used in tflite is per channel. * This version follows the same footsprints. * Meaning, it has per out_channel shift and multiplier for * requantization * * optimization notes: Though input_offset is int32 type, * offset values are contained in 8 bits [-128, 127] */ void esp_nn_depthwise_conv_s8_ansi(const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const dw_conv_params_t *conv_params, const quant_data_t *quant_data); /** * @brief 2d-convolution channelwise * * @note operation: result += (input + offset) * filter * * inputs type: int8_t, output: int8_t * input offsets: although int32_t, they are contained in 8 bits [-128, 127] */ void esp_nn_conv_s8_ansi(const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const conv_params_t *conv_params, const quant_data_t *quant_data); int esp_nn_get_conv_scratch_size_ansi(const data_dims_t *input_dims, const data_dims_t *filter_dims, const data_dims_t *output_dims, const conv_params_t *conv_params); void esp_nn_set_conv_scratch_buf_ansi(const void *buf); int esp_nn_get_depthwise_conv_scratch_size_ansi(const data_dims_t *input_dims, const data_dims_t *filter_dims, const data_dims_t *output_dims, const dw_conv_params_t *conv_params); void esp_nn_set_depthwise_conv_scratch_buf_ansi(const void *buf); /************************** Activation functions *****************************/ /** * @brief relu6 * * @note inout: int8_t */ void esp_nn_relu6_s8_ansi(int8_t *data, uint16_t size); /** * @brief hard_swish activation: y = x * relu6(x + 3) / 6 * * @note Quantized int8 fixed-point implementation */ void esp_nn_hard_swish_s8_ansi(const int8_t *input, int8_t *output, const int32_t size, const int16_t input_zero_point, const int16_t output_mult_fxp, const int16_t reluish_mult_fxp, const int32_t reluish_mult_exp, const int32_t output_mult_exp, const int16_t output_zero_point); /** * @brief mean reduction over spatial dims (H,W) for NHWC int8 tensor * * @note Specialized for 4D [N,H,W,C] → [N,1,1,C] reduction. * Used by Squeeze-and-Excite in MobileNetV3. */ void esp_nn_mean_nhwc_s8_ansi(const int8_t *input, int8_t *output, const int32_t height, const int32_t width, const int32_t channels, const int32_t input_zero_point, const int32_t output_zero_point, const int32_t multiplier, const int32_t shift); /************************** Pooling functions *****************************/ /** * @brief max_pool * * @note inputs type: int8_t, output: int8_t * input offsets: although int32_t, they are contained in 8 bits [-128, 127] */ void esp_nn_max_pool_s8_ansi(const int8_t *input, const uint16_t input_wd, const uint16_t input_ht, int8_t *output, const uint16_t output_wd, const uint16_t output_ht, const uint16_t stride_wd, const uint16_t stride_ht, const uint16_t filter_wd, const uint16_t filter_ht, const uint16_t pad_wd, const uint16_t pad_ht, const int32_t activation_min, const int32_t activation_max, const uint16_t channels); /** * @brief avg_pool * * @note inputs type: int8_t, output: int8_t * input offsets: although int32_t, they are contained in 8 bits [-128, 127] */ void esp_nn_avg_pool_s8_ansi(const int8_t *input, const uint16_t input_wd, const uint16_t input_ht, int8_t *output, const uint16_t output_wd, const uint16_t output_ht, const uint16_t stride_wd, const uint16_t stride_ht, const uint16_t filter_wd, const uint16_t filter_ht, const uint16_t pad_wd, const uint16_t pad_ht, const int32_t activation_min, const int32_t activation_max, const uint16_t channels); /************************** Fully connected functions ***********************/ /** * @brief fully connected * * @note inputs type: int8_t, output: int8_t * input offsets: although int32_t, they are contained in 8 bits [-128, 127] */ void esp_nn_fully_connected_s8_ansi(const int8_t *input_data, const int32_t input_offset, const uint16_t row_len, const int8_t *filter_data, const int32_t filter_offset, const int32_t *bias, int8_t *out_data, const uint16_t out_channels, const int32_t out_offset, const int32_t out_shift, const int32_t out_mult, const int32_t activation_min, const int32_t activation_max); /** * @brief fully connected * * @note inputs type: int8_t, output: int8_t * input offsets: although int32_t, they are contained in 8 bits [-128, 127] * out_mult, out_shift: int32_t* containing per-channel data */ void esp_nn_fully_connected_per_ch_s8_ansi(const int8_t *input_data, const int32_t input_offset, const uint16_t row_len, const int8_t *filter_data, const int32_t filter_offset, const int32_t *bias, int8_t *out_data, const uint16_t out_channels, const int32_t out_offset, const int32_t* out_shift, const int32_t* out_mult, const int32_t activation_min, const int32_t activation_max); /** * @brief Get scratch buffer size needed by softmax function * * @param width * @param height * @return size in bytes * * @note buffer must be 4 byte aligned */ int32_t esp_nn_get_softmax_scratch_size_ansi(const int32_t width, const int32_t height); /* ANSI C function to be hooked up when optimised version needed */ int32_t esp_nn_get_softmax_scratch_size_opt(const int32_t width, const int32_t height); /** * @brief Set scratch buffer to be used by softmax function * * @param buffer this can be NULL if one needs to unset it * must be aligned to 4 bytes */ void esp_nn_set_softmax_scratch_buf_ansi(void *buffer); /** * @brief reference softmax function * * @note inputs type: int8_t, output: int8_t */ void esp_nn_softmax_s8_ansi(const int8_t *input_data, const int32_t height, const int32_t width, const int32_t mult, const int32_t shift, const int32_t diff_min, int8_t *output_data); //////////////////////////// Generic optimisations ///////////////////////////// /************************** Convolution functions *****************************/ /** * @brief 2d-convolution channelwise optimized version * * @note operation: result += (input + offset) * filter * * inputs type: int8_t, output: int8_t * input offsets: although int32_t, they are contained in 8 bits [-128, 127] */ void esp_nn_conv_s8_opt(const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const conv_params_t *conv_params, const quant_data_t *quant_data); /** * @brief depthwise convolution per channel optimized version * * @note inputs type: int8_t, output: int8_t * Version used in tflite is per channel. * This version follows the same footsprints. * Meaning, it has per out_channel shift and multiplier for * requantization * * optimization notes: Though input_offset is int32 type, * offset values are contained in 8 bits [-128, 127] */ void esp_nn_depthwise_conv_s8_opt(const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const dw_conv_params_t *conv_params, const quant_data_t *quant_data); int esp_nn_get_conv_scratch_size_opt(const data_dims_t *input_dims, const data_dims_t *filter_dims, const data_dims_t *output_dims, const conv_params_t *conv_params); void esp_nn_set_conv_scratch_buf_opt(const void *buf); int esp_nn_get_depthwise_conv_scratch_size_opt(const data_dims_t *input_dims, const data_dims_t *filter_dims, const data_dims_t *output_dims, const dw_conv_params_t *conv_params); void esp_nn_set_depthwise_conv_scratch_buf_opt(const void *buf); /* ANSI C function to be hooked up when optimised version needed */ void esp_nn_set_softmax_scratch_buf_opt(void *buffer); /** * @brief optimised version of softmax function * * @note the function uses extra buffer (4 * width bytes) * hence, scratch buffers must be set before calling this. */ void esp_nn_softmax_s8_opt(const int8_t *input_data, const int32_t height, const int32_t width, const int32_t mult, const int32_t shift, const int32_t diff_min, int8_t *output_data); /** * @brief Get scratch buffer size for int8 logistic (sigmoid). * @return 256 (size of LUT in bytes) */ int32_t esp_nn_get_logistic_s8_scratch_size_ansi(void); /** * @brief Prepare LUT for int8 logistic (sigmoid). * Call once during model preparation after scratch is allocated. * * @param scratch_buf Scratch buffer (256 bytes, from get_scratch_size) * @param input_zero_point Input quantization zero point * @param input_scale Input quantization scale (float) * * @note Output quantization is fixed: scale=1/256, zero_point=-128. */ void esp_nn_logistic_s8_prepare_ansi(int8_t *scratch_buf, int32_t input_zero_point, float input_scale); /** * @brief Apply int8 logistic (sigmoid) using precomputed LUT. * * @param input Input int8 data * @param output Output int8 data * @param size Number of elements * @param scratch_buf 256-byte LUT from esp_nn_logistic_s8_prepare() */ void esp_nn_logistic_s8_ansi(const int8_t *input, int8_t *output, int32_t size, const int8_t *scratch_buf); ================================================ FILE: include/esp_nn_defs.h ================================================ // Copyright 2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include /** * @brief structure to club data dims * this structure can be used for input, output and filter */ typedef struct data_dims { int32_t width; int32_t height; int32_t channels; int32_t extra; // can be used as batch or any other param } data_dims_t; /** * @brief 2d data structure (width, height) * */ typedef struct data_2d { int32_t width; int32_t height; } data_2d_t; /** * @brief min/max activation */ typedef struct act_params { int32_t min; int32_t max; } act_params_t; /** * @brief per channel quant data * * @note number of shift and mult elements are equal to output channels */ typedef struct quant_data { int32_t *shift; int32_t *mult; } quant_data_t; /** * @brief params specific to convolution 2d * */ typedef struct conv_params { int32_t in_offset; int32_t out_offset; data_2d_t stride; data_2d_t padding; data_2d_t dilation; act_params_t activation; } conv_params_t; /** * @brief params specific to depthwise convolution 2d * */ typedef struct dw_conv_params { int32_t in_offset; int32_t out_offset; int32_t ch_mult; // channel multiplier. (in_ch * ch_mult = out_ch) data_2d_t stride; data_2d_t padding; data_2d_t dilation; act_params_t activation; } dw_conv_params_t; ================================================ FILE: include/esp_nn_esp32p4.h ================================================ /* * SPDX-FileCopyrightText: 2024-2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /** * @file Header definitions to include for esp_nn optimized functions for * the ESP32-P4 platform */ #pragma once #include "esp_nn_defs.h" #include "esp_nn_ansi_headers.h" /** * @brief 2d - convolution channelwise * * @note operation: result += (input + offset) * filter * * inputs type: int8_t, output: int8_t * input offsets: although int32_t, they are contained in 8 bits [-128, 127] */ void esp_nn_conv_s8_esp32p4(const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *output_data, const conv_params_t *conv_params, const quant_data_t *quant_data); int esp_nn_get_conv_scratch_size_esp32p4(const data_dims_t *input_dims, const data_dims_t *filter_dims, const data_dims_t *output_dims, const conv_params_t *conv_params); void esp_nn_set_conv_scratch_buf_esp32p4(const void *buf); /********************** function defines ***************************/ #define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_ansi void esp_nn_add_elementwise_s8_esp32p4(const int8_t *input1_data, const int8_t *input2_data, const int32_t input1_offset, const int32_t input2_offset, const int32_t input1_mult, const int32_t input2_mult, const int32_t input1_shift, const int32_t input2_shift, const int32_t left_shift, int8_t *output, const int32_t out_offset, const int32_t out_mult, const int32_t out_shift, const int32_t activation_min, const int32_t activation_max, const int32_t size); #define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_esp32p4 void esp_nn_mul_elementwise_s8_esp32p4(const int8_t *input1_data, const int8_t *input2_data, const int32_t input1_offset, const int32_t input2_offset, int8_t *output, const int32_t out_offset, const int32_t out_mult, const int32_t out_shift, const int32_t activation_min, const int32_t activation_max, const int32_t size); #define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_esp32p4 void esp_nn_depthwise_conv_s8_esp32p4(const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const dw_conv_params_t *conv_params, const quant_data_t *quant_data); int esp_nn_get_depthwise_conv_scratch_size_esp32p4(const data_dims_t *input_dims, const data_dims_t *filter_dims, const data_dims_t *output_dims, const dw_conv_params_t *conv_params); void esp_nn_set_depthwise_conv_scratch_buf_esp32p4(const void *buf); #define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_esp32p4 #define esp_nn_conv_s8 esp_nn_conv_s8_esp32p4 #define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_esp32p4 #define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_esp32p4 #define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_esp32p4 #define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_esp32p4 /* Functions not yet optimized for P4 - use ANSI fallback */ void esp_nn_hard_swish_s8_esp32p4(const int8_t *input, int8_t *output, const int32_t size, const int16_t input_zero_point, const int16_t output_mult_fxp, const int16_t reluish_mult_fxp, const int32_t reluish_mult_exp, const int32_t output_mult_exp, const int16_t output_zero_point); #define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_esp32p4 #define esp_nn_get_hard_swish_scratch_size() 0 #define esp_nn_set_hard_swish_scratch_buf(buf) void esp_nn_mean_nhwc_s8_esp32p4(const int8_t *input, int8_t *output, const int32_t height, const int32_t width, const int32_t channels, const int32_t input_zero_point, const int32_t output_zero_point, const int32_t multiplier, const int32_t shift); #define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_esp32p4 void esp_nn_relu6_s8_esp32p4(int8_t *data, uint16_t size); #define esp_nn_relu6_s8 esp_nn_relu6_s8_esp32p4 void esp_nn_avg_pool_s8_esp32p4(const int8_t *input, const uint16_t input_wd, const uint16_t input_ht, int8_t *output, const uint16_t output_wd, const uint16_t output_ht, const uint16_t stride_wd, const uint16_t stride_ht, const uint16_t filter_wd, const uint16_t filter_ht, const uint16_t pad_wd, const uint16_t pad_ht, const int32_t activation_min, const int32_t activation_max, const uint16_t channels); #define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_esp32p4 void esp_nn_max_pool_s8_esp32p4(const int8_t *input, const uint16_t input_wd, const uint16_t input_ht, int8_t *output, const uint16_t output_wd, const uint16_t output_ht, const uint16_t stride_wd, const uint16_t stride_ht, const uint16_t filter_wd, const uint16_t filter_ht, const uint16_t pad_wd, const uint16_t pad_ht, const int32_t activation_min, const int32_t activation_max, const uint16_t channels); #define esp_nn_max_pool_s8 esp_nn_max_pool_s8_esp32p4 void esp_nn_fully_connected_s8_esp32p4(const int8_t *input_data, const int32_t input_offset, const uint16_t row_len, const int8_t *filter_data, const int32_t filter_offset, const int32_t *bias, int8_t *out_data, const uint16_t out_channels, const int32_t out_offset, const int32_t out_shift, const int32_t out_mult, const int32_t activation_min, const int32_t activation_max); void esp_nn_fully_connected_per_ch_s8_esp32p4(const int8_t *input_data, const int32_t input_offset, const uint16_t row_len, const int8_t *filter_data, const int32_t filter_offset, const int32_t *bias, int8_t *out_data, const uint16_t out_channels, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max); #define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_esp32p4 #define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_esp32p4 int32_t esp_nn_get_softmax_scratch_size_esp32p4(const int32_t width, const int32_t height); void esp_nn_set_softmax_scratch_buf_esp32p4(void *buffer); void esp_nn_softmax_s8_esp32p4(const int8_t *input_data, const int32_t height, const int32_t width, const int32_t mult, const int32_t shift, const int32_t diff_min, int8_t *output_data); #define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_esp32p4 #define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_esp32p4 #define esp_nn_softmax_s8 esp_nn_softmax_s8_esp32p4 #define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi #define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi #define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi ================================================ FILE: include/esp_nn_esp32s3.h ================================================ /* * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /** * @file Header definitions to include for esp_nn optimized functions for * the ESP32-S3 platform */ #pragma once #include "esp_nn_defs.h" #include "esp_nn_ansi_headers.h" /************************** Basic math functions *****************************/ /** * @brief elementwise addition * * @note inputs type: int8_t, output: int8_t * input offsets: although int32_t, they are contained in 8 bits [-128, 127] * * shift values are expected to be <= 0 */ void esp_nn_add_elementwise_s8_esp32s3(const int8_t *input1_data, const int8_t *input2_data, const int32_t input1_offset, const int32_t input2_offset, const int32_t input1_mult, const int32_t input2_mult, const int32_t input1_shift, const int32_t input2_shift, const int32_t left_shift, int8_t *output, const int32_t out_offset, const int32_t out_mult, const int32_t out_shift, const int32_t activation_min, const int32_t activation_max, const int32_t size); /** * @brief elementwise multiplication * * @note inputs type: int8_t, output: int8_t * input offsets: although int32_t, they are contained in 8 bits [-128, 127] * * output shift is expected to be <= 0 */ void esp_nn_mul_elementwise_s8_esp32s3(const int8_t *input1_data, const int8_t *input2_data, const int32_t input1_offset, const int32_t input2_offset, int8_t *output, const int32_t out_offset, const int32_t out_mult, const int32_t out_shift, const int32_t activation_min, const int32_t activation_max, const int32_t size); /************************** Convolution functions *****************************/ /** * @brief depthwise convolution per channel * * @note inputs type: int8_t, output: int8_t * Version used in tflite is per channel. * This version follows the same footsprints. * Meaning, it has per out_channel shift and multiplier for * requantization * * optimization notes: Though input_offset is int32 type, * offset values are contained in 8 bits [-128, 127] */ void esp_nn_depthwise_conv_s8_esp32s3(const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *output_data, const dw_conv_params_t *conv_params, const quant_data_t *quant_data); /** * @brief 2d - convolution channelwise * * @note operation: result += (input + offset) * filter * * inputs type: int8_t, output: int8_t * input offsets: although int32_t, they are contained in 8 bits [-128, 127] */ void esp_nn_conv_s8_esp32s3(const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *output_data, const conv_params_t *conv_params, const quant_data_t *quant_data); int esp_nn_get_conv_scratch_size_esp32s3(const data_dims_t *input_dims, const data_dims_t *filter_dims, const data_dims_t *output_dims, const conv_params_t *conv_params); void esp_nn_set_conv_scratch_buf_esp32s3(const void *buf); int esp_nn_get_depthwise_conv_scratch_size_esp32s3(const data_dims_t *input_dims, const data_dims_t *filter_dims, const data_dims_t *output_dims, const dw_conv_params_t *conv_params); void esp_nn_set_depthwise_conv_scratch_buf_esp32s3(const void *buf); /************************** Pooling functions *****************************/ /** * @brief max_pool * * @note inputs type: int8_t, output: int8_t * input offsets: although int32_t, they are contained in 8 bits [-128, 127] */ void esp_nn_max_pool_s8_esp32s3(const int8_t *input, const uint16_t input_wd, const uint16_t input_ht, int8_t *output, const uint16_t output_wd, const uint16_t output_ht, const uint16_t stride_wd, const uint16_t stride_ht, const uint16_t filter_wd, const uint16_t filter_ht, const uint16_t pad_wd, const uint16_t pad_ht, const int32_t activation_min, const int32_t activation_max, const uint16_t channels); /** * @brief avg_pool * * @note inputs type: int8_t, output: int8_t * input offsets: although int32_t, they are contained in 8 bits [-128, 127] */ void esp_nn_avg_pool_s8_esp32s3(const int8_t *input, const uint16_t input_wd, const uint16_t input_ht, int8_t *output, const uint16_t output_wd, const uint16_t output_ht, const uint16_t stride_wd, const uint16_t stride_ht, const uint16_t filter_wd, const uint16_t filter_ht, const uint16_t pad_wd, const uint16_t pad_ht, const int32_t activation_min, const int32_t activation_max, const uint16_t channels); /************************** Fully connected functions *****************************/ /** * @brief fully connected * * @note inputs type: int8_t, output: int8_t * input offsets: although int32_t, they are contained in 8 bits [-128, 127] * * Current version works only on aligned input. * row_len and channels should both be multiple of 8. */ void esp_nn_fully_connected_s8_esp32s3(const int8_t *input_data, const int32_t input_offset, const uint16_t row_len, const int8_t *filter_data, const int32_t filter_offset, const int32_t *bias, int8_t *out_data, const uint16_t out_channels, const int32_t out_offset, const int32_t out_shift, const int32_t out_mult, const int32_t activation_min, const int32_t activation_max); /** * @brief fully connected - per channel * * @note inputs type: int8_t, output: int8_t * input offsets: although int32_t, they are contained in 8 bits [-128, 127] * out_mult, out_shift: int32_t* containing per-channel data * * Current version works only on aligned input. * row_len and channels should both be multiple of 8. */ void esp_nn_fully_connected_per_ch_s8_esp32s3(const int8_t *input_data, const int32_t input_offset, const uint16_t row_len, const int8_t *filter_data, const int32_t filter_offset, const int32_t *bias, int8_t *out_data, const uint16_t out_channels, const int32_t out_offset, const int32_t* out_shift, const int32_t* out_mult, const int32_t activation_min, const int32_t activation_max); /** * @brief relu6 * * @note inout: int8_t */ void esp_nn_relu6_s8_esp32s3(int8_t *data, uint16_t size); /********************** function defines ***************************/ #define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_esp32s3 #define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_esp32s3 void esp_nn_mul_broadcast_channel_s8_esp32s3(const int8_t *input1, const int8_t *input2_per_ch, const int32_t input1_offset, const int32_t input2_offset, int8_t *output, const int32_t output_offset, const int32_t output_mult, const int32_t output_shift, const int32_t activation_min, const int32_t activation_max, const int32_t total_spatial, const int32_t channels); #define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_esp32s3 #define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_esp32s3 #define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_esp32s3 #define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_esp32s3 #define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_esp32s3 #define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_esp32s3 #define esp_nn_conv_s8 esp_nn_conv_s8_esp32s3 #define esp_nn_relu6_s8 esp_nn_relu6_s8_esp32s3 int32_t esp_nn_get_hard_swish_scratch_size_esp32s3(void); void esp_nn_set_hard_swish_scratch_buf_esp32s3(void *buf); void esp_nn_hard_swish_s8_esp32s3(const int8_t *input, int8_t *output, const int32_t size, const int16_t input_zero_point, const int16_t output_mult_fxp, const int16_t reluish_mult_fxp, const int32_t reluish_mult_exp, const int32_t output_mult_exp, const int16_t output_zero_point); #define esp_nn_get_hard_swish_scratch_size esp_nn_get_hard_swish_scratch_size_esp32s3 #define esp_nn_set_hard_swish_scratch_buf esp_nn_set_hard_swish_scratch_buf_esp32s3 #define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_esp32s3 void esp_nn_mean_nhwc_s8_esp32s3(const int8_t *input, int8_t *output, const int32_t height, const int32_t width, const int32_t channels, const int32_t input_zero_point, const int32_t output_zero_point, const int32_t multiplier, const int32_t shift); #define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_esp32s3 #define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_esp32s3 #define esp_nn_max_pool_s8 esp_nn_max_pool_s8_esp32s3 #define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_esp32s3 #define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_esp32s3 int32_t esp_nn_get_softmax_scratch_size_esp32s3(const int32_t width, const int32_t height); void esp_nn_set_softmax_scratch_buf_esp32s3(void *buffer); void esp_nn_softmax_s8_esp32s3(const int8_t *input_data, const int32_t height, const int32_t width, const int32_t mult, const int32_t shift, const int32_t diff_min, int8_t *output_data); #define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_esp32s3 #define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_esp32s3 #define esp_nn_softmax_s8 esp_nn_softmax_s8_esp32s3 /* Logistic (sigmoid) — LUT-based, same impl for all targets */ #define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi #define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi #define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi ================================================ FILE: include/esp_nn_generic_opt.h ================================================ /* * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /** * @file Header definitions to include for esp_nn generic optimisations * For functions which not having optimisations, _ansi versions are picked. */ #pragma once #include "esp_nn_defs.h" #include "esp_nn_ansi_headers.h" #define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_ansi #define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_ansi #define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_ansi #define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_opt #define esp_nn_conv_s8 esp_nn_conv_s8_opt #define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_opt #define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_opt #define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_opt #define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_opt #define esp_nn_relu6_s8 esp_nn_relu6_s8_ansi #define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_ansi #define esp_nn_get_hard_swish_scratch_size() 0 #define esp_nn_set_hard_swish_scratch_buf(buf) #define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_ansi #define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_ansi #define esp_nn_max_pool_s8 esp_nn_max_pool_s8_ansi #define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_ansi #define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_ansi #define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_opt #define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_opt #define esp_nn_softmax_s8 esp_nn_softmax_s8_opt #define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi #define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi #define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi ================================================ FILE: src/activation_functions/esp_nn_hard_swish_ansi.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /* * HardSwish activation function: y = x * relu6(x + 3) / 6 * Quantized int8 implementation using fixed-point arithmetic. */ #include #include /* * Saturating left shift for int16 */ static inline int16_t sat_left_shift_s16(int16_t val, int shift) { int32_t result = (int32_t)val << shift; if (result > 32767) return 32767; if (result < -32768) return -32768; return (int16_t)result; } /* * SaturatingRoundingDoublingHighMul for int16: (a * b + (1<<14)) >> 15 */ static inline int16_t sat_round_dbl_high_mul_s16(int16_t a, int16_t b) { if (a == b && a == -32768) return 32767; int32_t ab = (int32_t)a * (int32_t)b; return (int16_t)((ab + (1 << 14)) >> 15); } /* * SaturatingDoublingHighMul (NOT rounding): (a * b) >> 15 */ static inline int16_t sat_dbl_high_mul_s16(int16_t a, int16_t b) { if (a == b && a == -32768) return 32767; return (int16_t)(((int32_t)a * (int32_t)b) / (1 << 15)); } /* * RoundingDivideByPOT for int16 */ static inline int16_t rounding_div_pot_s16(int16_t val, int exponent) { int32_t mask = (1 << exponent) - 1; int32_t remainder = val & mask; int32_t threshold = (mask >> 1) + (val < 0 ? 1 : 0); return (int16_t)((val >> exponent) + (remainder > threshold ? 1 : 0)); } void esp_nn_hard_swish_s8_ansi(const int8_t *input, int8_t *output, const int32_t size, const int16_t input_zero_point, const int16_t output_mult_fxp, const int16_t reluish_mult_fxp, const int32_t reluish_mult_exp, const int32_t output_mult_exp, const int16_t output_zero_point) { for (int i = 0; i < size; i++) { const int16_t in_val = input[i] - input_zero_point; const int16_t in_hires = in_val * 128; /* << 7 */ /* Scale input to output scale */ const int16_t in_on_out_scale = sat_round_dbl_high_mul_s16(in_hires, output_mult_fxp); /* Compute reluish value: maps input from [-3,3] to [-1,1] */ int16_t reluish = in_hires; if (reluish_mult_exp > 0) { reluish = sat_left_shift_s16(reluish, reluish_mult_exp - 1); } reluish = sat_round_dbl_high_mul_s16(reluish, reluish_mult_fxp); if (reluish_mult_exp > 0) { reluish = sat_left_shift_s16(reluish, 1); } if (reluish_mult_exp < 0) { reluish = rounding_div_pot_s16(reluish, -reluish_mult_exp); } /* Convert from [-1,1] to [0,1] */ reluish = (reluish + (1 << 15)) >> 1; /* Multiply: output = reluish * input_on_output_scale */ const int16_t pre_out = sat_dbl_high_mul_s16(reluish, in_on_out_scale); /* Final shift and offset */ int16_t out_val = rounding_div_pot_s16(pre_out, -output_mult_exp); out_val += output_zero_point; if (out_val > 127) out_val = 127; if (out_val < -128) out_val = -128; output[i] = (int8_t)out_val; } } ================================================ FILE: src/activation_functions/esp_nn_hard_swish_s8_esp32p4.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /* * ESP32-P4 optimized HardSwish with: * 1. Branch hoisting (borrowed from S3): dispatch on reluish_mult_exp ONCE * 2. 2x loop unrolling for better ILP on RISC-V pipeline * 3. All int16 arithmetic - no 64-bit multiply bottleneck */ #include static inline __attribute__((always_inline)) int16_t sat_rnd_dbl_hi_mul(int16_t a, int16_t b) { if (__builtin_expect(a == b && a == -32768, 0)) return 32767; return (int16_t)(((int32_t)a * (int32_t)b + (1 << 14)) >> 15); } static inline __attribute__((always_inline)) int16_t sat_dbl_hi_mul(int16_t a, int16_t b) { if (__builtin_expect(a == b && a == -32768, 0)) return 32767; return (int16_t)(((int32_t)a * (int32_t)b) >> 15); } static inline __attribute__((always_inline)) int16_t sat_left_shift_s16(int32_t val) { if (val > 32767) return 32767; if (val < -32768) return -32768; return (int16_t)val; } static inline __attribute__((always_inline)) int16_t rounding_div_pot_s16(int16_t val, int exp) { int32_t mask = (1 << exp) - 1; int32_t remainder = val & mask; int32_t threshold = (mask >> 1) + (val < 0 ? 1 : 0); return (int16_t)((val >> exp) + (remainder > threshold ? 1 : 0)); } /* Core output computation shared by all paths */ static inline __attribute__((always_inline)) int8_t hard_swish_output(int16_t reluish, int16_t in_on_out_scale, int neg_out_exp, int16_t output_zero_point) { int16_t pre = sat_dbl_hi_mul(reluish, in_on_out_scale); int16_t ov = rounding_div_pot_s16(pre, neg_out_exp); int32_t result = ov + output_zero_point; if (result > 127) result = 127; if (result < -128) result = -128; return (int8_t)result; } void esp_nn_hard_swish_s8_esp32p4(const int8_t *input, int8_t *output, const int32_t size, const int16_t input_zero_point, const int16_t output_mult_fxp, const int16_t reluish_mult_fxp, const int32_t reluish_mult_exp, const int32_t output_mult_exp, const int16_t output_zero_point) { const int neg_out_exp = -output_mult_exp; int i = 0; /* Branch on reluish_mult_exp ONCE - 3 specialized loops */ if (reluish_mult_exp > 0) { const int ls1 = reluish_mult_exp - 1; for (; i <= size - 2; i += 2) { int16_t iv0 = input[i] - input_zero_point; int16_t iv1 = input[i+1] - input_zero_point; int16_t hi0 = iv0 * 128, hi1 = iv1 * 128; int16_t on0 = sat_rnd_dbl_hi_mul(hi0, output_mult_fxp); int16_t on1 = sat_rnd_dbl_hi_mul(hi1, output_mult_fxp); int16_t rv0 = sat_left_shift_s16((int32_t)hi0 << ls1); int16_t rv1 = sat_left_shift_s16((int32_t)hi1 << ls1); rv0 = sat_rnd_dbl_hi_mul(rv0, reluish_mult_fxp); rv1 = sat_rnd_dbl_hi_mul(rv1, reluish_mult_fxp); rv0 = sat_left_shift_s16((int32_t)rv0 * 2); rv1 = sat_left_shift_s16((int32_t)rv1 * 2); rv0 = (int16_t)(((int32_t)rv0 + 32768) >> 1); rv1 = (int16_t)(((int32_t)rv1 + 32768) >> 1); output[i] = hard_swish_output(rv0, on0, neg_out_exp, output_zero_point); output[i+1] = hard_swish_output(rv1, on1, neg_out_exp, output_zero_point); } } else if (reluish_mult_exp < 0) { const int neg_relu_exp = -reluish_mult_exp; for (; i <= size - 2; i += 2) { int16_t iv0 = input[i] - input_zero_point; int16_t iv1 = input[i+1] - input_zero_point; int16_t hi0 = iv0 * 128, hi1 = iv1 * 128; int16_t on0 = sat_rnd_dbl_hi_mul(hi0, output_mult_fxp); int16_t on1 = sat_rnd_dbl_hi_mul(hi1, output_mult_fxp); int16_t rv0 = sat_rnd_dbl_hi_mul(hi0, reluish_mult_fxp); int16_t rv1 = sat_rnd_dbl_hi_mul(hi1, reluish_mult_fxp); rv0 = rounding_div_pot_s16(rv0, neg_relu_exp); rv1 = rounding_div_pot_s16(rv1, neg_relu_exp); rv0 = (int16_t)(((int32_t)rv0 + 32768) >> 1); rv1 = (int16_t)(((int32_t)rv1 + 32768) >> 1); output[i] = hard_swish_output(rv0, on0, neg_out_exp, output_zero_point); output[i+1] = hard_swish_output(rv1, on1, neg_out_exp, output_zero_point); } } else { for (; i <= size - 2; i += 2) { int16_t iv0 = input[i] - input_zero_point; int16_t iv1 = input[i+1] - input_zero_point; int16_t hi0 = iv0 * 128, hi1 = iv1 * 128; int16_t on0 = sat_rnd_dbl_hi_mul(hi0, output_mult_fxp); int16_t on1 = sat_rnd_dbl_hi_mul(hi1, output_mult_fxp); int16_t rv0 = sat_rnd_dbl_hi_mul(hi0, reluish_mult_fxp); int16_t rv1 = sat_rnd_dbl_hi_mul(hi1, reluish_mult_fxp); rv0 = (int16_t)(((int32_t)rv0 + 32768) >> 1); rv1 = (int16_t)(((int32_t)rv1 + 32768) >> 1); output[i] = hard_swish_output(rv0, on0, neg_out_exp, output_zero_point); output[i+1] = hard_swish_output(rv1, on1, neg_out_exp, output_zero_point); } } /* Scalar remainder */ for (; i < size; i++) { int16_t iv = input[i] - input_zero_point; int16_t hi = iv * 128; int16_t on_out = sat_rnd_dbl_hi_mul(hi, output_mult_fxp); int16_t rv = hi; if (reluish_mult_exp > 0) rv = sat_left_shift_s16((int32_t)rv << (reluish_mult_exp - 1)); rv = sat_rnd_dbl_hi_mul(rv, reluish_mult_fxp); if (reluish_mult_exp > 0) rv = sat_left_shift_s16((int32_t)rv * 2); if (reluish_mult_exp < 0) rv = rounding_div_pot_s16(rv, -reluish_mult_exp); rv = (int16_t)(((int32_t)rv + 32768) >> 1); output[i] = hard_swish_output(rv, on_out, neg_out_exp, output_zero_point); } } ================================================ FILE: src/activation_functions/esp_nn_hard_swish_s8_esp32s3.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /* * ESP32-S3 optimized HardSwish using 256-byte lookup table. * * Key insight: HardSwish maps int8 -> int8 with fixed quantization parameters * per layer. Only 256 possible input values exist. We precompute the full * mapping once using the ANSI reference (bit-exact), then the inner loop * is a single byte load per element. * * Scratch buffer: 256 bytes (set via esp_nn_set_hard_swish_scratch_buf). */ #include #include /* Use ANSI C reference to build LUT — guarantees bit-exact match */ extern void esp_nn_hard_swish_s8_ansi(const int8_t *input, int8_t *output, const int32_t size, const int16_t input_zero_point, const int16_t output_mult_fxp, const int16_t reluish_mult_fxp, const int32_t reluish_mult_exp, const int32_t output_mult_exp, const int16_t output_zero_point); static int8_t *hard_swish_scratch = NULL; int32_t esp_nn_get_hard_swish_scratch_size_esp32s3(void) { return 512; /* 256 for lut_input + 256 for lut output */ } void esp_nn_set_hard_swish_scratch_buf_esp32s3(void *buf) { hard_swish_scratch = (int8_t *)buf; } void esp_nn_hard_swish_s8_esp32s3(const int8_t *input, int8_t *output, const int32_t size, const int16_t input_zero_point, const int16_t output_mult_fxp, const int16_t reluish_mult_fxp, const int32_t reluish_mult_exp, const int32_t output_mult_exp, const int16_t output_zero_point) { if (!hard_swish_scratch) { /* No scratch — fall through to ANSI */ esp_nn_hard_swish_s8_ansi(input, output, size, input_zero_point, output_mult_fxp, reluish_mult_fxp, reluish_mult_exp, output_mult_exp, output_zero_point); return; } /* Build 256-byte LUT using ANSI reference (bit-exact). * lut[i] = hardswish((int8_t)i) for the given quant params. * Indexed by (uint8_t)input_val for direct lookup. */ int8_t *lut_input = hard_swish_scratch; int8_t *lut = hard_swish_scratch + 256; for (int i = 0; i < 256; i++) { lut_input[i] = (int8_t)i; } esp_nn_hard_swish_s8_ansi(lut_input, lut, 256, input_zero_point, output_mult_fxp, reluish_mult_fxp, reluish_mult_exp, output_mult_exp, output_zero_point); /* Apply LUT — one byte load per element */ for (int i = 0; i < size; i++) { output[i] = lut[(uint8_t)input[i]]; } } ================================================ FILE: src/activation_functions/esp_nn_relu_ansi.c ================================================ // Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include void esp_nn_relu6_s8_ansi(int8_t *data, uint16_t size) { int32_t i; for (i = 0; i < size; i++) { int32_t ip = data[i]; ip = max(ip, 0); data[i] = min(ip, 6); } } ================================================ FILE: src/activation_functions/esp_nn_relu_s8_esp32p4.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include /** * In-place ReLU6 for s8 data using ESP32-P4 PIE SIMD. * Clamps each element to [0, 6]. * Processes 16 elements per iteration via 128-bit vector ops. */ void esp_nn_relu6_s8_esp32p4(int8_t *data, uint16_t size) { /* Enable PIE */ asm volatile ( "csrsi 0x7f2, 0b01 \n\t" "li x29, 0b10 \n\t" "esp.movx.w.cfg x29 \n\t" ::: "x29" ); int i = 0; if (size >= 16) { /* Broadcast 0 into q2 and 6 into q3 */ const int8_t zero_val = 0; const int8_t six_val = 6; asm volatile ( "esp.vldbc.8.ip q2, %0, 0 \n\t" "esp.vldbc.8.ip q3, %1, 0 \n\t" :: "r"(&zero_val), "r"(&six_val) ); int count = size >> 4; int stride = 16; asm volatile ( "mv x30, %[ptr] \n\t" "mv x31, %[cnt] \n\t" "1: \n\t" "esp.vld.128.ip q0, x30, 0 \n\t" /* load 16 bytes, no auto-increment */ "esp.vmax.s8 q0, q0, q2 \n\t" /* max(val, 0) */ "esp.vmin.s8 q0, q0, q3 \n\t" /* min(val, 6) */ "esp.vst.128.xp q0, x30, %[stride] \n\t" /* store and advance ptr by 16 */ "addi x31, x31, -1 \n\t" "bnez x31, 1b \n\t" : : [ptr] "r"(data), [cnt] "r"(count), [stride] "r"(stride) : "x30", "x31", "memory" ); i = count << 4; } /* Handle remaining elements scalar */ for (; i < size; i++) { int32_t val = data[i]; if (val < 0) val = 0; if (val > 6) val = 6; data[i] = (int8_t) val; } } ================================================ FILE: src/activation_functions/esp_nn_relu_s8_esp32s3.S ================================================ // Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. .text .align 4 .literal_position # in place relu6 function. a2: data, a3: size # Program Unit: esp_nn_relu6_s8_esp32s3 .type esp_nn_relu6_s8_esp32s3, @function .align 4 .global esp_nn_relu6_s8_esp32s3 esp_nn_relu6_s8_esp32s3: entry a1,48 # mov.n a9,a2 # [0], data mov.n a7,a3 # [1], size // process multiple of 16 movi.n a4,6 # [4] s8i a4,a1,0 # [5] six addi a10,a3,-7 # [2] ee.vldbc.8 q1,a1 # [6] id:72 six+0x0 blti a3,16,.Lt_0_5634 # [7] srai a8,a3,4 # [0] ee.zero.q q2 # [1] loopgtz a8,.LBB37_esp_nn_relu6_s8_esp32s3 # [3] ee.vld.128.ip q0,a2,0 # [0*II+0] id:73 ee.vmax.s8 q0,q0,q2 # [0*II+2] ee.vmin.s8 q0,q0,q1 # [0*II+3] ee.vst.128.ip q0,a2,16 # [0*II+4] id:74 .LBB37_esp_nn_relu6_s8_esp32s3: # 0x34 slli a8,a8,4 # [0] // remaining multiple of 8 data bge a8,a10,.Lt_0_3586 # [1] .Lt_0_3842: # 0x3a sub a6,a7,a8 # [0] srai a6,a6,3 # [1] loopgtz a6,.LBB52_esp_nn_relu6_s8_esp32s3 # [2] ee.vld.l.64.ip q0,a2,0 # [0*II+0] id:75 ee.vmax.s8 q0,q0,q2 # [0*II+2] ee.vmin.s8 q0,q0,q1 # [0*II+3] ee.vst.l.64.ip q0,a2,8 # [0*II+4] id:76 .LBB52_esp_nn_relu6_s8_esp32s3: # 0x4f addx8 a8,a6,a8 # [0] .Lt_0_3586: # 0x52 // process leftover bge a8,a7,.Lt_0_6402 # [0] .Lt_0_4866: # 0x55 movi.n a5,0 # [0] sub a3,a7,a8 # [1] add.n a2,a8,a9 # [2] l8ui a6,a2,0 # [3] id:78 addi.n a3,a3,-1 # [4] sext a6,a6,7 max a6,a5,a6 # [6] min a6,a4,a6 # [7] s8i a6,a2,0 # [8] id:79 loopgtz a3,.LBB67_esp_nn_relu6_s8_esp32s3 # [9] l8ui a3,a2,1 # [0*II+0] id:78 addi.n a2,a2,1 # [1*II+1] sext a3,a3,7 max a3,a5,a3 # [0*II+3] min a3,a4,a3 # [0*II+4] s8i a3,a2,0 # [0*II+5] id:79 .LBB67_esp_nn_relu6_s8_esp32s3: # 0x81 .Lt_0_6402: # 0x83 retw.n # [0] .Lt_0_5634: # 0x85 blti a10,1,.Lt_0_5890 # [0] movi.n a8,0 # [0] ee.zero.q q2 # [1] j .Lt_0_3842 # [2] .Lt_0_5890: # 0x90 beqz.n a3,.Lt_0_6402 # [0] movi.n a8,0 # [0] j .Lt_0_4866 # [1] .size esp_nn_relu6_s8_esp32s3, . - esp_nn_relu6_s8_esp32s3 ================================================ FILE: src/basic_math/esp_nn_add_ansi.c ================================================ // Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include void esp_nn_add_elementwise_u8_ansi(const uint8_t *input1_data, const uint8_t *input2_data, const int32_t input1_offset, const int32_t input2_offset, const int32_t input1_mult, const int32_t input2_mult, const int32_t input1_shift, const int32_t input2_shift, const int32_t left_shift, uint8_t *output, const int32_t out_offset, const int32_t out_mult, const int32_t out_shift, const int32_t activation_min, const int32_t activation_max, const int32_t size) { for (int i = 0; i < size; i++) { int32_t tmp1 = input1_data[i] + input1_offset; int32_t tmp2 = input2_data[i] + input2_offset; tmp1 <<= left_shift; tmp2 <<= left_shift; tmp1 = esp_nn_sat_round_doubling_high_mul(tmp1, input1_mult); tmp2 = esp_nn_sat_round_doubling_high_mul(tmp2, input2_mult); tmp1 = esp_nn_div_by_power_of_two(tmp1, -input1_shift); tmp2 = esp_nn_div_by_power_of_two(tmp2, -input2_shift); int32_t out = tmp1 + tmp2; out = esp_nn_sat_round_doubling_high_mul(out, out_mult); out = esp_nn_div_by_power_of_two(out, -out_shift); out = out + out_offset; out = max(activation_min, min(out, activation_max)); output[i] = (uint8_t) out; } } void esp_nn_add_elementwise_s8_ansi(const int8_t *input1_data, const int8_t *input2_data, const int32_t input1_offset, const int32_t input2_offset, const int32_t input1_mult, const int32_t input2_mult, const int32_t input1_shift, const int32_t input2_shift, const int32_t left_shift, int8_t *output, const int32_t out_offset, const int32_t out_mult, const int32_t out_shift, const int32_t activation_min, const int32_t activation_max, const int32_t size) { for (int i = 0; i < size; i++) { int32_t tmp1 = input1_data[i] + input1_offset; int32_t tmp2 = input2_data[i] + input2_offset; tmp1 <<= left_shift; tmp2 <<= left_shift; tmp1 = esp_nn_sat_round_doubling_high_mul(tmp1, input1_mult); tmp2 = esp_nn_sat_round_doubling_high_mul(tmp2, input2_mult); tmp1 = esp_nn_div_by_power_of_two(tmp1, -input1_shift); tmp2 = esp_nn_div_by_power_of_two(tmp2, -input2_shift); int32_t out = tmp1 + tmp2; out = esp_nn_sat_round_doubling_high_mul(out, out_mult); out = esp_nn_div_by_power_of_two(out, -out_shift); out = out + out_offset; out = max(activation_min, min(out, activation_max)); output[i] = (int8_t) out; } } ================================================ FILE: src/basic_math/esp_nn_add_s8_esp32p4.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include #include /** * Optimized elementwise add for s8 on ESP32-P4. * Uses fast multiply-by-quantized-mult and 2x unrolling. */ /* Inline the core requantization to avoid function call overhead */ /* Inlined fast requant using explicit RISC-V mul/mulh to avoid * compiler generating 64-bit multiply helper calls */ static inline __attribute__((always_inline)) int32_t add_requant(int32_t val, int32_t mult, int32_t neg_shift) { /* Use C 64-bit multiply - compiler already generates mul+mulh pair at -O2 */ int64_t prod64 = (int64_t)val * mult + ((int64_t)1 << 30); int32_t result = (int32_t)(prod64 >> 31); if (neg_shift > 0) { int32_t rnd = (1 << (neg_shift - 1)) - (result < 0); result = (result + rnd) >> neg_shift; } return result; } void esp_nn_add_elementwise_s8_esp32p4(const int8_t *input1_data, const int8_t *input2_data, const int32_t input1_offset, const int32_t input2_offset, const int32_t input1_mult, const int32_t input2_mult, const int32_t input1_shift, const int32_t input2_shift, const int32_t left_shift, int8_t *output, const int32_t out_offset, const int32_t out_mult, const int32_t out_shift, const int32_t activation_min, const int32_t activation_max, const int32_t size) { const int32_t neg_in1_shift = -input1_shift; const int32_t neg_in2_shift = -input2_shift; const int32_t neg_out_shift = -out_shift; int i = 0; /* Process 2 at a time - C inline requant lets compiler optimize across calls */ for (; i <= size - 2; i += 2) { int32_t a0 = (input1_data[i + 0] + input1_offset) << left_shift; int32_t b0 = (input2_data[i + 0] + input2_offset) << left_shift; a0 = add_requant(a0, input1_mult, neg_in1_shift); b0 = add_requant(b0, input2_mult, neg_in2_shift); int32_t out0 = add_requant(a0 + b0, out_mult, neg_out_shift) + out_offset; out0 = max(activation_min, min(out0, activation_max)); int32_t a1 = (input1_data[i + 1] + input1_offset) << left_shift; int32_t b1 = (input2_data[i + 1] + input2_offset) << left_shift; a1 = add_requant(a1, input1_mult, neg_in1_shift); b1 = add_requant(b1, input2_mult, neg_in2_shift); int32_t out1 = add_requant(a1 + b1, out_mult, neg_out_shift) + out_offset; out1 = max(activation_min, min(out1, activation_max)); output[i + 0] = (int8_t) out0; output[i + 1] = (int8_t) out1; } for (; i < size; i++) { int32_t tmp1 = (input1_data[i] + input1_offset) << left_shift; int32_t tmp2 = (input2_data[i] + input2_offset) << left_shift; tmp1 = add_requant(tmp1, input1_mult, neg_in1_shift); tmp2 = add_requant(tmp2, input2_mult, neg_in2_shift); int32_t out = add_requant(tmp1 + tmp2, out_mult, neg_out_shift) + out_offset; out = max(activation_min, min(out, activation_max)); output[i] = (int8_t) out; } } ================================================ FILE: src/basic_math/esp_nn_add_s8_esp32s3.S ================================================ // Copyright 2021-2023 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. .text .align 4 .literal_position .literal .nudge_val, 1073741824 # Program Unit: esp_nn_add_elementwise_s8_esp32s3 .type esp_nn_add_elementwise_s8_esp32s3, @function .align 4 .global esp_nn_add_elementwise_s8_esp32s3 esp_nn_add_elementwise_s8_esp32s3: # 0x4 # temp_neg_out_shift = 0 # temp_neg_input2_shift = 4 # temp_neg_input1_shift = 8 # gra_spill_temp_2 = 12 # gra_spill_temp_3 = 16 # gra_spill_temp_4 = 20 # gra_spill_temp_5 = 24 # gra_spill_temp_6 = 28 # gra_spill_temp_7 = 32 # gra_spill_temp_8 = 36 # gra_spill_temp_9 = 40 # gra_spill_temp_10 = 44 # gra_spill_temp_11 = 48 # gra_spill_temp_12 = 52 # gra_spill_temp_13 = 56 // a2 : *input1_data // a3 : *input2_data // a4 : input1_offset // a5 : input2_offset // a6 : input1_mult // a7 : input2_mult // On stack: // 80: input1_shift // 84: input2_shift // 88: left_shift // 92: *output // 96: out_offset // 100: out_mult, loaded in `a8` // 104: out_shift // 108: activation_min // 112: activation_max // 116: size entry a1,80 # s32i.n a4,a1,48 # [10] gra_spill_temp_11, input1_offset s32i.n a5,a1,52 # [0] gra_spill_temp_12, input2_offset s32i.n a2,a1,32 # [5] gra_spill_temp_7, input1_data s32i.n a3,a1,12 # [3] gra_spill_temp_2, input2_data l32i a12,a1,116 # [11] id:720 size+0x0 mov.n a14,a2 # [6] mov.n a10,a3 # [8] blti a12,1,.exit # [1] // exit l32i a3,a1,80 # [0] id:721 input1_shift+0x0 l32i a13,a1,84 # [1] id:722 input2_shift+0x0 l32i a2,a1,104 # [8] id:723 out_shift+0x0 l32i a8,a1,100 # [1] out_mult neg a3,a3 # [12] neg a13,a13 # [7] neg a2,a2 # [11] s32i.n a3,a1,8 # [12] temp_neg_input1_shift, -input1_shift s32i.n a13,a1,4 # [7] temp_neg_input2_shift, -input2_shift s32i.n a2,a1,0 # [16] temp_neg_out_shift, -out_shift movi.n a5,1 addi a9,a3,-1 ssl a9 sll a15,a5 s32i.n a15,a1,16 # gra_spill_temp_3, 1 << (exponent - 1) for input1 addi a9,a13,-1 ssl a9 sll a15,a5 s32i.n a15,a1,20 # gra_spill_temp_4, 1 << (exponent - 1) for input2 addi a9,a2,-1 ssl a9 sll a15,a5 s32i.n a15,a1,24 # gra_spill_temp_5, 1 << (exponent - 1) for out movi.n a2,0 blti a12,12,.process_leftover # [23] // skip to leftover routine if inputs are unaligned or a9,a14,a10 extui a9,a9,0,4 bnez a9,.process_leftover l32i a9,a1,92 # [17] id:1279 output+0x0 l32i a13,a1,116 # [20] srai a13,a13,3 # [21] s32i.n a13,a1,56 # [22] gra_spill_temp_13 movi.n a13,8 s32i.n a13,a1,28 # gra_spill_temp_6, mult_of8 counter ee.zero.q q6 # [8] .vector_loop: // process 8 values in one go l32i a15,a1,88 # [6] left_shift ee.vld.l.64.ip q0,a14,8 # [9] id:729 s32i.n a9,a1,44 # [10] gra_spill_temp_10, out_ptr s32i.n a14,a1,40 # [20] gra_spill_temp_9 wsr.sar a15 # [21] load left shift addi.n a15,a1,48 # [14] ee.vldbc.16 q7,a15 # [21] id:1277 input1_offset ee.vcmp.lt.s8 q5,q0,q6 # [29] ee.vzip.8 q0,q5 # [31], 20 bits ee.vadds.s16 q0,q0,q7 # [34], add offset ee.vcmp.lt.s16 q2,q0,q6 # [36] ee.vzip.16 q0,q2 # [39], 32 bits ee.vsl.32 q0,q0 # [41] left_shift ee.vsl.32 q2,q2 # [42] left_shift l32r a9,.nudge_val # [15], nudge // mulhi32 for q0 ee.movi.32.a q0,a3,2 # [44] ee.movi.32.a q0,a4,3 # [45] ee.movi.32.a q0,a14,1 # [46] ee.movi.32.a q0,a5,0 # [62] mulsh a13,a6,a3 # [51] mull a3,a6,a3 # [53] mulsh a12,a6,a4 # [50] mull a4,a6,a4 # [55] mulsh a15,a6,a14 # [48] mull a14,a6,a14 # [49] ssai 31 # [47] add a3,a3,a9 saltu a2,a3,a9 add.n a13,a13,a2 src a13,a13,a3 add a4,a4,a9 saltu a2,a4,a9 add.n a12,a12,a2 src a12,a12,a4 ee.movi.32.q q0,a13,2 # [62] add a14,a14,a9 saltu a2,a14,a9 add.n a15,a15,a2 src a15,a15,a14 ee.movi.32.q q0,a12,3 # [62] mulsh a13,a6,a5 # [51] mull a5,a6,a5 # [53] ee.movi.32.q q0,a15,1 # [62] add a5,a5,a9 saltu a2,a5,a9 add.n a13,a13,a2 src a13,a13,a5 ee.movi.32.q q0,a13,0 # [62] // mulhi32 for q2 ee.movi.32.a q2,a3,2 # [44] ee.movi.32.a q2,a4,3 # [45] ee.movi.32.a q2,a14,1 # [46] ee.movi.32.a q2,a5,0 # [62] mulsh a13,a6,a3 # [51] mull a3,a6,a3 # [53] mulsh a12,a6,a4 # [50] mull a4,a6,a4 # [55] mulsh a15,a6,a14 # [48] mull a14,a6,a14 # [49] ssai 31 # [47] add a3,a3,a9 saltu a2,a3,a9 add.n a13,a13,a2 src a13,a13,a3 add a4,a4,a9 saltu a2,a4,a9 add.n a12,a12,a2 src a12,a12,a4 ee.movi.32.q q2,a13,2 # [62] add a14,a14,a9 saltu a2,a14,a9 add.n a15,a15,a2 src a15,a15,a14 ee.movi.32.q q2,a12,3 # [62] mulsh a13,a6,a5 # [51] mull a5,a6,a5 # [53] ee.movi.32.q q2,a15,1 # [62] l32i a3,a1,8 # [12] temp_neg_input1_shift, -input1_shift add a5,a5,a9 saltu a2,a5,a9 add.n a13,a13,a2 src a13,a13,a5 ee.movi.32.q q2,a13,0 # [62] blti a3,1, .skip_div_by2_in0 addi.n a13,a1,16 ee.vcmp.lt.s32 q1,q0,q6 ee.vcmp.lt.s32 q3,q2,q6 ee.vldbc.32 q5,a13 // 1 << (exponent - 1) wsr.sar a3 // load right_shift ee.vadds.s32 q0,q0,q1 // subtract 1 `if (val < 0)` ee.vadds.s32 q2,q2,q3 // subtract 1 `if (val < 0)` ee.vadds.s32 q0,q0,q5 ee.vadds.s32 q2,q2,q5 ee.vsr.32 q0,q0 ee.vsr.32 q2,q2 .skip_div_by2_in0: ee.vld.l.64.ip q1,a10,8 # [11] id:1290 addi.n a15,a1,52 # [12] ee.vldbc.16 q7,a15 # [19] id:1278 input2_offset l32i a15,a1,88 # [6] left_shift s32i a10,a1,36 # [14] gra_spill_temp_8 ee.vcmp.lt.s8 q3,q1,q6 # [271] wsr.sar a15 # [21], load shift for left shift ee.vzip.8 q1,q3 # [274], 20 bits ee.vadds.s16 q1,q1,q7 # [281] ee.vcmp.lt.s16 q3,q1,q6 # [282] ee.vzip.16 q1,q3 # [283], 32 bits ee.vsl.32 q1,q1 # [284] ee.vsl.32 q3,q3 # [285] // mulhi32 for q1 ee.movi.32.a q1,a3,2 # [44] ee.movi.32.a q1,a4,3 # [45] ee.movi.32.a q1,a14,1 # [46] ee.movi.32.a q1,a5,0 # [62] mulsh a13,a7,a3 # [51] mull a3,a7,a3 # [53] mulsh a12,a7,a4 # [50] mull a4,a7,a4 # [55] mulsh a15,a7,a14 # [48] mull a14,a7,a14 # [49] ssai 31 # [47] add a3,a3,a9 saltu a2,a3,a9 add.n a13,a13,a2 src a13,a13,a3 add a4,a4,a9 saltu a2,a4,a9 add.n a12,a12,a2 src a12,a12,a4 ee.movi.32.q q1,a13,2 # [62] add a14,a14,a9 saltu a2,a14,a9 add.n a15,a15,a2 src a15,a15,a14 ee.movi.32.q q1,a12,3 # [62] mulsh a13,a7,a5 # [51] mull a5,a7,a5 # [53] ee.movi.32.q q1,a15,1 # [62] add a5,a5,a9 saltu a2,a5,a9 add.n a13,a13,a2 src a13,a13,a5 ee.movi.32.q q1,a13,0 # [62] // mulhi32 for q3 ee.movi.32.a q3,a3,2 # [44] ee.movi.32.a q3,a4,3 # [45] ee.movi.32.a q3,a14,1 # [46] ee.movi.32.a q3,a5,0 # [62] mulsh a13,a7,a3 # [51] mull a3,a7,a3 # [53] mulsh a12,a7,a4 # [50] mull a4,a7,a4 # [55] mulsh a15,a7,a14 # [48] mull a14,a7,a14 # [49] ssai 31 # [47] add a3,a3,a9 saltu a2,a3,a9 add.n a13,a13,a2 src a13,a13,a3 add a4,a4,a9 saltu a2,a4,a9 add.n a12,a12,a2 src a12,a12,a4 ee.movi.32.q q3,a13,2 # [62] add a14,a14,a9 saltu a2,a14,a9 add.n a15,a15,a2 src a15,a15,a14 ee.movi.32.q q3,a12,3 # [62] mulsh a13,a7,a5 # [51] mull a5,a7,a5 # [53] ee.movi.32.q q3,a15,1 # [62] l32i a14,a1,4 # [7] temp_neg_input2_shift, -input2_shift add a5,a5,a9 saltu a2,a5,a9 add.n a13,a13,a2 src a13,a13,a5 ee.movi.32.q q3,a13,0 # [62] // multiplication results: q0-q2 & q1-q3 blti a14,1, .skip_div_by2_in1 addi.n a5,a1,20 ee.vcmp.lt.s32 q4,q1,q6 ee.vcmp.lt.s32 q5,q3,q6 ee.vldbc.32 q7,a5 // 1 << (exponent - 1) wsr.sar a14 // load right_shift ee.vadds.s32 q4,q4,q7 // subtract 1 `if (val < 0)` ee.vadds.s32 q5,q5,q7 // subtract 1 `if (val < 0)` ee.vadds.s32 q1,q1,q4 ee.vadds.s32 q3,q3,q5 ee.vsr.32 q1,q1 ee.vsr.32 q3,q3 .skip_div_by2_in1: ee.vadds.s32 q0,q0,q1 ee.vadds.s32 q1,q2,q3 // mulhi32 for q0 ee.movi.32.a q0,a3,2 # [44] ee.movi.32.a q0,a4,3 # [45] ee.movi.32.a q0,a14,1 # [46] ee.movi.32.a q0,a5,0 # [62] mulsh a13,a8,a3 # [51] mull a3,a8,a3 # [53] mulsh a12,a8,a4 # [50] mull a4,a8,a4 # [55] mulsh a15,a8,a14 # [48] mull a14,a8,a14 # [49] ssai 31 # [47] add a3,a3,a9 saltu a2,a3,a9 add.n a13,a13,a2 src a13,a13,a3 add a4,a4,a9 saltu a2,a4,a9 add.n a12,a12,a2 src a12,a12,a4 ee.movi.32.q q0,a13,2 # [62] add a14,a14,a9 saltu a2,a14,a9 add.n a15,a15,a2 src a15,a15,a14 ee.movi.32.q q0,a12,3 # [62] mulsh a13,a8,a5 # [51] mull a5,a8,a5 # [53] ee.movi.32.q q0,a15,1 # [62] add a5,a5,a9 saltu a2,a5,a9 add.n a13,a13,a2 src a13,a13,a5 ee.movi.32.q q0,a13,0 # [62] // mulhi32 for q1 ee.movi.32.a q1,a3,2 # [44] ee.movi.32.a q1,a4,3 # [45] ee.movi.32.a q1,a14,1 # [46] ee.movi.32.a q1,a5,0 # [62] mulsh a13,a8,a3 # [51] mull a3,a8,a3 # [53] mulsh a12,a8,a4 # [50] mull a4,a8,a4 # [55] mulsh a15,a8,a14 # [48] mull a14,a8,a14 # [49] ssai 31 # [47] add a3,a3,a9 saltu a2,a3,a9 add.n a13,a13,a2 src a13,a13,a3 add a4,a4,a9 saltu a2,a4,a9 add.n a12,a12,a2 src a12,a12,a4 ee.movi.32.q q1,a13,2 # [62] add a14,a14,a9 saltu a2,a14,a9 add.n a15,a15,a2 src a15,a15,a14 ee.movi.32.q q1,a12,3 # [62] mulsh a13,a8,a5 # [51] mull a5,a8,a5 # [53] ee.movi.32.q q1,a15,1 # [62] l32i a14,a1,0 # [738] temp_neg_out_shift, -out_shift add a5,a5,a9 saltu a2,a5,a9 add.n a13,a13,a2 src a13,a13,a5 ee.movi.32.q q1,a13,0 # [62] //q0-q1 has output blti a14,1,.skip_div_by2_out addi.n a5,a1,24 ee.vcmp.lt.s32 q2,q0,q6 ee.vcmp.lt.s32 q3,q1,q6 ee.vldbc.32 q5,a5 // 1 << (exponent - 1) wsr.sar a14 // load right shift ee.vadds.s32 q0,q0,q2 // subtract 1 `if (val < 0)` ee.vadds.s32 q1,q1,q3 // subtract 1 `if (val < 0)` ee.vadds.s32 q0,q0,q5 ee.vadds.s32 q1,q1,q5 ee.vsr.32 q0,q0 ee.vsr.32 q1,q1 .skip_div_by2_out: // add offset and apply activation addi a15,a1,96 ee.vldbc.32 q3,a15 # [809] id:802 out_offset ee.vadds.s32 q0,q0,q3 # [811] ee.vadds.s32 q1,q1,q3 # [812] addi a13,a1,108 addi a14,a1,112 ee.vldbc.32 q3,a14 # [813] id:803 activation_max ee.vmin.s32 q0,q0,q3 # [815] ee.vmin.s32 q1,q1,q3 # [816] ee.vldbc.32 q3,a13 # [817] id:804 activation_min l32i a13,a1,4 # [818] temp_neg_input2_shift ee.vmax.s32 q1,q1,q3 # [819] ee.vmax.s32 q0,q0,q3 # [820] //pack the data and store l32i.n a9,a1,44 # [784] gra_spill_temp_10 ee.vunzip.16 q0,q1 # [821] ee.vunzip.8 q0,q1 # [822] l32i.n a13,a1,28 # gra_spill_temp_6, multiple of 12 index ee.vst.l.64.ip q0,a9,8 # [823] id:805 l32i a15,a1,116 # [1], size l32i.n a14,a1,40 # [20] gra_spill_temp_9 l32i.n a10,a1,36 # [14] gra_spill_temp_8 addi a13,a13,8 s32i.n a13,a1,28 # gra_spill_temp_6 bge a15,a13,.vector_loop l32i.n a2,a1,56 # [0] gra_spill_temp_13 // check for leftover l32i a10,a1,116 # [1] slli a2,a2,3 # [2] bge a2,a10,.exit # [3] // done, exit .process_leftover: l32i.n a3,a1,48 # [1] gra_spill_temp_11 l32i.n a12,a1,52 # [2] gra_spill_temp_12 l32i.n a10,a1,12 # [3] gra_spill_temp_2 l32i.n a14,a1,32 # [8] gra_spill_temp_7 add.n a10,a2,a10 # [5] add.n a14,a2,a14 # [6] l8ui a14,a14,0 # [7] id:809, input1 l8ui a10,a10,0 # [12] id:1370, input2 sext a14,a14,7 # [9] sext a10,a10,7 # [10] add.n a10,a10,a12 # [11] // add offset2 add.n a14,a14,a3 # [16] // add offset1 l32i a12,a1,88 # [13] left_shift // sat_round_doubling_high_mul step for input1 and input2 ssl a12 # [15] sll a10,a10 # [20] sll a14,a14 # [17] l32r a12,.nudge_val # [0], nudge // a13,a3 are free, a12: nudge, a6:mult1 mulsh a13,a14,a6 mull a9,a14,a6 ssai 31 add a9,a9,a12 saltu a3,a9,a12 add.n a13,a13,a3 src a14,a13,a9 //result in a14 mulsh a13,a10,a7 mull a9,a10,a7 ssai 31 add a9,a9,a12 saltu a3,a9,a12 add.n a13,a13,a3 src a10,a13,a9 //result in a10 // divide_by_power_of2_step for input1 (a14), input2 (a10) // free registers: a13, a12, a9, a3 l32i.n a12,a1,8 // -input1_shift l32i.n a13,a1,4 // -input2_shift blti a12,1,.skip_div_by2_in0_remain l32i.n a3,a1,16 // 1 << (exponent - 1) extui a9,a14,31,1 ssr a12 // load right_shift sub a3,a3,a9 // 1 << (exponent - 1) - (val < 0) add a14,a14,a3 sra a14,a14 .skip_div_by2_in0_remain: blti a13,1,.skip_div_by2_in1_remain l32i.n a3,a1,20 // 1 << (exponent - 1) extui a9,a10,31,1 ssr a13 // load right_shift sub a3,a3,a9 // 1 << (exponent - 1) - (val < 0) add a10,a10,a3 sra a10,a10 .skip_div_by2_in1_remain: // process output l32r a12,.nudge_val # [0], nudge l32i a13,a1,0 // -out_shift add.n a10,a10,a14 # [45] // multiply and pick high32 mulsh a3,a10,a8 mull a10,a10,a8 ssai 31 # [0] add a10,a10,a12 saltu a9,a10,a12 add a12,a3,a9 src a12,a12,a10 // div by power of 2 for output l32i a9,a1,96 # [31] out_offset blti a13,1,.skip_div_by2_out_remain l32i.n a3,a1,24 // 1 << (exponent - 1) extui a14,a12,31,1 ssr a13 // load right_shift sub a3,a3,a14 // 1 << (exponent - 1) - (val < 0) add a12,a12,a3 sra a12,a12 .skip_div_by2_out_remain: // add offset add.n a9,a9,a12 # [33] // apply activation l32i a13,a1,112 # [34] activation_max l32i a12,a1,108 # [35] activation_min min a13,a13,a9 # [36] l32i a9,a1,92 # [37] output max a13,a13,a12 # [38] add.n a9,a2,a9 # [39] s8i a13,a9,0 # [40] id:1371 l32i a12,a1,116 addi.n a2,a2,1 # [41] blt a2,a12,.process_leftover .exit: retw.n # [0] .size esp_nn_add_elementwise_s8_esp32s3, . - esp_nn_add_elementwise_s8_esp32s3 ================================================ FILE: src/basic_math/esp_nn_mul_ansi.c ================================================ // Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include void esp_nn_mul_elementwise_s8_ansi(const int8_t *input1_data, const int8_t *input2_data, const int32_t input1_offset, const int32_t input2_offset, int8_t *output, const int32_t out_offset, const int32_t out_mult, const int32_t out_shift, const int32_t activation_min, const int32_t activation_max, const int32_t size) { for (int i = 0; i < size; i++) { int32_t tmp1 = input1_data[i] + input1_offset; int32_t tmp2 = input2_data[i] + input2_offset; int32_t out = tmp1 * tmp2; out = esp_nn_multiply_by_quantized_mult(out, out_mult, out_shift); out = out + out_offset; out = max(activation_min, min(out, activation_max)); output[i] = (int8_t) out; } } void esp_nn_mul_broadcast_channel_s8_ansi(const int8_t *input1, const int8_t *input2_per_ch, const int32_t input1_offset, const int32_t input2_offset, int8_t *output, const int32_t output_offset, const int32_t output_mult, const int32_t output_shift, const int32_t activation_min, const int32_t activation_max, const int32_t total_spatial, const int32_t channels) { for (int s = 0; s < total_spatial; s++) { const int8_t *in_row = input1 + s * channels; int8_t *out_row = output + s * channels; for (int c = 0; c < channels; c++) { int32_t val = ((int32_t)in_row[c] + input1_offset) * ((int32_t)input2_per_ch[c] + input2_offset); val = esp_nn_multiply_by_quantized_mult(val, output_mult, output_shift); val += output_offset; val = max(val, activation_min); val = min(val, activation_max); out_row[c] = (int8_t)val; } } } ================================================ FILE: src/basic_math/esp_nn_mul_broadcast_s8_esp32s3.S ================================================ // Copyright 2026 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Optimized broadcast MUL for SE-block pattern: [H,W,C] * [1,1,C] // Processes 8 channels at a time using S3 SIMD. .text .align 4 .literal_position .literal .LC_nudge, 1073741824 // 1 << 30 .type esp_nn_mul_broadcast_channel_s8_esp32s3, @function .align 4 .global esp_nn_mul_broadcast_channel_s8_esp32s3 // void esp_nn_mul_broadcast_channel_s8_esp32s3( // const int8_t *input1, // a2 // const int8_t *input2_per_ch, // a3 // const int32_t input1_offset, // a4 // const int32_t input2_offset, // a5 // int8_t *output, // a6 // const int32_t output_offset, // a7 // const int32_t output_mult, // stack+120 // const int32_t output_shift, // stack+124 // const int32_t activation_min, // stack+128 // const int32_t activation_max, // stack+132 // const int32_t total_spatial, // stack+136 // const int32_t channels); // stack+140 // Stack frame layout (entry a1, 120): // 0: to_add (for div by power of 2) // 4: input2_per_ch (saved) // 8: output base (saved) // 12: channels // 16: input1 base (saved) // 20: right_shift // 24: input1_offset (saved) // 28: input2_offset (saved) // 32: spatial counter // 36: out_ptr (current) // 40: out_offset (from a7) // 44: input1_offset (for vldbc) // 48: input2_offset (for vldbc) esp_nn_mul_broadcast_channel_s8_esp32s3: entry a1, 120 // Save args s32i.n a3, a1, 4 // input2_per_ch base s32i.n a6, a1, 8 // output base s32i.n a2, a1, 16 // input1 base s32i.n a4, a1, 24 // input1_offset s32i.n a5, a1, 28 // input2_offset s32i a7, a1, 40 // out_offset l32i a8, a1, 136 // total_spatial l32i a9, a1, 140 // channels s32i.n a9, a1, 12 // save channels blti a8, 1, .Lexit // no spatial positions blti a9, 1, .Lexit // no channels // Prepare shift values l32i a15, a1, 124 // output_shift movi.n a11, 0 max a14, a15, a11 // left_shift = max(shift, 0) sub a4, a14, a15 // right_shift = left_shift - shift s32i.n a4, a1, 20 // save right_shift l32i a13, a1, 120 // output_mult l32r a4, .LC_nudge // nudge = 1 << 30 // Store offsets for vldbc l32i a8, a1, 136 // reload total_spatial s32i a5, a1, 48 // input2_offset for vldbc l32i.n a5, a1, 24 // input1_offset s32i a5, a1, 44 // input1_offset for vldbc // Init spatial counter movi.n a10, 0 s32i a10, a1, 32 // spatial counter = 0 // Pointers: a2 = input1 (current), a3 = input2_per_ch (reloaded each row), // a6 = output (current) .Lspatial_loop: l32i a8, a1, 136 // total_spatial l32i a10, a1, 32 // spatial counter bge a10, a8, .Lexit // Reset input2 pointer for each spatial position l32i.n a3, a1, 4 // input2_per_ch base // Channel counter l32i.n a9, a1, 12 // channels movi.n a11, 0 // channel index blti a9, 8, .Lchannel_leftover // Check alignment for SIMD path or a8, a2, a3 or a8, a8, a6 extui a8, a8, 0, 4 bnez a8, .Lchannel_leftover // Setup SIMD constants ee.zero.q q1 // zero register addi a8, a1, 44 ee.vldbc.16 q0, a8 // input1_offset broadcast addi a8, a1, 48 ee.vldbc.16 q7, a8 // input2_offset broadcast st.qr q0, a1, 64 // save for reload in loop .Lchannel_simd_loop: addi a8, a9, -7 // channels - 7 blt a11, a8, .Lchannel_simd_body j .Lchannel_leftover .Lchannel_simd_body: ld.qr q4, a1, 64 // input1_offset ee.vld.l.64.ip q2, a2, 8 // load 8 input1 values movi.n a7, 16 ee.vld.h.64.ip q2, a3, 8 // load 8 input2 values (per-ch) wsr.sar a7 ee.vcmp.lt.s8 q5, q2, q1 // sign extend ee.vzip.8 q2, q5 // interleave to 16-bit ee.vadds.s16 q5, q5, q7 // add input2_offset ee.vadds.s16 q4, q2, q4 // add input1_offset ee.vmul.s16 q3, q4, q5 // multiply (high part) ssai 0 // sar = 0 ee.vmul.s16 q2, q4, q5 // multiply (low part) // Requantize 8 results (same pattern as elementwise mul) wsr.sar a14 // left_shift ee.vzip.16 q2, q3 ee.vsl.32 q6, q2 // left shift first 4 ssai 31 // Element 2 of q6 ee.movi.32.a q6, a8, 2 mulsh a7, a13, a8 mull a8, a13, a8 add.n a8, a4, a8 saltu a5, a8, a4 add.n a5, a5, a7 src a5, a5, a8 // Element 3 ee.movi.32.a q6, a8, 3 mulsh a7, a13, a8 mull a8, a13, a8 add.n a8, a4, a8 saltu a12, a8, a4 add.n a12, a12, a7 src a12, a12, a8 ee.movi.32.q q2, a5, 2 ee.movi.32.q q2, a12, 3 // Element 1 ee.movi.32.a q6, a8, 1 mulsh a7, a13, a8 mull a8, a13, a8 add.n a8, a4, a8 saltu a5, a8, a4 add.n a5, a5, a7 src a5, a5, a8 // Element 0 ee.movi.32.a q6, a8, 0 mulsh a7, a13, a8 mull a8, a13, a8 add.n a8, a4, a8 saltu a12, a8, a4 add.n a12, a12, a7 src a12, a12, a8 ee.movi.32.q q2, a5, 1 ee.movi.32.q q2, a12, 0 // Second group of 4 (q3) wsr.sar a14 // left_shift ee.vsl.32 q4, q3 ssai 31 ee.movi.32.a q4, a8, 2 mulsh a7, a13, a8 mull a8, a13, a8 add.n a8, a4, a8 saltu a5, a8, a4 add.n a5, a5, a7 src a5, a5, a8 ee.movi.32.a q4, a8, 3 mulsh a7, a13, a8 mull a8, a13, a8 add.n a8, a4, a8 saltu a12, a8, a4 add.n a12, a12, a7 src a12, a12, a8 ee.movi.32.q q0, a5, 2 ee.movi.32.q q0, a12, 3 ee.movi.32.a q4, a8, 1 mulsh a7, a13, a8 mull a8, a13, a8 add.n a8, a4, a8 saltu a5, a8, a4 add.n a5, a5, a7 src a5, a5, a8 ee.movi.32.a q4, a8, 0 mulsh a7, a13, a8 mull a8, a13, a8 add.n a8, a4, a8 saltu a12, a8, a4 add.n a12, a12, a7 src a12, a12, a8 ee.movi.32.q q0, a5, 1 ee.movi.32.q q0, a12, 0 // Divide by power of 2 (right_shift) l32i.n a5, a1, 20 // right_shift movi.n a7, 1 blti a5, 1, .Lskip_div ee.vcmp.lt.s32 q5, q2, q1 ee.vcmp.lt.s32 q6, q0, q1 addi.n a8, a5, -1 ssl a8 sll a7, a7 // to_add = 1 << (right_shift - 1) s32i.n a7, a1, 0 ee.vldbc.32 q4, a1 // broadcast to_add wsr.sar a5 ee.vadds.s32 q5, q4, q5 ee.vadds.s32 q5, q2, q5 ee.vsr.32 q2, q5 wsr.sar a5 ee.vadds.s32 q5, q4, q6 ee.vadds.s32 q5, q0, q5 ee.vsr.32 q0, q5 .Lskip_div: // Add output offset, apply activation addi a8, a1, 132 ee.vldbc.32 q4, a8 // activation_max addi a5, a1, 40 ee.vldbc.32 q6, a5 // output_offset addi a7, a1, 128 ee.vadds.s32 q0, q0, q6 // add offset ee.vadds.s32 q2, q2, q6 ee.vldbc.32 q6, a7 // activation_min ee.vmin.s32 q0, q0, q4 ee.vmin.s32 q2, q2, q4 ee.vmax.s32 q0, q0, q6 ee.vmax.s32 q2, q2, q6 // Pack 32-bit -> 8-bit and store ee.vunzip.16 q2, q0 ee.vunzip.8 q2, q0 ee.vst.l.64.ip q2, a6, 8 addi a11, a11, 8 // channel index += 8 j .Lchannel_simd_loop .Lchannel_leftover: // Process remaining channels one by one l32i.n a9, a1, 12 // channels bge a11, a9, .Lspatial_next ssl a14 // left_shift l32i.n a8, a1, 24 // input1_offset l8ui a10, a2, 0 // *input1 sext a10, a10, 7 add.n a10, a10, a8 // + input1_offset l32i.n a8, a1, 28 // input2_offset l8ui a12, a3, 0 // *input2_per_ch sext a12, a12, 7 add.n a12, a12, a8 // + input2_offset mull a10, a10, a12 // multiply // Requantize sll a10, a10 // left shift l32i.n a9, a1, 20 // right_shift mulsh a8, a10, a13 mull a12, a10, a13 ssai 31 add.n a12, a4, a12 saltu a10, a12, a4 add.n a10, a10, a8 src a10, a10, a12 // result blti a9, 1, .Lskip_div_scalar addi a8, a9, -1 ssl a8 movi a7, 1 sll a7, a7 // to_add extui a8, a10, 31, 1 // sign bit (1 if neg, 0 if pos) sub a10, a10, a8 // val -= sign (fast rounding) add a10, a10, a7 ssr a9 sra a10, a10 .Lskip_div_scalar: l32i a8, a1, 40 // output_offset l32i a7, a1, 128 // activation_min l32i a12, a1, 132 // activation_max add.n a10, a10, a8 min a10, a10, a12 max a10, a10, a7 s8i a10, a6, 0 // store addi a2, a2, 1 // input1++ addi a3, a3, 1 // input2++ addi a6, a6, 1 // output++ addi a11, a11, 1 // channel index++ j .Lchannel_leftover .Lspatial_next: l32i a10, a1, 32 // spatial counter addi a10, a10, 1 s32i a10, a1, 32 j .Lspatial_loop .Lexit: retw.n .size esp_nn_mul_broadcast_channel_s8_esp32s3, . - esp_nn_mul_broadcast_channel_s8_esp32s3 ================================================ FILE: src/basic_math/esp_nn_mul_s8_esp32p4.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include #include /** * Elementwise multiply for s8 optimized for ESP32-P4. * Uses inlined fast requantization with 4x unrolled loop. * Interleaves independent computations to hide latency. */ void esp_nn_mul_elementwise_s8_esp32p4(const int8_t *input1_data, const int8_t *input2_data, const int32_t input1_offset, const int32_t input2_offset, int8_t *output, const int32_t out_offset, const int32_t out_mult, const int32_t out_shift, const int32_t activation_min, const int32_t activation_max, const int32_t size) { const int32_t left_shift = out_shift > 0 ? out_shift : 0; const int32_t right_shift = left_shift - out_shift; const int64_t nudge = (int64_t)1 << 30; int i = 0; for (; i <= size - 4; i += 4) { int32_t prod0 = (input1_data[i+0] + input1_offset) * (input2_data[i+0] + input2_offset); int32_t prod1 = (input1_data[i+1] + input1_offset) * (input2_data[i+1] + input2_offset); int32_t prod2 = (input1_data[i+2] + input1_offset) * (input2_data[i+2] + input2_offset); int32_t prod3 = (input1_data[i+3] + input1_offset) * (input2_data[i+3] + input2_offset); int32_t s0 = prod0 << left_shift; int32_t s1 = prod1 << left_shift; int32_t s2 = prod2 << left_shift; int32_t s3 = prod3 << left_shift; int32_t r0 = (int32_t)(((int64_t)s0 * out_mult + nudge) >> 31); int32_t r1 = (int32_t)(((int64_t)s1 * out_mult + nudge) >> 31); int32_t r2 = (int32_t)(((int64_t)s2 * out_mult + nudge) >> 31); int32_t r3 = (int32_t)(((int64_t)s3 * out_mult + nudge) >> 31); if (right_shift > 0) { int32_t rnd = (1 << (right_shift - 1)); r0 = (r0 + rnd - (r0 < 0)) >> right_shift; r1 = (r1 + rnd - (r1 < 0)) >> right_shift; r2 = (r2 + rnd - (r2 < 0)) >> right_shift; r3 = (r3 + rnd - (r3 < 0)) >> right_shift; } r0 = max(activation_min, min(r0 + out_offset, activation_max)); r1 = max(activation_min, min(r1 + out_offset, activation_max)); r2 = max(activation_min, min(r2 + out_offset, activation_max)); r3 = max(activation_min, min(r3 + out_offset, activation_max)); output[i+0] = (int8_t) r0; output[i+1] = (int8_t) r1; output[i+2] = (int8_t) r2; output[i+3] = (int8_t) r3; } for (; i < size; i++) { int32_t prod = (input1_data[i] + input1_offset) * (input2_data[i] + input2_offset); int32_t out = esp_nn_requantize(prod, out_mult, out_shift); out = max(activation_min, min(out + out_offset, activation_max)); output[i] = (int8_t) out; } } ================================================ FILE: src/basic_math/esp_nn_mul_s8_esp32s3.S ================================================ // Copyright 2021-2023 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. .text .align 4 .literal_position .literal .LC0_26_123, 1073741824 // `1 << 30` # Program Unit: esp_nn_mul_elementwise_s8_esp32s3 .type esp_nn_mul_elementwise_s8_esp32s3, @function .align 4 .global esp_nn_mul_elementwise_s8_esp32s3 esp_nn_mul_elementwise_s8_esp32s3: # 0x4 # to_add = 0 # gra_spill_temp_0 = 4 # gra_spill_temp_1 = 8 # gra_spill_temp_2 = 12 # gra_spill_temp_3 = 16 # gra_spill_temp_4 = 20 # gra_spill_temp_5 = 24 # gra_spill_temp_6 = 28 # gra_spill_temp_7 = 32 # gra_spill_temp_8 = 36 # gra_spill_temp_<> = 40 # gra_spill_temp_<> = 44 # gra_spill_temp_<> = 48 # gra_spill_temp_13 = 64 // registers: // a2: const int8_t *input1_data // a3: const int8_t *input2_data // a4: const int32_t input1_offset // a5: const int32_t input2_offset // a6: int8_t *output // a7: const int32_t out_offset // on stack: // 120: const int32_t out_mult // 124: const int32_t out_shift // 128: const int32_t activation_min // 132: const int32_t activation_max // 136: const int32_t size entry a1,120 # s32i.n a4,a1,24 # [0] gra_spill_temp_5, input1_offset s32i.n a5,a1,28 # [1] gra_spill_temp_12, input2_offset s32i.n a3,a1,4 # [5] gra_spill_temp_0, input2 mov.n a10,a3 # [6] l32i a3,a1,136 # [18] id:361 size+0x0 mov.n a9,a6 # [2] // out_addr blti a3,1,.exit # [0] // exit s32i.n a2,a1,16 # [9] gra_spill_temp_3, input1 s32i a7,a1,40 # [4] id:358 out_offset+0x0 movi.n a11,0 # [3] mov.n a12,a2 # [10] s32i a4,a1,44 # [13] id:356 input1_offset+0x0 s32i a5,a1,48 # [14] id:357 input2_offset+0x0 movi.n a2,1 # [15] l32i a15,a1,124 # [3] id:362 out_shift+0x0 l32i a13,a1,120 # [4] id:363 out_mult+0x0 s32i.n a6,a1,8 # [1] gra_spill_temp_1, out_addr max a14,a15,a11 # [11] left_shift sub a4,a14,a15 # right_shift s32i.n a4,a1,20 # [9] gra_spill_temp_4 blti a3,8,.process_leftover # [20] // skip to leftover routine if inputs are unaligned or a6,a12,a10 extui a6,a6,0,4 bnez a6,.process_leftover // `size > 8`, s3 optimisation path... ee.zero.q q1 # [0] addi a4,a1,44 # [7] addi a8,a1,48 # [8] ee.vldbc.16 q0,a4 # [17] id:359 input1_offset ee.vldbc.16 q7,a8 # [16] id:360 input2_offset l32r a4,.LC0_26_123 # [12] movi a8, 8 st.qr q0,a1,64 # [19] gra_spill_temp_13 s32i.n a8,a1,12 # [6] gra_spill_temp_2 .Lt_0_7682: # 0x60 s32i a9,a1,36 # [1] gra_spill_temp_8, out_addr ld.qr q4,a1,64 # [2] gra_spill_temp_13, input1_offset ee.vld.l.64.ip q2,a12,8 # [4] id:367, input1_ptr movi.n a7,16 # [3] ee.vld.h.64.ip q2,a10,8 # [5] id:368, input2_ptr wsr.sar a7 # [6] ee.vcmp.lt.s8 q5,q2,q1 # [7] ee.vzip.8 q2,q5 # [8] ee.vadds.s16 q5,q5,q7 # [9] input2_offset ee.vadds.s16 q4,q2,q4 # [10] input1_offset ee.vmul.s16 q3,q4,q5 # [11] wsr.sar a11 # [12] ee.vmul.s16 q2,q4,q5 # [13] wsr.sar a14 # [14] left_shift ee.vzip.16 q2,q3 # [15] ee.vsl.32 q6,q2 # [16] left_shift ssai 31 # [17] ee.movi.32.a q6,a3,2 # [18] ee.movi.32.a q6,a8,3 # [26] mulsh a6,a13,a3 # [19] mull a3,a13,a3 # [20] mulsh a7,a13,a8 # [27] add.n a3,a4,a3 # [22] saltu a2,a3,a4 # [23] add.n a2,a2,a6 # [24] src a2,a2,a3 # [25] mull a6,a13,a8 # [28] add.n a6,a4,a6 # [30] saltu a9,a6,a4 # [31] add.n a9,a9,a7 # [32] src a9,a9,a6 # [33] ee.movi.32.q q2,a2,2 # [53] ee.movi.32.q q2,a9,3 # [54] ee.movi.32.a q6,a6,1 # [34] mulsh a7,a13,a6 # [35] mull a6,a13,a6 # [36] add.n a6,a4,a6 # [38] saltu a3,a6,a4 # [39] add.n a3,a3,a7 # [16] src a3,a3,a6 # [41] ee.movi.32.a q6,a2,0 # [42] mulsh a8,a13,a2 # [43] mull a7,a13,a2 # [4] add.n a7,a4,a7 # [46] saltu a6,a7,a4 # [47] add.n a6,a6,a8 # [24] src a6,a6,a7 # [49] ee.movi.32.q q2,a3,1 # [28] ee.movi.32.q q2,a6,0 # [50] wsr.sar a14 # [10] ee.vsl.32 q4,q3 # [11] ee.movi.32.a q4,a2,2 # [13] mulsh a3,a13,a2 # [14] mull a2,a13,a2 # [15] ssai 31 # [12] add.n a2,a4,a2 # [17] saltu a5,a2,a4 # [18] add.n a5,a5,a3 # [19] src a5,a5,a2 # [20] ee.movi.32.a q4,a3,3 # [21] mulsh a6,a13,a3 # [22] mull a3,a13,a3 # [23] add.n a3,a4,a3 # [25] saltu a8,a3,a4 # [26] add.n a8,a8,a6 # [27] src a8,a8,a3 # [28] ee.movi.32.q q0,a5,2 # [24] ee.movi.32.q q0,a8,3 # [51] ee.movi.32.a q4,a7,1 # [29] mulsh a6,a13,a7 # [30] mull a3,a13,a7 # [31] add.n a3,a4,a3 # [33] saltu a2,a3,a4 # [34] add.n a2,a2,a6 # [35] src a2,a2,a3 # [36] ee.movi.32.a q4,a6,0 # [37] mulsh a7,a13,a6 # [38] mull a6,a13,a6 # [39] add.n a6,a4,a6 # [41] saltu a3,a6,a4 # [42] add.n a3,a3,a7 # [43] src a3,a3,a6 # [4] ee.movi.32.q q0,a2,1 # [47] ee.movi.32.q q0,a3,0 # [46] l32i.n a5,a1,20 # [0] gra_spill_temp_4, right_shift movi.n a7,1 # [51] blti a5,1,.skip_div_by_pow_of_2 // divide by power of 2 ee.vcmp.lt.s32 q5,q2,q1 # [56] ee.vcmp.lt.s32 q6,q0,q1 # [28] addi.n a8,a5,-1 # [1] ssl a8 # [2] sll a7,a7 # [3] s32i.n a7,a1,0 # [4] to_add ee.vldbc.32 q4,a1 # [5] id:376 to_add wsr.sar a5 # [6] ee.vadds.s32 q5,q4,q5 # [7] ee.vadds.s32 q5,q2,q5 # [8] ee.vsr.32 q2,q5 # [9] wsr.sar a5 # [5] ee.vadds.s32 q5,q4,q6 # [9] ee.vadds.s32 q5,q0,q5 # [11] ee.vsr.32 q0,q5 # [12] .skip_div_by_pow_of_2: // add offset, apply activation addi a8,a1,132 # [54] ee.vldbc.32 q4,a8 # [55] id:385 activation_max addi a5,a1,40 # [8] ee.vldbc.32 q6,a5 # [10] id:384 out_offset addi a7,a1,128 # [4] ee.vadds.s32 q0,q0,q6 # [13] // add out_offset ee.vadds.s32 q2,q2,q6 # [14] // add out_offset ee.vldbc.32 q6,a7 # [16] id:386 activation_min ee.vmin.s32 q0,q0,q4 # [17] ee.vmin.s32 q2,q2,q4 # [15] ee.vmax.s32 q0,q0,q6 # [18] ee.vmax.s32 q2,q2,q6 # [19] // pack and store ee.vunzip.16 q2,q0 # [20] ee.vunzip.8 q2,q0 # [21] l32i.n a7,a1,12 // count l32i a9,a1,36 # [55] gra_spill_temp_8 l32i.n a3,a1,136 # [1] , size ee.vst.l.64.ip q2,a9,8 # [22] id:387 addi a7,a7,8 s32i.n a7,a1,12 // increment count bge a3,a7,.Lt_0_7682 addi a11,a7,-8 bge a11,a3,.exit # [3] // exit .process_leftover: sub a8,a3,a11 # [1] loopgtz a8,.LBB33_esp_nn_mul_elementwise_s8_esp32s3 # [9] ssl a14 # [0] left_shift l32i.n a8,a1,24 # [1] gra_spill_temp_5, input1_offset l32i.n a10,a1,4 # [2] gra_spill_temp_0, input2 l32i.n a12,a1,16 # [3] gra_spill_temp_3, input1 add.n a10,a11,a10 # [4], input2 add.n a12,a11,a12 # [5], input1 l8ui a12,a12,0 # [6] id:390 l8ui a10,a10,0 # [7] id:391 sext a12,a12,7 # [8] add.n a12,a12,a8 # [9] l32i.n a8,a1,28 # [10] gra_spill_temp_12, input2_offset sext a10,a10,7 # [11] add.n a10,a10,a8 # [12] mull a10,a12,a10 # [13] // multiplication result // multiply by quantised mult l32i.n a9,a1,20 # [0] gra_spill_temp_4, load right_shift sll a10,a10 # [15] // left shift mulsh a3,a10,a13 # [1] mull a8,a10,a13 # [6] ssai 31 # [0] add.n a6,a8,a4 # [8] saltu a8,a6,a8 # [9] add.n a8,a8,a3 # [10] src a3,a8,a6 # [19] // result blti a9, 1, .skip_div_by_pow_of_2_remains // divide by power of 2 // calculate to_add = `1 << (exponent - 1)` addi a6,a9,-1 ssl a6 # [23] movi a7,1 sll a7,a7 // to_add extui a8,a3,31,1 # [24], sign add a3,a3,a8 // add sign add a3,a3,a7 // add to_add ssr a9 # [20] load right_shift sra a3,a3 // right shift .skip_div_by_pow_of_2_remains: l32i.n a6,a1,40 # [32], out_offset l32i.n a8,a1,132 # [35], act_max l32i.n a7,a1,128 # [36], act_min // add offset and apply activation add.n a3,a3,a6 # [34], offset added min a8,a8,a3 # [37] l32i.n a3,a1,8 # [38] gra_spill_temp_1, load base out_addr max a8,a8,a7 # [39] // store add.n a3,a11,a3 # [16], add index from `a11` s8i a8,a3,0 # [41] id:392 // store addi.n a11,a11,1 # [42] // inc index .LBB33_esp_nn_mul_elementwise_s8_esp32s3: # 0x2ed .exit: retw.n # [0] .size esp_nn_mul_elementwise_s8_esp32s3, . - esp_nn_mul_elementwise_s8_esp32s3 ================================================ FILE: src/common/common_functions.h ================================================ /* * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #pragma once #include #include #include /** * c99 standard still doesn't strictly inline functions * We need to use attribute as well to do this. */ #define __NN_FORCE_INLINE__ __attribute((always_inline)) static inline /* min/max macros */ #ifndef max #define max(a, b) ({ \ __typeof__ (a) _a = (a); \ __typeof__ (b) _b = (b); \ _a > _b ? _a : _b; \ }) #define min(a, b) ({ \ __typeof__ (a) _a = (a); \ __typeof__ (b) _b = (b); \ _a < _b ? _a : _b; \ }) #endif __NN_FORCE_INLINE__ int32_t esp_nn_clz32(uint32_t in) { #if CONFIG_IDF_TARGET_ARCH_XTENSA __asm__ volatile("nsau %0, %0" : "+r" (in)); return in; #elif defined(__GNUC__) return __builtin_clz(in); #else int32_t count = 32; uint32_t x = in, y = in >> 16; if (y != 0) { count -= 16; x = y; } y = x >> 8; if (y != 0) { count -= 8; x = y; } y = x >> 4; if (y != 0) { count -= 4; x = y; } y = x >> 2; if (y != 0) { count -= 2; x = y; } y = x >> 1; if (y != 0) { return count - 2; } return count - x; #endif } /** * Signed saturate a 32 bit value to 8 bits keeping output in 32 bit variable. */ __NN_FORCE_INLINE__ int32_t esp_nn_saturate8(int32_t in) { #if CONFIG_IDF_TARGET_ARCH_XTENSA __asm__ volatile("clamps %0, %0, 7" : "+a"(in)); return in; #else return max(INT8_MIN, min(in, INT8_MAX)); #endif } __NN_FORCE_INLINE__ int32_t esp_nn_pick_sat_high32_of64(int64_t val64) { int32_t sign = (int32_t) (val64 >> 63); int32_t to_add = sign & ((1ul << 31) - 1); return (int32_t) ((int64_t) (val64 + to_add) >> 31); } __NN_FORCE_INLINE__ int32_t esp_nn_sat_round_doubling_high_mul(int32_t in0, int32_t in1) { int32_t result; int64_t in0_64 = (int64_t) in0; bool overflow = (in0 == in1) && (in0 == (int32_t) INT32_MIN); /* Nudge value */ int64_t nudge_val = 1 << 30; if ((in0 < 0) ^ (in1 < 0)) { nudge_val = 1 - nudge_val; } /* Multiply and add nudge */ int64_t mult = in0_64 * in1 + nudge_val; /* Round and pickup 32 bits */ result = esp_nn_pick_sat_high32_of64(mult); return overflow ? INT32_MAX : result; } /** * fast version * this will fail for values closer to INT32_MAX and INT32_MIN by `1 << (exponent - 1)`. * We can afford to do this because we are at the very last stage of filter. * Also it is pretty rare condition as our output is going to be 8 bit. */ __NN_FORCE_INLINE__ int32_t esp_nn_div_by_power_of_two_fast(int32_t val, int32_t exponent) { int32_t to_add = (1 << (exponent - 1)) - (val < 0); return (int32_t) ((val + to_add) >> exponent); } __NN_FORCE_INLINE__ int32_t esp_nn_div_by_power_of_two(int32_t val, int32_t exponent) { int32_t result; const int32_t mask = (1 << exponent) - 1; const int32_t remainder = val & mask; result = val >> exponent; int32_t threshold = (mask >> 1) + (result < 0); if (remainder > threshold) { result += 1; } return result; } __NN_FORCE_INLINE__ int32_t esp_nn_multiply_by_quantized_mult(int32_t x, int32_t mult, int32_t shift) { int32_t left_shift = shift > 0 ? shift : 0; int32_t right_shift = shift > 0 ? 0 : -shift; int32_t result = esp_nn_sat_round_doubling_high_mul(x * (1 << left_shift), mult); return esp_nn_div_by_power_of_two(result, right_shift); } #if CONFIG_IDF_TARGET_ESP32P4 /** PIE enable macro - call once before using any esp.* instructions */ #define ESP_NN_PIE_ENABLE() do { \ asm volatile ( \ "csrsi 0x7f2, 0b01 \n\t" \ "li x29, 0b10 \n\t" \ "esp.movx.w.cfg x29 \n\t" \ ::: "x29" \ ); \ } while(0) /** Extract 16 int32 per-lane results from QACC into array */ #define ESP_NN_QACC_EXTRACT_S32(dst) do { \ asm volatile ( \ "mv x30, %0 \n\t" \ "esp.st.qacc.l.l.128.ip x30, 16 \n\t" \ "esp.st.qacc.l.h.128.ip x30, 16 \n\t" \ "esp.st.qacc.h.l.128.ip x30, 16 \n\t" \ "esp.st.qacc.h.h.128.ip x30, 0 \n\t" \ :: "r"(dst) \ : "x30", "memory" \ ); \ } while(0) #endif /* CONFIG_IDF_TARGET_ESP32P4 - PIE_ENABLE and QACC_EXTRACT */ /** * 2-wide interleaved requant macro for ESP32-P4 RISC-V. * Interleaves mulh across two independent elements for pipeline fill. * Outputs r0, r1 as requantized int32 values (before offset/clamp). */ #if CONFIG_IDF_TARGET_ESP32P4 #define ESP_NN_REQUANT_2X(x0, x1, m0, m1, s0, s1, r0, r1) do { \ int32_t _ls0 = (s0) > 0 ? (s0) : 0; \ int32_t _ls1 = (s1) > 0 ? (s1) : 0; \ int32_t _v0 = (x0) << _ls0; \ int32_t _v1 = (x1) << _ls1; \ int32_t _rs0 = _ls0 - (s0); \ int32_t _rs1 = _ls1 - (s1); \ int32_t _hi0, _lo0, _hi1, _lo1; \ asm volatile ( \ "mulh %[h0], %[v0], %[mm0] \n\t" \ "mulh %[h1], %[v1], %[mm1] \n\t" \ "mul %[l0], %[v0], %[mm0] \n\t" \ "mul %[l1], %[v1], %[mm1] \n\t" \ : [h0] "=&r"(_hi0), [h1] "=&r"(_hi1), \ [l0] "=&r"(_lo0), [l1] "=&r"(_lo1) \ : [v0] "r"(_v0), [v1] "r"(_v1), \ [mm0] "r"((int32_t)(m0)), [mm1] "r"((int32_t)(m1)) \ ); \ /* Add nudge (1<<30) and extract bits [31:62] */ \ uint32_t _n = 0x40000000u; \ uint32_t _a0 = (uint32_t)_lo0 + _n; \ _hi0 += (_a0 < (uint32_t)_lo0); \ (r0) = (_hi0 << 1) | (_a0 >> 31); \ uint32_t _a1 = (uint32_t)_lo1 + _n; \ _hi1 += (_a1 < (uint32_t)_lo1); \ (r1) = (_hi1 << 1) | (_a1 >> 31); \ /* Right shift with rounding */ \ if (_rs0) { (r0) = ((r0) + (1 << (_rs0 - 1)) - ((r0) < 0)) >> _rs0; } \ if (_rs1) { (r1) = ((r1) + (1 << (_rs1 - 1)) - ((r1) < 0)) >> _rs1; } \ } while(0) #endif __NN_FORCE_INLINE__ int32_t esp_nn_multiply_by_quantized_mult_fast(int32_t x, int32_t mult, int32_t shift) { int32_t left_shift = max(shift, 0); int32_t right_shift = left_shift - shift; int64_t nudge_val = 1 << 30; int64_t in0_64 = (int64_t) (x << left_shift); /* Multiply and add nudge */ int64_t mult_64 = in0_64 * mult + nudge_val; int32_t result = (int32_t) (mult_64 >> 31); if (right_shift) { result = esp_nn_div_by_power_of_two_fast(result, right_shift); } return result; } /* * Unified requantize wrapper. Defining either SKIP_NUDGE (legacy) or * CONFIG_NN_SKIP_NUDGE (Kconfig-driven) selects the faster, non-bit-exact * path; otherwise the bit-exact TFLite-reference path is used. */ #if defined(SKIP_NUDGE) || defined(CONFIG_NN_SKIP_NUDGE) #define esp_nn_requantize(x, m, s) esp_nn_multiply_by_quantized_mult_fast((x), (m), (s)) #else #define esp_nn_requantize(x, m, s) esp_nn_multiply_by_quantized_mult((x), (m), (s)) #endif static void esp_nn_aligned_s8_pad_with_value(const int8_t *src, int8_t *dst, const uint16_t input_wd, const uint16_t input_ht, const uint16_t channels, const int32_t pad_val, const uint16_t pad_wd, const uint16_t pad_ht) { /* memset with pad_val */ memset(dst, pad_val, ((input_wd + 2 * pad_wd) * (input_ht + 2 * pad_ht)) * channels); dst += (pad_wd + input_wd + pad_wd) * pad_ht * channels; for (int i = 0; i < input_ht; i++) { dst += pad_wd * channels; for (int j = 0; j < input_wd * channels; j++) { *dst++ = *src++; } dst += pad_wd * channels; } } static void esp_nn_aligned_s8_pad_end_with_value(const int8_t *src, int8_t *dst, const uint16_t input_wd, const uint16_t input_ht, const uint16_t channels, const int32_t pad_val, const uint16_t pad_wd, const uint16_t pad_ht) { for (int i = 0; i < input_ht; i++) { for (int j = 0; j < input_wd * channels; j++) { *dst++ = *src++; } if (pad_wd) { memset(dst, pad_val, pad_wd * channels); dst += pad_wd * channels; } } /* pad end `pad_ht` lines at end */ if (pad_ht) { memset(dst, pad_val, (input_wd + pad_wd) * pad_ht * channels); } } /** * @brief convert 8 bit input data to 16 bit * * @param src int8_t source data * @param dst int16_t dst data * @param size length of data * @param offset offset to be added to src data. Range: [-128, 127] */ __NN_FORCE_INLINE__ void esp_nn_s8_to_s16_with_offset(const int8_t *src, int16_t *dst, const int size, const int32_t offset) { int i = 0; for (; i < size; i += 2) { dst[i + 0] = src[i + 0] + offset; dst[i + 1] = src[i + 1] + offset; } if(i < size) { dst[i] = src[i] + offset; } } /** * @brief convert 8 bit input data to 16 bit * * @param src int8_t source data * @param dst int16_t dst data * @param size length of data */ __NN_FORCE_INLINE__ void esp_nn_s8_to_s16(const int8_t *src, int16_t *dst, const int size) { int i = 0; for (; i < size; i += 2) { dst[i + 0] = src[i + 0]; dst[i + 1] = src[i + 1]; } if(i < size) { dst[i] = src[i]; } } #if CONFIG_IDF_TARGET_ESP32S3 /** * @brief s8 dot product — both pointers 16-byte aligned. * Uses ACCX accumulator with fused MAC+load. * * @param a input data (16-byte aligned) * @param b filter data (16-byte aligned) * @param len number of elements (must be multiple of 16, >= 16) * @return int32_t dot product result */ extern int32_t esp_nn_dot_s8_aligned_esp32s3(const int8_t *a, const int8_t *b, int32_t len); /** * @brief s8 dot product — input aligned, filter may be unaligned. * Uses USAR+QUP pattern for filter data. * * @param a input data (16-byte aligned) * @param b filter data (may be unaligned) * @param len_div16 number of 16-element chunks (>= 1) * @return int32_t dot product result */ extern int32_t esp_nn_dot_s8_unaligned_esp32s3(const int8_t *a, const int8_t *b, int32_t len_div16); #endif ================================================ FILE: src/common/esp_nn_common_functions_esp32s3.S ================================================ // Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. .text # Program Unit: esp_nn_aligned_s8_to_s16_with_offset_esp32s3 .type esp_nn_aligned_s8_to_s16_with_offset_esp32s3, @function .align 4 .global esp_nn_aligned_s8_to_s16_with_offset_esp32s3 esp_nn_aligned_s8_to_s16_with_offset_esp32s3: # 0x30d entry a1,48 # mov.n a10,a2 # // src mov.n a9,a3 # // dst mov.n a8,a4 # // size s32i.n a5,a1,12 # [3] // offset addi.n a2,a1,12 # [4] blti a4,32,.Lt_2_6402 # [5] if (size < 32) goto unopt addi.n a6,a8,-1 # [0] ee.zero.q q5 # [1] ee.vldbc.16 q4,a2 # [2] id:136 offset mov.n a3,a10 # [3] mov.n a2,a9 # [4] ee.vld.128.ip q0,a3,16 # [5] id:137 ee.vld.128.ip q1,a3,16 # [6] id:138 ee.vcmp.lt.s8 q2,q0,q5 # [7] ee.vzip.8 q0,q2 # [8] ee.vadds.s16 q0,q0,q4 # [9] ee.vadds.s16.st.incp q0,a2,q0,q2,q4 # [10] id:139 blti a4,64,.Lt_2_7170 # [11] addi a5,a4,-32 # [0] srai a5,a5,5 # [1] slli a4,a5,5 # [2] loopgtz a5,.LBB37_esp_nn_aligned_s8_to_s16_with_offset_esp32s3 # [3] ee.vst.128.ip q0,a2,16 # [0*II+0] id:140 ee.vcmp.lt.s8 q0,q1,q5 # [0*II+1] ee.vzip.8 q1,q0 # [0*II+2] ee.vadds.s16.ld.incp q2,a3,q3,q1,q4 # [0*II+3] id:141 ee.vadds.s16.st.incp q3,a2,q0,q0,q4 # [0*II+4] id:142 ee.vcmp.lt.s8 q3,q2,q5 # [0*II+5] ee.vst.128.ip q0,a2,16 # [0*II+6] id:143 ee.vzip.8 q2,q3 # [0*II+7] ee.vadds.s16.ld.incp q1,a3,q0,q2,q4 # [0*II+8] id:144 ee.vadds.s16.st.incp q0,a2,q0,q3,q4 # [0*II+9] id:145 .LBB37_esp_nn_aligned_s8_to_s16_with_offset_esp32s3: # 0x36d addi a4,a4,32 # [0] .Lt_2_3842: # 0x370 ee.vst.128.ip q0,a2,16 # [0] id:146 ee.vcmp.lt.s8 q2,q1,q5 # [1] ee.vzip.8 q1,q2 # [2] ee.vadds.s16 q2,q2,q4 # [3] ee.vadds.s16 q3,q1,q4 # [4] ee.vst.128.ip q3,a2,16 # [5] id:147 ee.vst.128.ip q2,a2,16 # [6] id:148 bge a4,a6,.Lt_2_4866 # [7] l32i.n a5,a1,12 # [0] id:135 offset+0x0 .Lt_2_5122: # 0x38a mov.n a11,a4 # [0] add.n a2,a4,a10 # [1] # 576 dst[i + 0] = src[i + 0] + offset; l8ui a7,a2,0 # [2] id:149 addx2 a6,a4,a9 # [3] sext a7,a7,7 # [4] add.n a7,a7,a5 # [5] s16i a7,a6,0 # [6] id:150 # 577 dst[i + 1] = src[i + 1] + offset; l8ui a3,a2,1 # [7] id:151 sub a7,a8,a4 # [8] addi.n a2,a2,2 # [9] srai a7,a7,1 # [10] sext a3,a3,7 # [11] add.n a3,a3,a5 # [12] s16i a3,a6,2 # [13] id:152 addi.n a3,a7,-1 # [14] loopgtz a3,.LBB52_esp_nn_aligned_s8_to_s16_with_offset_esp32s3 # [15] l8ui a3,a2,0 # [0*II+0] id:149 addi.n a6,a6,4 # [1*II+1] sext a3,a3,7 # [0*II+2] add.n a3,a3,a5 # [0*II+3] s16i a3,a6,0 # [0*II+4] id:150 l8ui a3,a2,1 # [0*II+5] id:151 addi.n a2,a2,2 # [0*II+6] sext a3,a3,7 # [0*II+7] add.n a3,a3,a5 # [0*II+8] s16i a3,a6,2 # [0*II+9] id:152 .LBB52_esp_nn_aligned_s8_to_s16_with_offset_esp32s3: # 0x3ce addx2 a4,a7,a11 # [0] .Lt_2_4866: # 0x3d1 bge a4,a8,.Lt_2_7682 # [0] # 580 dst[i] = src[i] + offset; addx2 a11,a4,a9 # [0] add.n a8,a4,a10 # [1] l8ui a8,a8,0 # [2] id:153 l32i.n a12,a1,12 # [3] id:135 offset+0x0 sext a8,a8,7 # [4] add.n a8,a8,a12 # [5] s16i a8,a11,0 # [6] id:154 retw.n # [7] .Lt_2_6402: # 0x3e8 blti a4,2,.Lt_2_6658 # [0] movi.n a4,0 # [0] j .Lt_2_5122 # [1] .Lt_2_7682: # 0x3f0 retw.n # [0] .Lt_2_6658: # 0x3f2 blti a4,1,.Lt_2_7682 # [0] l8ui a11,a10,0 # [0] id:153 sext a11,a11,7 # [2] add.n a11,a11,a5 # [3] s16i a11,a3,0 # [4] id:154 retw.n # [5] .Lt_2_7170: # 0x402 movi.n a4,32 # [0] j .Lt_2_3842 # [1] .size esp_nn_aligned_s8_to_s16_with_offset_esp32s3, . - esp_nn_aligned_s8_to_s16_with_offset_esp32s3 .literal_position # Program Unit: esp_nn_s8_to_s16_esp32s3 .type esp_nn_s8_to_s16_esp32s3, @function .align 4 .global esp_nn_s8_to_s16_esp32s3 esp_nn_s8_to_s16_esp32s3: # 0x40b entry a1,32 # mov.n a9,a2 // src mov.n a8,a3 // dst mov.n a7,a4 // size blti a4,1,.Lt_3_4866 // size == 0 blti a4,16,.Lt_3_4610 // if (size < 16) jump to unopt path // load align_len to sar_byte extui a2,a2,0,4 # [0] wur.sar_byte a2 # [1] mov.n a2,a9 # [2] // preload ee.vld.128.ip q0,a2,16 ee.vld.128.ip q1,a2,16 ee.zero.q q4 # 672 # 673 for (i = 16; i < size - 15; i += 16) { blti a4,32,.Lt_3_5378 # [5] addi a6,a4,-16 # [1] srai a6,a6,4 # [2] slli a4,a6,4 # [3] loopgtz a6,.LBB35_esp_nn_s8_to_s16_esp32s3 # [4] ee.src.q.qup q2,q0,q1 # [0*II+0] ee.vcmp.lt.s8 q3,q2,q4 # [0*II+1] // sign ee.vld.128.ip q1,a2,16 # [0*II+2] // for next iteration ee.vzip.8 q2,q3 # [0*II+3] ee.vst.128.ip q2,a3,16 # [0*II+4] id:93 ee.vst.128.ip q3,a3,16 # [0*II+5] id:94 .LBB35_esp_nn_s8_to_s16_esp32s3: # 0x449 addi a4,a4,16 # [0] .Lt_3_2050: # 0x44c ee.src.q.qup q5,q0,q1 # [0] ee.vcmp.lt.s8 q3,q5,q4 # [1] ee.vzip.8 q5,q3 # [2] ee.vst.128.ip q5,a3,16 # [3] id:96 ee.vst.128.ip q3,a3,16 # [4] id:97 # 687 # 688 skip_to_remains_s8_to_s16: # 689 for (; i < size; i += 2) { bge a4,a7,.Lt_3_4866 # [5] .Lt_3_3330: # 0x45e mov.n a11,a4 # [0] add.n a2,a4,a9 # [1] # 690 dst[i + 0] = src[i + 0]; l8ui a10,a2,0 # [2] id:98 addx2 a5,a4,a8 # [3] sext a10,a10,7 # [4] s16i a10,a5,0 # [5] id:99 # 691 dst[i + 1] = src[i + 1]; l8ui a3,a2,1 # [6] id:100 sub a10,a7,a4 # [7] addi.n a2,a2,2 # [8] addi.n a10,a10,1 # [9] srai a10,a10,1 # [10] sext a3,a3,7 # [11] s16i a3,a5,2 # [12] id:101 addi.n a3,a10,-1 # [13] loopgtz a3,.LBB50_esp_nn_s8_to_s16_esp32s3 # [14] l8ui a3,a2,0 # [0*II+0] id:98 addi.n a5,a5,4 # [1*II+1] sext a3,a3,7 # [0*II+2] s16i a3,a5,0 # [0*II+3] id:99 l8ui a3,a2,1 # [0*II+4] id:100 addi.n a2,a2,2 # [0*II+5] sext a3,a3,7 # [0*II+6] s16i a3,a5,2 # [0*II+7] id:101 .LBB50_esp_nn_s8_to_s16_esp32s3: # 0x49c addx2 a4,a10,a11 # [0] # 692 } # 693 if(i < size) { bge a4,a7,.Lt_3_4866 # [1] # 694 dst[i] = src[i]; add.n a11,a4,a9 # [0] l8ui a11,a11,0 # [1] id:102 addx2 a12,a4,a8 # [2] sext a11,a11,7 # [3] s16i a11,a12,0 # [4] id:103 retw.n # [5] .Lt_3_4610: # 0x4b2 movi.n a4,0 # [0] j .Lt_3_3330 # [1] .Lt_3_4866: # 0x4ba retw.n # [0] .Lt_3_5378: # 0x4bc movi.n a4,16 # [1] j .Lt_3_2050 # [2] .size esp_nn_s8_to_s16_esp32s3, . - esp_nn_s8_to_s16_esp32s3 ================================================ FILE: src/common/esp_nn_dot_s8_esp32s3.S ================================================ // // SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD // // SPDX-License-Identifier: Apache-2.0 // // // Reusable s8 dot product kernels for ESP32-S3. // Used by conv im2col, FC, and any kernel that reduces to a dot product. // // esp_nn_dot_s8_aligned_esp32s3: // Both input and filter 16-byte aligned. Uses ee.vld.128.ip + fused MAC. // // esp_nn_dot_s8_unaligned_esp32s3: // Input aligned, filter may be unaligned. Uses USAR+QUP for filter. // .text .align 4 // ============================================================ // esp_nn_dot_s8_aligned_esp32s3 // Both pointers must be 16-byte aligned. // a2: input_data (aligned) // a3: filter_data (aligned) // a4: len (must be multiple of 16, >= 16) // Returns: int32_t dot product in a2 // ============================================================ .type esp_nn_dot_s8_aligned_esp32s3, @function .align 4 .global esp_nn_dot_s8_aligned_esp32s3 esp_nn_dot_s8_aligned_esp32s3: entry a1, 32 ee.zero.accx beqz a4, .Lalign_done // Compute loop count and remainder srli a5, a4, 4 // a5 = len / 16 beqz a5, .Lalign_done // Prime: load first pair ee.vld.128.ip q0, a2, 16 ee.vld.128.ip q1, a3, 16 addi a5, a5, -1 beqz a5, .Lalign_last // Main loop: fused MAC + load loopgtz a5, .Lalign_loop_end ee.vmulas.s8.accx.ld.ip q0, a2, 16, q0, q1 ee.vld.128.ip q1, a3, 16 .Lalign_loop_end: .Lalign_last: // Final MAC ee.vmulas.s8.accx q0, q1 .Lalign_done: // Read lower 32 bits of ACCX (sufficient for int8 dot products) nop nop rur.accx_0 a2 retw.n .size esp_nn_dot_s8_aligned_esp32s3, . - esp_nn_dot_s8_aligned_esp32s3 // ============================================================ // esp_nn_dot_s8_unaligned_esp32s3 // Input must be 16-byte aligned. Filter can be unaligned. // Uses USAR+QUP pattern for filter loads. // a2: input_data (aligned) // a3: filter_data (may be unaligned) // a4: len_div16 (>= 1) // Returns: int32_t dot product in a2 // ============================================================ .type esp_nn_dot_s8_unaligned_esp32s3, @function .align 4 .global esp_nn_dot_s8_unaligned_esp32s3 esp_nn_dot_s8_unaligned_esp32s3: entry a1, 32 ee.zero.accx beqz a4, .Lunalign_done // Prime: first unaligned filter load (sets SAR_BYTE) ee.ld.128.usar.ip q0, a3, 16 // Check if we can do 2x unrolled (need >= 2 iterations) srai a5, a4, 1 // a5 = len_div16 / 2 beqz a5, .Lunalign_single // Load first input + filter pair for unrolled loop ee.vld.128.ip q1, a2, 16 ee.ld.128.usar.ip q2, a3, 16 // 2x unrolled main loop loopgtz a5, .Lunalign_loop2_end ee.src.q.qup q4, q0, q2 // align filter[i] ee.vld.128.ip q3, a2, 16 // input[i+1] ee.vmulas.s8.accx q4, q1 // MAC filter[i] * input[i] ee.ld.128.usar.ip q0, a3, 16 // filter chunk[i+2] ee.src.q.qup q5, q2, q0 // align filter[i+1] ee.vld.128.ip q1, a2, 16 // input[i+2] (primed for next) ee.vmulas.s8.accx q5, q3 // MAC filter[i+1] * input[i+1] ee.ld.128.usar.ip q2, a3, 16 // filter chunk[i+3] .Lunalign_loop2_end: // Check if there's a remaining single iteration (odd len_div16) bbci a4, 0, .Lunalign_done_mac // Odd remainder: the 2x loop already loaded q0/q2 for the next chunk. // Just qup the filter and MAC with the primed input (q1). // But q1 was loaded as input[i+2] in the last loop iteration — we need // to re-read the correct input. Actually, q1 is already the right input. // q0 and q2 are the filter chunks ready for qup. ee.src.q.qup q4, q0, q2 ee.vmulas.s8.accx q4, q1 j .Lunalign_done_mac .Lunalign_single: // Called when len_div16 < 2 (single chunk only) ee.vld.128.ip q1, a2, 16 ee.ld.128.usar.ip q2, a3, 16 ee.src.q.qup q4, q0, q2 ee.vmulas.s8.accx q4, q1 .Lunalign_done_mac: .Lunalign_done: // 2-cycle gap before ACCX read movi.n a3, 0 nop ee.srs.accx a2, a3, 0 retw.n .size esp_nn_dot_s8_unaligned_esp32s3, . - esp_nn_dot_s8_unaligned_esp32s3 ================================================ FILE: src/common/esp_nn_mean_ansi.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /* * Quantized mean reduction over spatial dimensions (axes 1,2). * Specialized for 4D tensors [N, H, W, C] → [N, 1, 1, C]. * This is the common case in Squeeze-and-Excite blocks. */ #include #include void esp_nn_mean_nhwc_s8_ansi(const int8_t *input, int8_t *output, const int32_t height, const int32_t width, const int32_t channels, const int32_t input_zero_point, const int32_t output_zero_point, const int32_t multiplier, const int32_t shift) { const int32_t num_elements = height * width; for (int c = 0; c < channels; c++) { /* Sum over spatial dimensions */ int32_t sum = 0; for (int h = 0; h < height; h++) { for (int w = 0; w < width; w++) { sum += input[(h * width + w) * channels + c]; } } /* Apply zero point correction */ sum -= num_elements * input_zero_point; /* Requantize: multiply_by_quantized_mult(sum, multiplier, shift) */ int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift); result += output_zero_point; result = max(result, -128); result = min(result, 127); output[c] = (int8_t)result; } } ================================================ FILE: src/common/esp_nn_mean_s8_esp32p4.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /* * ESP32-P4 optimized spatial mean reduction using QACC per-lane accumulation. * Processes 16 channels in parallel via esp.vmulas.s8.qacc (same pattern as avg_pool). */ #include #include void esp_nn_mean_nhwc_s8_esp32p4(const int8_t *input, int8_t *output, const int32_t height, const int32_t width, const int32_t channels, const int32_t input_zero_point, const int32_t output_zero_point, const int32_t multiplier, const int32_t shift) { const int32_t num_elements = height * width; const int32_t ch_16 = channels >> 4; const int8_t one_val = 1; if (ch_16 > 0) { /* Enable PIE and broadcast 1 into q7 */ asm volatile ( "csrsi 0x7f2, 0b01 \n\t" "li x29, 0b10 \n\t" "esp.movx.w.cfg x29 \n\t" ::: "x29" ); asm volatile ( "mv x30, %0 \n\t" "esp.vldbc.8.ip q7, x30, 0 \n\t" :: "r"(&one_val) : "x30" ); } /* Process all channels - QACC for 16-channel blocks, scalar for remainder */ int ch = 0; for (int ch_blk = 0; ch_blk < ch_16; ch_blk++, ch += 16) { /* Single asm block: broadcast ones, zero QACC, accumulate all spatial * positions. Keeping in one block prevents compiler from clobbering * q7 between the broadcast and the MAC loop. */ const int8_t *base_ptr = input + ch; asm volatile ( /* Broadcast 1 into q7 */ "mv x30, %[one] \n\t" "esp.vldbc.8.ip q7, x30, 0 \n\t" /* Zero QACC */ "esp.zero.qacc \n\t" /* Accumulate loop: stride = channels between spatial positions */ "mv x30, %[base] \n\t" "mv s7, %[cnt] \n\t" "1: \n\t" "esp.vld.128.ip q0, x30, 0 \n\t" "esp.vmulas.s8.qacc q0, q7 \n\t" "add x30, x30, %[stride] \n\t" "addi s7, s7, -1 \n\t" "bnez s7, 1b \n\t" : : [one] "r"(&one_val), [base] "r"(base_ptr), [cnt] "r"(num_elements), [stride] "r"((int32_t)channels) : "x30", "s7" ); int32_t sums[16] __attribute__((aligned(16))); ESP_NN_QACC_EXTRACT_S32(sums); int32_t zp_correction = num_elements * input_zero_point; for (int k = 0; k < 16; k++) { int32_t result = sums[k] - zp_correction; result = esp_nn_multiply_by_quantized_mult(result, multiplier, shift); result += output_zero_point; result = max(result, -128); result = min(result, 127); output[ch + k] = (int8_t)result; } } /* Remaining channels scalar */ for (; ch < channels; ch++) { int32_t sum = 0; for (int hw = 0; hw < num_elements; hw++) { sum += input[hw * channels + ch]; } sum -= num_elements * input_zero_point; int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift); result += output_zero_point; result = max(result, -128); result = min(result, 127); output[ch] = (int8_t)result; } } ================================================ FILE: src/common/esp_nn_mean_s8_esp32s3.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /* * ESP32-S3 optimized mean reduction for NHWC int8 tensors. * Uses int16 accumulation for small spatial sizes (H*W <= 256), * int32 for larger. Accumulates all channels at once per spatial position. */ #include #include #include void esp_nn_mean_nhwc_s8_esp32s3(const int8_t *input, int8_t *output, const int32_t height, const int32_t width, const int32_t channels, const int32_t input_zero_point, const int32_t output_zero_point, const int32_t multiplier, const int32_t shift) { const int32_t num_elements = height * width; const int32_t zp_correction = num_elements * input_zero_point; if (num_elements <= 256 && channels <= 512) { /* int16 accumulation (safe: 256 * 127 = 32,512 < 32,767) */ /* Process 8 channels at a time using int16 accumulators */ int16_t acc16[channels]; memset(acc16, 0, channels * sizeof(int16_t)); const int8_t *ptr = input; for (int i = 0; i < num_elements; i++) { /* Inner loop — compiler should auto-vectorize with -O2 */ for (int c = 0; c < channels; c++) { acc16[c] += (int16_t)ptr[c]; } ptr += channels; } /* Requantize per channel */ for (int c = 0; c < channels; c++) { int32_t sum = (int32_t)acc16[c] - zp_correction; int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift); result += output_zero_point; result = max(result, -128); result = min(result, 127); output[c] = (int8_t)result; } } else if (channels <= 512) { /* int32 accumulation for larger spatial sizes */ int32_t acc[channels]; memset(acc, 0, channels * sizeof(int32_t)); const int8_t *ptr = input; for (int i = 0; i < num_elements; i++) { for (int c = 0; c < channels; c++) { acc[c] += ptr[c]; } ptr += channels; } for (int c = 0; c < channels; c++) { int32_t sum = acc[c] - zp_correction; int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift); result += output_zero_point; result = max(result, -128); result = min(result, 127); output[c] = (int8_t)result; } } else { /* Per-channel fallback for huge channel counts */ for (int c = 0; c < channels; c++) { int32_t sum = 0; for (int i = 0; i < num_elements; i++) { sum += input[i * channels + c]; } sum -= zp_correction; int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift); result += output_zero_point; result = max(result, -128); result = min(result, 127); output[c] = (int8_t)result; } } } ================================================ FILE: src/common/esp_nn_multiply_by_quantized_mult_esp32p4.S ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /* * Fast 2-wide requantization for ESP32-P4 (RISC-V). * Interleaves mul/mulh across 2 elements for better pipeline utilization. * Uses a0-a7 and t0-t6 only (no callee-saved registers needed). * * void esp_nn_requant_2x_esp32p4( * int32_t x0, // a0 * int32_t x1, // a1 * int32_t mult0, // a2 * int32_t mult1, // a3 * int32_t shift0, // a4 * int32_t shift1, // a5 * int32_t *out // a6: pointer to store 2 results * ); */ .text .align 4 .global esp_nn_requant_2x_esp32p4 .type esp_nn_requant_2x_esp32p4, @function esp_nn_requant_2x_esp32p4: /* Compute left_shift and apply */ mv t0, a0 /* x0 */ mv t1, a1 /* x1 */ bgez a4, .Lls0_pos mv t6, zero /* ls0 = 0 */ j .Lls0_done .Lls0_pos: sll t0, t0, a4 /* x0 <<= shift0 (positive = left shift) */ mv t6, a4 /* ls0 = shift0 */ .Lls0_done: sub a4, t6, a4 /* rs0 = ls0 - shift0 */ bgez a5, .Lls1_pos mv t6, zero j .Lls1_done .Lls1_pos: sll t1, t1, a5 mv t6, a5 .Lls1_done: sub a5, t6, a5 /* rs1 = ls1 - shift1 */ /* ---- Interleaved 64-bit multiply ---- */ /* mulh first (both elements), then mul (both elements) */ mulh t2, t0, a2 /* hi0 */ mulh t3, t1, a3 /* hi1 */ mul t0, t0, a2 /* lo0 */ mul t1, t1, a3 /* lo1 */ /* Add nudge and combine: result = ((hi:lo) + (1<<30)) >> 31 */ li t4, 0x40000000 /* nudge = 1 << 30 */ add t5, t0, t4 /* lo0 + nudge */ sltu t6, t5, t0 /* carry0 */ add t2, t2, t6 /* hi0 += carry0 */ srli t5, t5, 31 /* (lo0+nudge) >> 31 */ slli t0, t2, 1 /* hi0 << 1 */ or t0, t0, t5 /* result0 */ add t5, t1, t4 /* lo1 + nudge */ sltu t6, t5, t1 /* carry1 */ add t3, t3, t6 /* hi1 += carry1 */ srli t5, t5, 31 slli t1, t3, 1 or t1, t1, t5 /* result1 */ /* ---- Right shift with rounding ---- */ li t4, 1 beqz a4, .Lskip_rs0 addi t5, a4, -1 sll t5, t4, t5 /* round0 = 1 << (rs0-1) */ srai t6, t0, 31 /* -1 if negative, 0 otherwise */ add t5, t5, t6 /* round0 += sign */ add t0, t0, t5 sra t0, t0, a4 .Lskip_rs0: beqz a5, .Lskip_rs1 addi t5, a5, -1 sll t5, t4, t5 srai t6, t1, 31 add t5, t5, t6 add t1, t1, t5 sra t1, t1, a5 .Lskip_rs1: /* Store results */ sw t0, 0(a6) sw t1, 4(a6) ret .size esp_nn_requant_2x_esp32p4, . - esp_nn_requant_2x_esp32p4 ================================================ FILE: src/common/esp_nn_multiply_by_quantized_mult_esp32s3.S ================================================ // Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // the macro `use_nudge` enables adding rounding factor similar to tflite implementation // this barely changes any accuracy // keep this disabled for better performance #ifndef SKIP_NUDGE # set SKIP_NUDGE flag for ~20% faster (but not bit-exact) quantisation .set use_nudge, 1 #endif .text .literal_position .literal .nudge_val, 1073741824 # 1 << 30 .type esp_nn_multiply_by_quantized_mult_asm_esp32s3, @function .align 4 .global esp_nn_multiply_by_quantized_mult_asm_esp32s3 esp_nn_multiply_by_quantized_mult_asm_esp32s3: # 0x4 # to_add = 4 entry a1,32 wsr.sar a3 ee.zero.q q2 bltz a3, .skip_left_shift ee.vsl.32 q0,q0 # [13] .skip_left_shift: ssai 31 # [15] # move data to general purpose registers ee.movi.32.a q0,a12,0 # [17] ee.movi.32.a q0,a13,1 # [16] ee.movi.32.a q0,a14,2 # [18] ee.movi.32.a q0,a15,3 # [19] .ifdef use_nudge l32r a6,.nudge_val .endif # perform 64 bit mult mulsh a4,a2,a12 # [22] mulsh a11,a2,a13 # [23] mulsh a10,a2,a14 # [21] mulsh a8,a2,a15 # [20] mull a12,a2,a12 # [24] mull a13,a2,a13 # [25] mull a14,a2,a14 # [26] mull a15,a2,a15 # [27] # add nudge_val and discard low31 .ifdef use_nudge add.n a14,a6,a14 # [41] saltu a2,a14,a6 # [44] add.n a10,a10,a2 # [45] add.n a13,a6,a13 # [47] saltu a9,a13,a6 # [50] add.n a11,a11,a9 # [51] .endif src a10,a10,a14 # [88] src a11,a11,a13 # [78] ee.movi.32.q q0,a10,2 ee.movi.32.q q0,a11,1 .ifdef use_nudge add.n a15,a6,a15 # [36] saltu a2,a15,a6 # [39] add.n a8,a8,a2 # [40] add.n a12,a6,a12 # [54] saltu a10,a12,a6 # [57] add.n a4,a4,a10 # [58] .endif src a8,a8,a15 # [95] src a4,a4,a12 # [69] # discard lower 31 bits ee.movi.32.q q0,a8,3 ee.movi.32.q q0,a4,0 bgez a3, .skip_div_by_power_of_2 neg a5,a3 # [0] right_shift/exponent = -shift ee.vcmp.lt.s32 q2,q0,q2 # [97] addi.n a7,a5,-1 # [0] exponent - 1 ssl a7 # [1] movi.n a6,1 # [92] sll a6,a6 # [2] s32i.n a6,a1,4 # [3] to_add addi.n a4,a1,4 # [94] to_add_addr ee.vldbc.32 q1,a4 # [4] id:148 to_add wsr.sar a5 ee.vadds.s32 q1,q1,q2 ee.vadds.s32 q0,q0,q1 ee.vsr.32 q0,q0 .skip_div_by_power_of_2: retw.n # [9] .size esp_nn_multiply_by_quantized_mult_asm_esp32s3, . - esp_nn_multiply_by_quantized_mult_asm_esp32s3 ================================================ FILE: src/common/esp_nn_multiply_by_quantized_mult_ver1_esp32s3.S ================================================ // Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // quantisation version where we deal with different shifts and mults. .set use_nudge, 1 .text .literal_position .literal .LC3_19_48, 1073741824 # Program Unit: esp_nn_multiply_by_quantized_mult_ver1_esp32s3 .type esp_nn_multiply_by_quantized_mult_ver1_esp32s3, @function .align 4 .global esp_nn_multiply_by_quantized_mult_ver1_esp32s3 esp_nn_multiply_by_quantized_mult_ver1_esp32s3: # 0x1ee entry a1,32 # ee.zero.q q3 # [0] l32i.n a8,a3,0 # [5] id:200 // shift0 l32i.n a7,a3,4 # [2] id:201 // shift1 l32i.n a12,a2,0 # [3] id:204 // mult0 l32i.n a15,a2,4 # [1] id:205 // mult1 movi.n a10,0 # [7] max a6,a10,a8 # [1] // left_shift0 max a5,a10,a7 # [7] // left_shift1 sub a8,a6,a8 # [2] // right_shift0 sub a7,a5,a7 # [8] // right_shift1 ee.movi.32.a q0,a9,0 # [4] ee.movi.32.a q0,a11,1 # [11] ssl a6 # [3] sll a9,a9 # [4] mulsh a4,a12,a9 # [6] mull a12,a12,a9 # [9] ssl a5 # [10] sll a11,a11 # [12] mulsh a14,a15,a11 # [14] mull a15,a15,a11 # [16] l32r a13,.LC3_19_48 # [23] ee.movi.32.q q0,a9,0 # [5] ee.movi.32.q q0,a11,1 # [15] l32i.n a6,a3,8 # [6] id:202 // shift2 l32i.n a9,a2,8 # [19] id:206 // mult2 max a5,a10,a6 # [0] // left_shift2 sub a6,a5,a6 # [24] // right_shift2 ee.movi.32.a q0,a11,2 # [17] ssl a5 # [13] sll a11,a11 # [18] ee.movi.32.q q0,a11,2 # [20] mulsh a5,a9,a11 # [21] mull a9,a9,a11 # [22] mov a11, a5 // add nudge to result0 & result1 add.n a12,a13,a12 # [25] saltu a5,a12,a13 # [26] add.n a15,a13,a15 # [27] add.n a5,a5,a4 # [28] saltu a4,a15,a13 # [29] add.n a4,a4,a14 # [30] l32i.n a14,a3,12 # [31] id:203 // shift3 add.n a9,a13,a9 # [32] // add nudge low2 max a10,a10,a14 # [33] // left_shift3 sub a14,a10,a14 # [34] // right_shift3 ssl a10 # [35] ee.movi.32.a q0,a10,3 # [36] sll a10,a10 # [37] // select high32 from result0 and resul1 ssai 31 # [39] src a5,a5,a12 # [40] src a4,a4,a15 # [41] movi.n a12,1 # [42] ee.movi.32.q q0,a5,0 # [43] saltu a15,a9,a13 # [44] add.n a15,a15,a11 # [45] ee.movi.32.q q0,a4,1 # [46] l32i.n a11,a2,12 # [47] id:207 // mult3 src a15,a15,a9 # [48] ee.movi.32.q q0,a15,2 # [49] mull a9,a11,a10 # [50] mulsh a11,a11,a10 # [51] add.n a9,a13,a9 # [52] saltu a13,a9,a13 # [53] add.n a13,a13,a11 # [54] src a13,a13,a9 # [55] ee.movi.32.q q0,a13,3 # [57] // divide_by_power_of2_step ssl a8 # [56] sll a9,a12 # [58] ssl a7 # [59] addi.n a9,a9,-1 # [60] ee.movi.32.q q2,a9,0 # [61] sll a11,a12 # [62] addi.n a11,a11,-1 # [63] ssl a6 # [64] sll a10,a12 # [65] ee.movi.32.q q2,a11,1 # [66] ssl a14 # [67] addi.n a10,a10,-1 # [68] ee.movi.32.q q2,a10,2 # [69] sll a9,a12 # [70] addi.n a9,a9,-1 # [71] ee.movi.32.q q2,a9,3 # [74] ee.andq q1,q0,q2 # [75] ssr a8 # [72] sra a5,a5 # [73] ssr a7 # [76] sra a4,a4 # [78] ssr a6 # [79] sra a15,a15 # [81] ssr a14 # [82] sra a13,a13 # [84] wsr.sar a12 # [85] ee.movi.32.q q7,a5,0 # [77] ee.movi.32.q q7,a4,1 # [80] ee.movi.32.q q7,a15,2 # [83] ee.movi.32.q q7,a13,3 # [86] ee.vcmp.lt.s32 q3,q7,q3 # [87] ee.vsr.32 q2,q2 # [88] ee.vsubs.s32 q2,q2,q3 # [89] ee.vcmp.gt.s32 q1,q1,q2 # [90] ee.vsubs.s32 q0,q7,q1 # [91] // return retw.n # [92] .size esp_nn_multiply_by_quantized_mult_ver1_esp32s3, . - esp_nn_multiply_by_quantized_mult_ver1_esp32s3 ================================================ FILE: src/convolution/esp_nn_conv_ansi.c ================================================ // Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include int esp_nn_get_conv_scratch_size_ansi(const data_dims_t *input_dims, const data_dims_t *filter_dims, const data_dims_t *output_dims, const conv_params_t *conv_params) { return 0; } void esp_nn_set_conv_scratch_buf_ansi(const void *buf) { } /** * Assumption 1: i/p channels == o/p channels * Assumption 2: Pointers are valid * Assumption 3: dialation width = 1 */ void esp_nn_conv_u8_ansi(const uint8_t *input_data, const uint16_t input_wd, const uint16_t input_ht, const uint16_t in_channels, const int32_t input_offset, const uint16_t pad_wd, const uint16_t pad_ht, const uint16_t stride_wd, const uint16_t stride_ht, const uint8_t *filter_data, const uint16_t filter_wd, const uint16_t filter_ht, const int32_t filter_offset, const int32_t *bias, uint8_t *out_data, const uint16_t out_wd, const uint16_t out_ht, const uint16_t out_channels, const int32_t out_offset, const int32_t out_shift, const int32_t out_mult, const int32_t activation_min, const int32_t activation_max) { for (int out_y = 0; out_y < out_ht; out_y++) { //height loop const int16_t base_y = (out_y * stride_ht) - pad_ht; for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop const int16_t base_x = (out_x * stride_wd) - pad_wd; for (int out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {//channel_loop int32_t result = 0; /* Select filter so as the point doesn't lie outside block */ int filter_y_start = max(0, -base_y); int filter_x_start = max(0, -base_x); int filter_y_end = min(filter_ht, input_ht - base_y); int filter_x_end = min(filter_wd, input_wd - base_x); for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) { const int32_t idx_y = base_y + filter_y_idx; for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) { const int32_t idx_x = base_x + filter_x_idx; for (int in_ch_idx = 0; in_ch_idx < in_channels; in_ch_idx++) { int32_t input_index = (idx_y * input_wd + idx_x) * in_channels + in_ch_idx; int32_t filter_index = ((out_ch_idx * filter_ht + filter_y_idx) * filter_wd + filter_x_idx) * in_channels + in_ch_idx; int32_t input_val = input_data[input_index] + input_offset; int32_t filter_val = filter_data[filter_index] + filter_offset; result += input_val * filter_val; } } } if (bias) { result += bias[out_ch_idx]; } result = esp_nn_multiply_by_quantized_mult(result, out_mult, out_shift); result += out_offset; result = max(result, activation_min); result = min(result, activation_max); int out_index = (out_y * out_wd + out_x) * out_channels + out_ch_idx; out_data[out_index] = (uint8_t) result; } } } } /** * Assumption 1: i/p channels == o/p channels * Assumption 2: Pointers are valid * Assumption 3: dialation width = 1 */ void esp_nn_conv_s8_ansi(const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const conv_params_t *conv_params, const quant_data_t *quant_data) { const uint16_t input_wd = input_dims->width; const uint16_t input_ht = input_dims->height; const uint16_t in_channels = input_dims->channels; const int32_t input_offset = conv_params->in_offset; const int32_t out_offset = conv_params->out_offset; const uint16_t pad_wd = conv_params->padding.width; const uint16_t pad_ht = conv_params->padding.height; const uint16_t stride_wd = conv_params->stride.width; const uint16_t stride_ht = conv_params->stride.height; const uint16_t filter_wd = filter_dims->width; const uint16_t filter_ht = filter_dims->height; const uint16_t out_wd = output_dims->width; const uint16_t out_ht = output_dims->height; const uint16_t out_channels = output_dims->channels; const int32_t *out_shift = quant_data->shift; const int32_t *out_mult = quant_data->mult; const int32_t activation_min = conv_params->activation.min; const int32_t activation_max = conv_params->activation.max; /* Fall back to in_channels when filter_dims->channels is unset (legacy callers). */ const uint16_t filter_ch = filter_dims->channels ? filter_dims->channels : in_channels; const int32_t groups = in_channels / filter_ch; const int32_t filters_per_group = out_channels / groups; int32_t out_ch_idx, out_y, out_x, in_ch_idx, filter_y_idx, filter_x_idx; for (out_y = 0; out_y < out_ht; out_y++) { for (out_x = 0; out_x < out_wd; out_x++) { for (out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) { int32_t conv_out = 0; const int32_t group = out_ch_idx / filters_per_group; const int32_t in_ch_start = group * filter_ch; const int32_t base_y = stride_ht * out_y - pad_ht; const int32_t base_x = stride_wd * out_x - pad_wd; const int32_t filter_y_start = max(0, -base_y); const int32_t filter_x_start = max(0, -base_x); const int32_t filter_y_end = min(filter_ht, input_ht - base_y); const int32_t filter_x_end = min(filter_wd, input_wd - base_x); for (filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) { for (filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) { const int32_t in_row = base_y + filter_y_idx; const int32_t in_col = base_x + filter_x_idx; int32_t input_base_offset = (in_row * input_wd + in_col) * in_channels + in_ch_start; int32_t filter_base_offset = out_ch_idx * filter_ch * filter_ht * filter_wd + (filter_y_idx * filter_wd + filter_x_idx) * filter_ch; for (in_ch_idx = 0; in_ch_idx < filter_ch; in_ch_idx++) { conv_out += (input_data[input_base_offset + in_ch_idx] + input_offset) * filter_data[filter_base_offset + in_ch_idx]; } } } if (bias) { conv_out += bias[out_ch_idx]; } conv_out = esp_nn_multiply_by_quantized_mult(conv_out, out_mult[out_ch_idx], out_shift[out_ch_idx]); conv_out += out_offset; conv_out = max(conv_out, activation_min); conv_out = min(conv_out, activation_max); *out_data++ = (int8_t) conv_out; } } } } ================================================ FILE: src/convolution/esp_nn_conv_esp32p4.c ================================================ /* * SPDX-FileCopyrightText: 2024-2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /** * Optimizations strategies used: * Below optimizations are capable of any size of input/filter: * * 1. For filter wdxht = 1x1 (Refer esp_nn_conv_s8_mult8_1x1_esp32p4 function) * - For this specific version, the strategy we employ: * > This particular filter has only the channel * dimension and we have `out_ch` number of such filters. * > We take 8 input lines at a time and transpose those. * > Keep loading and multiplying filter values one by one, * to produce 8 outputs in parallel * * 2. General version: (Refer esp_nn_conv_s8_filter_aligned_input_padded_esp32p4) * - For all other cases: * > Consider `filter_wd * in_ch` as a single row. These many values can * be continuosly loaded from inputs as well. * > multiply accumulate into a single filter output. * > To speed things up further, we pre-calculate * (filter * in_offset + bias term) earlier and add it at the end of filter * * About ((filter * in_offset + bias term)) accumulate term: * > The conv operation before requantization is as follows: * for i in filter_size: * conv_out += (input + input_offset) * filter; * conv_out += bias * * > where input_offset is constant term hence, we can see that * this term can be precalculated as: * for i in filter_size: * acc_term += input_offset * filter[i]; * acc_term += bias * OR * for i in filter_size: * acc_term += filter[i]; // accumulate filter values * acc_term = acc_term * input_offset + bias * * * In both the above versions we align the filter if needed, pad the input with * -input_offset if needed and extend the channels to make those multiple * of 8/16 as per function needs */ #include #include #include #include "esp_nn_generic_opt.h" #include static int16_t *scratch_buffer = NULL; /** * Reusable PIE-accelerated dot product (same as FC version). * Processes 32 elements/iter (double-pump) for len >= 32, * 16 elements/iter for len >= 16, scalar remainder. */ static inline __attribute__((always_inline)) int32_t pie_dot_s8(const int8_t *a, const int8_t *b, int32_t len) { int32_t result = 0; int32_t idx = 0; if (len >= 32) { asm volatile ( "esp.zero.xacc \n\t" "mv x30, %[in] \n\t" "mv x31, %[flt] \n\t" "li %[idx], 32 \n\t" "addi s7, %[len], -31 \n\t" "esp.vld.128.ip q0, x30, 16 \n\t" "esp.vld.128.ip q2, x30, 16 \n\t" "esp.vld.128.ip q1, x31, 16 \n\t" "esp.vld.128.ip q3, x31, 16 \n\t" "j 2f \n\t" "1: \n\t" "esp.vmulas.s8.xacc.ld.ip q0, x30, 16, q0, q1 \n\t" "esp.vld.128.ip q1, x31, 16 \n\t" "esp.vmulas.s8.xacc.ld.ip q2, x30, 16, q2, q3 \n\t" "esp.vld.128.ip q3, x31, 16 \n\t" "addi %[idx], %[idx], 32 \n\t" "2: \n\t" "blt %[idx], s7, 1b \n\t" "esp.vmulas.s8.xacc q0, q1 \n\t" "esp.vmulas.s8.xacc q2, q3 \n\t" "addi s7, %[len], -15 \n\t" "bge %[idx], s7, 3f \n\t" "esp.vld.128.ip q0, x30, 16 \n\t" "esp.vld.128.ip q1, x31, 16 \n\t" "esp.vmulas.s8.xacc q0, q1 \n\t" "addi %[idx], %[idx], 16 \n\t" "3: \n\t" "esp.movx.r.xacc.l x30 \n\t" "mv %[res], x30 \n\t" : [idx] "+r"(idx), [res] "=r"(result) : [in] "r"(a), [flt] "r"(b), [len] "r"(len) : "x30", "x31", "s7" ); } else if (len >= 16) { asm volatile ( "esp.zero.xacc \n\t" "mv x30, %[in] \n\t" "mv x31, %[flt] \n\t" "li %[idx], 16 \n\t" "addi s7, %[len], -15 \n\t" "esp.vld.128.ip q0, x30, 16 \n\t" "esp.vld.128.ip q1, x31, 16 \n\t" "j 5f \n\t" "4: \n\t" "esp.vmulas.s8.xacc.ld.ip q0, x30, 16, q0, q1 \n\t" "esp.vld.128.ip q1, x31, 16 \n\t" "addi %[idx], %[idx], 16 \n\t" "5: \n\t" "blt %[idx], s7, 4b \n\t" "esp.vmulas.s8.xacc q0, q1 \n\t" "esp.movx.r.xacc.l x30 \n\t" "mv %[res], x30 \n\t" : [idx] "+r"(idx), [res] "=r"(result) : [in] "r"(a), [flt] "r"(b), [len] "r"(len) : "x30", "x31", "s7" ); } for (; idx < len; idx++) { result += (int32_t)a[idx] * (int32_t)b[idx]; } return result; } /** * Batched 1x1 conv using QACC per-lane: processes 16 pixels simultaneously. * Transposes input so each QACC lane = one pixel, then broadcasts filter * coefficients for per-lane accumulation. Critical for small in_ch where * XACC can't be used (in_ch < 16). * * For in_ch=8: 4.5x faster than scalar per-pixel approach. */ __attribute__((noinline)) static void conv_1x1_batch16(const int8_t *pixel_ptrs[16], const int8_t *filter_data, const int32_t *filter_sum, const int32_t *bias, int8_t *out_ptrs[16], int32_t in_ch, int32_t out_ch, int32_t out_offset, const int32_t *out_mult, const int32_t *out_shift, int32_t act_min, int32_t act_max) { /* Ensure PIE is enabled (might be lost across noinline function call) */ asm volatile ( "csrsi 0x7f2, 0b01 \n\t" "li x29, 0b10 \n\t" "esp.movx.w.cfg x29 \n\t" ::: "x29" ); /* Transpose: arrange 16 pixels' data as ch0[p0..p15], ch1[p0..p15], ... */ int8_t transposed[16 * 16] __attribute__((aligned(16))); /* in_ch <= 16 for this path */ for (int c = 0; c < in_ch; c++) { for (int p = 0; p < 16; p++) { transposed[c * 16 + p] = pixel_ptrs[p][c]; } } /* For each output channel: QACC per-lane MAC with broadcast filter. * Use single asm block for zero + accumulate loop to prevent * q register clobber between separate asm blocks. */ const int8_t *filt = filter_data; for (int32_t oc = 0; oc < out_ch; oc++) { /* Single asm: zero QACC, then loop over in_ch channels: * broadcast filter[ch], load 16 transposed pixels, MAC per-lane */ asm volatile ( "esp.zero.qacc \n\t" "mv x30, %[trans] \n\t" /* transposed base */ "mv x31, %[flt] \n\t" /* filter base */ "mv s7, %[cnt] \n\t" /* in_ch count */ "1: \n\t" "esp.vld.128.ip q0, x30, 16 \n\t" /* load 16 pixel values, advance by 16 */ "esp.vldbc.8.ip q1, x31, 1 \n\t" /* broadcast filter[ch], advance by 1 */ "esp.vmulas.s8.qacc q0, q1 \n\t" "addi s7, s7, -1 \n\t" "bnez s7, 1b \n\t" : : [trans] "r"(transposed), [flt] "r"(filt), [cnt] "r"(in_ch) : "x30", "x31", "s7" ); /* Extract 16 results */ int32_t results[16] __attribute__((aligned(16))); ESP_NN_QACC_EXTRACT_S32(results); /* Add filter_sum + bias, requant, clamp, store for each pixel */ int32_t fs = filter_sum[oc]; int32_t b = bias ? bias[oc] : 0; int32_t combined = fs + b; int32_t m = out_mult[oc]; int32_t s = out_shift[oc]; for (int p = 0; p < 16; p++) { int32_t r = results[p] + combined; r = esp_nn_multiply_by_quantized_mult(r, m, s); r += out_offset; r = max(r, act_min); r = min(r, act_max); out_ptrs[p][oc] = (int8_t) r; } filt += in_ch; } } __attribute__ ((noinline)) static void esp_nn_conv_s8_1x1(const data_dims_t *input_dims, const int8_t *input_data, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const conv_params_t *conv_params, const quant_data_t *quant_data, void *scratch) { const uint16_t input_wd = input_dims->width; const uint16_t in_channels = input_dims->channels; const int32_t input_offset = conv_params->in_offset; const int32_t out_offset = conv_params->out_offset; const uint16_t out_wd = output_dims->width; const uint16_t out_ht = output_dims->height; const uint16_t out_channels = output_dims->channels; const int32_t activation_min = conv_params->activation.min; const int32_t activation_max = conv_params->activation.max; int32_t *filter_sum = (int32_t *) scratch; // alignment of 4 bytes assumed /* pre-calculate filter_sum * input_offset */ const int8_t *filter_ptr = filter_data; for (int32_t out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) { int32_t sum = 0; int32_t in_ch_idx = 0; for (; in_ch_idx < in_channels - 3; in_ch_idx += 4) { sum += *filter_ptr++; sum += *filter_ptr++; sum += *filter_ptr++; sum += *filter_ptr++; } for (; in_ch_idx < in_channels; in_ch_idx ++) { sum += *filter_ptr++; } filter_sum[out_ch_idx] = sum * input_offset; } /* When in_ch < 16: use QACC batch path (16 pixels at once) or channel padding. * QACC batch: transpose pixels, broadcast filter, per-lane MAC. * Channel pad: pad in/filter to 16 ch for XACC. */ /* When in_ch < 16: use QACC batch (16 pixels at a time with broadcast filter). * Falls back to channel-padding for remaining pixels. */ if (in_channels < 16) { /* Enable PIE for QACC */ asm volatile ( "csrsi 0x7f2, 0b01 \n\t" "li x29, 0b10 \n\t" "esp.movx.w.cfg x29 \n\t" ::: "x29" ); int32_t total_pixels = out_wd * out_ht; int32_t pix = 0; /* Process batches of 16 pixels using QACC per-lane */ for (; pix <= total_pixels - 16; pix += 16) { const int8_t *pp[16]; int8_t *op[16]; for (int p = 0; p < 16; p++) { pp[p] = input_data + (pix + p) * in_channels; op[p] = out_data + (pix + p) * out_channels; } conv_1x1_batch16(pp, filter_data, filter_sum, bias, op, in_channels, out_channels, out_offset, quant_data->mult, quant_data->shift, activation_min, activation_max); } /* Remaining pixels (< 16): scalar fallback */ for (; pix < total_pixels; pix++) { const int8_t *inp = input_data + pix * in_channels; filter_ptr = filter_data; for (int32_t oc = 0; oc < out_channels; oc++) { int32_t conv_out = 0; for (int32_t ic = 0; ic < in_channels; ic++) { conv_out += inp[ic] * filter_ptr[ic]; } conv_out += filter_sum[oc]; if (bias) conv_out += bias[oc]; conv_out = esp_nn_multiply_by_quantized_mult(conv_out, quant_data->mult[oc], quant_data->shift[oc]); conv_out += out_offset; conv_out = max(conv_out, activation_min); conv_out = min(conv_out, activation_max); out_data[pix * out_channels + oc] = (int8_t) conv_out; filter_ptr += in_channels; } } return; } for (int32_t in_row = 0; in_row < out_ht; in_row++) { for (int32_t in_col = 0; in_col < out_wd; in_col++) { const int32_t *out_mult = quant_data->mult; const int32_t *out_shift = quant_data->shift; filter_ptr = filter_data; const int8_t *input_base_ptr = input_data + (in_row * input_wd + in_col) * in_channels; for (int32_t out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) { /* initializations */ int32_t conv_out = 0; const int8_t *input_ptr = input_base_ptr; int32_t in_ch_idx = 0; #if 1 // inline asm // for now check for the alignment as well if (in_channels < 16) {// || ((uint32_t) input_ptr & 15) || ((uint32_t) filter_ptr & 15)) { goto skip_asm; } asm volatile ( "li %0, 16 \n\t" "addi s7, %4, -15 \n\t" "mv x30, %1 \n\t" "mv x31, %2 \n\t" "esp.zero.xacc \n\t" "esp.vld.128.ip q0, x30, 16 \n\t" "esp.vld.128.ip q1, x31, 16 \n\t" "j .loop16_end \n\t" ".loop16_start: \n\t" "esp.vmulas.s8.xacc.ld.ip q0, x30, 16, q0, q1 \n\t" "esp.vld.128.ip q1, x31, 16 \n\t" "addi %0, %0, 16 \n\t" // in_ch_idx += 16 ".loop16_end: \n\t" "blt %0, s7, .loop16_start \n\t" // if in_ch_idx < `in_channels - 15` abort // move input_ptr, filter_ptr and conv_out "mv %1, x30 \n\t" "mv %2, x31 \n\t" "esp.vmulas.s8.xacc q0, q1 \n\t" "esp.movx.r.xacc.l %3 \n\t" : "+r" (in_ch_idx), "+r" (input_ptr), "+r" (filter_ptr), "=r" (conv_out) : "r"(in_channels) : "x30", "x31", "s7" ); skip_asm: #endif for (; in_ch_idx < in_channels - 3; in_ch_idx += 4) { conv_out += *input_ptr++ * *filter_ptr++; conv_out += *input_ptr++ * *filter_ptr++; conv_out += *input_ptr++ * *filter_ptr++; conv_out += *input_ptr++ * *filter_ptr++; } for (; in_ch_idx < in_channels; in_ch_idx++) { conv_out += *input_ptr++ * *filter_ptr++; } conv_out = conv_out + filter_sum[out_ch_idx]; if (bias) { conv_out += bias[out_ch_idx]; } conv_out = esp_nn_requantize(conv_out, *out_mult++, *out_shift++); conv_out += out_offset; conv_out = max(conv_out, activation_min); conv_out = min(conv_out, activation_max); *out_data++ = (int8_t) conv_out; } } } } __attribute__ ((noinline)) static void esp_nn_conv_s8_padded( const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const conv_params_t *conv_params, const quant_data_t *quant_data, void *scratch) { const uint16_t input_wd = input_dims->width; const uint16_t input_ht = input_dims->height; const uint16_t in_channels = input_dims->channels; const int32_t input_offset = conv_params->in_offset; const int32_t out_offset = conv_params->out_offset; const uint16_t stride_wd = conv_params->stride.width; const uint16_t stride_ht = conv_params->stride.height; const uint16_t filter_wd = filter_dims->width; const uint16_t filter_ht = filter_dims->height; const uint16_t out_wd = output_dims->width; const uint16_t out_ht = output_dims->height; const uint16_t out_channels = output_dims->channels; const int32_t *out_shift = quant_data->shift; const int32_t *out_mult = quant_data->mult; const int32_t activation_min = conv_params->activation.min; const int32_t activation_max = conv_params->activation.max; /* Grouped conv (filter_ch < input_ch): fall back to ansi which handles it */ if (in_channels != filter_dims->channels) { esp_nn_conv_s8_ansi(input_dims, input_data, filter_dims, filter_data, bias, output_dims, out_data, conv_params, quant_data); return; } int32_t *filter_sum = (int32_t *) scratch; // alignment of 4 bytes assumed /* pre-calculate filter_sum * input_offset */ const int8_t *filter_ptr = filter_data; for (int32_t out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) { int32_t sum = 0; int32_t filter_len = filter_wd * filter_ht * in_channels; int32_t filter_idx = 0; for (; filter_idx < filter_len - 3; filter_idx += 4) { sum += *filter_ptr++; sum += *filter_ptr++; sum += *filter_ptr++; sum += *filter_ptr++; } for (; filter_idx < filter_len; filter_idx++) { sum += *filter_ptr++; } filter_sum[out_ch_idx] = sum * input_offset; } const int32_t row_size = filter_wd * in_channels; bool right_pad = max(0, ((out_wd - 1) * stride_wd + filter_wd - input_wd)); bool bottom_pad = max(0, ((out_ht - 1) * stride_ht + filter_ht - input_ht)); for (int32_t out_y = 0; out_y < out_ht - bottom_pad; out_y++) { for (int32_t out_x = 0; out_x < out_wd - right_pad; out_x++) { const int32_t base_y = stride_ht * out_y; const int32_t base_x = stride_wd * out_x; const int32_t *out_mult_ptr = out_mult; const int32_t *out_shift_ptr = out_shift; const int32_t *bias_ptr = bias; const int8_t *filter_data_ptr = filter_data; for (int32_t out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) { int32_t conv_out = 0, filter_y_idx; if (row_size >= 16) { asm volatile("esp.zero.xacc \n\t"); } for (filter_y_idx = 0; filter_y_idx < filter_ht; filter_y_idx++) { const int32_t in_row = base_y + filter_y_idx; const int32_t in_col = base_x; const int8_t *input_data_ptr = input_data + (in_row * input_wd + in_col) * in_channels; int32_t row_idx = 0; #if 1 // inline asm // for now check for the alignment as well if (row_size < 16) {// || ((uint32_t) input_ptr & 15) || ((uint32_t) filter_ptr & 15)) { goto skip_asm_pad0; } asm volatile ( "li %0, 16 \n\t" "addi s7, %3, -15 \n\t" "mv x30, %1 \n\t" "mv x31, %2 \n\t" "esp.vld.128.ip q0, x30, 16 \n\t" "esp.vld.128.ip q1, x31, 16 \n\t" "j .loop16_pad0_end \n\t" ".loop16_pad0_start: \n\t" "esp.vmulas.s8.xacc.ld.ip q0, x30, 16, q0, q1 \n\t" "esp.vld.128.ip q1, x31, 16 \n\t" "addi %0, %0, 16 \n\t" // in_ch_idx += 16 ".loop16_pad0_end: \n\t" "blt %0, s7, .loop16_pad0_start \n\t" // if in_ch_idx < `in_channels - 15` abort // move input_ptr, filter_ptr and conv_out "mv %1, x30 \n\t" "mv %2, x31 \n\t" "esp.vmulas.s8.xacc q0, q1 \n\t" : "+r" (row_idx), "+r" (input_data_ptr), "+r" (filter_data_ptr) : "r"(row_size) : "x30", "x31", "s7" ); skip_asm_pad0: #endif for (; row_idx < row_size - 3; row_idx += 4) { conv_out += *input_data_ptr++ * *filter_data_ptr++; conv_out += *input_data_ptr++ * *filter_data_ptr++; conv_out += *input_data_ptr++ * *filter_data_ptr++; conv_out += *input_data_ptr++ * *filter_data_ptr++; } for (; row_idx < row_size; row_idx++) { conv_out += *input_data_ptr++ * *filter_data_ptr++; } } if (row_size >= 16) { asm volatile ( "esp.movx.r.xacc.l x30 \n\t" "add %0, %0, x30 \n\t" : "+r" (conv_out) : : "x30" ); } /* add input_offset term */ conv_out += filter_sum[out_ch_idx]; if (bias) { conv_out += *bias_ptr++; } conv_out = esp_nn_requantize(conv_out, *out_mult_ptr++, *out_shift_ptr++); conv_out += out_offset; conv_out = max(conv_out, activation_min); conv_out = min(conv_out, activation_max); *out_data++ = (int8_t) conv_out; } } for (int32_t out_x = out_wd - right_pad; out_x < out_wd; out_x++) { const int32_t base_y = stride_ht * out_y; const int32_t base_x = stride_wd * out_x; const int32_t *out_mult_ptr = out_mult; const int32_t *out_shift_ptr = out_shift; const int32_t *bias_ptr = bias; for (int32_t out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) { int32_t conv_out = 0, filter_y_idx; for (filter_y_idx = 0; filter_y_idx < filter_ht; filter_y_idx++) { for (int32_t filter_x_idx = 0; filter_x_idx < filter_wd - right_pad; filter_x_idx++) { const int32_t in_row = base_y + filter_y_idx; const int32_t in_col = base_x + filter_x_idx; const int8_t *input_ptr = input_data + (in_row * input_wd + in_col) * in_channels; const int8_t *filter_ptr = filter_data + out_ch_idx * in_channels * filter_ht * filter_wd + (filter_y_idx * filter_wd + filter_x_idx) * in_channels; int32_t in_ch_idx = 0; for (; in_ch_idx < in_channels - 3; in_ch_idx += 4) { conv_out += (*input_ptr++ + input_offset) * *filter_ptr++; conv_out += (*input_ptr++ + input_offset) * *filter_ptr++; conv_out += (*input_ptr++ + input_offset) * *filter_ptr++; conv_out += (*input_ptr++ + input_offset) * *filter_ptr++; } for (; in_ch_idx < in_channels; in_ch_idx ++) { conv_out += (*input_ptr++ + input_offset) * *filter_ptr++; } } } if (bias) { conv_out += *bias_ptr++; } conv_out = esp_nn_requantize(conv_out, *out_mult_ptr++, *out_shift_ptr++); conv_out += out_offset; conv_out = max(conv_out, activation_min); conv_out = min(conv_out, activation_max); *out_data++ = (int8_t) conv_out; } } } // Calculate the last row if needed if (bottom_pad) { int in_row = input_dims->height - filter_dims->height + 1; esp_nn_conv_s8_opt(&(data_dims_t){input_dims->width, 2, input_dims->channels, 0}, input_data + in_row * input_dims->width * input_dims->channels, filter_dims, filter_data, bias, &(data_dims_t){output_dims->width, 1, output_dims->channels, 0}, out_data, conv_params, quant_data); } } /* L1D cache budget: use half of 64KB to leave room for filter streaming */ #define L1D_BUDGET 32768 /** * Im2col convolution for small in_ch where filter_wd * in_ch < 16. * * Instead of padding channels (81% wasted MACs for in_ch=3), * concatenates the entire filter window into one contiguous vector: * window_len = filter_wd * filter_ht * in_ch (e.g., 3*3*3 = 27) * * For each output pixel: copy the input window into a contiguous scratch * buffer, then use PIE dot product on the full window. No wasted MACs. * * Scratch layout: [filter_sum | im2col_buf] * im2col_buf = filter_wd * filter_ht * in_ch bytes */ __attribute__ ((noinline)) static void esp_nn_conv_s8_im2col( const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const conv_params_t *conv_params, const quant_data_t *quant_data, void *scratch) { const uint16_t input_wd = input_dims->width; const uint16_t input_ht = input_dims->height; const uint16_t in_ch = input_dims->channels; const uint16_t filter_wd = filter_dims->width; const uint16_t filter_ht = filter_dims->height; const uint16_t out_wd = output_dims->width; const uint16_t out_ht = output_dims->height; const uint16_t out_ch = output_dims->channels; const uint16_t pad_wd = conv_params->padding.width; const uint16_t pad_ht = conv_params->padding.height; const uint16_t stride_wd = conv_params->stride.width; const uint16_t stride_ht = conv_params->stride.height; const int32_t input_offset = conv_params->in_offset; const int32_t out_offset = conv_params->out_offset; const int32_t activation_min = conv_params->activation.min; const int32_t activation_max = conv_params->activation.max; const int32_t window_len = filter_wd * filter_ht * in_ch; const int8_t pad_val = (int8_t)(-input_offset); /* Scratch: filter_sum[out_ch] + im2col_buf[window_len] */ int32_t *filter_sum = (int32_t *)scratch; int8_t *im2col_buf = (int8_t *)scratch + out_ch * sizeof(int32_t); /* Pre-compute filter_sum * input_offset */ const int8_t *fptr = filter_data; for (int32_t oc = 0; oc < out_ch; oc++) { int32_t sum = 0; for (int32_t fi = 0; fi < window_len; fi++) { sum += *fptr++; } filter_sum[oc] = sum * input_offset; } /* Process each output pixel */ int8_t *out_ptr = out_data; for (int32_t out_y = 0; out_y < out_ht; out_y++) { for (int32_t out_x = 0; out_x < out_wd; out_x++) { const int32_t base_y = out_y * stride_ht - pad_ht; const int32_t base_x = out_x * stride_wd - pad_wd; /* Copy input window into contiguous im2col buffer */ int8_t *buf = im2col_buf; for (int32_t fy = 0; fy < filter_ht; fy++) { int32_t in_y = base_y + fy; for (int32_t fx = 0; fx < filter_wd; fx++) { int32_t in_x = base_x + fx; if (in_y >= 0 && in_y < input_ht && in_x >= 0 && in_x < input_wd) { const int8_t *src = input_data + (in_y * input_wd + in_x) * in_ch; for (int c = 0; c < in_ch; c++) { *buf++ = src[c]; } } else { /* Padding pixel */ for (int c = 0; c < in_ch; c++) { *buf++ = pad_val; } } } } /* Dot product against each output channel's filter */ const int32_t *out_mult = quant_data->mult; const int32_t *out_shift = quant_data->shift; const int8_t *filter_ptr = filter_data; for (int32_t oc = 0; oc < out_ch; oc++) { int32_t conv_out = pie_dot_s8(im2col_buf, filter_ptr, window_len); conv_out += filter_sum[oc]; if (bias) conv_out += bias[oc]; conv_out = esp_nn_requantize(conv_out, *out_mult++, *out_shift++); conv_out += out_offset; conv_out = max(conv_out, activation_min); conv_out = min(conv_out, activation_max); *out_ptr++ = (int8_t) conv_out; filter_ptr += window_len; } } } } /** * Tiled convolution: process T output rows at a time. * Converts padded conv into a series of no-pad sub-problems by * copying/padding input tiles into the scratch buffer. * * This keeps the working set in L1D for large input tensors. * Reuses the existing esp_nn_conv_s8_padded PIE inner loop per tile. */ __attribute__ ((noinline)) static void esp_nn_conv_s8_tiled( const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const conv_params_t *conv_params, const quant_data_t *quant_data, void *scratch) { const uint16_t input_wd = input_dims->width; const uint16_t input_ht = input_dims->height; const uint16_t in_ch = input_dims->channels; const uint16_t filter_wd = filter_dims->width; const uint16_t filter_ht = filter_dims->height; const uint16_t out_wd = output_dims->width; const uint16_t out_ht = output_dims->height; const uint16_t out_ch = output_dims->channels; const uint16_t pad_wd = conv_params->padding.width; const uint16_t pad_ht = conv_params->padding.height; const uint16_t stride_ht = conv_params->stride.height; const int32_t input_offset = conv_params->in_offset; /* Check if we need channel padding for PIE (row_size must be >= 16) */ int new_ch = in_ch; int need_ch_pad = 0; if (filter_wd * in_ch < 16) { new_ch = (16 + filter_wd - 1) / filter_wd; /* minimum channels for PIE */ new_ch = (new_ch + 15) & ~15; /* align to 16 */ need_ch_pad = 1; } int padded_input_wd = input_wd + 2 * pad_wd; /* Scratch layout: * [0] filter_sum: out_ch * 4 bytes * [after filter_sum] aligned_filter (if ch padding): filter_wd * filter_ht * new_ch * out_ch * [after filter] tile_input_buf: variable per tile */ int32_t *filter_sum = (int32_t *) scratch; int filter_sum_size = out_ch * sizeof(int32_t); /* Pre-compute filter_sum * input_offset (once for entire layer) */ const int8_t *fptr = filter_data; for (int32_t oc = 0; oc < out_ch; oc++) { int32_t sum = 0; int32_t flen = filter_wd * filter_ht * in_ch; for (int32_t fi = 0; fi < flen; fi++) { sum += *fptr++; } filter_sum[oc] = sum * input_offset; } /* Channel-pad filter if needed (pad with 0s - doesn't affect filter_sum) */ int8_t *aligned_filter = NULL; int aligned_filter_size = 0; if (need_ch_pad) { aligned_filter = (int8_t *)scratch + filter_sum_size; aligned_filter_size = filter_wd * filter_ht * new_ch * out_ch; memset(aligned_filter, 0, aligned_filter_size); const int8_t *src_f = filter_data; int8_t *dst_f = aligned_filter; for (int oc = 0; oc < out_ch; oc++) { for (int fh = 0; fh < filter_ht; fh++) { for (int fw = 0; fw < filter_wd; fw++) { memcpy(dst_f, src_f, in_ch); src_f += in_ch; dst_f += new_ch; /* zero-padded channels */ } } } } /* Tile input buffer starts after filter_sum + aligned_filter */ int8_t *tile_buf = (int8_t *)scratch + filter_sum_size + aligned_filter_size; /* Use effective channel count for tile buffer sizing */ int eff_ch = need_ch_pad ? new_ch : in_ch; int tile_input_row_bytes = padded_input_wd * eff_ch; /* Compute tile height T (output rows per tile) */ int tile_T = out_ht; int total_input_bytes = padded_input_wd * (input_ht + 2 * pad_ht) * eff_ch; int used_scratch = filter_sum_size + aligned_filter_size; if (total_input_bytes + used_scratch > L1D_BUDGET) { int budget_for_input = L1D_BUDGET - used_scratch; int min_input_rows = filter_ht; if (min_input_rows * tile_input_row_bytes <= budget_for_input) { tile_T = (budget_for_input - filter_ht * tile_input_row_bytes) / (stride_ht * tile_input_row_bytes) + 1; if (tile_T < 1) tile_T = 1; if (tile_T > out_ht) tile_T = out_ht; } } /* Process tiles */ const int8_t *use_filter = need_ch_pad ? aligned_filter : filter_data; data_dims_t eff_filter_dims = {filter_wd, filter_ht, eff_ch, 0}; for (int32_t tile_y = 0; tile_y < out_ht; tile_y += tile_T) { int32_t actual_T = min(tile_T, out_ht - tile_y); int32_t in_row_start = tile_y * stride_ht - pad_ht; int32_t in_row_end = (tile_y + actual_T - 1) * stride_ht + filter_ht - 1; int32_t tile_input_ht = in_row_end - in_row_start + 1; /* Copy/pad input rows into tile buffer, with channel padding if needed */ int8_t pad_val = (int8_t)(-input_offset); int8_t *dst = tile_buf; for (int32_t row = in_row_start; row <= in_row_end; row++) { if (row < 0 || row >= input_ht) { memset(dst, pad_val, padded_input_wd * eff_ch); } else { /* For each pixel in padded row */ int8_t *row_dst = dst; /* Left padding */ for (int px = 0; px < pad_wd; px++) { memset(row_dst, pad_val, eff_ch); row_dst += eff_ch; } /* Valid pixels - with optional channel padding */ const int8_t *row_src = input_data + row * input_wd * in_ch; if (need_ch_pad) { for (int px = 0; px < input_wd; px++) { memcpy(row_dst, row_src, in_ch); if (eff_ch > in_ch) { memset(row_dst + in_ch, pad_val, eff_ch - in_ch); } row_src += in_ch; row_dst += eff_ch; } } else { memcpy(row_dst, row_src, input_wd * in_ch); row_dst += input_wd * in_ch; } /* Right padding */ for (int px = 0; px < pad_wd; px++) { memset(row_dst, pad_val, eff_ch); row_dst += eff_ch; } } dst += padded_input_wd * eff_ch; } /* Sub-problem with pad=0, effective channels */ data_dims_t tile_input_dims = {padded_input_wd, tile_input_ht, eff_ch, 0}; data_dims_t tile_output_dims = {out_wd, actual_T, out_ch, 0}; conv_params_t tile_conv_params = *conv_params; tile_conv_params.padding.width = 0; tile_conv_params.padding.height = 0; esp_nn_conv_s8_padded(&tile_input_dims, tile_buf, &eff_filter_dims, use_filter, bias, &tile_output_dims, out_data + tile_y * out_wd * out_ch, &tile_conv_params, quant_data, filter_sum); } } int esp_nn_get_conv_scratch_size_esp32p4(const data_dims_t *input_dims, const data_dims_t *filter_dims, const data_dims_t *output_dims, const conv_params_t *conv_params) { const uint16_t input_wd = input_dims->width; const uint16_t input_ht = input_dims->height; const uint16_t in_ch = input_dims->channels; const uint16_t filter_wd = filter_dims->width; const uint16_t filter_ht = filter_dims->height; const uint16_t out_ch = output_dims->channels; const uint16_t pad_wd = conv_params->padding.width; const uint16_t pad_ht = conv_params->padding.height; const uint16_t stride_wd = conv_params->stride.width; const uint16_t stride_ht = conv_params->stride.height; int new_channels = (in_ch + 7) & ~7; int input_scratch = input_wd * input_ht * in_ch; int filter_scratch = filter_wd * filter_ht * in_ch * out_ch; int align_buf_size = 32; /* extra buffer for alignment */ if ((filter_wd == 1 && filter_ht == 1 && pad_wd == 0 && pad_ht == 0) && (stride_wd == 1 && stride_ht == 1)) { if (in_ch < 16) { /* Channel-padding path: filter_sum + padded_filter + padded_input */ int filter_sum_sz = out_ch * 4; int padded_filter_sz = 16 * out_ch; int padded_input_sz = 32; /* 16 bytes + alignment */ return filter_sum_sz + padded_filter_sz + padded_input_sz + align_buf_size; } int transpose_buf_size = 2 * (8 * new_channels); if (input_wd * input_ht < 8) { transpose_buf_size = 0; } if (in_ch % 8) { input_scratch = input_wd * input_ht * new_channels; } else { input_scratch = 0; } filter_scratch = new_channels * out_ch; return input_scratch + filter_scratch + transpose_buf_size + align_buf_size; } else { new_channels = (in_ch + 15) & ~15; int offset_acc_scratch = out_ch * 4; if (pad_wd == 0 && pad_ht == 0 && filter_wd * in_ch >= 16) { /* Direct no-pad path: no input scratch needed */ input_scratch = 0; filter_scratch = filter_wd * filter_ht * new_channels * out_ch; return input_scratch + filter_scratch + align_buf_size + offset_acc_scratch; } /* Im2col path: scratch = filter_sum + im2col_buf */ if (filter_wd * filter_ht * in_ch >= 16) { int window_len = filter_wd * filter_ht * in_ch; int im2col_scratch = window_len; /* one window buffer */ return offset_acc_scratch + im2col_scratch + align_buf_size; } if (pad_wd == 0 && pad_ht == 0) { /* Very small window (< 16 elements total): tiled path */ int eff_ch = ((16 + filter_wd - 1) / filter_wd + 15) & ~15; int filt_aligned = filter_wd * filter_ht * eff_ch * out_ch; int tile_input = input_wd * input_ht * eff_ch; return offset_acc_scratch + filt_aligned + tile_input + align_buf_size; } /* Padded case: check if tiling is beneficial */ int padded_input_wd = input_wd + 2 * pad_wd; int full_input_size = padded_input_wd * (input_ht + 2 * pad_ht) * in_ch; if (full_input_size + offset_acc_scratch > L1D_BUDGET) { /* Tiled path: compute tile input size */ int eff_ch = in_ch; int filt_aligned = 0; if (filter_wd * in_ch < 16) { eff_ch = ((16 + filter_wd - 1) / filter_wd + 15) & ~15; filt_aligned = filter_wd * filter_ht * eff_ch * out_ch; } int tile_row_bytes = padded_input_wd * eff_ch; int budget_for_input = L1D_BUDGET - offset_acc_scratch - filt_aligned; int tile_T = 1; if (budget_for_input > 0 && filter_ht * tile_row_bytes <= budget_for_input) { tile_T = (budget_for_input - filter_ht * tile_row_bytes) / (stride_ht * tile_row_bytes) + 1; if (tile_T > (int)(output_dims->height)) tile_T = output_dims->height; } int tile_input_rows = (tile_T - 1) * stride_ht + filter_ht + 2 * pad_ht; input_scratch = tile_input_rows * tile_row_bytes; filter_scratch = filt_aligned; } else { /* Monolithic padded path */ input_scratch = full_input_size; filter_scratch = filter_wd * filter_ht * new_channels * out_ch; } return input_scratch + filter_scratch + align_buf_size + offset_acc_scratch; } return align_buf_size; } void esp_nn_set_conv_scratch_buf_esp32p4(void *buf) { // We are going to use the vector extensions asm volatile ( "csrsi 0x7f2, 0b01 \n\t" // enable `esp` vector extension "li x29, 0b10 \n\t" "esp.movx.w.cfg x29 \n\t" : : : "x29" ); scratch_buffer = (int16_t *) buf; } void esp_nn_conv_s8_esp32p4(const data_dims_t *input_dims, const int8_t *input, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const conv_params_t *conv_params, const quant_data_t *quant_data) { if (scratch_buffer == NULL) { printf("esp_nn_conv error! scratch_buffer not set!\n"); return; } const uint16_t filter_wd = filter_dims->width; const uint16_t filter_ht = filter_dims->height; const uint16_t pad_wd = conv_params->padding.width; const uint16_t pad_ht = conv_params->padding.height; const uint16_t stride_wd = conv_params->stride.width; const uint16_t stride_ht = conv_params->stride.height; if (filter_wd == 1 && filter_ht == 1 && pad_wd == 0 && pad_ht == 0 && stride_wd == 1 && stride_ht == 1) { esp_nn_conv_s8_1x1(input_dims, input, filter_data, bias, output_dims, out_data, conv_params, quant_data, scratch_buffer); } else if (pad_wd == 0 && pad_ht == 0 && filter_wd * input_dims->channels >= 16) { /* No-pad, channels large enough for PIE: use direct padded path */ esp_nn_conv_s8_padded(input_dims, input, filter_dims, filter_data, bias, output_dims, out_data, conv_params, quant_data, scratch_buffer); } else if (filter_wd * filter_ht * input_dims->channels >= 16) { /* Small in_ch but window_len >= 16: use im2col for zero-waste PIE. * Also handles padded cases naturally. */ esp_nn_conv_s8_im2col(input_dims, input, filter_dims, filter_data, bias, output_dims, out_data, conv_params, quant_data, scratch_buffer); } else if (pad_wd != 0 || pad_ht != 0) { /* Padded case with very small window: use tiled path */ esp_nn_conv_s8_tiled(input_dims, input, filter_dims, filter_data, bias, output_dims, out_data, conv_params, quant_data, scratch_buffer); } else { /* Tiny output: fall back to generic opt */ esp_nn_conv_s8_opt(input_dims, input, filter_dims, filter_data, bias, output_dims, out_data, conv_params, quant_data); } } ================================================ FILE: src/convolution/esp_nn_conv_esp32s3.c ================================================ /* * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /** * Optimizations strategies used: * Below optimizations are capable of any size of input/filter: * * 1. For filter wdxht = 1x1 (Refer esp_nn_conv_s8_mult8_1x1_esp32s3 function) * - For this specific version, the strategy we employ: * > This particular filter has only the channel * dimension and we have `out_ch` number of such filters. * > We take 8 input lines at a time and transpose those. * > Keep loading and multiplying filter values one by one, * to produce 8 outputs in parallel * * 2. General version: (Refer esp_nn_conv_s8_filter_aligned_input_padded_esp32s3) * - For all other cases: * > Consider `filter_wd * in_ch` as a single row. These many values can * be continuosly loaded from inputs as well. * > multiply accumulate into a single filter output. * > To speed things up further, we pre-calculate * (filter * in_offset + bias term) earlier and add it at the end of filter * * About ((filter * in_offset + bias term)) accumulate term: * > The conv operation before requantization is as follows: * for i in filter_size: * conv_out += (input + input_offset) * filter; * conv_out += bias * * > where input_offset is constant term hence, we can see that * this term can be precalculated as: * for i in filter_size: * acc_term += input_offset * filter[i]; * acc_term += bias * OR * for i in filter_size: * acc_term += filter[i]; // accumulate filter values * acc_term = acc_term * input_offset + bias * * * In both the above versions we align the filter if needed, pad the input with * -input_offset if needed and extend the channels to make those multiple * of 8/16 as per function needs * * 3. Im2col version: (for small in_ch where filter_wd * in_ch < 16) * - Inspired by ESP32-P4 im2col approach. * - Instead of padding channels (wastes 81% of SIMD lanes for in_ch=3), * flatten the entire filter window into one contiguous vector: * window_len = filter_wd * filter_ht * in_ch (e.g., 3*3*3 = 27) * - For each output pixel: copy the input window into a scratch buffer, * then use ACCX dot product on the full window. No wasted MACs. */ #include #include #include #include #include /* 3x3 optimized path — im2col per pixel, iterate OC with input in cache */ extern int esp_nn_conv_s8_3x3_can_use(int filter_wd, int filter_ht, int in_channels); extern void esp_nn_conv_s8_3x3_opt(const int8_t *input, const uint16_t input_wd, const uint16_t input_ht, const uint16_t in_channels, const int32_t input_offset, const uint16_t stride_wd, const uint16_t stride_ht, const int8_t *filter_data, const int32_t *bias, int8_t *out_data, const uint16_t out_wd, const uint16_t out_ht, const uint16_t out_channels, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max, void *scratch); /* ANSI C reference conv for comparison */ extern void esp_nn_conv_s8_ansi(const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const conv_params_t *conv_params, const quant_data_t *quant_data); /* 1x1 conv — correct SIMD implementation */ extern int esp_nn_conv_s8_1x1_scratch_size(int out_channels); extern void esp_nn_conv_s8_1x1(const int8_t *input, const uint16_t input_wd, const uint16_t input_ht, const uint16_t in_channels, const int32_t input_offset, const int8_t *filter_data, const int32_t *bias, int8_t *out_data, const uint16_t out_channels, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max, void *scratch); /* Debug heap checks — enable to find buffer overruns */ #if CONFIG_IDF_CMAKE #include "esp_heap_caps.h" #define CONV_HEAP_CHECK(tag) do { \ if (!heap_caps_check_integrity_all(false)) { \ printf("CONV HEAP CORRUPT: %s\n", tag); \ } \ } while(0) #else #define CONV_HEAP_CHECK(tag) #endif static int16_t *scratch_buffer = NULL; extern void esp_nn_conv_s8_mult8_1x1_esp32s3( const int8_t *input_data, const uint16_t input_wd, const uint16_t input_ht, const uint16_t in_channels, const int32_t input_offset, const int8_t *filter_aligned, const int32_t *bias, int8_t *out_data, const uint16_t out_wd, const uint16_t out_ht, const uint16_t out_channels, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max, void *buffer /* scratch buffer */); extern void esp_nn_conv_s8_filter_aligned_input_padded_esp32s3( const int8_t *input_data, const uint16_t input_wd, const uint16_t input_ht, const uint16_t in_channels, const int32_t input_offset, const uint16_t stride_wd, const uint16_t stride_ht, const int8_t *filter_data, const uint16_t filter_wd, const uint16_t filter_ht, const int32_t *bias, int8_t *out_data, const uint16_t out_wd, const uint16_t out_ht, const uint16_t out_channels, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max, void *scratch_buffer); /* Use shared dot product from common — see esp_nn_dot_s8_esp32s3.S */ /** * Im2col convolution for small in_ch (filter_wd * in_ch < 16). * * Instead of padding channels to 16 (wasting 81% MACs for in_ch=3), * flatten the entire filter window into one contiguous vector: * window_len = filter_wd * filter_ht * in_ch (e.g., 3*3*3 = 27) * * For each output pixel: copy the input window into a contiguous scratch * buffer, then use ACCX dot product. No wasted MACs. * * Scratch layout: [filter_sum[out_ch] | im2col_buf[window_len_aligned]] */ __attribute__ ((noinline)) static void esp_nn_conv_s8_im2col_s3( const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const conv_params_t *conv_params, const quant_data_t *quant_data, void *scratch) { const uint16_t input_wd = input_dims->width; const uint16_t input_ht = input_dims->height; const uint16_t in_ch = input_dims->channels; const uint16_t filter_wd = filter_dims->width; const uint16_t filter_ht = filter_dims->height; const uint16_t out_wd = output_dims->width; const uint16_t out_ht = output_dims->height; const uint16_t out_ch = output_dims->channels; const uint16_t pad_wd = conv_params->padding.width; const uint16_t pad_ht = conv_params->padding.height; const uint16_t stride_wd = conv_params->stride.width; const uint16_t stride_ht = conv_params->stride.height; const int32_t input_offset = conv_params->in_offset; const int32_t out_offset = conv_params->out_offset; const int32_t activation_min = conv_params->activation.min; const int32_t activation_max = conv_params->activation.max; const int32_t window_len = filter_wd * filter_ht * in_ch; /* Align to 16 for SIMD: zero-padded tail doesn't affect dot product */ const int32_t window_len_aligned = (window_len + 15) & ~15; const int8_t pad_val = (int8_t)(-input_offset); /* Scratch layout (16-byte aligned): * [filter_sum: out_ch * 4] * [aligned_filter: out_ch * window_len_aligned] -- zero-padded copy * [im2col_buf: window_len_aligned] */ int32_t *filter_sum = (int32_t *)scratch; int8_t *aligned_filter = (int8_t *)((uintptr_t)((int8_t *)scratch + out_ch * sizeof(int32_t) + 15) & ~15); int8_t *im2col_buf = (int8_t *)((uintptr_t)(aligned_filter + out_ch * window_len_aligned + 15) & ~15); /* Pre-compute filter_sum * input_offset AND copy filter with zero-padded tail */ const int8_t *fptr = filter_data; int8_t *af_ptr = aligned_filter; for (int32_t oc = 0; oc < out_ch; oc++) { int32_t sum = 0; for (int32_t fi = 0; fi < window_len; fi++) { sum += fptr[fi]; } filter_sum[oc] = sum * input_offset; /* Copy filter + zero-pad tail for safe SIMD reads */ memcpy(af_ptr, fptr, window_len); memset(af_ptr + window_len, 0, window_len_aligned - window_len); fptr += window_len; af_ptr += window_len_aligned; } /* Zero the tail of im2col buffer once (for aligned SIMD reads) */ memset(im2col_buf + window_len, 0, window_len_aligned - window_len); /* Compute safe interior region where no bounds checking needed. * Interior: all filter taps fall within valid input. */ const int32_t row_bytes = filter_wd * in_ch; int32_t safe_y_start = (pad_ht + stride_ht - 1) / stride_ht; int32_t safe_y_end = (input_ht - filter_ht + pad_ht) / stride_ht + 1; int32_t safe_x_start = (pad_wd + stride_wd - 1) / stride_wd; int32_t safe_x_end = (input_wd - filter_wd + pad_wd) / stride_wd + 1; if (safe_y_start > out_ht) safe_y_start = out_ht; if (safe_y_end > out_ht) safe_y_end = out_ht; if (safe_y_end < safe_y_start) safe_y_end = safe_y_start; if (safe_x_start > out_wd) safe_x_start = out_wd; if (safe_x_end > out_wd) safe_x_end = out_wd; if (safe_x_end < safe_x_start) safe_x_end = safe_x_start; /* Process each output pixel */ int8_t *out_ptr = out_data; for (int32_t out_y = 0; out_y < out_ht; out_y++) { const int32_t base_y = out_y * stride_ht - pad_ht; int is_safe_y = (out_y >= safe_y_start && out_y < safe_y_end); for (int32_t out_x = 0; out_x < out_wd; out_x++) { const int32_t base_x = out_x * stride_wd - pad_wd; /* Copy input window into contiguous im2col buffer */ int8_t *buf = im2col_buf; if (is_safe_y && out_x >= safe_x_start && out_x < safe_x_end) { /* FAST PATH: interior pixel — no bounds checking needed. * All filter taps guaranteed to be within valid input. */ for (int32_t fy = 0; fy < filter_ht; fy++) { const int8_t *src = input_data + ((base_y + fy) * input_wd + base_x) * in_ch; memcpy(buf, src, row_bytes); buf += row_bytes; } } else { /* SLOW PATH: edge pixel — per-element bounds checking */ for (int32_t fy = 0; fy < filter_ht; fy++) { int32_t in_y = base_y + fy; if (in_y >= 0 && in_y < input_ht) { for (int32_t fx = 0; fx < filter_wd; fx++) { int32_t in_x = base_x + fx; if (in_x >= 0 && in_x < input_wd) { const int8_t *src = input_data + (in_y * input_wd + in_x) * in_ch; memcpy(buf, src, in_ch); } else { memset(buf, pad_val, in_ch); } buf += in_ch; } } else { memset(buf, pad_val, row_bytes); buf += row_bytes; } } } /* Dot product against each output channel's filter (aligned copy) */ const int32_t *out_mult_ptr = quant_data->mult; const int32_t *out_shift_ptr = quant_data->shift; const int8_t *filter_ptr = aligned_filter; for (int32_t oc = 0; oc < out_ch; oc++) { int32_t conv_out = esp_nn_dot_s8_aligned_esp32s3(im2col_buf, filter_ptr, window_len_aligned); conv_out += filter_sum[oc]; if (bias) conv_out += bias[oc]; conv_out = esp_nn_requantize(conv_out, *out_mult_ptr++, *out_shift_ptr++); conv_out += out_offset; conv_out = max(conv_out, activation_min); conv_out = min(conv_out, activation_max); *out_ptr++ = (int8_t) conv_out; filter_ptr += window_len_aligned; } } } } int esp_nn_get_conv_scratch_size_esp32s3(const data_dims_t *input_dims, const data_dims_t *filter_dims, const data_dims_t *output_dims, const conv_params_t *conv_params) { const uint16_t input_wd = input_dims->width; const uint16_t input_ht = input_dims->height; const uint16_t in_ch = input_dims->channels; const uint16_t filter_wd = filter_dims->width; const uint16_t filter_ht = filter_dims->height; const uint16_t out_ch = output_dims->channels; const uint16_t pad_wd = conv_params->padding.width; const uint16_t pad_ht = conv_params->padding.height; const uint16_t stride_wd = conv_params->stride.width; const uint16_t stride_ht = conv_params->stride.height; int new_channels = (in_ch + 7) & ~7; int input_scratch = input_wd * input_ht * in_ch; int filter_scratch = filter_wd * filter_ht * in_ch * out_ch; int align_buf_size = 64; /* alignment (16) + assembly pre/post access margin (48) */ if ((filter_wd == 1 && filter_ht == 1 && pad_wd == 0 && pad_ht == 0) && (stride_wd == 1 && stride_ht == 1)) { int transpose_buf_size = 2 * (8 * new_channels); if (input_wd * input_ht < 8) { transpose_buf_size = 0; } if (in_ch % 8) { input_scratch = input_wd * input_ht * new_channels; } else { input_scratch = 0; } filter_scratch = new_channels * out_ch; return input_scratch + filter_scratch + transpose_buf_size + align_buf_size; } else { int32_t filter_row_size = filter_wd * in_ch; int32_t window_len = filter_wd * filter_ht * in_ch; /* Im2col path: filter_wd * in_ch < 16 but window_len >= 16 */ if (filter_row_size < 16 && window_len >= 16) { int32_t window_len_aligned = (window_len + 15) & ~15; /* filter_sum + aligned_filter_copy + im2col_buf + alignment padding */ int im2col_scratch = out_ch * 4 + 16 + out_ch * window_len_aligned + 16 + window_len_aligned; return im2col_scratch + align_buf_size; } new_channels = (in_ch + 15) & ~15; if (pad_wd == 0 && pad_ht == 0) { input_scratch = 0; } else { input_scratch = (input_wd + 2 * pad_wd) * (input_ht + 2 * pad_ht) * in_ch; } filter_scratch = filter_wd * filter_ht * new_channels * out_ch; // Account for filter alignment padding (worst case) int32_t aligned_filter_row_size = ((filter_row_size + 15) / 16) * 16; int filter_alignment_scratch = aligned_filter_row_size * filter_ht * out_ch; // Account for right/bottom padding even when pad_wd=0, pad_ht=0 int pad_right = max(0, (output_dims->width * stride_wd + filter_wd - 1) - input_wd); int pad_bottom = max(0, (output_dims->height * stride_ht + filter_ht - 1) - input_ht); int boundary_padding_scratch = 0; if (pad_right > 0 || pad_bottom > 0) { boundary_padding_scratch = (input_wd + pad_right) * (input_ht + pad_bottom) * in_ch; } int offset_acc_scratch = out_ch * 4; return input_scratch + filter_scratch + filter_alignment_scratch + boundary_padding_scratch + align_buf_size + offset_acc_scratch; } return align_buf_size; } void esp_nn_set_conv_scratch_buf_esp32s3(void *buf) { scratch_buffer = (int16_t *) buf; } void esp_nn_conv_s8_esp32s3(const data_dims_t *input_dims, const int8_t *input, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const conv_params_t *conv_params, const quant_data_t *quant_data) { const uint16_t input_wd = input_dims->width; const uint16_t input_ht = input_dims->height; const uint16_t channels = input_dims->channels; const int32_t input_offset = conv_params->in_offset; const int32_t out_offset = conv_params->out_offset; const uint16_t pad_wd = conv_params->padding.width; const uint16_t pad_ht = conv_params->padding.height; const uint16_t stride_wd = conv_params->stride.width; const uint16_t stride_ht = conv_params->stride.height; const uint16_t filter_wd = filter_dims->width; const uint16_t filter_ht = filter_dims->height; const uint16_t out_wd = output_dims->width; const uint16_t out_ht = output_dims->height; const uint16_t out_channels = output_dims->channels; const int32_t *out_shift = quant_data->shift; const int32_t *out_mult = quant_data->mult; const int32_t activation_min = conv_params->activation.min; const int32_t activation_max = conv_params->activation.max; /* Grouped conv (filter_ch < input_ch): fall back to ansi which handles it */ if (channels != filter_dims->channels) { esp_nn_conv_s8_ansi(input_dims, input, filter_dims, filter_data, bias, output_dims, out_data, conv_params, quant_data); return; } int filter_size = filter_wd * filter_ht * channels * out_channels; /* 1x1 stride-1 conv */ if (filter_wd == 1 && filter_ht == 1 && pad_wd == 0 && pad_ht == 0 && stride_wd == 1 && stride_ht == 1) { if (channels % 8 == 0) { /* Full asm path — requires mult8 channels + 8-byte aligned filter */ esp_nn_conv_s8_mult8_1x1_esp32s3(input, input_wd, input_ht, channels, input_offset, filter_data, bias, out_data, out_wd, out_ht, out_channels, out_offset, out_shift, out_mult, activation_min, activation_max, scratch_buffer); } else { /* Fallback: handles any alignment + any channel count */ esp_nn_conv_s8_1x1(input, input_wd, input_ht, channels, input_offset, filter_data, bias, out_data, out_channels, out_offset, out_shift, out_mult, activation_min, activation_max, scratch_buffer); } return; } if (scratch_buffer == NULL) { printf("esp_nn_conv error! scratch_buffer not set!\n"); return; } { int32_t filter_row_size = filter_wd * channels; int32_t window_len = filter_wd * filter_ht * channels; /* 3x3 optimized path: im2col per pixel, iterate OC with input in cache. * TODO: fix inline asm priming + performance regression before enabling. * Avoids the 128× input reload of the general aligned asm. */ #if 0 if (esp_nn_conv_s8_3x3_can_use(filter_wd, filter_ht, channels) && pad_wd == 0 && pad_ht == 0) { esp_nn_conv_s8_3x3_opt(input, input_wd, input_ht, channels, input_offset, stride_wd, stride_ht, filter_data, bias, out_data, out_wd, out_ht, out_channels, out_offset, out_shift, out_mult, activation_min, activation_max, (void *)scratch_buffer); return; } #endif /* Im2col path: small in_ch where per-row SIMD is wasteful, * but entire window is large enough for SIMD dot product. * E.g., 3x3 conv with in_ch=3: row=9 (<16), window=27 (>=16). */ if (filter_row_size < 16 && window_len >= 16) { esp_nn_conv_s8_im2col_s3(input_dims, input, filter_dims, filter_data, bias, output_dims, out_data, conv_params, quant_data, scratch_buffer); return; } // align the `filter width * channels` to 16 bytes. Do zero padding for the same int32_t filter_alignment_padding = 16 - (filter_row_size & 15); int8_t *filter_data_aligned = (int8_t *) filter_data; int8_t *input_padded = (int8_t *) input; int8_t *scratch_data = (int8_t *) scratch_buffer; int new_input_wd = input_wd, new_input_ht = input_ht; if (filter_alignment_padding != 16) { // pad filter_data int32_t new_row_size = filter_wd * channels + filter_alignment_padding; filter_data_aligned = scratch_data; int8_t *row_ptr = filter_data_aligned; const int8_t *filter_data_ptr = filter_data; for (int32_t ch_idx = 0; ch_idx < out_channels; ch_idx++) { for (int32_t row_idx = 0; row_idx < filter_ht; row_idx++) { memcpy(row_ptr, filter_data_ptr, filter_row_size); memset(row_ptr + filter_row_size, 0, new_row_size - filter_row_size); filter_data_ptr += filter_row_size; row_ptr += new_row_size; } } scratch_data += new_row_size * filter_ht * out_channels; filter_row_size = new_row_size; } else if ((int) filter_data & 15) { filter_data_aligned = scratch_data; memcpy(filter_data_aligned, filter_data, filter_size); scratch_data += filter_size; } // Calculate if right/bottom padding is needed even when pad_wd=0, pad_ht=0 // This happens when the filter extends beyond input boundaries at the edges // Formula matches depthwise convolution: (out_wd * stride_wd + filter_wd - 1) - input_wd int32_t pad_right = max(0, (out_wd * stride_wd + filter_wd - 1) - input_wd); int32_t pad_bottom = max(0, (out_ht * stride_ht + filter_ht - 1) - input_ht); // Apply padding if explicitly requested (pad_wd/pad_ht) OR if needed for boundary handling if (pad_wd != 0 || pad_ht != 0) { // Full padding (top, bottom, left, right) when pad_wd/pad_ht are set input_padded = (int8_t *) scratch_data; esp_nn_aligned_s8_pad_with_value(input, input_padded, input_wd, input_ht, channels, -input_offset, pad_wd, pad_ht); new_input_wd = input_wd + 2 * pad_wd; new_input_ht = input_ht + 2 * pad_ht; scratch_data += new_input_wd * new_input_ht * channels; } else if (pad_right > 0 || pad_bottom > 0) { // Only right/bottom padding needed for boundary handling (like depthwise conv) input_padded = (int8_t *) scratch_data; esp_nn_aligned_s8_pad_end_with_value(input, input_padded, input_wd, input_ht, channels, -input_offset, (uint16_t)pad_right, (uint16_t)pad_bottom); new_input_wd = input_wd + pad_right; new_input_ht = input_ht + pad_bottom; scratch_data += new_input_wd * new_input_ht * channels; } int filter_total = filter_wd * filter_ht * channels * out_channels; if (input_offset != 0 && filter_total > 16384) { int32_t *corrections = (int32_t *)scratch_data; int32_t filter_ch_size = filter_wd * filter_ht * channels; const int8_t *f_src = filter_data; // use ORIGINAL (not aligned) filter for sum for (int ch = 0; ch < out_channels; ch++) { int32_t filter_sum = 0; for (int i = 0; i < filter_ch_size; i++) { filter_sum += f_src[i]; } corrections[ch] = filter_sum * input_offset; if (bias) { corrections[ch] += bias[ch]; } f_src += filter_ch_size; } // Pass input_offset=0 to assembly so it skips its pre-computation. // Pass scratch_data as "bias" pointer — the assembly's bias-copy loop // will read from scratch and write to scratch (identity, no-op). esp_nn_conv_s8_filter_aligned_input_padded_esp32s3( input_padded, new_input_wd, new_input_ht, channels, 0, stride_wd, stride_ht, filter_data_aligned, filter_wd, filter_ht, (const int32_t *)scratch_data, out_data, out_wd, out_ht, out_channels, out_offset, out_shift, out_mult, activation_min, activation_max, scratch_data); CONV_HEAP_CHECK("general: after asm (precomp)"); } else { esp_nn_conv_s8_filter_aligned_input_padded_esp32s3( input_padded, new_input_wd, new_input_ht, channels, input_offset, stride_wd, stride_ht, filter_data_aligned, filter_wd, filter_ht, bias, out_data, out_wd, out_ht, out_channels, out_offset, out_shift, out_mult, activation_min, activation_max, scratch_data); CONV_HEAP_CHECK("general: after asm (normal)"); } } } ================================================ FILE: src/convolution/esp_nn_conv_opt.c ================================================ // Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #include int esp_nn_get_conv_scratch_size_opt(const data_dims_t *input_dims, const data_dims_t *filter_dims, const data_dims_t *output_dims, const conv_params_t *conv_params) { return 0; } void esp_nn_set_conv_scratch_buf_opt(const void *buf) { } __attribute__ ((noinline)) static void esp_nn_conv_s8_1x1(const data_dims_t *input_dims, const int8_t *input_data, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const conv_params_t *conv_params, const quant_data_t *quant_data) { const uint16_t input_wd = input_dims->width; const uint16_t in_channels = input_dims->channels; const int32_t input_offset = conv_params->in_offset; const int32_t out_offset = conv_params->out_offset; const uint16_t stride_wd = conv_params->stride.width; const uint16_t stride_ht = conv_params->stride.height; const uint16_t out_wd = output_dims->width; const uint16_t out_ht = output_dims->height; const uint16_t out_channels = output_dims->channels; const int32_t activation_min = conv_params->activation.min; const int32_t activation_max = conv_params->activation.max; for (int32_t in_row = 0; in_row < out_ht * stride_ht; in_row += stride_ht) { for (int32_t in_col = 0; in_col < out_wd * stride_wd; in_col += stride_wd) { const int32_t *out_mult = quant_data->mult; const int32_t *out_shift = quant_data->shift; const int8_t *filter_ptr = filter_data; const int8_t *input_base_ptr = input_data + (in_row * input_wd + in_col) * in_channels; int32_t out_ch_idx = 0; for (; out_ch_idx < out_channels; out_ch_idx++) { int32_t conv_out = 0; const int8_t *input_ptr = input_base_ptr; int32_t in_ch_idx = 0; for (; in_ch_idx < in_channels - 3; in_ch_idx += 4) { conv_out += (*input_ptr++ + input_offset) * *filter_ptr++; conv_out += (*input_ptr++ + input_offset) * *filter_ptr++; conv_out += (*input_ptr++ + input_offset) * *filter_ptr++; conv_out += (*input_ptr++ + input_offset) * *filter_ptr++; } for (; in_ch_idx < in_channels; in_ch_idx ++) { conv_out += (*input_ptr++ + input_offset) * *filter_ptr++; } if (bias) { conv_out += bias[out_ch_idx]; } conv_out = esp_nn_requantize(conv_out, *out_mult++, *out_shift++); conv_out += out_offset; conv_out = max(conv_out, activation_min); conv_out = min(conv_out, activation_max); *out_data++ = (int8_t) conv_out; } } } } /** * Assumption 1: i/p channels == o/p channels * Assumption 2: Pointers are valid * Assumption 3: dialation width = 1 */ void esp_nn_conv_s8_opt(const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const conv_params_t *conv_params, const quant_data_t *quant_data) { const uint16_t filter_wd = filter_dims->width; const uint16_t filter_ht = filter_dims->height; if (filter_wd == 1 && filter_ht == 1) { esp_nn_conv_s8_1x1(input_dims, input_data, filter_data, bias, output_dims, out_data, conv_params, quant_data); return; } const uint16_t input_wd = input_dims->width; const uint16_t input_ht = input_dims->height; const uint16_t in_channels = input_dims->channels; const int32_t input_offset = conv_params->in_offset; const int32_t out_offset = conv_params->out_offset; const uint16_t pad_wd = conv_params->padding.width; const uint16_t pad_ht = conv_params->padding.height; const uint16_t stride_wd = conv_params->stride.width; const uint16_t stride_ht = conv_params->stride.height; const uint16_t out_wd = output_dims->width; const uint16_t out_ht = output_dims->height; const uint16_t out_channels = output_dims->channels; const int32_t activation_min = conv_params->activation.min; const int32_t activation_max = conv_params->activation.max; /* Grouped conv (filter_ch < input_ch): fall back to ansi which handles it */ if (in_channels != filter_dims->channels) { esp_nn_conv_s8_ansi(input_dims, input_data, filter_dims, filter_data, bias, output_dims, out_data, conv_params, quant_data); return; } int32_t out_ch_idx, out_y, out_x, filter_y_idx, filter_x_idx; for (out_y = 0; out_y < out_ht; out_y++) { for (out_x = 0; out_x < out_wd; out_x++) { const int32_t *out_shift = quant_data->shift; const int32_t *out_mult = quant_data->mult; for (out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) { int32_t conv_out = 0; const int32_t base_y = stride_ht * out_y - pad_ht; const int32_t base_x = stride_wd * out_x - pad_wd; const int32_t filter_y_start = max(0, -base_y); const int32_t filter_x_start = max(0, -base_x); const int32_t filter_y_end = min(filter_ht, input_ht - base_y); const int32_t filter_x_end = min(filter_wd, input_wd - base_x); for (filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) { for (filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) { const int32_t in_row = base_y + filter_y_idx; const int32_t in_col = base_x + filter_x_idx; const int8_t *input_ptr = input_data + (in_row * input_wd + in_col) * in_channels; const int8_t *filter_ptr = filter_data + out_ch_idx * in_channels * filter_ht * filter_wd + (filter_y_idx * filter_wd + filter_x_idx) * in_channels; int32_t in_ch_idx = 0; for (; in_ch_idx < in_channels - 3; in_ch_idx += 4) { conv_out += (*input_ptr++ + input_offset) * *filter_ptr++; conv_out += (*input_ptr++ + input_offset) * *filter_ptr++; conv_out += (*input_ptr++ + input_offset) * *filter_ptr++; conv_out += (*input_ptr++ + input_offset) * *filter_ptr++; } for (; in_ch_idx < in_channels; in_ch_idx ++) { conv_out += (*input_ptr++ + input_offset) * *filter_ptr++; } } } if (bias) { conv_out += bias[out_ch_idx]; } conv_out = esp_nn_requantize(conv_out, *out_mult++, *out_shift++); conv_out += out_offset; conv_out = max(conv_out, activation_min); conv_out = min(conv_out, activation_max); *out_data++ = (int8_t) conv_out; } } } } ================================================ FILE: src/convolution/esp_nn_conv_s16_mult4_1x1_esp32s3.S ================================================ // Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. .text .literal_position .literal .nudge_val, 1073741824 # Program Unit: esp_nn_conv_s16_mult4_1x1_esp32s3 .type esp_nn_conv_s16_mult4_1x1_esp32s3, @function .align 4 .global esp_nn_conv_s16_mult4_1x1_esp32s3 esp_nn_conv_s16_mult4_1x1_esp32s3: # 0xa62 # scratch_buf = 0 # to_add = 32 # gra_spill_temp_139 = 36 # gra_spill_temp_140 = 40 # gra_spill_temp_141 = 44 # gra_spill_temp_155 = 48 # gra_spill_temp_156 = 52 # gra_spill_temp_144 = 56 # gra_spill_temp_145 = 60 # gra_spill_temp_146 = 64 # gra_spill_temp_147 = 68 # gra_spill_temp_148 = 72 # gra_spill_temp_149 = 76 # gra_spill_temp_150 = 80 # gra_spill_temp_151 = 84 # gra_spill_temp_152 = 88 # gra_spill_temp_153 = 92 # lgra_spill_temp_165 = 96 # lgra_spill_temp_166 = 100 # lgra_spill_temp_167 = 104 # lgra_spill_temp_168 = 108 # gra_spill_temp_158 = 112 # gra_spill_temp_159 = 116 # gra_spill_temp_160 = 120 // registers: // a2: int16_t *input_data // a3: uint16_t input_wd // a4: uint16_t input_ht // a5: uint16_t in_channels // a6: int16_t *filter_data // a7: int32_t *bias // on stack: // 160: int8_t *out_data // 164: uint16_t out_wd // 168: uint16_t out_ht // 172: uint16_t out_channels // 176: int32_t out_offset // 180: int32_t *out_shift // 184: int32_t *out_mult // 188: int32_t activation_min // 192: int32_t activation_max // 196: *buffer /* scratch buffer */ entry a1,160 # s32i.n a2,a1,40 # [0] gra_spill_temp_140 s32i a6,a1,68 # [1] gra_spill_temp_147 s32i a7,a1,116 # [2] gra_spill_temp_159 mul16u a3,a3,a4 # [3] addi a10,a1,112 # [4] addmi a11,a1,176 # [5] addmi a8,a1,176 # [6] addmi a9,a1,176 # [7] addi.n a9,a9,12 # [8] addi a8,a8,16 # [9] ee.vldbc.32 q5,a11 # [10] id:188 out_offset ee.vldbc.32 q7,a8 # [12] id:270 activation_max ee.vldbc.32 q6,a9 # [13] id:269 activation_min blti a3,4,.Lt_3_6402 # [14] .LBB3_esp_nn_conv_s16_mult4_1x1_esp32s3: # 0xa90 l32i a13,a1,160 # [0] id:280 out_data+0x0 srai a8,a5,2 # [1] addi a10,a3,-3 # [2] addi a9,a5,-3 # [3] movi.n a12,0 # [4] slli a11,a5,2 # [5] slli a15,a5,1 # [6] l16ui a14,a1,172 # [7] id:271 out_channels+0x0 s32i.n a15,a1,36 # [9] gra_spill_temp_139 s32i.n a11,a1,56 # [10] gra_spill_temp_144 s32i a12,a1,84 # [11] gra_spill_temp_151 s32i a9,a1,52 # [12] gra_spill_temp_156 s32i.n a10,a1,60 # [13] gra_spill_temp_145 s32i a8,a1,88 # [14] gra_spill_temp_152 movi.n a10,0 # [15] l32i a8,a1,196 # [16] id:281 buffer+0x0 slli a11,a11,1 # [19] l32i a15,a1,184 # [20] id:192 out_mult+0x0 s32i a11,a1,64 # [22] gra_spill_temp_146 s32i a8,a1,112 # [25] gra_spill_temp_158 s32i a10,a1,92 # [26] gra_spill_temp_153 movi.n a8,0 # [27] s32i a10,a1,80 # [31] gra_spill_temp_150 s32i a8,a1,76 # [32] gra_spill_temp_149 slli a8,a14,1 # [34] addx2 a9,a14,a14 # [35] s32i a9,a1,72 # [36] gra_spill_temp_148 s32i.n a8,a1,44 # [37] gra_spill_temp_141 addx4 a14,a14,a15 # [38] s32i a14,a1,48 # [39] gra_spill_temp_155 j .Lt_3_6914 # [40] .Lt_3_8194: # 0xb00 # Part of loop body line 305, head labeled .Lt_3_6914 l32i.n a12,a1,60 # [0] gra_spill_temp_145 l32i.n a9,a1,56 # [1] gra_spill_temp_144 l32i a8,a1,76 # [2] gra_spill_temp_149 l32i a15,a1,64 # [3] gra_spill_temp_146 l32i a11,a1,72 # [4] gra_spill_temp_148 l32i a14,a1,84 # [5] gra_spill_temp_151 add.n a13,a13,a11 # [6] l32i a11,a1,80 # [7] gra_spill_temp_150 add.n a14,a14,a15 # [8] add.n a8,a8,a9 # [9] s32i a8,a1,76 # [10] gra_spill_temp_149 s32i a14,a1,84 # [11] gra_spill_temp_151 addi.n a11,a11,4 # [12] s32i a11,a1,80 # [13] gra_spill_temp_150 bge a11,a12,.Lt_3_6402 # [14] .Lt_3_6914: # 0xb27 l32i a12,a1,52 # [0] gra_spill_temp_156 l32i a4,a1,112 # [1] gra_spill_temp_158 blti a12,1,.Lt_3_7170 # [2] .LBB6_esp_nn_conv_s16_mult4_1x1_esp32s3: # 0xb30 l32i a3,a1,88 # [0] gra_spill_temp_152 l32i.n a5,a1,40 # [1] gra_spill_temp_140 l32i a2,a1,84 # [3] gra_spill_temp_151 add.n a2,a2,a5 # [7] l32i.n a5,a1,36 # [9] gra_spill_temp_139 // load and transose 4 lines of input 4xchannels, loopgtz a3,.transpose_loop_end mov.n a3,a2 # [0*II+0] ee.vld.l.64.xp q0,a3,a5 # [0*II+2] id:282 ee.vld.l.64.xp q1,a3,a5 # [0*II+3] id:283 ee.vld.l.64.xp q2,a3,a5 # [0*II+4] id:284 ee.vld.l.64.xp q3,a3,a5 # [0*II+5] id:285 ee.vzip.16 q0,q1 # [0*II+6] ee.vzip.16 q2,q3 # [0*II+7] ee.vzip.32 q0,q2 # [0*II+8] ee.vst.128.ip q0,a4,16 # [0*II+9] id:286 ee.vst.128.ip q2,a4,16 # [0*II+10] id:287 addi.n a2,a2,8 # [0*II+1] .transpose_loop_end: .Lt_3_7170: # 0xb7c l32i a2,a1,68 # [0] gra_spill_temp_147 l32i a9,a1,116 # [1] gra_spill_temp_159 l16ui a8,a1,172 # [2] out_channels s32i a9,a1,120 # [3] gra_spill_temp_160 beqz.n a8,.Lt_3_8194 # [4] l32i a9,a1,180 # [0] out_shift l32i a11,a1,184 # [1] out_mult l32i a15,a1,72 # [2] gra_spill_temp_148 l32i.n a14,a1,44 # [3] gra_spill_temp_141 add.n a15,a15,a13 # [4] add.n a14,a14,a13 # [5] j .Lt_3_8706 # [6] .Lt_3_10754: # 0xb9a movi.n a3,0 # [0] .Lt_3_10498: # 0xb9c // esp_nn_multiply_by_quantized_mult_esp32s3 ee.zero.q q0 # [0] l32i a5,a1,92 # [1] gra_spill_temp_153 s32i a2,a1,96 # [2] lgra_spill_temp_165 s32i a11,a1,104 # [3] lgra_spill_temp_167 s32i a13,a1,108 # [4] lgra_spill_temp_168 s32i a9,a1,100 # [5] lgra_spill_temp_166 movi.n a13,0 # [6] max a12,a12,a13 # [7] wsr.sar a12 # [8] ee.vsl.32 q1,q1 # [9] ssai 31 # [10] ee.movi.32.a q1,a7,0 # [11] ee.movi.32.a q1,a8,1 # [12] ee.movi.32.a q1,a6,3 # [13] ee.movi.32.a q1,a9,2 # [14] mulsh a12,a4,a9 # [15] mulsh a11,a4,a6 # [16] mulsh a2,a4,a8 # [17] mulsh a13,a7,a4 # [18] mull a8,a4,a8 # [19] mull a7,a7,a4 # [20] mull a6,a4,a6 # [24] add.n a11,a5,a11 # [21] add.n a12,a5,a12 # [22] add.n a2,a5,a2 # [23] add.n a5,a5,a13 # [25] l32r a13,.nudge_val mull a9,a4,a9 # [27] add.n a6,a13,a6 # [28] add.n a9,a13,a9 # [29] add.n a10,a13,a7 # [30] add.n a8,a13,a8 # [32] saltu a7,a10,a13 # [33] add.n a7,a7,a5 # [34] saltu a5,a8,a13 # [35] add.n a5,a5,a2 # [36] src a5,a5,a8 # [37] saltu a2,a9,a13 # [38] add.n a2,a2,a12 # [40] saltu a13,a6,a13 # [41] addi.n a12,a3,-1 # [42] src a2,a2,a9 # [43] ee.movi.32.q q3,a5,1 # [51] ee.movi.32.q q3,a2,2 # [54] add.n a13,a13,a11 # [44] addi a9,a1,32 # [45] to_add movi.n a11,1 # [46] src a7,a7,a10 # [47] src a13,a13,a6 # [48] ee.movi.32.q q3,a7,0 # [50] ee.movi.32.q q3,a13,3 # [57] addi a8,a1,112 # [49] l32i a7,a1,48 # [52] gra_spill_temp_155 l16ui a5,a1,172 # [53] out_channels ssl a12 # [55] sll a11,a11 # [56] wsr.sar a3 # [58] ee.vcmp.lt.s32 q0,q3,q0 # [59] l32i a13,a1,108 # [60] lgra_spill_temp_168 s32i.n a11,a1,32 # [61] to_add ee.vldbc.32 q1,a9 # [62] id:317 to_add add.n a5,a5,a13 # [63] l32i a9,a1,100 # [64] lgra_spill_temp_166 ee.vadds.s32 q1,q1,q0 # [65] addi.n a9,a9,4 # [66] ee.vadds.s32 q1,q3,q1 # [67] ee.vsr.32 q1,q1 # [69] # add offset, apply activation and store ee.vadds.s32 q1,q1,q5 # [70] ee.vmin.s32 q1,q1,q7 # [72] ee.vmax.s32 q1,q1,q6 # [73] ee.vst.128.ip q1,a1,0 # [74] id:320 l8ui a6,a1,0 # [75] scratch_buf s8i a6,a13,0 # [76] addi.n a13,a13,1 # [77] l8ui a2,a1,4 # [78] scratch_buf+4 s8i a2,a5,0 # [79] l8ui a12,a1,8 # [80] scratch_buf+8 l32i a2,a1,96 # [81] lgra_spill_temp_165 s8i a12,a14,0 # [82] addi.n a14,a14,1 # [83] l8ui a11,a1,12 # [84] scratch_buf+12 s8i a11,a15,0 # [85] l32i a11,a1,104 # [86] lgra_spill_temp_167 addi.n a15,a15,1 # [87] addi.n a11,a11,4 # [88] sub a7,a11,a7 # [89] beqz a7,.Lt_3_8194 # [90] .Lt_3_8706: # 0xc97 ee.zero.qacc # [0] l32i a8,a1,52 # [1] gra_spill_temp_156 l32i a3,a1,112 # [2] gra_spill_temp_158 blti a8,1,.Lt_3_8962 # [3] l32i a4,a1,88 # [0] gra_spill_temp_152 loopgtz a4,.LBB53_esp_nn_conv_s16_mult4_1x1_esp32s3 # [2] ee.vld.l.64.ip q0,a2,8 # [0*II+0] id:289 ee.vld.l.64.ip q1,a3,8 # [0*II+1] id:290 ee.vld.l.64.ip q2,a3,8 # [0*II+2] id:291 ee.vsmulas.s16.qacc q1,q0,0 # [0*II+3] ee.vld.l.64.ip q3,a3,8 # [0*II+4] id:292 ee.vsmulas.s16.qacc q2,q0,1 # [0*II+5] ee.vld.l.64.ip q4,a3,8 # [0*II+6] id:293 ee.vsmulas.s16.qacc q3,q0,2 # [0*II+7] ee.vsmulas.s16.qacc q4,q0,3 # [0*II+8] .LBB53_esp_nn_conv_s16_mult4_1x1_esp32s3: # 0xcc4 .Lt_3_8962: # 0xcc4 // extract data: mov a10,a1 ee.st.qacc_l.l.128.ip a10,16 # [0] id:298 ee.st.qacc_l.h.32.ip a10,-16 # [1] id:299 l8ui a12,a1,16 # [2] scratch_buf+16 l8ui a8,a1,6 # [3] scratch_buf+6 s8i a8,a1,3 # [4] scratch_buf+3 s8i a12,a1,7 # [5] scratch_buf+7 l8ui a8,a1,15 # [6] scratch_buf+15 l8ui a12,a1,5 # [7] scratch_buf+5 s8i a12,a1,2 # [8] scratch_buf+2 s8i a8,a1,6 # [9] scratch_buf+6 l16ui a12,a1,10 # [10] scratch_buf+10 movi.n a8,16 # [11] ee.srcmb.s16.qacc q2,a8,0 # [12] s16i a12,a1,4 # [13] scratch_buf+4 ee.vld.l.64.ip q1,a10,0 # [14] id:309 l32i a12,a1,116 # [15] gra_spill_temp_159, bias ee.vzip.16 q1,q2 # [16] beqz.n a12,.Lt_3_9986 # [17] // skip bias // add bias: l32i a8,a1,120 # [0] gra_spill_temp_160 ee.vldbc.32.ip q0,a8,4 # [2] id:311 s32i a8,a1,120 # [3] gra_spill_temp_160 ee.vadds.s32 q1,q1,q0 # [4] .Lt_3_9986: # 0xd04 l32i.n a12,a9,0 # [0] id:313 l32i.n a4,a11,0 # [1] id:312 bgei a12,1,.Lt_3_10754 # [2] neg a3,a12 # [0] j .Lt_3_10498 # [1] .Lt_3_6402: # 0xd11 retw.n # [0] .size esp_nn_conv_s16_mult4_1x1_esp32s3, . - esp_nn_conv_s16_mult4_1x1_esp32s3 ================================================ FILE: src/convolution/esp_nn_conv_s16_mult8_esp32s3.S ================================================ // Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. .text .literal_position .literal .LC10_28_153, -2147483648 .literal .LC11_28_154, -1073741823 .literal .LC12_28_155, 2147483647 .literal .LC13_28_156, 1073741824 # Program Unit: esp_nn_conv_s16_mult8_esp32s3 .type esp_nn_conv_s16_mult8_esp32s3, @function .align 4 .global esp_nn_conv_s16_mult8_esp32s3 esp_nn_conv_s16_mult8_esp32s3: # 0x6e2 # qacc_scratch = 0 # gra_spill_temp_96 = 48 # gra_spill_temp_97 = 52 # gra_spill_temp_98 = 56 # gra_spill_temp_99 = 60 # gra_spill_temp_100 = 64 # gra_spill_temp_101 = 68 # gra_spill_temp_102 = 72 # gra_spill_temp_103 = 76 # gra_spill_temp_104 = 80 # gra_spill_temp_105 = 84 # gra_spill_temp_106 = 88 # gra_spill_temp_107 = 92 # gra_spill_temp_108 = 96 # gra_spill_temp_109 = 100 # gra_spill_temp_110 = 104 # gra_spill_temp_111 = 108 # gra_spill_temp_112 = 112 # gra_spill_temp_113 = 116 # gra_spill_temp_114 = 120 # gra_spill_temp_115 = 124 # gra_spill_temp_116 = 128 # gra_spill_temp_117 = 132 # gra_spill_temp_118 = 136 # gra_spill_temp_119 = 140 # gra_spill_temp_120 = 144 # gra_spill_temp_121 = 148 # gra_spill_temp_122 = 152 # gra_spill_temp_123 = 156 # gra_spill_temp_124 = 160 # gra_spill_temp_125 = 164 # gra_spill_temp_126 = 168 # gra_spill_temp_127 = 172 # gra_spill_temp_128 = 176 # gra_spill_temp_129 = 180 # gra_spill_temp_130 = 184 # gra_spill_temp_131 = 188 # gra_spill_temp_132 = 192 # gra_spill_temp_133 = 196 # gra_spill_temp_134 = 200 # gra_spill_temp_135 = 204 # gra_spill_temp_136 = 208 # gra_spill_temp_137 = 212 // registers: // a2: const int16_t *input_data // a3: const uint16_t input_wd // a4: const uint16_t input_ht // a5: const uint16_t in_channels // a6: const uint16_t pad_wd // a7: const uint16_t pad_ht // on stack: // const uint16_t stride_wd // const uint16_t stride_ht // const int16_t *filter_data // const uint16_t filter_wd // const uint16_t filter_ht // const int32_t *bias // int8_t *out_data // const uint16_t out_wd // const uint16_t out_ht // const uint16_t out_channels // const int32_t out_offset // const int32_t *out_shift // const int32_t *out_mult // const int32_t activation_min // const int32_t activation_max entry a1,256 # s32i a2,a1,176 # [0] gra_spill_temp_128 s32i a3,a1,192 # [1] gra_spill_temp_132 s32i.n a6,a1,60 # [2] gra_spill_temp_99 l16ui a8,a1,288 # [3] id:282 out_ht+0x0 s32i a8,a1,68 # [4] gra_spill_temp_101 beqz.n a8,.Lt_2_11778 # [5] s32i a7,a1,76 # [0] gra_spill_temp_103 s32i a1,a1,156 # [1] gra_spill_temp_123 l16ui a8,a1,272 # [2] id:285 filter_ht+0x0 neg a11,a7 # [3] movi.n a12,0 # [4] neg a14,a6 # [5] l16ui a15,a1,268 # [6] id:286 filter_wd+0x0 l16ui a9,a1,292 # [7] id:283 out_channels+0x0 l32i a10,a1,304 # [8] id:284 out_mult+0x0 s32i a10,a1,88 # [9] gra_spill_temp_106 s32i a9,a1,96 # [10] gra_spill_temp_108 s32i a15,a1,196 # [11] gra_spill_temp_133 s32i.n a14,a1,48 # [12] gra_spill_temp_96 s32i a12,a1,72 # [13] gra_spill_temp_102 s32i a11,a1,80 # [14] gra_spill_temp_104 s32i.n a8,a1,52 # [15] gra_spill_temp_97 sub a13,a3,a14 # [16] mul16u a8,a5,a8 # [17] s32i.n a13,a1,56 # [18] gra_spill_temp_98 sub a11,a4,a11 # [19] l32i a12,a1,276 # [20] id:292 bias+0x0 s32i a12,a1,152 # [21] gra_spill_temp_122 s32i a11,a1,84 # [22] gra_spill_temp_105 l32i a14,a1,308 # [23] id:290 activation_min+0x0 l32i a13,a1,312 # [24] id:291 activation_max+0x0 s32i a13,a1,144 # [25] gra_spill_temp_120 mull a15,a15,a8 # [26] addx4 a9,a9,a10 # [27] s32i a14,a1,140 # [28] gra_spill_temp_119 l32i a11,a1,300 # [29] id:293 out_shift+0x0 s32i a11,a1,92 # [30] gra_spill_temp_107 slli a14,a5,1 # [31] s32i a9,a1,124 # [32] gra_spill_temp_115 s32i a15,a1,128 # [33] gra_spill_temp_116 l32i a8,a1,280 # [34] id:288 out_data+0x0 movi.n a10,0 # [35] s32i a10,a1,160 # [36] gra_spill_temp_124 s32i a8,a1,132 # [37] gra_spill_temp_117 l32i a15,a1,296 # [38] id:289 out_offset+0x0 l32i a9,a1,264 # [39] id:287 filter_data+0x0 s32i a9,a1,180 # [40] gra_spill_temp_129 s32i a15,a1,136 # [41] gra_spill_temp_118 l16ui a8,a1,284 # [42] id:296 out_wd+0x0 l16ui a10,a1,256 # [43] id:294 stride_wd+0x0 s32i a10,a1,100 # [44] gra_spill_temp_109 s32i a8,a1,104 # [45] gra_spill_temp_110 addi.n a15,a5,-1 # [46] l16ui a9,a1,260 # [47] id:295 stride_ht+0x0 s32i a9,a1,64 # [48] gra_spill_temp_100 srai a15,a15,3 # [49] j .Lt_2_12290 # [50] .Lt_2_12546: # 0x788 l32i a8,a1,68 # [0] gra_spill_temp_101 l32i a12,a1,80 # [1] gra_spill_temp_104 l32i a11,a1,84 # [2] gra_spill_temp_105 l32i a10,a1,64 # [3] gra_spill_temp_100 l32i a13,a1,72 # [4] gra_spill_temp_102 l32i a9,a1,76 # [5] gra_spill_temp_103 addi.n a13,a13,1 # [6] s32i a13,a1,72 # [7] gra_spill_temp_102 sub a9,a9,a10 # [8] sub a11,a11,a10 # [9] add.n a12,a12,a10 # [10] s32i a12,a1,80 # [11] gra_spill_temp_104 s32i a11,a1,84 # [12] gra_spill_temp_105 s32i a9,a1,76 # [13] gra_spill_temp_103 sub a13,a13,a8 # [14] beqz a13,.Lt_2_11778 # [15] .Lt_2_12290: # 0x7b6 // width loop l32i a13,a1,104 # [0] gra_spill_temp_110 beqz.n a13,.Lt_2_12546 # [2] l32i a8,a1,192 # [0] gra_spill_temp_132 l32i a9,a1,80 # [1] gra_spill_temp_104 movi.n a11,0 # [2] l32i a10,a1,76 # [3] gra_spill_temp_103 l32i.n a12,a1,60 # [4] gra_spill_temp_99 l32i.n a13,a1,56 # [5] gra_spill_temp_98 s32i a13,a1,116 # [6] gra_spill_temp_113 s32i a12,a1,112 # [7] gra_spill_temp_112 max a10,a10,a11 # [8] s32i a10,a1,148 # [9] gra_spill_temp_121 add.n a9,a9,a10 # [10] l32i.n a11,a1,48 # [11] gra_spill_temp_96 s32i a11,a1,184 # [12] gra_spill_temp_130 mull a8,a8,a9 # [13] l32i a10,a1,84 # [14] gra_spill_temp_105 s32i a8,a1,120 # [15] gra_spill_temp_114 l32i.n a9,a1,52 # [16] gra_spill_temp_97 movi.n a8,0 # [17] s32i a8,a1,108 # [18] gra_spill_temp_111 min a9,a9,a10 # [19] s32i a9,a1,204 # [20] gra_spill_temp_135 j .Lt_2_13058 # [21] .Lt_2_13314: # 0x7f6 # Part of loop body line 186, head labeled .Lt_2_13058 l32i a13,a1,104 # [0] gra_spill_temp_110 l32i a11,a1,112 # [1] gra_spill_temp_112 l32i a10,a1,184 # [2] gra_spill_temp_130 l32i a9,a1,100 # [3] gra_spill_temp_109 l32i a12,a1,108 # [4] gra_spill_temp_111 l32i a8,a1,116 # [5] gra_spill_temp_113 addi.n a12,a12,1 # [6] s32i a12,a1,108 # [7] gra_spill_temp_111 sub a8,a8,a9 # [8] add.n a10,a10,a9 # [9] sub a11,a11,a9 # [10] s32i a11,a1,112 # [11] gra_spill_temp_112 s32i a10,a1,184 # [12] gra_spill_temp_130 s32i a8,a1,116 # [13] gra_spill_temp_113 beq a12,a13,.Lt_2_12546 # [14] .Lt_2_13058: # 0x821 // channel loop l32i a12,a1,96 # [0] gra_spill_temp_108 beqz.n a12,.Lt_2_13314 # [2] movi.n a11,0 # [0] l32i a10,a1,112 # [1] gra_spill_temp_112 l32i a13,a1,92 # [2] gra_spill_temp_107 l32i a8,a1,152 # [3] gra_spill_temp_122 movi.n a9,0 # [4] l32i a12,a1,88 # [5] gra_spill_temp_106 s32i a12,a1,168 # [6] gra_spill_temp_126 s32i a9,a1,188 # [7] gra_spill_temp_131 s32i a8,a1,164 # [8] gra_spill_temp_125 s32i a13,a1,172 # [9] gra_spill_temp_127 l32i a8,a1,116 # [10] gra_spill_temp_113 l32i a13,a1,196 # [11] gra_spill_temp_133 max a10,a10,a11 # [12] s32i a10,a1,208 # [13] gra_spill_temp_136 min a13,a13,a8 # [14] s32i a13,a1,200 # [15] gra_spill_temp_134 j .Lt_2_13826 # [16] .Lt_2_14082: # 0x857 // extract data l32i a4,a1,156 # [0] gra_spill_temp_123 ee.st.qacc_l.l.128.ip a4,16 # [2] id:303 ee.st.qacc_l.h.32.ip a4,0 # [3] id:304 l8ui a9,a1,15 # [4] qacc_scratch+15 l16ui a8,a1,10 # [5] qacc_scratch+10 l8ui a12,a1,16 # [6] qacc_scratch+16 l8ui a11,a1,6 # [7] qacc_scratch+6 l8ui a10,a1,5 # [8] qacc_scratch+5 s8i a10,a1,2 # [9] qacc_scratch+2 s8i a11,a1,3 # [10] qacc_scratch+3 s8i a12,a1,7 # [11] qacc_scratch+7 s16i a8,a1,4 # [12] qacc_scratch+4 s8i a9,a1,6 # [13] qacc_scratch+6 ee.st.qacc_h.l.128.ip a4,16 # [14] id:314 ee.st.qacc_h.h.32.ip a4,-32 # [15] id:315 l8ui a13,a1,32 # [16] qacc_scratch+32 l8ui a9,a1,21 # [17] qacc_scratch+21 l8ui a12,a1,31 # [18] qacc_scratch+31 l16ui a11,a1,26 # [19] qacc_scratch+26 l8ui a10,a1,22 # [20] qacc_scratch+22 l16ui a8,a1,16 # [21] qacc_scratch+16 s16i a8,a1,8 # [22] qacc_scratch+8 s8i a10,a1,11 # [23] qacc_scratch+11 s16i a11,a1,12 # [24] qacc_scratch+12 s8i a12,a1,14 # [25] qacc_scratch+14 s8i a9,a1,10 # [26] qacc_scratch+10 s8i a13,a1,15 # [27] qacc_scratch+15 l32i a9,a1,152 # [28] gra_spill_temp_122, bias movi.n a13,16 # [29] ee.srcmb.s16.qacc q1,a13,0 # [30] ee.vld.128.ip q0,a4,0 # [31] id:327 s32i a4,a1,156 # [32] gra_spill_temp_123 ee.vzip.16 q0,q1 # [33] ee.vadds.s32 q0,q0,q1 # [34] ee.movi.32.a q0,a12,3 # [35] ee.movi.32.a q0,a11,2 # [36] ee.movi.32.a q0,a10,0 # [37] add.n a11,a11,a12 # [38] ee.movi.32.a q0,a12,1 # [39] add.n a10,a10,a12 # [40] add.n a10,a10,a11 # [41] beqz.n a9,.Lt_2_17154 # [42] // skip bias l32i a13,a1,164 # [0] gra_spill_temp_125 l32i.n a13,a13,0 # [2] id:329 add.n a10,a10,a13 # [4] .Lt_2_17154: # 0x8d7 # 259 conv_out = esp_nn_multiply_by_quantized_mult(conv_out, out_mult[out_ch_idx], out_shift[out_ch_idx]); l32i a11,a1,172 # [0] gra_spill_temp_127 l32i a4,a1,168 # [1] gra_spill_temp_126 l32i.n a11,a11,0 # [2] id:331 l32i.n a4,a4,0 # [3] id:330 blti a11,1,.LBB26_esp_nn_conv_s16_mult8_esp32s3 # [4] movi.n a13,0 # [0] j .Lt_2_17666 # [1] .LBB26_esp_nn_conv_s16_mult8_esp32s3: # 0xa4e neg a13,a11 # [0] .Lt_2_17666: # 0x8e6 movi.n a12,0 # [0] max a12,a11,a12 # [1] movi.n a11,0 # [2] ssl a12 # [3] sll a10,a10 # [4] bne a10,a4,.Lt_2_20994 # [5] l32r a9,.LC10_28_153 # [0] movi.n a8,1 # [1] sub a9,a10,a9 # [2] moveqz a11,a8,a9 # [3] .Lt_2_20994: # 0x901 extui a8,a4,31,1 # [0] extui a12,a10,31,1 # [1] xor a12,a12,a8 # [2] extui a12,a12,0,8 # [3] beqz.n a12,.Lt_2_18434 # [4] movi.n a12,-1 # [0] l32r a9,.LC11_28_154 # [1] j .Lt_2_18178 # [2] .Lt_2_18434: # 0xa54 movi.n a12,0 # [0] l32r a9,.LC13_28_156 # [1] .Lt_2_18178: # 0x914 ssai 31 # [0] l32r a8,.LC12_28_155 # [1] mulsh a6,a4,a10 # [2] mull a4,a4,a10 # [3] add.n a6,a6,a12 # [4] add.n a7,a4,a9 # [5] saltu a4,a7,a4 # [6] add.n a4,a4,a6 # [7] srai a6,a4,31 # [8] and a6,a6,a8 # [9] add.n a7,a6,a7 # [10] srai a3,a6,31 # [11] add.n a3,a3,a4 # [12] saltu a6,a7,a6 # [13] add.n a6,a6,a3 # [14] src a6,a6,a7 # [15] extui a3,a11,0,8 # [16] movi.n a7,1 # [17] ssr a13 # [18] movnez a6,a8,a3 # [19] sra a8,a6 # [20] addi.n a3,a8,1 # [21] ssl a13 # [22] sll a7,a7 # [23] extui a4,a8,31,1 # [24] addi.n a7,a7,-1 # [25] and a6,a6,a7 # [26] srai a7,a7,1 # [27] add.n a4,a4,a7 # [28] l32i a7,a1,164 # [29] gra_spill_temp_125 salt a4,a4,a6 # [30] movnez a8,a3,a4 # [31] l32i a6,a1,172 # [32] gra_spill_temp_127 l32i a4,a1,132 # [33] gra_spill_temp_117 l32i a3,a1,160 # [34] gra_spill_temp_124 addi.n a7,a7,4 # [35] s32i a7,a1,164 # [36] gra_spill_temp_125 addi.n a6,a6,4 # [37] s32i a6,a1,172 # [38] gra_spill_temp_127 l32i a7,a1,136 # [39] gra_spill_temp_118 l32i a6,a1,140 # [40] gra_spill_temp_119 add.n a4,a3,a4 # [41] add.n a7,a7,a8 # [42] addi.n a3,a3,1 # [43] l32i a8,a1,128 # [44] gra_spill_temp_116 max a6,a6,a7 # [45] s32i a3,a1,160 # [46] gra_spill_temp_124 l32i a7,a1,188 # [47] gra_spill_temp_131 l32i a3,a1,144 # [48] gra_spill_temp_120 add.n a7,a7,a8 # [49] min a3,a3,a6 # [50] s8i a3,a4,0 # [51] id:332 s32i a7,a1,188 # [52] gra_spill_temp_131 l32i a4,a1,168 # [53] gra_spill_temp_126 l32i a6,a1,124 # [54] gra_spill_temp_115 addi.n a4,a4,4 # [55] s32i a4,a1,168 # [56] gra_spill_temp_126 sub a4,a4,a6 # [57] beqz a4,.Lt_2_13314 # [58] .Lt_2_13826: # 0x9b4 ee.zero.qacc # [0] l32i a9,a1,204 # [1] gra_spill_temp_135 l32i a8,a1,148 # [2] gra_spill_temp_121 s32i a8,a1,212 # [3] gra_spill_temp_137 bge a8,a9,.Lt_2_14082 # [4] .LBB12_esp_nn_conv_s16_mult8_esp32s3: # 0x9c3 # Part of loop body line 187, head labeled .Lt_2_13826 l32i a8,a1,196 # [0] gra_spill_temp_133 l32i a7,a1,212 # [1] gra_spill_temp_137 l32i a13,a1,200 # [2] gra_spill_temp_134 mull a7,a7,a8 # [3] l32i a6,a1,120 # [4] gra_spill_temp_114 add.n a13,a7,a13 # [5] j .Lt_2_14594 # [6] .Lt_2_14850: # 0x9d7 # Part of loop body line 201, head labeled .Lt_2_14594 l32i a9,a1,204 # [0] gra_spill_temp_135 l32i a10,a1,212 # [1] gra_spill_temp_137 l32i a12,a1,192 # [2] gra_spill_temp_132 l32i a11,a1,196 # [3] gra_spill_temp_133 add.n a6,a6,a12 # [4] add.n a7,a7,a11 # [5] add.n a13,a13,a11 # [6] addi.n a10,a10,1 # [7] s32i a10,a1,212 # [8] gra_spill_temp_137 sub a9,a9,a10 # [9] beqz a9,.Lt_2_14082 # [10] .Lt_2_14594: # 0x9f4 l32i a9,a1,200 # [0] gra_spill_temp_134 l32i a8,a1,208 # [1] gra_spill_temp_136 bge a8,a9,.Lt_2_14850 # [3] l32i a11,a1,176 # [0] gra_spill_temp_128 l32i a10,a1,184 # [1] gra_spill_temp_130 add.n a12,a7,a8 # [2] add.n a10,a10,a8 # [3] add.n a10,a6,a10 # [4] mull a10,a5,a10 # [5] mull a8,a12,a5 # [6] addx2 a10,a10,a11 # [7] l32i a11,a1,188 # [8] gra_spill_temp_131 add.n a11,a11,a8 # [10] l32i a8,a1,180 # [11] gra_spill_temp_129 mov.n a2,a10 # [12] addx2 a11,a11,a8 # [13] movi.n a8,8 # [14] mov.n a3,a11 # [15] j .Lt_2_15362 # [16] .LBB18_esp_nn_conv_s16_mult8_esp32s3: # 0xa26 loopgtz a15,.LBB54_esp_nn_conv_s16_mult8_esp32s3 # [0] ee.vmulas.s16.qacc.ld.ip q0,a2,16,q0,q1 # [0*II+0] id:300 ee.vld.128.ip q1,a3,16 # [0*II+1] id:301 .LBB54_esp_nn_conv_s16_mult8_esp32s3: # 0xa30 .Lt_2_15618: # 0xa30 ee.vmulas.s16.qacc q0,q1 # [0] movi.n a8,8 # [1] add.n a10,a10,a14 # [2] add.n a11,a11,a14 # [3] mov.n a3,a11 # [4] mov.n a2,a10 # [5] beq a12,a13,.Lt_2_14850 # [6] .Lt_2_15362: # 0xa40 ee.vld.128.ip q1,a3,16 # [0] id:299 ee.vld.128.ip q0,a2,16 # [1] id:298 addi.n a12,a12,1 # [2] bltu a8,a5,.LBB18_esp_nn_conv_s16_mult8_esp32s3 # [3] j .Lt_2_15618 # [0] .Lt_2_11778: # 0xa5c retw.n # [0] .size esp_nn_conv_s16_mult8_esp32s3, . - esp_nn_conv_s16_mult8_esp32s3 ================================================ FILE: src/convolution/esp_nn_conv_s8_1x1_esp32s3.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /* * 1x1 convolution for ESP32-S3 using transpose + parallel MAC. * Processes 8 spatial positions simultaneously via QACC lanes. */ #include #include #include #include int esp_nn_conv_s8_1x1_scratch_size(int out_channels) { /* Transpose buffer: 8 channels × 8 positions × 2 bytes = 128 bytes per chunk. * Multiple chunks processed sequentially, so 128 is enough. */ return 128 + 64; /* transpose + alignment */ } /* * Transpose 8 spatial positions × 8 channels from int8 to int16 with offset. * C fallback for when input address is not 8-byte aligned. */ static inline void transpose_8x8_s16_c(const int8_t *input, int stride, int32_t input_offset, int16_t *out_buf) { for (int ch = 0; ch < 8; ch++) { for (int pos = 0; pos < 8; pos++) { out_buf[ch * 8 + pos] = (int16_t)(input[pos * stride + ch] + input_offset); } } } /* * SIMD transpose: 8 positions × 8 channels → channel-major int16 with offset. * Uses vzip.8/16/32 chain (same as original .S transpose, verified correct). * * Input: 8 consecutive spatial positions, each `stride` bytes apart. * Input address MUST be 8-byte aligned. * Output: int16 buffer [ch0: pos0..pos7, ch1: pos0..pos7, ...] (16-byte aligned) */ static inline void transpose_8x8_s16_simd(const int8_t *input, int stride, int16_t offset16, int16_t *out_buf) { const int8_t *p = input; int16_t *out = out_buf; int16_t *off_ptr = &offset16; __asm__ volatile( /* Load input_offset broadcast to all 8 int16 lanes */ "ee.vldbc.16 q5, %[off]\n" /* Zero register for sign extension comparisons */ "ee.zero.q q7\n" /* Load 8 positions × 8 channels into q0-q3 using paired l/h loads. * Each vld.l.64.xp loads 8 bytes (1 position) into low half, advances by stride. * Each vld.h.64.xp loads 8 bytes into high half, advances by stride. * Result: q0=[pos0|pos2], q1=[pos1|pos3], q2=[pos4|pos6], q3=[pos5|pos7] */ "ee.vld.l.64.xp q0, %[p], %[s]\n" "ee.vld.l.64.xp q1, %[p], %[s]\n" "ee.vld.h.64.xp q0, %[p], %[s]\n" "ee.vld.h.64.xp q1, %[p], %[s]\n" "ee.vld.l.64.xp q2, %[p], %[s]\n" "ee.vzip.8 q0, q1\n" "ee.vld.l.64.xp q3, %[p], %[s]\n" "ee.vld.h.64.xp q2, %[p], %[s]\n" "ee.vld.h.64.ip q3, %[p], 0\n" "ee.vzip.16 q0, q1\n" "ee.vzip.8 q2, q3\n" "ee.vzip.16 q2, q3\n" "ee.vzip.32 q0, q2\n" /* First 4 channels: sign-extend q0→(q0,q6), q2→(q2,q4), add offset, store */ "ee.vcmp.lt.s8 q4, q2, q7\n" "ee.vzip.8 q2, q4\n" "ee.vcmp.lt.s8 q6, q0, q7\n" "ee.vzip.8 q0, q6\n" "ee.vadds.s16 q0, q0, q5\n" "ee.vst.128.ip q0, %[out], 16\n" "ee.vadds.s16 q6, q6, q5\n" "ee.vst.128.ip q6, %[out], 16\n" "ee.vadds.s16 q2, q2, q5\n" "ee.vst.128.ip q2, %[out], 16\n" "ee.vadds.s16 q4, q4, q5\n" "ee.vst.128.ip q4, %[out], 16\n" /* Last 4 channels: sign-extend q1→(q1,q6), q3→(q3,q4), add offset, store */ "ee.vzip.32 q1, q3\n" "ee.vcmp.lt.s8 q4, q3, q7\n" "ee.vzip.8 q3, q4\n" "ee.vcmp.lt.s8 q6, q1, q7\n" "ee.vzip.8 q1, q6\n" "ee.vadds.s16 q1, q1, q5\n" "ee.vst.128.ip q1, %[out], 16\n" "ee.vadds.s16 q6, q6, q5\n" "ee.vst.128.ip q6, %[out], 16\n" "ee.vadds.s16 q3, q3, q5\n" "ee.vst.128.ip q3, %[out], 16\n" "ee.vadds.s16 q4, q4, q5\n" "ee.vst.128.ip q4, %[out], 16\n" : [p] "+r" (p), [out] "+r" (out), [off] "+r" (off_ptr) : [s] "r" (stride) : "memory" ); } /* * MAC 8 filter channels against 8 positions using QACC. * data_buf: [ch0: 8 int16, ch1: 8 int16, ...] = 128 bytes, 16-byte aligned * filter: 8 int8 values, sign-extended to int16 internally * Accumulates into QACC lanes 0-7 (must be zeroed before first call per oc) * * NOTE: filter pointer may not be 8-byte aligned, so we copy to an aligned * local buffer before using ee.vld.l.64.ip (which ignores unaligned address bits). */ static inline void mac_8pos_8ch_simd(const int16_t *data_buf, const int8_t *filter) { /* Copy filter to aligned buffer — ee.vld.l.64.ip requires 8-byte alignment */ int8_t __attribute__((aligned(16))) f_aligned[16]; memcpy(f_aligned, filter, 8); const int16_t *dp = data_buf; const int8_t *fp = f_aligned; __asm__ volatile( /* Sign-extend filter: load 8 int8 → 8 int16 in q7 */ "ee.zero.q q5\n" "ee.vld.l.64.ip q7, %[f], 0\n" /* Pre-load first two data chunks during sign extension */ "ee.vld.128.ip q0, %[d], 16\n" "ee.vld.128.ip q1, %[d], 16\n" "ee.vcmp.lt.s8 q6, q7, q5\n" "ee.vzip.8 q7, q6\n" /* Pipelined: MAC current + load next in one instruction */ "ee.vsmulas.s16.qacc.ld.incp q2, %[d], q0, q7, 0\n" "ee.vsmulas.s16.qacc.ld.incp q3, %[d], q1, q7, 1\n" "ee.vsmulas.s16.qacc.ld.incp q0, %[d], q2, q7, 2\n" "ee.vsmulas.s16.qacc.ld.incp q1, %[d], q3, q7, 3\n" "ee.vsmulas.s16.qacc.ld.incp q2, %[d], q0, q7, 4\n" "ee.vsmulas.s16.qacc.ld.incp q3, %[d], q1, q7, 5\n" /* Last two: plain MAC, no more data to load */ "ee.vsmulas.s16.qacc q2, q7, 6\n" "ee.vsmulas.s16.qacc q3, q7, 7\n" : [d] "+r" (dp), [f] "+r" (fp) : : "memory" ); } void esp_nn_conv_s8_1x1(const int8_t *input, const uint16_t input_wd, const uint16_t input_ht, const uint16_t in_channels, const int32_t input_offset, const int8_t *filter_data, const int32_t *bias, int8_t *out_data, const uint16_t out_channels, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max, void *scratch) { const int size = input_wd * input_ht; const int ch8 = in_channels / 8; /* SIMD transpose requires 8-byte aligned input; check once */ const int use_simd_transpose = (in_channels % 8 == 0) && (((uintptr_t)input & 7) == 0); const int16_t offset16 = (int16_t)input_offset; /* Use scratch buffer for transpose data — holds ALL channel groups at once. * Layout: [cg0: 8 int16 × 8 pos, cg1: 8 int16 × 8 pos, ...] = ch8 × 128 bytes. * Aligned to 16 bytes for SIMD loads. */ int16_t *tbuf = (int16_t *)((uintptr_t)((int8_t *)scratch + 15) & ~15); int pos = 0; for (; pos + 7 < size; pos += 8) { const int8_t *in_base = input + pos * in_channels; /* Transpose ALL channel groups ONCE per position batch. * This is the key optimization — reuse transposed data across all out_channels. */ for (int cg = 0; cg < ch8; cg++) { int16_t *cg_buf = tbuf + cg * 64; /* 64 int16 per channel group */ if (use_simd_transpose) { transpose_8x8_s16_simd(in_base + cg * 8, in_channels, offset16, cg_buf); } else { transpose_8x8_s16_c(in_base + cg * 8, in_channels, input_offset, cg_buf); } } __asm__ volatile("" ::: "memory"); for (int oc = 0; oc < out_channels; oc++) { const int8_t *filt = filter_data + oc * in_channels; /* MAC across all channel groups using pre-transposed data */ __asm__ volatile("ee.zero.qacc"); for (int cg = 0; cg < ch8; cg++) { mac_8pos_8ch_simd(tbuf + cg * 64, filt + cg * 8); } /* Extract QACC → 8 int32 values */ int32_t qacc[8]; { int8_t __attribute__((aligned(16))) qraw[24]; int8_t *qp = qraw; __asm__ volatile( "ee.st.qacc_l.l.128.ip %[p], 16\n" "ee.st.qacc_l.h.32.ip %[p], -16\n" : [p] "+r" (qp) : : "memory" ); qacc[0] = *(int32_t *)(qraw + 0); qacc[1] = *(int32_t *)(qraw + 5); qacc[2] = *(int32_t *)(qraw + 10); qacc[3] = *(int32_t *)(qraw + 15); qp = qraw; __asm__ volatile( "ee.st.qacc_h.l.128.ip %[p], 16\n" "ee.st.qacc_h.h.32.ip %[p], -16\n" : [p] "+r" (qp) : : "memory" ); qacc[4] = *(int32_t *)(qraw + 0); qacc[5] = *(int32_t *)(qraw + 5); qacc[6] = *(int32_t *)(qraw + 10); qacc[7] = *(int32_t *)(qraw + 15); } /* Remainder channels (scalar) */ for (int c = ch8 * 8; c < in_channels; c++) { int16_t f = (int16_t)filt[c]; for (int p = 0; p < 8; p++) { qacc[p] += ((int32_t)in_base[p * in_channels + c] + input_offset) * f; } } /* Bias + requant + store for 8 positions */ for (int p = 0; p < 8; p++) { int32_t acc = qacc[p]; if (bias) acc += bias[oc]; acc = esp_nn_multiply_by_quantized_mult(acc, out_mult[oc], out_shift[oc]); acc += out_offset; acc = max(acc, activation_min); acc = min(acc, activation_max); out_data[(pos + p) * out_channels + oc] = (int8_t)acc; } } } /* Leftover positions (< 8 remaining) */ for (; pos < size; pos++) { const int8_t *in_ptr = input + pos * in_channels; for (int oc = 0; oc < out_channels; oc++) { const int8_t *filt = filter_data + oc * in_channels; int32_t acc = 0; int c = 0; for (; c + 2 < in_channels; c += 3) { acc += ((int32_t)in_ptr[c] + input_offset) * (int32_t)filt[c]; acc += ((int32_t)in_ptr[c + 1] + input_offset) * (int32_t)filt[c + 1]; acc += ((int32_t)in_ptr[c + 2] + input_offset) * (int32_t)filt[c + 2]; } for (; c < in_channels; c++) { acc += ((int32_t)in_ptr[c] + input_offset) * (int32_t)filt[c]; } if (bias) acc += bias[oc]; acc = esp_nn_multiply_by_quantized_mult(acc, out_mult[oc], out_shift[oc]); acc += out_offset; acc = max(acc, activation_min); acc = min(acc, activation_max); out_data[pos * out_channels + oc] = (int8_t)acc; } } } ================================================ FILE: src/convolution/esp_nn_conv_s8_3x3_opt_esp32s3.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /* * Optimized 3x3 convolution for ESP32-S3. * * Key optimization vs the general aligned asm: * The general asm reloads input for each output channel (128× per pixel). * This version pre-loads the 3x3 input window into scratch (9 rows × in_ch bytes), * then iterates output channels with the input in L1 cache. * * For Conv[11] (26×26×128→12×12×128, 3×3 s2): * - Input window: 3 × 3 × 128 = 1,152 bytes (fits in L1) * - Filter per OC: 3 × 3 × 128 = 1,152 bytes * - Total for all 128 OC: 147,456 bytes (cycles through L1) * - Input loaded once vs 128× in the general asm */ #include #include #include #include /* * Check if a conv can use the optimized 3x3 path. * Requirements: * - filter_wd == 3 && filter_ht == 3 * - in_channels >= 16 (SIMD worth it) * - in_channels % 16 == 0 (aligned for ee.vld.128) */ int esp_nn_conv_s8_3x3_can_use(int filter_wd, int filter_ht, int in_channels) { return (filter_wd == 3 && filter_ht == 3 && in_channels >= 16 && (in_channels % 16) == 0); } /* * Scratch size for the 3x3 optimized path: * - im2col buffer: 3 × 3 × in_channels bytes (input window) * - corrections: out_channels × 4 bytes */ int esp_nn_conv_s8_3x3_scratch_size(int in_channels, int out_channels) { int im2col = 9 * in_channels; /* 3×3 input window */ int corrections = out_channels * 4; /* bias + filter_sum * offset */ return im2col + corrections + 32; /* + alignment */ } /* * 3x3 convolution: im2col per pixel, then dot product per output channel. * Uses ACCX dot product (ee.vmulas.s8.accx) for the 3×3×in_ch window. */ void esp_nn_conv_s8_3x3_opt(const int8_t *input, const uint16_t input_wd, const uint16_t input_ht, const uint16_t in_channels, const int32_t input_offset, const uint16_t stride_wd, const uint16_t stride_ht, const int8_t *filter_data, const int32_t *bias, int8_t *out_data, const uint16_t out_wd, const uint16_t out_ht, const uint16_t out_channels, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max, void *scratch) { const int window_len = 9 * in_channels; /* 3×3 window */ const int window_len_aligned = (window_len + 15) & ~15; /* Scratch layout: [im2col_buf | corrections] */ int8_t *im2col_buf = (int8_t *)((uintptr_t)((int8_t *)scratch + 15) & ~15); int32_t *corrections = (int32_t *)(im2col_buf + window_len_aligned); /* Pre-compute corrections: filter_sum * input_offset + bias */ const int8_t *f_ptr = filter_data; for (int oc = 0; oc < out_channels; oc++) { int32_t filter_sum = 0; for (int i = 0; i < window_len; i++) { filter_sum += f_ptr[i]; } corrections[oc] = filter_sum * input_offset; if (bias) corrections[oc] += bias[oc]; f_ptr += window_len; } /* Zero-pad the tail of im2col buffer for aligned SIMD reads */ memset(im2col_buf + window_len, 0, window_len_aligned - window_len); const int in_row_stride = input_wd * in_channels; for (int out_y = 0; out_y < out_ht; out_y++) { for (int out_x = 0; out_x < out_wd; out_x++) { /* Phase 1: Build im2col for this output pixel (one-time per pixel) */ const int in_y = out_y * stride_ht; const int in_x = out_x * stride_wd; int8_t *dst = im2col_buf; for (int fy = 0; fy < 3; fy++) { const int8_t *src = input + (in_y + fy) * in_row_stride + in_x * in_channels; memcpy(dst, src, 3 * in_channels); dst += 3 * in_channels; } /* Phase 2: Dot product against each output channel's filter */ const int8_t *filter_ptr = filter_data; for (int oc = 0; oc < out_channels; oc++) { /* ACCX dot product: im2col_buf · filter_ptr */ int32_t acc = 0; /* Use SIMD dot product via ACCX */ const int8_t *a = im2col_buf; const int8_t *b = filter_ptr; int remaining = window_len_aligned; __asm__ volatile("ee.zero.accx"); /* Primed unaligned load for input */ __asm__ volatile( "ee.ld.128.usar.ip q0, %[a], 16\n" : [a] "+r" (a) : : "memory" ); while (remaining >= 32) { __asm__ volatile( "ee.vld.128.ip q4, %[a], 16\n" "ee.vmulas.s8.accx.ld.ip.qup q3, %[b], 16, q2, q1, q0, q4\n" "ee.vld.128.ip q2, %[a], 16\n" "ee.vmulas.s8.accx.ld.ip.qup q1, %[b], 16, q0, q3, q4, q2\n" "ee.orq q0, q2, q2\n" "ee.orq q2, q4, q4\n" : [a] "+r" (a), [b] "+r" (b) : : "memory" ); remaining -= 32; } if (remaining >= 16) { __asm__ volatile( "ee.vmulas.s8.accx.ld.ip q4, %[a], 16, q2, q1\n" "ee.src.q.ld.ip q1, %[b], 16, q0, q4\n" "ee.orq q2, q0, q0\n" : [a] "+r" (a), [b] "+r" (b) : : "memory" ); remaining -= 16; } __asm__ volatile( "ee.vmulas.s8.accx q2, q1\n" "movi.n %[tmp], 0\n" "ee.srs.accx %[acc], %[tmp], 0\n" : [acc] "=r" (acc), [tmp] "=r" (remaining) : : "memory" ); acc += corrections[oc]; acc = esp_nn_multiply_by_quantized_mult(acc, out_mult[oc], out_shift[oc]); acc += out_offset; acc = max(acc, activation_min); acc = min(acc, activation_max); *out_data++ = (int8_t)acc; filter_ptr += window_len; } } } } ================================================ FILE: src/convolution/esp_nn_conv_s8_filter_aligned_input_padded_esp32s3.S ================================================ // // SPDX-FileCopyrightText: 2023-2026 Espressif Systems (Shanghai) CO LTD // // SPDX-License-Identifier: Apache-2.0 // // // Contraints used by this function are: // 1. pad_wd and pad_ht is 0. For versions needing padding we do this // explicitly // 2. All the filter rows are aligned to 16 bytes boundary. To make sure // this is indeed the case, for filter rows (filter_wd * channels) not // multiple of 16, we add zeros to fill it till 16 bondary. // // The optimized kernel assumes this and skips filter row with following // size: ((filter_wd * input_ch) + 15) & ~15. .text .literal_position .literal .LC1, 1073741824 # Program Unit: esp_nn_conv_s8_filter_aligned_input_padded_esp32s3 .type esp_nn_conv_s8_filter_aligned_input_padded_esp32s3, @function .align 4 .global esp_nn_conv_s8_filter_aligned_input_padded_esp32s3 // registers: // a2: const int16_t *input_data // a3: const uint16_t input_wd // a4: const uint16_t input_ht // a5: const uint16_t in_ch // a6: const uint16_t input_offset // a7: const uint16_t stride_wd // on stack: // const uint16_t stride_ht : 80 // const int8_t *filter_data : 84 // const uint16_t filter_wd : 88 // const uint16_t filter_ht : 92 // const int32_t *bias : 96 // int8_t *out_data : 100 // const uint16_t out_wd : 104 // const uint16_t out_ht : 108 // const uint16_t out_channels : 112 // const int32_t out_offset : 116 // const int32_t *out_shift : 120 // const int32_t *out_mult : 124 // const int32_t activation_min: 128 // const int32_t activation_max: 132 // void *scratch_buffer: 136 esp_nn_conv_s8_filter_aligned_input_padded_esp32s3: entry sp, 80 s32i.n a2, sp, 40 # input_data mov a11, a6 # input_offset l16ui a2, sp, 88 # filter_wd l32i a8, sp, 100 # out_data l16ui a6, sp, 80 # stride_ht mov.n a15, a5 mull a4, a2, a15 # filter_row_sz s32i.n a8, sp, 24 # out_data_ptr movi.n a9, 0 s32i.n a9, sp, 36 # out_y addi.n a4, a4, 15 # to round the size up srli a2, a4, 4 # (filter_row_sz) >> 4 slli a12, a2, 4 # ((filter_row_sz) >> 4) << 4 mull a4, a6, a3 # stride_ht * input_wd mull a5, a3, a15 # input_wd * in_ch l32i.n a10, sp, 112 # out_ch mull a9, a7, a15 # stride_wd * in_ch mull a4, a4, a15 # (stride_ht * input_wd) * in_ch slli a3, a10, 2 # out_ch * 4 s32i.n a3, sp, 32 # out_ch * 4 s32i.n a5, sp, 12 # input_wd * in_ch s32i.n a9, sp, 52 # stride_wd * in_ch s32i a4, sp, 56 # (stride_ht * input_wd) * in_ch l32i.n a3, sp, 92 # filter_ht l32i a13, sp, 136 # scratch_buf l32i a5, sp, 84 # filter_data mull a4, a12, a3 # (filter_wd * filter_ht * in_ch) srai a4, a4, 1 addx4 a10, a10, a13 # scratch_buf + 4 * out_ch l32i a3, sp, 96 // Skip filter sum accumulation if input_offset is 0 (common in TFLite) // In that case, correction = just bias (pre-filled by C wrapper) beqz a11, .L_skip_acc_loop // accumulate filter values per channel into scratch buffer .L_acc_out_channel_loop: movi.n a9, 0 // acc loop a4, .L_acc_filter_size_loop l8ui a14, a5, 0 l8ui a7, a5, 1 addi.n a5, a5, 2 sext a14, a14, 7 sext a7, a7, 7 add a9, a9, a14 add a9, a9, a7 .L_acc_filter_size_loop: // multiply by offset, add bias and store the acc value per channel mull a9, a9, a11 beqz.n a3, .L_skip_bias l32i a8, a3, 0 addi a3, a3, 4 // this will remain 0 if bias not present add a9, a9, a8 .L_skip_bias: s32i a9, a13, 0 addi.n a13, a13, 4 blt a13, a10, .L_acc_out_channel_loop j .L_acc_done .L_skip_acc_loop: // input_offset == 0: correction = bias only // Fill scratch_buf with bias values beqz.n a3, .L_skip_acc_zero_bias .L_copy_bias_loop: l32i a8, a3, 0 s32i a8, a13, 0 addi a3, a3, 4 addi.n a13, a13, 4 blt a13, a10, .L_copy_bias_loop j .L_acc_done .L_skip_acc_zero_bias: // No bias either: zero the scratch buffer .L_zero_scratch_loop: movi.n a8, 0 s32i a8, a13, 0 addi.n a13, a13, 4 blt a13, a10, .L_zero_scratch_loop .L_acc_done: movi.n a4, 0 # 0 .L_height_loop: l32i.n a8, sp, 40 # in_row_ptr movi.n a9, 0 l32i.n a10, sp, 104 # out_wd s32i.n a8, sp, 28 # input_ptr s32i.n a9, sp, 44 # out_x .L_width_loop: movi.n a9, 0 l32i a5, sp, 84 # filter_data s32i.n a9, sp, 20 l32i a3, sp, 136 # scratch_buf .L_out_ch_loop: movi.n a6, 0 l32i.n a9, sp, 28 # input_ptr mov.n a10, a6 .L_filter_ht_loop: add.n a8, a5, a12 mov.n a13, a9 ee.zero.accx ee.ld.128.usar.ip q0, a13, 16 ee.vld.128.ip q4, a13, 16 ee.vld.128.ip q1, a5, 16 sub a15, a8, a5 // row_len - 16 extui a14, a15, 4, 1 // if multiple of 16 and not 32 srai a15, a15, 5 // multiples of 32 ee.src.q.qup q2, q0, q4 beqz a15, .L_vector_32_loop_end loop a15, .L_vector_32_loop_end ee.vld.128.ip q4, a13, 16 ee.vmulas.s8.accx.ld.ip.qup q3, a5, 16, q2, q1, q0, q4 ee.vld.128.ip q2, a13, 16 ee.vmulas.s8.accx.ld.ip.qup q1, a5, 16, q0, q3, q4, q2 ee.orq q0, q2, q2 ee.orq q2, q4, q4 .L_vector_32_loop_end: beqz a14, .L_vector_loop_end ee.vmulas.s8.accx.ld.ip q4, a13, 16, q2, q1 ee.src.q.ld.ip q1, a5, 16, q0, q4 ee.orq q2, q0, q0 .L_vector_loop_end: ee.vmulas.s8.accx q2, q1 addi a13, a13, -16 // since we incremented by 16 too much movi a15, 0 ee.srs.accx a14, a15, 0 mov.n a5, a8 add.n a6, a6, a14 .L7: l32i.n a8, sp, 12 # input_wd * in_ch l32i.n a2, sp, 92 # filter_ht addi.n a10, a10, 1 # filter_y_idx add.n a9, a9, a8 blt a10, a2, .L_filter_ht_loop .L9: l32i a7, a3, 0 # load input_offset acc addi a3, a3, 4 # increment offset acc ptr l32i.n a8, sp, 20 add.n a6, a6, a7 # add input_offset accumulation .L_multiply_by_quant_mult: l32i a10, sp, 120 l32i a9, sp, 124 add.n a2, a10, a8 l32i.n a2, a2, 0 add.n a7, a9, a8 l32i.n a7, a7, 0 max a8, a2, a4 ssl a8 sll a6, a6 mull a9, a6, a7 l32r a10, .LC1 sub a2, a8, a2 add.n a8, a9, a10 mulsh a6, a6, a7 movi.n a7, 1 bltu a8, a9, .L13 movi.n a7, 0 .L13: add.n a6, a7, a6 slli a6, a6, 1 extui a8, a8, 31, 1 or a6, a6, a8 beqz.n a2, .L_skip_div_by_pow_of_2 addi.n a7, a2, -1 movi.n a9, 1 extui a8, a6, 31, 1 ssl a7 sll a7, a9 sub a7, a7, a8 add.n a6, a7, a6 ssr a2 sra a6, a6 .L_skip_div_by_pow_of_2: l32i a10, sp, 116 l32i a8, sp, 128 add.n a2, a10, a6 l32i a9, sp, 132 l32i.n a10, sp, 24 # out_data_ptr max a2, a2, a8 min a2, a2, a9 s8i a2, a10, 0 l32i.n a2, sp, 20 addi.n a10, a10, 1 addi.n a2, a2, 4 l32i.n a6, sp, 32 s32i.n a2, sp, 20 s32i.n a10, sp, 24 # out_data_ptr bne a6, a2, .L_out_ch_loop .L4: l32i.n a5, sp, 44 # out_x l32i.n a6, sp, 28 # input_ptr (was stored by height loop) l32i.n a8, sp, 52 # stride_wd * in_ch addi.n a5, a5, 1 add.n a6, a6, a8 # input_ptr + stride_wd * in_ch l32i.n a9, sp, 104 # out_wd s32i.n a5, sp, 44 # out_x s32i.n a6, sp, 28 # input_ptr bne a9, a5, .L_width_loop l32i.n a10, sp, 36 # out_y l32i.n a2, sp, 40 # in_row_ptr l32i a5, sp, 56 # (stride_ht * input_wd) * in_ch l32i.n a6, sp, 108 # out_ht addi.n a10, a10, 1 add.n a2, a2, a5 # in_row_ptr s32i.n a10, sp, 36 # out_y s32i.n a2, sp, 40 # in_row_ptr blt a10, a6, .L_height_loop // end outer (height) loop retw.n .size esp_nn_conv_s8_filter_aligned_input_padded_esp32s3, .-esp_nn_conv_s8_filter_aligned_input_padded_esp32s3 ================================================ FILE: src/convolution/esp_nn_conv_s8_mult8_1x1_esp32s3.S ================================================ // Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. .text .literal_position .literal .nudge_val, 1073741824 # Program Unit: esp_nn_conv_s8_mult8_1x1_esp32s3 # # Requirements: # - in_channels must be a multiple of 8 # - filter_data must be 8-byte aligned (ee.vld.l.64.ip ignores lower 3 address bits) # - input_data must be 8-byte aligned (ee.vld.l/h.64.xp same alignment requirement) # - buffer (scratch) must be 16-byte aligned # # If filter is not aligned, use esp_nn_conv_s8_1x1() (C+inline asm) as fallback. # .type esp_nn_conv_s8_mult8_1x1_esp32s3, @function .align 4 .global esp_nn_conv_s8_mult8_1x1_esp32s3 esp_nn_conv_s8_mult8_1x1_esp32s3: # 0xdbc # scratch_buf = 0 // to store qacc regs need 36 bytes # gra_spill_temp_164 = 36, channel itr, (in_channels - 1) >> 3 # gra_spill_temp_165 = 40, i_out # gra_spill_temp_166 = 44, in_channels # gra_spill_temp_167 = 48, in_channels/8 - 1 # gra_spill_temp_168 = 52, in_channels-7 # gra_spill_temp_169 = 56, input # gra_spill_temp_170 = 60, filter_data # gra_spill_temp_171 = 64, input_offset # gra_spill_temp_172 = 68, input_ptr # gra_spill_temp_173 = 72, bias # gra_spill_temp_174 = 76, in_channels*8 # gra_spill_temp_175 = 80, size-7 # gra_spill_temp_176 = 84, size // registers: // a2: int8_t *input_data // a3: uint16_t input_wd // a4: uint16_t input_ht // a5: uint16_t in_channels // a6: int32_t input_offset // a7: int16_t *filter_data // on stack: // int32_t *bias // 160 // int8_t *out_data // 164 // uint16_t out_wd // 168 // uint16_t out_ht // 172 // uint16_t out_channels // 176 // int32_t out_offset // 180 // int32_t *out_shift // 184 // int32_t *out_mult // 188 // int32_t activation_min // 192 // int32_t activation_max // 196 // void *buffer // tmp buf // 200 entry a1,160 # s32i a5,a1,44 # [0] gra_spill_temp_166, in_channels s32i a6,a1,64 # [2] id:619 input_offset+0x0 s32i a7,a1,60 # [1] gra_spill_temp_170, filter_data mul16u a8,a3,a4 # [3] size = input_wd * input_ht; s32i a2,a1,56 # [0] gra_spill_temp_169, input l32i a4,a1,164 # [1] id:624 out_data+0x0 mov.n a3,a1 # [52] scratch_buf s32i a8,a1,84 # [4] gra_spill_temp_176, size blti a8,8,.prepare_leftover # [5] // process remaining lines one by one addi a9,a8,-7 # [32] s32i a9,a1,80 # [33] gra_spill_temp_175, size-7 s32i a2,a1,68 # [2] gra_spill_temp_172 , input_ptr srai a15,a5,3 # [7] `in_ch/8` loop_cnt movi.n a11,0 # [10] s32i a11,a1,40 # [11] gra_spill_temp_165 addi a15,a15,-1 # [17] `in_ch/8` loop_cnt - 1 s32i a15,a1,48 # [18] gra_spill_temp_167 slli a9,a5,3 # [19] in_channels*8 s32i a9,a1,76 # [20] gra_spill_temp_174 addi a15,a5,-7 # [31] s32i a15,a1,52 # [34] gra_spill_temp_168 .outer_loop: // for (; i_out < size - 7; i_out += 8) { l32i a10,a1,200 # [1] gra_spill_temp_165, buffer l32i.n a11,a1,44 # [1] gra_spill_temp_166, input_channels l32i.n a8,a1,68 # [2] gra_spill_temp_172, input_ptr srai a9,a11,3 # [7] `in_ch/8` loop_cnt for transpose loop ee.zero.q q7 # [0] addi a12,a1,64 # [6] ee.vldbc.16 q5,a12 # [0*II+16] id:638 input_offset // load and transose 8 lines of input 8xchannels, // add input offset and store 16 bit data to tmp buffer loopgtz a9,.transpose_loop_end # [10] mov.n a9,a8 ee.vld.l.64.xp q0,a9,a11 ee.vld.l.64.xp q1,a9,a11 ee.vld.h.64.xp q0,a9,a11 ee.vld.h.64.xp q1,a9,a11 ee.vld.l.64.xp q2,a9,a11 ee.vzip.8 q0,q1 ee.vld.l.64.xp q3,a9,a11 ee.vld.h.64.xp q2,a9,a11 ee.vld.h.64.ip q3,a9,0 ee.vzip.16 q0,q1 ee.vzip.8 q2,q3 ee.vzip.16 q2,q3 ee.vzip.32 q0,q2 ee.vcmp.lt.s8 q4,q2,q7 ee.vzip.8 q2,q4 ee.vcmp.lt.s8 q6,q0,q7 ee.vzip.8 q0,q6 ee.vadds.s16 q0,q0,q5 ee.vadds.s16.st.incp q0,a10,q6,q6,q5 ee.vadds.s16.st.incp q6,a10,q2,q2,q5 ee.vadds.s16.st.incp q2,a10,q4,q4,q5 ee.vst.128.ip q4,a10,16 ee.vzip.32 q1,q3 ee.vcmp.lt.s8 q4,q3,q7 ee.vzip.8 q3,q4 ee.vcmp.lt.s8 q6,q1,q7 ee.vzip.8 q1,q6 ee.vadds.s16 q1,q1,q5 ee.vadds.s16.st.incp q1,a10,q6,q6,q5 ee.vadds.s16.st.incp q6,a10,q3,q3,q5 ee.vadds.s16.st.incp q3,a10,q4,q4,q5 ee.vst.128.ip q4,a10,16 addi.n a8,a8,8 .transpose_loop_end: # 0xeeb # 468 uint32_t bias_ptr = (uint32_t) bias; # 469 uint32_t filter_ptr = (uint32_t) (filter_data); # 470 const int32_t *out_mult_ptr = out_mult; # 471 const int32_t *out_shift_ptr = out_shift; l32i a6,a1,184 # [0] out_shift l32i a2,a1,188 # [1] out_mult l32i a5,a1,60 # [2] gra_spill_temp_170, filter l32i a9,a1,160 # [3] gra_spill_temp_170, bias # 472 for (int32_t out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) { l16ui a8,a1,176 # [5] id:620 out_channels+0x0 s32i a9,a1,72 # [5] gra_spill_temp_173 blti a8,1,.outer_ch_loop_end movi.n a7,0 .out_ch_loop: # 0xf3e l32i a8,a1,200 # [4] gra_spill_temp_165, buffer_ptr ee.zero.qacc # [3] ee.zero.q q5 # l32i a10,a1,52 # [1] gra_spill_temp_168, in_channels-7 l32i a9,a1,48 # [1] gra_spill_temp_167, in_channels/8 - 1 # USAR-based filter load for unaligned access ee.ld.128.usar.ip q7,a5,16 ee.ld.128.usar.ip q6,a5,0 addi a5,a5,-8 # net advance = 8 ee.src.q q7,q7,q6 ee.vld.128.ip q0,a8,16 ee.vld.128.ip q1,a8,16 ee.vcmp.lt.s8 q6,q7,q5 ee.vzip.8 q7,q6 ee.vsmulas.s16.qacc.ld.incp q2,a8,q0,q7,0 ee.vsmulas.s16.qacc.ld.incp q3,a8,q1,q7,1 ee.vsmulas.s16.qacc.ld.incp q0,a8,q2,q7,2 ee.vsmulas.s16.qacc.ld.incp q1,a8,q3,q7,3 ee.vsmulas.s16.qacc.ld.incp q2,a8,q0,q7,4 ee.vsmulas.s16.qacc.ld.incp q3,a8,q1,q7,5 blti a10,8,.inner_loop_end # [16] loopgtz a9,.inner_loop_end # [3] ee.vsmulas.s16.qacc.ld.incp q0,a8,q2,q7,6 # [0*II+0] id:657 ee.vsmulas.s16.qacc.ld.incp q1,a8,q3,q7,7 # [0*II+1] id:658 # USAR-based filter load for unaligned access ee.ld.128.usar.ip q7,a5,16 ee.ld.128.usar.ip q6,a5,0 addi a5,a5,-8 ee.src.q q7,q7,q6 ee.vcmp.lt.s8 q6,q7,q5 ee.vzip.8 q7,q6 ee.vsmulas.s16.qacc.ld.incp q2,a8,q0,q7,0 # [0*II+4] id:660 ee.vsmulas.s16.qacc.ld.incp q3,a8,q1,q7,1 # [0*II+5] id:661 ee.vsmulas.s16.qacc.ld.incp q0,a8,q2,q7,2 # [0*II+6] id:662 ee.vsmulas.s16.qacc.ld.incp q1,a8,q3,q7,3 # [0*II+7] id:663 ee.vsmulas.s16.qacc.ld.incp q2,a8,q0,q7,4 # [0*II+8] id:664 ee.vsmulas.s16.qacc.ld.incp q3,a8,q1,q7,5 # [0*II+9] id:665 .inner_loop_end: # 0xfaf ee.vsmulas.s16.qacc q2,q7,6 # [2] ee.vsmulas.s16.qacc q3,q7,7 # [3] # store qacc registers and re-arrange data for low 16 bits ee.st.qacc_l.l.128.ip a3,16 # [5] id:668 ee.st.qacc_l.h.32.ip a3,-16 # [6] id:669 l32i.n a10, a1, 0 l32i.n a11, a1, 5 l32i.n a12, a1, 10 l32i.n a13, a1, 15 ee.movi.32.q q0, a10, 0 ee.movi.32.q q0, a11, 1 ee.movi.32.q q0, a12, 2 ee.movi.32.q q0, a13, 3 ee.st.qacc_h.l.128.ip a3,16 # [5] id:668 ee.st.qacc_h.h.32.ip a3,-16 # [6] id:669 l32i.n a10, a1, 0 l32i.n a11, a1, 5 l32i.n a12, a1, 10 l32i.n a13, a1, 15 ee.movi.32.q q4, a10, 0 ee.movi.32.q q4, a11, 1 ee.movi.32.q q4, a12, 2 ee.movi.32.q q4, a13, 3 l32i a9,a1,160 # [17] gra_spill_temp_170, bias l32i a10,a1,72 # [0] gra_spill_temp_173, bias_ptr # add bias beqz.n a9,.no_bias ee.vldbc.32.ip q6,a10,4 s32i a10,a1,72 # [3] gra_spill_temp_173, bias_ptr ee.vadds.s32 q0,q0,q6 # [4] ee.vadds.s32 q4,q4,q6 # [5] .no_bias: # 0x102e l32i.n a11,a6,0 # [1] id:696 l32i.n a10,a2,0 # [3] id:695 .global esp_nn_multiply_by_quantized_mult_asm_esp32s3 call8 esp_nn_multiply_by_quantized_mult_asm_esp32s3 # [4] esp_nn_multiply_by_quantized_mult_asm_esp32s3 l32i.n a10,a2,0 # [0] id:697, mult l32i.n a11,a6,0 # [2] id:698, shift mv.qr q5,q0 mv.qr q0,q4 call8 esp_nn_multiply_by_quantized_mult_asm_esp32s3 # [5] esp_nn_multiply_by_quantized_mult_asm_esp32s3 addi.n a6,a6,4 # out_shift_ptr++ addi.n a2,a2,4 # out_mult_ptr++ addi a9,a1,180 # [7] addi a10,a1,192 # [5] addi a8,a1,196 # [6] # load broadcast, activation and out_offset ee.vldbc.32 q4,a9 # [14] id:699 out_offset ee.vldbc.32 q2,a10 # [11] id:700 activation_min ee.vldbc.32 q3,a8 # [12] id:701 activation_max # add offset ee.vadds.s32 q1,q0,q4 # [17] ee.vadds.s32 q0,q5,q4 # [22] # activation ee.vmin.s32 q1,q1,q3 # [19] ee.vmax.s32 q1,q1,q2 # [21] ee.vmin.s32 q0,q0,q3 # [23] ee.vmax.s32 q0,q0,q2 # [24] l16ui a9,a1,176 # [33] out_channels # unzip and store ee.vunzip.16 q0,q1 # [25] ee.vst.128.ip q0,a3,0 # [26] id:702, scratch_buf # a4 = out_data, out_channels = a1+176 l8ui a14,a1,0 # [27] l8ui a11,a1,2 # [30] scratch_buf+2 add a10,a4,a9 s8i a14,a4,0 # [28], out_data s8i a11,a10,0 # [31], out_data + out_channels l8ui a14,a1,4 # [32] scratch_buf+4 l8ui a11,a1,6 # [37] scratch_buf+6 add a12,a10,a9 add a10,a12,a9 s8i a14,a12,0 # [28] s8i a11,a10,0 # [31] l8ui a14,a1,8 # [41] scratch_buf+8 l8ui a11,a1,10 # [47] scratch_buf+10 add a12,a10,a9 add a10,a12,a9 s8i a14,a12,0 # [28] s8i a11,a10,0 # [31] l8ui a14,a1,12 # [51] scratch_buf+12 l8ui a11,a1,14 # [55] scratch_buf+14 add a12,a10,a9 add a10,a12,a9 s8i a14,a12,0 # [28] s8i a11,a10,0 # [31] addi.n a4,a4,1 # [29] out_data++; addi.n a7,a7,1 bne a7,a9,.out_ch_loop .outer_ch_loop_end: subx8 a11,a9,a9 # (7 * out_channels); l32i a10,a1,76 # [1] gra_spill_temp_174, in_channels * 8 l32i a15,a1,40 # [4] gra_spill_temp_165 l32i a9,a1,68 # [2] gra_spill_temp_172 l32i a8,a1,80 # [0] gra_spill_temp_175, size-7 add.n a4,a4,a11 # [5] out_data += (7 * out_channels); addi.n a15,a15,8 s32i a15,a1,40 # [7] gra_spill_temp_165 add.n a9,a9,a10 # [8] s32i a9,a1,68 # [9] gra_spill_temp_172 blt a15,a8,.outer_loop # [10] # check if leftover l32i a15,a1,40 l32i a13,a1,84 # [1] gra_spill_temp_176, size l32i a8,a1,44 # [0] gra_spill_temp_166, in_channels bge a15, a13, .return_function # no leftover // This block below processes one input channel line at a time. .process_leftover: l32i a15,a1,40 # [1] gra_spill_temp_165, i_out l32i a14,a1,56 # [2] gra_spill_temp_169, input mull a15,a15,a8 # [3] in_channels * i_out addi.n a8,a8,-1 # [4] in_channels - 1 add.n a14,a14,a15 # [5] input_ptr = in_channels * i_out + input srai a8,a8,3 # [6] iterations, (in_channels - 1) >> 3 s32i a8,a1,36 # [7] gra_spill_temp_164, iterations s32i a14,a1,68 # [8] gra_spill_temp_172, in_channels * i_out + input addi a12,a1,64 ee.vldbc.16 q4,a12 # [8] id:716 input_offset .leftover_outer_loop: l32i a15,a1,184 # [0] out_shift l32i a2,a1,188 # [1] out_mult l32i a8,a1,60 # [3] gra_spill_temp_170, filter_data l32i a5,a1,160 # [0] gra_spill_temp_170, bias movi.n a11,0 # [2] .leftover_out_ch_loop: ee.zero.qacc # [0] ee.zero.q q3 # [1] l32i.n a9,a1,68 # [4] gra_spill_temp_172, input_ptr l32i a10,a1,36 # [1] gra_spill_temp_164, iterations, (in_channels - 1) >> 3 ee.vld.l.64.ip q0,a9,8 # [7] id:717, input # USAR-based filter load for unaligned access ee.ld.128.usar.ip q1,a8,16 ee.ld.128.usar.ip q7,a8,0 addi a8,a8,-8 ee.src.q q1,q1,q7 ee.vcmp.lt.s8 q6,q0,q3 ee.vcmp.lt.s8 q7,q1,q3 ee.vzip.8 q0,q6 ee.vzip.8 q1,q7 ee.vadds.s16 q0,q0,q4 # [11] id:718, add offset loopgtz a10,.leftover_inner_loop_end # [3] ee.vmulas.s16.qacc q0,q1 # mula(q0,q1) ee.vld.l.64.ip q0,a9,8 # load 8 input values # USAR-based filter load for unaligned access ee.ld.128.usar.ip q1,a8,16 ee.ld.128.usar.ip q7,a8,0 addi a8,a8,-8 ee.src.q q1,q1,q7 ee.vcmp.lt.s8 q2,q0,q3 # sign ee.vcmp.lt.s8 q7,q1,q3 ee.vzip.8 q0,q2 # 16 bit input ee.vzip.8 q1,q7 # 16 bit filter ee.vadds.s16 q0,q0,q4 # add offset .leftover_inner_loop_end: # 0x1262 # re-arrange data from qacc in 32 bit q registers ee.vmulas.s16.qacc q0,q1 # [3] ee.st.qacc_l.l.128.ip a3,16 # [5] id:722 ee.st.qacc_l.h.32.ip a3,0 # [6] id:723 l8ui a10,a1,5 # [11] scratch_buf+5 l8ui a12,a1,6 # [10] scratch_buf+6 l16ui a14,a1,10 # [8] scratch_buf+10 l8ui a9,a1,15 # [7] scratch_buf+15 l8ui a13,a1,16 # [9] scratch_buf+16 s8i a10,a1,2 # [12] scratch_buf+2 s8i a12,a1,3 # [13] scratch_buf+3 s16i a14,a1,4 # [15] scratch_buf+4 s8i a9,a1,6 # [16] scratch_buf+6 s8i a13,a1,7 # [14] scratch_buf+7 ee.st.qacc_h.l.128.ip a3,16 # [17] id:724 ee.st.qacc_h.h.32.ip a3,-32 # [18] id:725 l16ui a13,a1,16 # [30] scratch_buf+16 l8ui a14,a1,21 # [23] scratch_buf+21 l8ui a9,a1,22 # [22] scratch_buf+22 l16ui a10,a1,26 # [21] scratch_buf+26 s16i a13,a1,8 # [31] scratch_buf+8 l8ui a12,a1,31 # [20] scratch_buf+31 l8ui a13,a1,32 # [19] scratch_buf+32 s8i a14,a1,10 # [24] scratch_buf+10 s8i a9,a1,11 # [25] scratch_buf+11 s16i a10,a1,12 # [26] scratch_buf+12 s8i a12,a1,14 # [27] scratch_buf+14 s8i a13,a1,15 # [28] scratch_buf+15 movi.n a12,16 # get data now ee.vld.128.ip q0,a3,0 ee.srcmb.s16.qacc q1,a12,0 ee.vzip.16 q0,q1 ee.vadds.s32 q0,q0,q1 ee.movi.32.a q0,a10,3 ee.movi.32.a q0,a9,2 ee.movi.32.a q0,a14,0 add a9,a9,a10 ee.movi.32.a q0,a10,1 add a14,a14,a10 add a14,a14,a9 # a14 contains conv_out l32i a9,a1,160 # [43] gra_spill_temp_170, bias ptr l32i.n a6,a15,0 # [44] id:730, shift beqz.n a9,.leftover_multiply_by_quant_mult # [45] # load and add bias l32i.n a9,a5,0 add.n a14,a14,a9 .leftover_multiply_by_quant_mult: # 0x12e7 l32i.n a9,a2,0 # [0] id:729, mult movi.n a10,0 # [1] max a10,a6,a10 # [2] left_shift ssl a10 # [3] sll a14,a14 # [4] (value << left_shift) sub a7,a10,a6 # right_shift l32r a13,.nudge_val mulsh a12,a9,a14 mull a14,a9,a14 ssai 31 addi.n a2,a2,4 # [0] mult addi.n a15,a15,4 # [1] shift addi.n a5,a5,4 # [2] bias addi.n a11,a11,1 # [3] add a13,a14,a13 # low part saltu a14,a13,a14 add a9,a12,a14 # high part src a12,a9,a13 blti a7,1,.leftover_skip_div_by2 addi.n a14,a7,-1 ssl a14 movi.n a10,1 sll a10,a10 # 1 << (exponent - 1) extui a14,a12,31,1 ssr a7 sub a10,a10,a14 # 1 << (exponent - 1) - (val < 0) add a12,a12,a10 # val += to_add sra a12,a12 .leftover_skip_div_by2: l32i a10,a1,180 # [26] id:733 out_offset+0x0 l32i a9,a1,192 # [29] id:732 activation_min+0x0 l16ui a13,a1,176 # [5] id:620 out_channels+0x0 l32i a14,a1,196 # [31] id:731 activation_max+0x0 // add offset, apply activation and store add.n a10,a10,a12 max a9,a9,a10 min a14,a14,a9 s8i a14,a4,0 addi.n a4,a4,1 bne a11,a13,.leftover_out_ch_loop l32i a15,a1,44 # [0] gra_spill_temp_166, in_channels l32i a14,a1,68 # [1] gra_spill_temp_172, input_ptr l32i a13,a1,40 # [2] gra_spill_temp_165, i_out l32i a12,a1,84 # [3] gra_spill_temp_176, size addi.n a13,a13,1 # [4] s32i a13,a1,40 # [5] gra_spill_temp_165, i_out add a14,a14,a15 # [7] input_ptr += in_channels s32i a14,a1,68 # [8] gra_spill_temp_172, input_ptr blt a13,a12,.leftover_outer_loop .return_function: retw.n # [9] .prepare_leftover: l32i a8,a1,44 # [0] gra_spill_temp_166, in_channels movi.n a15,0 s32i a15,a1,40 # [7] gra_spill_temp_165, i_out j .process_leftover .size esp_nn_conv_s8_mult8_1x1_esp32s3, . - esp_nn_conv_s8_mult8_1x1_esp32s3 ================================================ FILE: src/convolution/esp_nn_depthwise_conv_ansi.c ================================================ // Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include int esp_nn_get_depthwise_conv_scratch_size_ansi(const data_dims_t *input_dims, const data_dims_t *filter_dims, const data_dims_t *output_dims, const dw_conv_params_t *conv_params) { return 0; } void esp_nn_set_depthwise_conv_scratch_buf_ansi(const void *buf) { } void esp_nn_depthwise_conv_s8_ansi(const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const dw_conv_params_t *conv_params, const quant_data_t *quant_data) { const uint16_t input_wd = input_dims->width; const uint16_t input_ht = input_dims->height; const uint16_t channels = input_dims->channels; const int32_t input_offset = conv_params->in_offset; const int32_t out_offset = conv_params->out_offset; const uint16_t pad_wd = conv_params->padding.width; const uint16_t pad_ht = conv_params->padding.height; const uint16_t stride_wd = conv_params->stride.width; const uint16_t stride_ht = conv_params->stride.height; const uint16_t filter_wd = filter_dims->width; const uint16_t filter_ht = filter_dims->height; const uint16_t out_wd = output_dims->width; const uint16_t out_ht = output_dims->height; const int32_t *out_shift = quant_data->shift; const int32_t *out_mult = quant_data->mult; const int32_t activation_min = conv_params->activation.min; const int32_t activation_max = conv_params->activation.max; const uint16_t ch_mult = conv_params->ch_mult; int out_idx = 0; for (int out_y = 0; out_y < out_ht; out_y++) { //height loop const int16_t base_y = (out_y * stride_ht) - pad_ht; for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop const int16_t base_x = (out_x * stride_wd) - pad_wd; for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop for (int ch_mult_idx = 0; ch_mult_idx < ch_mult; ch_mult_idx++) { int32_t result = 0; const int out_ch_idx = ch_mult_idx + ch_idx * ch_mult; /* Select filter so as the point doesn't lie outside block */ int filter_y_start = max(0, -base_y); int filter_x_start = max(0, -base_x); int filter_y_end = min(filter_ht, input_ht - base_y); int filter_x_end = min(filter_wd, input_wd - base_x); for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) { const int32_t idx_y = base_y + filter_y_idx; for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) { const int32_t idx_x = base_x + filter_x_idx; int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx; int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx; int32_t input_val = input_data[input_index] + input_offset; int32_t filter_val = filter_data[filter_index]; result += input_val * filter_val; } } if (bias) { result += bias[out_ch_idx]; } result = esp_nn_multiply_by_quantized_mult(result, out_mult[out_ch_idx], out_shift[out_ch_idx]); result += out_offset; result = max(result, activation_min); result = min(result, activation_max); out_data[out_idx++] = result; } } } } } ================================================ FILE: src/convolution/esp_nn_depthwise_conv_esp32p4.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include #include #include /* Note: esp_nn_requant_2x_esp32p4.S exists but inline ESP_NN_REQUANT_2X macro * from common_functions.h is used instead (avoids function call overhead). */ /* External fallback */ void esp_nn_depthwise_conv_s8_opt(const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const dw_conv_params_t *conv_params, const quant_data_t *quant_data); int esp_nn_get_depthwise_conv_scratch_size_esp32p4(const data_dims_t *input_dims, const data_dims_t *filter_dims, const data_dims_t *output_dims, const dw_conv_params_t *conv_params) { return 0; } void esp_nn_set_depthwise_conv_scratch_buf_esp32p4(const void *buf) { (void) buf; } /* PIE-optimized ch_mult=1, channels>=16 path using QACC per-lane MAC. * Pre-computes filter_sum[ch] = sum of filter[ch] across all filter positions. * For non-edge output positions: result[ch] = QACC_MAC + filter_sum[ch] * input_offset * For edge positions: falls back to scalar with input_offset applied directly. */ __attribute__ ((noinline)) static void depthwise_conv_s8_ch1_pie(const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const dw_conv_params_t *conv_params, const quant_data_t *quant_data) { const uint16_t input_wd = input_dims->width; const uint16_t input_ht = input_dims->height; const uint16_t channels = input_dims->channels; const int32_t input_offset = conv_params->in_offset; const int32_t out_offset = conv_params->out_offset; const uint16_t pad_wd = conv_params->padding.width; const uint16_t pad_ht = conv_params->padding.height; const uint16_t stride_wd = conv_params->stride.width; const uint16_t stride_ht = conv_params->stride.height; const uint16_t filter_wd = filter_dims->width; const uint16_t filter_ht = filter_dims->height; const uint16_t out_wd = output_dims->width; const uint16_t out_ht = output_dims->height; const int32_t activation_min = conv_params->activation.min; const int32_t activation_max = conv_params->activation.max; /* Enable PIE */ asm volatile ( "csrsi 0x7f2, 0b01 \n\t" "li x29, 0b10 \n\t" "esp.movx.w.cfg x29 \n\t" ::: "x29" ); /* Set up activation min/max vectors for PIE clamp */ { int8_t act_min_val = (int8_t) activation_min; int8_t act_max_val = (int8_t) activation_max; asm volatile ( "mv x30, %0 \n\t" "esp.vldbc.8.ip q4, x30, 0 \n\t" "mv x30, %1 \n\t" "esp.vldbc.8.ip q5, x30, 0 \n\t" :: "r"(&act_min_val), "r"(&act_max_val) : "x30" ); } /* Pre-compute combined offset: filter_sum * input_offset + bias per channel. * This fuses two additions per channel into one pre-computed value. * Constant for the entire layer - computed once. */ int32_t combined_offset_buf[256]; /* support up to 256 channels on stack */ int32_t *combined_offset = NULL; if (channels <= 256) { combined_offset = combined_offset_buf; for (int ch = 0; ch < channels; ch++) { int32_t s = 0; if (input_offset != 0) { for (int fy = 0; fy < filter_ht; fy++) { for (int fx = 0; fx < filter_wd; fx++) { s += filter_data[(fy * filter_wd + fx) * channels + ch]; } } s *= input_offset; } combined_offset[ch] = s + (bias ? bias[ch] : 0); } } int out_idx = 0; for (int out_y = 0; out_y < out_ht; out_y++) { const int16_t base_y = (out_y * stride_ht) - pad_ht; for (int out_x = 0; out_x < out_wd; out_x++) { const int16_t base_x = (out_x * stride_wd) - pad_wd; const int32_t *out_shift = quant_data->shift; const int32_t *out_mult = quant_data->mult; int filter_y_start = max(0, -base_y); int filter_x_start = max(0, -base_x); int filter_y_end = min(filter_ht, input_ht - base_y); int filter_x_end = min(filter_wd, input_wd - base_x); /* Check if this is a non-edge position (full filter window) */ int is_full_window = (filter_y_start == 0 && filter_x_start == 0 && filter_y_end == filter_ht && filter_x_end == filter_wd); /* Process 16 channels at a time using QACC. * Inline helper macro for QACC MAC across filter window. */ #define QACC_MAC_WINDOW(ch_off) do { \ asm volatile ("esp.zero.qacc \n\t"); \ for (int _fy = filter_y_start; _fy < filter_y_end; _fy++) { \ const int32_t _iy = base_y + _fy; \ const int8_t *_ip = input_data + (_iy * input_wd + base_x + filter_x_start) * channels + (ch_off); \ const int8_t *_fp = filter_data + (_fy * filter_wd + filter_x_start) * channels + (ch_off); \ int _fc = filter_x_end - filter_x_start; \ asm volatile ( \ "mv x30, %[ip] \n\t" \ "mv x31, %[fp] \n\t" \ "mv s7, %[cnt] \n\t" \ "1: \n\t" \ "esp.vld.128.ip q0, x30, 0 \n\t" \ "esp.vld.128.ip q1, x31, 0 \n\t" \ "esp.vmulas.s8.qacc q0, q1 \n\t" \ "add x30, x30, %[stride] \n\t" \ "add x31, x31, %[stride] \n\t" \ "addi s7, s7, -1 \n\t" \ "bnez s7, 1b \n\t" \ : \ : [ip] "r"(_ip), [fp] "r"(_fp), \ [cnt] "r"(_fc), [stride] "r"((int32_t)channels) \ : "x30", "x31", "s7" \ ); \ } \ } while(0) #define QACC_EXTRACT(dst) do { \ asm volatile ( \ "mv x30, %0 \n\t" \ "esp.st.qacc.l.l.128.ip x30, 16 \n\t" \ "esp.st.qacc.l.h.128.ip x30, 16 \n\t" \ "esp.st.qacc.h.l.128.ip x30, 16 \n\t" \ "esp.st.qacc.h.h.128.ip x30, 0 \n\t" \ :: "r"(dst) \ : "x30", "memory" \ ); \ } while(0) int ch_idx = 0; /* Process 16-channel blocks, then partial block if remainder >= 8 */ while (ch_idx < channels) { int block_ch = (ch_idx + 16 <= channels) ? 16 : (channels - ch_idx >= 8) ? (channels - ch_idx) : 0; if (block_ch == 0) break; /* remaining < 8, handle scalar below */ QACC_MAC_WINDOW(ch_idx); /* Extract per-lane results (only first block_ch are valid) */ int32_t result[16] __attribute__((aligned(16))); QACC_EXTRACT(result); /* Add fused offset (filter_sum * input_offset + bias) + requantize */ if (combined_offset) { if (is_full_window) { for (int k = 0; k < block_ch; k++) { result[k] += combined_offset[ch_idx + k]; } } else { for (int k = 0; k < block_ch; k++) { int32_t fsum = 0; if (input_offset != 0) { for (int fy = filter_y_start; fy < filter_y_end; fy++) { for (int fx = filter_x_start; fx < filter_x_end; fx++) { fsum += filter_data[(fy * filter_wd + fx) * channels + ch_idx + k]; } } fsum *= input_offset; } result[k] += fsum + (bias ? bias[ch_idx + k] : 0); } } } /* Per-channel requantize */ { const int32_t *mp = out_mult + ch_idx; const int32_t *sp = out_shift + ch_idx; int rq_count = block_ch & ~1; /* round down to even for 2-wide */ for (int k = 0; k < rq_count; k += 2) { int32_t r0 = result[k]; int32_t r1 = result[k+1]; int32_t m0 = mp[k], s0 = sp[k]; int32_t m1 = mp[k+1], s1 = sp[k+1]; /* 2-wide interleaved requant via inline asm macro. * Macro handles left_shift internally - do NOT pre-shift. */ int32_t h0, h1; ESP_NN_REQUANT_2X(r0, r1, m0, m1, s0, s1, h0, h1); h0 += out_offset; h1 += out_offset; out_data[out_idx++] = (int8_t)max(activation_min, min(h0, activation_max)); out_data[out_idx++] = (int8_t)max(activation_min, min(h1, activation_max)); } /* Handle odd remaining channel in block */ if (block_ch & 1) { int k = rq_count; int32_t r = result[k]; r = esp_nn_requantize(r, mp[k], sp[k]); r += out_offset; out_data[out_idx++] = (int8_t)max(activation_min, min(r, activation_max)); } } ch_idx += block_ch; } /* Remaining channels < 8: scalar */ for (; ch_idx < channels; ch_idx++) { int32_t result = 0; for (int fy = filter_y_start; fy < filter_y_end; fy++) { const int32_t idx_y = base_y + fy; for (int fx = filter_x_start; fx < filter_x_end; fx++) { const int32_t idx_x = base_x + fx; result += (input_data[(idx_y * input_wd + idx_x) * channels + ch_idx] + input_offset) * filter_data[(fy * filter_wd + fx) * channels + ch_idx]; } } if (bias) result += bias[ch_idx]; result = esp_nn_requantize(result, out_mult[ch_idx], out_shift[ch_idx]); result += out_offset; result = max(result, activation_min); result = min(result, activation_max); out_data[out_idx++] = (int8_t) result; } } } } void esp_nn_depthwise_conv_s8_esp32p4(const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const dw_conv_params_t *conv_params, const quant_data_t *quant_data) { const uint16_t ch_mult = conv_params->ch_mult; const uint16_t channels = input_dims->channels; if (ch_mult == 1 && channels >= 8) { depthwise_conv_s8_ch1_pie(input_dims, input_data, filter_dims, filter_data, bias, output_dims, out_data, conv_params, quant_data); return; } /* Fall back to generic optimized */ esp_nn_depthwise_conv_s8_opt(input_dims, input_data, filter_dims, filter_data, bias, output_dims, out_data, conv_params, quant_data); } ================================================ FILE: src/convolution/esp_nn_depthwise_conv_opt.c ================================================ // Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include int esp_nn_get_depthwise_conv_scratch_size_opt(const data_dims_t *input_dims, const data_dims_t *filter_dims, const data_dims_t *output_dims, const dw_conv_params_t *conv_params) { return 0; } void esp_nn_set_depthwise_conv_scratch_buf_opt(const void *buf) { } /* common channel multiplier == 1 case */ __attribute__ ((noinline)) static void esp_nn_depthwise_conv_s8_ch_mult_1(const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const dw_conv_params_t *conv_params, const quant_data_t *quant_data) { const uint16_t input_wd = input_dims->width; const uint16_t input_ht = input_dims->height; const uint16_t channels = input_dims->channels; const int32_t input_offset = conv_params->in_offset; const int32_t out_offset = conv_params->out_offset; const uint16_t pad_wd = conv_params->padding.width; const uint16_t pad_ht = conv_params->padding.height; const uint16_t stride_wd = conv_params->stride.width; const uint16_t stride_ht = conv_params->stride.height; const uint16_t filter_wd = filter_dims->width; const uint16_t filter_ht = filter_dims->height; const uint16_t out_wd = output_dims->width; const uint16_t out_ht = output_dims->height; const int32_t activation_min = conv_params->activation.min; const int32_t activation_max = conv_params->activation.max; int out_idx = 0; for (int out_y = 0; out_y < out_ht; out_y++) { //height loop const int16_t base_y = (out_y * stride_ht) - pad_ht; for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop const int16_t base_x = (out_x * stride_wd) - pad_wd; const int32_t *out_shift = quant_data->shift; const int32_t *out_mult = quant_data->mult; /* Select filter so as the point doesn't lie outside block */ int filter_y_start = max(0, -base_y); int filter_x_start = max(0, -base_x); int filter_y_end = min(filter_ht, input_ht - base_y); int filter_x_end = min(filter_wd, input_wd - base_x); int ch_idx = 0; for (; ch_idx < channels - 3; ch_idx += 4) {//channel_loop int32_t result0 = 0; int32_t result1 = 0; int32_t result2 = 0; int32_t result3 = 0; for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) { const int32_t idx_y = base_y + filter_y_idx; for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) { const int32_t idx_x = base_x + filter_x_idx; int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx; int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels) + ch_idx; int32_t input_val0 = input_data[input_index + 0] + input_offset; int32_t input_val1 = input_data[input_index + 1] + input_offset; int32_t input_val2 = input_data[input_index + 2] + input_offset; int32_t input_val3 = input_data[input_index + 3] + input_offset; int32_t filter_val0 = filter_data[filter_index + 0]; int32_t filter_val1 = filter_data[filter_index + 1]; int32_t filter_val2 = filter_data[filter_index + 2]; int32_t filter_val3 = filter_data[filter_index + 3]; result0 += input_val0 * filter_val0; result1 += input_val1 * filter_val1; result2 += input_val2 * filter_val2; result3 += input_val3 * filter_val3; } } if (bias) { result0 += bias[ch_idx + 0]; result1 += bias[ch_idx + 1]; result2 += bias[ch_idx + 2]; result3 += bias[ch_idx + 3]; } result0 = esp_nn_requantize(result0, *out_mult++, *out_shift++); result1 = esp_nn_requantize(result1, *out_mult++, *out_shift++); result2 = esp_nn_requantize(result2, *out_mult++, *out_shift++); result3 = esp_nn_requantize(result3, *out_mult++, *out_shift++); result0 += out_offset; result1 += out_offset; result2 += out_offset; result3 += out_offset; result0 = max(result0, activation_min); result1 = max(result1, activation_min); result2 = max(result2, activation_min); result3 = max(result3, activation_min); result0 = min(result0, activation_max); result1 = min(result1, activation_max); result2 = min(result2, activation_max); result3 = min(result3, activation_max); out_data[out_idx++] = result0; out_data[out_idx++] = result1; out_data[out_idx++] = result2; out_data[out_idx++] = result3; } for (; ch_idx < channels; ch_idx++) {//channel_loop int32_t result = 0; for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) { const int32_t idx_y = base_y + filter_y_idx; for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) { const int32_t idx_x = base_x + filter_x_idx; int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx; int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels) + ch_idx; int32_t input_val = input_data[input_index] + input_offset; int32_t filter_val = filter_data[filter_index]; result += input_val * filter_val; } } if (bias) { result += bias[ch_idx]; } result = esp_nn_requantize(result, *out_mult++, *out_shift++); result += out_offset; result = max(result, activation_min); result = min(result, activation_max); out_data[out_idx++] = result; } } } } void esp_nn_depthwise_conv_s8_opt(const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const dw_conv_params_t *conv_params, const quant_data_t *quant_data) { const uint16_t ch_mult = conv_params->ch_mult; if (ch_mult == 1) { esp_nn_depthwise_conv_s8_ch_mult_1(input_dims, input_data, filter_dims, filter_data, bias, output_dims, out_data, conv_params, quant_data); return; } const uint16_t input_wd = input_dims->width; const uint16_t input_ht = input_dims->height; const uint16_t channels = input_dims->channels; const int32_t input_offset = conv_params->in_offset; const int32_t out_offset = conv_params->out_offset; const uint16_t pad_wd = conv_params->padding.width; const uint16_t pad_ht = conv_params->padding.height; const uint16_t stride_wd = conv_params->stride.width; const uint16_t stride_ht = conv_params->stride.height; const uint16_t filter_wd = filter_dims->width; const uint16_t filter_ht = filter_dims->height; const uint16_t out_wd = output_dims->width; const uint16_t out_ht = output_dims->height; const int32_t activation_min = conv_params->activation.min; const int32_t activation_max = conv_params->activation.max; int out_idx = 0; for (int out_y = 0; out_y < out_ht; out_y++) { //height loop const int16_t base_y = (out_y * stride_ht) - pad_ht; for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop const int16_t base_x = (out_x * stride_wd) - pad_wd; const int32_t *out_shift = quant_data->shift; const int32_t *out_mult = quant_data->mult; /* Select filter so as the point doesn't lie outside block */ int filter_y_start = max(0, -base_y); int filter_x_start = max(0, -base_x); int filter_y_end = min(filter_ht, input_ht - base_y); int filter_x_end = min(filter_wd, input_wd - base_x); for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop int ch_mult_idx = 0; for (; ch_mult_idx < ch_mult - 3; ch_mult_idx += 4) { int32_t result0 = 0; int32_t result1 = 0; int32_t result2 = 0; int32_t result3 = 0; const int out_ch_idx = ch_idx * ch_mult + ch_mult_idx; for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) { const int32_t idx_y = base_y + filter_y_idx; for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) { const int32_t idx_x = base_x + filter_x_idx; int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx; int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx; int32_t input_val = input_data[input_index] + input_offset; int32_t filter_val0 = filter_data[filter_index + 0]; int32_t filter_val1 = filter_data[filter_index + 1]; int32_t filter_val2 = filter_data[filter_index + 2]; int32_t filter_val3 = filter_data[filter_index + 3]; result0 += input_val * filter_val0; result1 += input_val * filter_val1; result2 += input_val * filter_val2; result3 += input_val * filter_val3; } } if (bias) { result0 += bias[out_ch_idx + 0]; result1 += bias[out_ch_idx + 1]; result2 += bias[out_ch_idx + 2]; result3 += bias[out_ch_idx + 3]; } result0 = esp_nn_requantize(result0, *out_mult++, *out_shift++); result1 = esp_nn_requantize(result1, *out_mult++, *out_shift++); result2 = esp_nn_requantize(result2, *out_mult++, *out_shift++); result3 = esp_nn_requantize(result3, *out_mult++, *out_shift++); result0 += out_offset; result1 += out_offset; result2 += out_offset; result3 += out_offset; result0 = max(result0, activation_min); result1 = max(result1, activation_min); result2 = max(result2, activation_min); result3 = max(result3, activation_min); result0 = min(result0, activation_max); result1 = min(result1, activation_max); result2 = min(result2, activation_max); result3 = min(result3, activation_max); out_data[out_idx++] = result0; out_data[out_idx++] = result1; out_data[out_idx++] = result2; out_data[out_idx++] = result3; } for (; ch_mult_idx < ch_mult; ch_mult_idx++) { int32_t result = 0; const int out_ch_idx = ch_idx * ch_mult + ch_mult_idx; for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) { const int32_t idx_y = base_y + filter_y_idx; for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) { const int32_t idx_x = base_x + filter_x_idx; int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx; int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx; int32_t input_val = input_data[input_index] + input_offset; int32_t filter_val = filter_data[filter_index]; result += input_val * filter_val; } } if (bias) { result += bias[out_ch_idx]; } result = esp_nn_requantize(result, *out_mult++, *out_shift++); result += out_offset; result = max(result, activation_min); result = min(result, activation_max); out_data[out_idx++] = result; } } } } } ================================================ FILE: src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3.S ================================================ // Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. .text .literal_position # Program Unit: esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3 .type esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3, @function .align 4 .global esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3 esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3: # 0x776 # qacc_scratch = 0 # gra_spill_temp_35 = 48 # gra_spill_temp_36 = 52 # gra_spill_temp_37 = 56 # gra_spill_temp_38 = 60 # gra_spill_temp_39 = 64 # gra_spill_temp_40 = 68 # gra_spill_temp_41 = 72 # gra_spill_temp_42 = 76 # gra_spill_temp_43 = 80 # gra_spill_temp_44 = 84 # gra_spill_temp_45 = 88 # gra_spill_temp_46 = 92 # gra_spill_temp_47 = 96 # gra_spill_temp_48 = 100 # gra_spill_temp_49 = 104 # gra_spill_temp_50 = 108 # gra_spill_temp_51 = 112 # gra_spill_temp_52 = 116 # gra_spill_temp_53 = 120 # gra_spill_temp_54 = 124 # gra_spill_temp_55 = 128 # gra_spill_temp_56 = 132 # gra_spill_temp_57 = 136 # gra_spill_temp_58 = 140 # gra_spill_temp_59 = 144 # gra_spill_temp_60 = 148 # gra_spill_temp_61 = 152 # gra_spill_temp_62 = 156 # gra_spill_temp_63 = 160 # gra_spill_temp_64 = 164 # gra_spill_temp_65 = 168 # gra_spill_temp_66 = 176 # gra_spill_temp_67 = 192 # gra_spill_temp_68 = 208 # gra_spill_temp_69 = 224 # gra_spill_temp_70 = 240 // registers: // a2: const int16_t *input_data // a3: const uint16_t input_wd // a4: const uint16_t input_ht // a5: const uint16_t channels // a6: const uint16_t pad_wd // a7: const uint16_t pad_ht // on stack // const uint16_t stride_wd // const uint16_t stride_ht // const int16_t *filter_data // const int32_t *bias // int8_t *out_data // const uint16_t out_wd // const uint16_t out_ht // const int32_t out_offset // const int32_t *out_shift // const int32_t *out_mult // const int32_t activation_min // const int32_t activation_max entry a1,288 # s32i a2,a1,104 # [0] gra_spill_temp_49 s32i a3,a1,112 # [1] gra_spill_temp_51 s32i a5,a1,116 # [2] gra_spill_temp_52 s32i.n a6,a1,56 # [3] gra_spill_temp_37 addi a14,a1,112 # [4] addmi a11,a1,256 # [5] addmi a13,a1,256 # [6] addmi a15,a1,256 # [7] l32i a9,a1,304 # [8] id:251 out_data+0x0 l16ui a8,a1,312 # [9] id:252 out_ht+0x0 s32i a8,a1,64 # [10] gra_spill_temp_39 s32i a9,a1,156 # [11] gra_spill_temp_62 addi a15,a15,60 # [12] addi a13,a13,72 # [13] addi a11,a11,76 # [14] ee.vldbc.32 q0,a11 # [15] id:250 activation_max ee.vldbc.32 q1,a13 # [16] id:249 activation_min ee.vldbc.32 q2,a15 # [17] id:248 out_offset st.qr q2,a14,80 # [18] gra_spill_temp_67-112 st.qr q1,a14,96 # [19] gra_spill_temp_68-112 st.qr q0,a14,112 # [20] gra_spill_temp_69-112 beqz.n a8,.Lt_5_7426 # [21] .LBB3_esp_nn_depthwise_conv_s16_mult1_3x3: # 0x7b9 s32i a1,a1,160 # [0] gra_spill_temp_63 s32i a7,a1,72 # [1] gra_spill_temp_41 mul16u a6,a3,a5 # [2] l32i a14,a1,296 # [3] id:254 filter_data+0x0 l32i a15,a1,300 # [4] id:253 bias+0x0 l16ui a9,a1,308 # [5] id:259 out_wd+0x0 l16ui a13,a1,288 # [6] id:255 stride_wd+0x0 neg a8,a7 # [7] l16ui a10,a1,292 # [8] id:258 stride_ht+0x0 l32i a11,a1,324 # [9] id:257 out_mult+0x0 l32i a12,a1,320 # [10] id:256 out_shift+0x0 s32i a12,a1,84 # [11] gra_spill_temp_44 s32i a11,a1,88 # [12] gra_spill_temp_45 s32i.n a10,a1,60 # [13] gra_spill_temp_38 s32i a8,a1,124 # [14] gra_spill_temp_54 s32i a13,a1,80 # [15] gra_spill_temp_43 s32i a9,a1,92 # [16] gra_spill_temp_46 s32i a15,a1,140 # [17] gra_spill_temp_58 s32i a14,a1,108 # [18] gra_spill_temp_50 slli a6,a6,1 # [19] movi.n a14,16 # [20] extui a15,a15,0,4 # [21] addi a9,a5,-7 # [22] movi.n a13,0 # [23] sub a8,a4,a8 # [24] addx2 a7,a5,a5 # [25] slli a7,a7,1 # [26] slli a4,a5,1 # [27] s32i a13,a1,68 # [28] gra_spill_temp_40 s32i a9,a1,144 # [29] gra_spill_temp_59 s32i a15,a1,132 # [30] gra_spill_temp_56 l32i.n a9,a1,56 # [31] gra_spill_temp_37 s32i a8,a1,76 # [32] gra_spill_temp_42 neg a9,a9 # [33] s32i.n a9,a1,48 # [34] gra_spill_temp_35 sub a8,a3,a9 # [35] s32i.n a8,a1,52 # [36] gra_spill_temp_36 .Lt_5_7938: # 0x822 l32i a10,a1,92 # [0] gra_spill_temp_46 beqz.n a10,.Lt_5_8194 # [2] .LBB6_esp_nn_depthwise_conv_s16_mult1_3x3: # 0x827 l32i.n a5,a1,52 # [0] gra_spill_temp_36 l32i a11,a1,76 # [1] gra_spill_temp_42 movi.n a13,0 # [2] l32i a12,a1,72 # [3] gra_spill_temp_41 movi.n a15,0 # [4] l32i.n a8,a1,48 # [5] gra_spill_temp_35 l32i.n a9,a1,56 # [6] gra_spill_temp_37 s32i a9,a1,100 # [7] gra_spill_temp_48 s32i a8,a1,128 # [8] gra_spill_temp_55 s32i a15,a1,96 # [9] gra_spill_temp_47 max a12,a12,a13 # [10] s32i a12,a1,152 # [11] gra_spill_temp_61 movi.n a13,3 # [12] min a11,a11,a13 # [13] s32i a11,a1,136 # [14] gra_spill_temp_57 sub a11,a11,a12 # [15] s32i a11,a1,120 # [16] gra_spill_temp_53 .Lt_5_8706: # 0x854 l32i a2,a1,84 # [0] gra_spill_temp_44 l32i a10,a1,144 # [1] gra_spill_temp_59 l32i a11,a1,140 # [2] gra_spill_temp_58 l32i a12,a1,88 # [3] gra_spill_temp_45 s32i a12,a1,168 # [4] gra_spill_temp_65 s32i a11,a1,148 # [5] gra_spill_temp_60 blti a10,1,.Lt_5_8962 # [6] movi.n a8,0 # [0] movi.n a13,0 # [1] l32i a3,a1,100 # [2] gra_spill_temp_48 s32i a13,a1,164 # [3] gra_spill_temp_64 max a3,a3,a8 # [4] .Lt_5_9474: # 0x876 l32i a10,a1,136 # [0] gra_spill_temp_57 l32i a9,a1,152 # [1] gra_spill_temp_61 ee.zero.qacc # [2] bge a9,a10,.Lt_5_9730 # [3] .LBB12_esp_nn_depthwise_conv_s16_mult1_3x3: # 0x882 l32i a12,a1,128 # [0] gra_spill_temp_55 l32i a15,a1,112 # [1] gra_spill_temp_51 l32i a10,a1,116 # [2] gra_spill_temp_52 l32i a13,a1,124 # [3] gra_spill_temp_54 mull a11,a9,a10 # [4] add.n a13,a13,a9 # [5] mull a13,a13,a15 # [6] addx2 a11,a11,a11 # [7] l32i a9,a1,164 # [8] gra_spill_temp_64 add.n a12,a12,a13 # [9] mull a10,a10,a12 # [10] add.n a11,a9,a11 # [11] l32i a12,a1,108 # [12] gra_spill_temp_50 add.n a9,a9,a10 # [13] l32i a10,a1,104 # [14] gra_spill_temp_49 addx2 a11,a11,a12 # [15] l32i a12,a1,120 # [16] gra_spill_temp_53 addx2 a9,a9,a10 # [17] loopgtz a12,.LBB32_esp_nn_depthwise_conv_s16_mult1_3x3 # [18] mov.n a13,a9 # [0] mov.n a12,a11 # [1] mov.n a9,a11 # [2] mov.n a11,a13 # [3] beqz.n a3,.Lt_5_10498 # [4] if (filter_x_start) add.n a11,a4,a13 # [0] add.n a9,a4,a12 # [1] .Lt_5_10498: # 0x8c5 ee.vld.128.xp q0,a11,a4 # [0] id:261 ee.vld.128.xp q1,a9,a4 # [1] id:262 bnez.n a3,.Lt_5_11010 # [2] if (filter_x_start) ee.vmulas.s16.qacc q0,q1 # [0] ee.vld.128.xp q0,a11,a4 # [1] id:264 ee.vld.128.xp q1,a9,a4 # [2] id:265 .Lt_5_11010: # 0x8d6 ee.vmulas.s16.qacc q0,q1 # [0] ee.vld.128.xp q0,a11,a4 # [1] id:267 ee.vld.128.xp q1,a9,a4 # [2] id:268 add.n a9,a6,a13 # [3] blti a5,3,.Lt_5_11522 # [4] if (filter_x_end) ee.vmulas.s16.qacc q0,q1 # [0] .Lt_5_11522: # 0x8e7 add.n a11,a7,a12 # [0] .LBB32_esp_nn_depthwise_conv_s16_mult1_3x3: # 0x8eb .Lt_5_9730: # 0x8eb // extract data l32i a9,a1,160 # [0] gra_spill_temp_63 ee.st.qacc_l.l.128.ip a9,16 # [2] id:270 ee.st.qacc_l.h.32.ip a9,0 # [3] id:271 l8ui a11,a1,15 # [4] qacc_scratch+15 l16ui a10,a1,10 # [5] qacc_scratch+10 l8ui a15,a1,16 # [6] qacc_scratch+16 l8ui a13,a1,6 # [7] qacc_scratch+6 l8ui a12,a1,5 # [8] qacc_scratch+5 s8i a12,a1,2 # [9] qacc_scratch+2 s8i a13,a1,3 # [10] qacc_scratch+3 s8i a15,a1,7 # [11] qacc_scratch+7 s16i a10,a1,4 # [12] qacc_scratch+4 s8i a11,a1,6 # [13] qacc_scratch+6 ee.st.qacc_h.l.128.ip a9,16 # [14] id:281 ee.st.qacc_h.h.32.ip a9,-32 # [15] id:282 ee.srcmb.s16.qacc q1,a14,0 # [16] l8ui a15,a1,31 # [17] qacc_scratch+31 l8ui a8,a1,32 # [18] qacc_scratch+32 l16ui a13,a1,26 # [19] qacc_scratch+26 l8ui a12,a1,22 # [20] qacc_scratch+22 l8ui a11,a1,21 # [21] qacc_scratch+21 l16ui a10,a1,16 # [22] qacc_scratch+16 s16i a10,a1,8 # [23] qacc_scratch+8 s8i a11,a1,10 # [24] qacc_scratch+10 s8i a12,a1,11 # [25] qacc_scratch+11 s16i a13,a1,12 # [26] qacc_scratch+12 s8i a8,a1,15 # [27] qacc_scratch+15 s8i a15,a1,14 # [28] qacc_scratch+14 l32i a8,a1,140 # [29] gra_spill_temp_58 , bias ee.vld.128.ip q0,a9,0 # [30] id:294 s32i a9,a1,160 # [31] gra_spill_temp_63 ee.vzip.16 q0,q1 # [32] beqz.n a8,.Lt_5_12290 # [33] // skip bias addi a8,a1,112 # [0] l32i a10,a1,132 # [1] gra_spill_temp_56 l32i a9,a1,148 # [2] gra_spill_temp_60 wur.sar_byte a10 # [3] ee.vld.128.ip q4,a9,16 # [4] id:297 ee.vld.128.ip q7,a9,16 # [5] id:298 ee.vld.128.ip q5,a9,0 # [6] id:299 s32i a9,a1,148 # [7] gra_spill_temp_60 ee.src.q.qup q6,q4,q7 # [8] ee.vadds.s32 q0,q0,q6 # [9] ee.src.q.qup q3,q4,q5 # [10] ee.vadds.s32 q1,q1,q3 # [11] st.qr q1,a8,64 # [12] gra_spill_temp_66-112 .Lt_5_12290: # 0x974 addi a11,a1,112 # [0] # 287 q0 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q0, out_mult_ptr, out_shift_ptr); l32i a10,a1,168 # [1] gra_spill_temp_65 st.qr q1,a11,64 # [2] gra_spill_temp_66-112 mov.n a11,a2 # [3] call8 esp_nn_multiply_by_quantized_mult_ver1_esp32s3 # [4] esp_nn_multiply_by_quantized_mult_ver1_esp32s3 # 288 out_mult_ptr += 4; # 289 out_shift_ptr += 4; # 290 # 291 q1 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q1, out_mult_ptr, out_shift_ptr); l32i a10,a1,168 # [0] gra_spill_temp_65 addmi a12,a1,256 # [1] addi a11,a1,112 # [2] st.qr q0,a12,-16 # [3] gra_spill_temp_70-256 ld.qr q0,a11,64 # [4] gra_spill_temp_66-112 addi a10,a10,16 # [5] addi a11,a2,16 # [6] call8 esp_nn_multiply_by_quantized_mult_ver1_esp32s3 # [7] esp_nn_multiply_by_quantized_mult_ver1_esp32s3 .LBB25_esp_nn_depthwise_conv_s16_mult1_3x3: # 0x99a # Part of loop body line 216, head labeled .Lt_5_9474 movi.n a14,16 # [0] # 292 out_mult_ptr += 4; # 293 out_shift_ptr += 4; addi a2,a2,32 # [1] l32i a15,a1,144 # [2] gra_spill_temp_59 l32i a9,a1,156 # [3] gra_spill_temp_62 l32i a8,a1,168 # [4] gra_spill_temp_65 addmi a12,a1,256 # [5] addi a13,a1,112 # [6] ld.qr q3,a13,112 # [7] gra_spill_temp_69-112 ld.qr q1,a13,80 # [8] gra_spill_temp_67-112 ld.qr q2,a12,-16 # [9] gra_spill_temp_70-256 addi a8,a8,32 # [10] s32i a8,a1,168 # [11] gra_spill_temp_65 ee.vadds.s32 q2,q2,q1 # [12] ee.vadds.s32 q1,q0,q1 # [13] ee.vmin.s32 q0,q2,q3 # [14] ee.vmin.s32 q1,q1,q3 # [15] ld.qr q2,a13,96 # [16] gra_spill_temp_68-112 l32i a13,a1,164 # [17] gra_spill_temp_64 ee.vmax.s32 q1,q1,q2 # [18] ee.vmax.s32 q0,q0,q2 # [19] addi.n a13,a13,8 # [20] s32i a13,a1,164 # [21] gra_spill_temp_64 ee.vunzip.16 q0,q1 # [22] ee.vunzip.8 q0,q1 # [23] ee.vst.l.64.ip q0,a9,8 # [24] id:302 s32i a9,a1,156 # [25] gra_spill_temp_62 blt a13,a15,.Lt_5_9474 # [26] .Lt_5_8962: # 0x9e9 # Part of loop body line 203, head labeled .Lt_5_8706 l32i a8,a1,92 # [0] gra_spill_temp_46 l32i a11,a1,100 # [1] gra_spill_temp_48 l32i a10,a1,128 # [2] gra_spill_temp_55 l32i a9,a1,80 # [3] gra_spill_temp_43 l32i a15,a1,96 # [4] gra_spill_temp_47 sub a5,a5,a9 # [5] addi.n a15,a15,1 # [6] s32i a15,a1,96 # [7] gra_spill_temp_47 add.n a10,a10,a9 # [8] sub a11,a11,a9 # [9] s32i a11,a1,100 # [10] gra_spill_temp_48 s32i a10,a1,128 # [11] gra_spill_temp_55 sub a15,a15,a8 # [12] bnez a15,.Lt_5_8706 # [13] .Lt_5_8194: # 0xa11 # Part of loop body line 201, head labeled .Lt_5_7938 l32i a13,a1,64 # [0] gra_spill_temp_39 l32i a10,a1,72 # [1] gra_spill_temp_41 l32i a9,a1,124 # [2] gra_spill_temp_54 l32i.n a8,a1,60 # [3] gra_spill_temp_38 l32i a12,a1,68 # [4] gra_spill_temp_40 l32i a15,a1,76 # [5] gra_spill_temp_42 addi.n a12,a12,1 # [6] s32i a12,a1,68 # [7] gra_spill_temp_40 sub a15,a15,a8 # [8] add.n a9,a9,a8 # [9] sub a10,a10,a8 # [10] s32i a10,a1,72 # [11] gra_spill_temp_41 s32i a9,a1,124 # [12] gra_spill_temp_54 s32i a15,a1,76 # [13] gra_spill_temp_42 sub a12,a12,a13 # [14] bnez a12,.Lt_5_7938 # [15] .Lt_5_7426: # 0xa3e retw.n # [0] .size esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3, . - esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3 ================================================ FILE: src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3.S ================================================ // Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. .text .literal_position # Program Unit: esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3 .type esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3, @function .align 4 .global esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3 esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3: # 0xa42 # qacc_scratch = 0 # gra_spill_temp_71 = 48 # gra_spill_temp_72 = 52 # gra_spill_temp_73 = 56 # gra_spill_temp_74 = 60 # gra_spill_temp_75 = 64 # gra_spill_temp_76 = 68 # gra_spill_temp_77 = 72 # gra_spill_temp_78 = 76 # gra_spill_temp_79 = 80 # gra_spill_temp_80 = 84 # gra_spill_temp_81 = 88 # gra_spill_temp_82 = 92 # gra_spill_temp_83 = 96 # gra_spill_temp_84 = 100 # gra_spill_temp_85 = 104 # gra_spill_temp_86 = 108 # gra_spill_temp_87 = 112 # gra_spill_temp_88 = 116 # gra_spill_temp_89 = 120 # gra_spill_temp_90 = 124 # gra_spill_temp_91 = 128 # gra_spill_temp_92 = 132 # gra_spill_temp_93 = 136 # gra_spill_temp_94 = 140 # gra_spill_temp_95 = 144 # gra_spill_temp_96 = 160 # gra_spill_temp_97 = 176 # gra_spill_temp_98 = 192 # gra_spill_temp_99 = 208 # gra_spill_temp_100 = 224 # gra_spill_temp_101 = 240 # gra_spill_temp_102 = 244 # gra_spill_temp_103 = 248 // registers: // a2: const int16_t *input_data // a3: const uint16_t input_wd // a4: const uint16_t input_ht // a5: const uint16_t channels // a6: const uint16_t stride_wd // a7: const uint16_t stride_ht // on stack: // const int16_t *filter_data // const int32_t *bias // int8_t *out_data // const uint16_t out_wd // const uint16_t out_ht // const int32_t out_offset // const int32_t *out_shift // const int32_t *out_mult // const int32_t activation_min // const int32_t activation_max entry a1,288 # s32i a2,a1,120 # [0] gra_spill_temp_89 s32i.n a3,a1,48 # [1] gra_spill_temp_71 s32i a5,a1,76 # [2] gra_spill_temp_78 s32i a6,a1,84 # [3] gra_spill_temp_80 s32i.n a7,a1,60 # [4] gra_spill_temp_74 l32i a12,a1,296 # [5] id:241 out_data+0x0 addi a14,a1,112 # [6] addmi a10,a1,256 # [7] addmi a13,a1,256 # [8] addmi a15,a1,256 # [9] // height loop l16ui a8,a1,304 # [10] id:242 out_ht+0x0 s32i.n a8,a1,56 # [11] gra_spill_temp_73 addi a15,a15,52 # [12] addi a13,a13,64 # [13] addi a10,a10,68 # [14] ee.vldbc.32 q0,a10 # [15] id:240 activation_max ee.vldbc.32 q1,a13 # [16] id:239 activation_min ee.vldbc.32 q2,a15 # [17] id:238 out_offset st.qr q2,a14,64 # [18] gra_spill_temp_97-112 st.qr q1,a14,80 # [19] gra_spill_temp_98-112 st.qr q0,a14,96 # [20] gra_spill_temp_99-112 beqz.n a8,.Lt_6_6914 # [21] .LBB3_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad: # 0xa83 s32i a1,a1,144 # [0] gra_spill_temp_95 mul16u a7,a3,a5 # [1] s32i a4,a1,72 # [2] gra_spill_temp_77 addi a9,a5,-7 # [3] l16ui a11,a1,300 # [4] id:247 out_wd+0x0 l32i a10,a1,292 # [5] id:243 bias+0x0 l32i a15,a1,288 # [6] id:244 filter_data+0x0 l32i a13,a1,316 # [7] id:246 out_mult+0x0 l32i a14,a1,312 # [8] id:245 out_shift+0x0 s32i a14,a1,88 # [9] gra_spill_temp_81 s32i a13,a1,92 # [10] gra_spill_temp_82 s32i a15,a1,124 # [11] gra_spill_temp_90 s32i a10,a1,116 # [12] gra_spill_temp_88 s32i a11,a1,96 # [13] gra_spill_temp_83 s32i a9,a1,136 # [14] gra_spill_temp_93 addx2 a4,a5,a5 # [15] slli a4,a4,1 # [16] slli a7,a7,1 # [17] l32i.n a9,a1,60 # [18] gra_spill_temp_74 movi.n a11,0 # [19] extui a10,a10,0,4 # [20] movi.n a15,0 # [21] slli a5,a5,1 # [22] s32i a15,a1,68 # [23] gra_spill_temp_76 s32i a10,a1,112 # [24] gra_spill_temp_87 s32i a11,a1,64 # [25] gra_spill_temp_75 mul16u a8,a3,a9 # [26] movi.n a11,0 # [27] s32i a11,a1,80 # [28] gra_spill_temp_79 s32i.n a8,a1,52 # [29] gra_spill_temp_72 .Lt_6_7426: # 0xad8 // width_loop l32i a8,a1,96 # [0] gra_spill_temp_83 beqz.n a8,.Lt_6_7682 # [2] movi.n a11,3 # [0] l32i a10,a1,72 # [1] gra_spill_temp_77 movi.n a9,0 # [2] movi.n a13,0 # [3] l32i.n a14,a1,48 # [4] gra_spill_temp_71 s32i a14,a1,108 # [5] gra_spill_temp_86 s32i a13,a1,104 # [6] gra_spill_temp_85 s32i a9,a1,100 # [7] gra_spill_temp_84 min a10,a10,a11 # [8] s32i a10,a1,128 # [9] gra_spill_temp_91 .Lt_6_8194: # 0xaf7 l32i a2,a1,88 # [0] gra_spill_temp_81 l32i a6,a1,92 # [1] gra_spill_temp_82 l32i a8,a1,116 # [2] gra_spill_temp_88 // channel loop l32i a15,a1,136 # [3] gra_spill_temp_93 s32i a8,a1,140 # [4] gra_spill_temp_94 blti a15,1,.Lt_6_8450 # [5] movi.n a11,0 # [0] movi.n a10,0 # [1] l32i a9,a1,76 # [2] gra_spill_temp_78 l32i a14,a1,80 # [3] gra_spill_temp_79 movi.n a8,3 # [4] l32i a3,a1,108 # [5] gra_spill_temp_86 l32i a13,a1,104 # [6] gra_spill_temp_85 min a3,a3,a8 # [7] add.n a13,a13,a14 # [8] mull a9,a9,a13 # [9] s32i a9,a1,132 # [10] gra_spill_temp_92 .Lt_6_8962: # 0xb26 ee.zero.qacc # [0] l32i a9,a1,132 # [1] gra_spill_temp_92 l32i a13,a1,120 # [2] gra_spill_temp_89 add.n a9,a9,a10 # [3] addx2 a9,a9,a13 # [4] l32i a13,a1,124 # [5] gra_spill_temp_90 l32i a14,a1,128 # [6] gra_spill_temp_91 add.n a13,a11,a13 # [7] loopgtz a14,.LBB30_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad # [8] .Lt_6_9730: # 0xb3f # Loop body line 360, nesting depth: 4, estimated iterations: 100 mov.n a14,a13 # [0] mov.n a15,a9 # [1] ee.vld.128.xp q0,a15,a5 # [2] id:249 ee.vld.128.xp q1,a14,a5 # [3] id:250 add.n a9,a9,a7 # [4] beqi a3,2,.LBB15_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad # [5] .Lt_6_9986: # 0xb4e beqi a3,3,.LBB17_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad # [0] .Lt_6_10498: # 0xb51 add.n a13,a13,a4 # [0] ee.vmulas.s16.qacc q0,q1 # [1] .LBB30_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad: # 0xb58 // extract data l32i a15,a1,144 # [0] gra_spill_temp_95 ee.st.qacc_l.l.128.ip a15,16 # [2] id:258 ee.st.qacc_l.h.32.ip a15,0 # [3] id:259 l8ui a14,a1,15 # [4] qacc_scratch+15 l8ui a13,a1,16 # [5] qacc_scratch+16 l8ui a8,a1,5 # [6] qacc_scratch+5 l8ui a9,a1,6 # [7] qacc_scratch+6 s8i a9,a1,3 # [8] qacc_scratch+3 s8i a8,a1,2 # [9] qacc_scratch+2 s8i a13,a1,7 # [10] qacc_scratch+7 s8i a14,a1,6 # [11] qacc_scratch+6 l16ui a13,a1,10 # [12] qacc_scratch+10 s16i a13,a1,4 # [13] qacc_scratch+4 ee.st.qacc_h.l.128.ip a15,16 # [14] id:269 ee.st.qacc_h.h.32.ip a15,-32 # [15] id:270 l8ui a9,a1,32 # [16] qacc_scratch+32 l8ui a13,a1,22 # [17] qacc_scratch+22 l8ui a8,a1,31 # [18] qacc_scratch+31 l16ui a14,a1,26 # [19] qacc_scratch+26 s16i a14,a1,12 # [20] qacc_scratch+12 s8i a8,a1,14 # [21] qacc_scratch+14 s8i a13,a1,11 # [22] qacc_scratch+11 s8i a9,a1,15 # [23] qacc_scratch+15 l32i a13,a1,116 # [24] gra_spill_temp_88 l8ui a9,a1,21 # [25] qacc_scratch+21 l16ui a8,a1,16 # [26] qacc_scratch+16 movi.n a14,16 # [27] ee.srcmb.s16.qacc q1,a14,0 # [28] s16i a8,a1,8 # [29] qacc_scratch+8 s8i a9,a1,10 # [30] qacc_scratch+10 ee.vld.128.ip q0,a15,0 # [31] id:282 s32i a15,a1,144 # [32] gra_spill_temp_95 ee.vzip.16 q0,q1 # [33] bnez.n a13,.LBB20_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad # [34] s32i a12,a1,240 # [0] gra_spill_temp_101 s32i a11,a1,244 # [1] gra_spill_temp_102 s32i a10,a1,248 # [2] gra_spill_temp_103 addi a14,a1,112 # [3] st.qr q1,a14,48 # [4] gra_spill_temp_96-112 j .Lt_6_11266 # [5] .LBB15_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad: # 0xbce # Part of loop body line 360, head labeled .Lt_6_9730 ee.vmulas.s16.qacc.ld.xp q0,a15,a5,q0,q1 # [0] id:251 ee.vld.128.xp q1,a14,a5 # [1] id:252 bnei a3,3,.Lt_6_10498 # [2] .LBB17_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad: # 0xbd8 ee.vmulas.s16.qacc.ld.xp q3,a15,a5,q0,q1 # [0] id:253 ee.vld.128.xp q4,a14,a5 # [1] id:254 ee.vld.128.xp q1,a14,a5 # [2] id:256 ee.vmulas.s16.qacc.ld.xp q0,a15,a5,q3,q4 # [3] id:255 j .Lt_6_10498 # [4] .LBB20_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad: # 0xbe9 # Part of loop body line 358, head labeled .Lt_6_8962 s32i a12,a1,240 # [0] gra_spill_temp_101 s32i a11,a1,244 # [1] gra_spill_temp_102 s32i a10,a1,248 # [2] gra_spill_temp_103 addi a15,a1,112 # [3] l32i a9,a1,112 # [4] gra_spill_temp_87 l32i a8,a1,140 # [5] gra_spill_temp_94 wur.sar_byte a9 # [6] ee.vld.128.ip q6,a8,16 # [7] id:285 ee.vld.128.ip q3,a8,16 # [8] id:286 ee.vld.128.ip q7,a8,0 # [9] id:287 s32i a8,a1,140 # [10] gra_spill_temp_94 ee.src.q.qup q2,q6,q3 # [11] ee.vadds.s32 q0,q0,q2 # [12] ee.src.q.qup q5,q6,q7 # [13] ee.vadds.s32 q1,q1,q5 # [14] st.qr q1,a15,48 # [15] gra_spill_temp_96-112 .Lt_6_11266: # 0xc19 # 423 q0 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q0, out_mult_ptr, out_shift_ptr); mov.n a10,a6 # [0] mov.n a11,a2 # [1] call8 esp_nn_multiply_by_quantized_mult_ver1_esp32s3 # [2] esp_nn_multiply_by_quantized_mult_ver1_esp32s3 addi a11,a1,112 # [0] addi a10,a6,16 # [1] st.qr q0,a11,112 # [2] gra_spill_temp_100-112 ld.qr q0,a11,48 # [3] gra_spill_temp_96-112 addi a11,a2,16 # [4] call8 esp_nn_multiply_by_quantized_mult_ver1_esp32s3 # [5] esp_nn_multiply_by_quantized_mult_ver1_esp32s3 addi a6,a6,32 # [0] addi a2,a2,32 # [1] l32i a13,a1,136 # [2] gra_spill_temp_93 l32i a12,a1,240 # [3] gra_spill_temp_101 l32i a10,a1,248 # [4] gra_spill_temp_103 l32i a11,a1,244 # [5] gra_spill_temp_102 addi a9,a1,112 # [6] ld.qr q6,a9,80 # [7] gra_spill_temp_98-112 ld.qr q7,a9,96 # [8] gra_spill_temp_99-112 ld.qr q5,a9,64 # [9] gra_spill_temp_97-112 ld.qr q4,a9,112 # [10] gra_spill_temp_100-112 addi a11,a11,16 # [11] addi.n a10,a10,8 # [12] ee.vadds.s32 q4,q4,q5 # [13] ee.vadds.s32 q5,q0,q5 # [14] ee.vmin.s32 q4,q4,q7 # [15] ee.vmax.s32 q4,q4,q6 # [16] ee.vmin.s32 q5,q5,q7 # [17] ee.vmax.s32 q5,q5,q6 # [18] ee.vunzip.16 q4,q5 # [19] ee.vunzip.8 q4,q5 # [20] ee.vst.l.64.ip q4,a12,8 # [21] id:290 blt a10,a13,.Lt_6_8962 # [22] .Lt_6_8450: # 0xc76 # Part of loop body line 348, head labeled .Lt_6_8194 l32i a11,a1,96 # [0] gra_spill_temp_83 l32i a15,a1,104 # [1] gra_spill_temp_85 l32i a14,a1,84 # [2] gra_spill_temp_80 l32i a10,a1,100 # [3] gra_spill_temp_84 l32i a13,a1,108 # [4] gra_spill_temp_86 addi.n a10,a10,1 # [5] s32i a10,a1,100 # [6] gra_spill_temp_84 sub a13,a13,a14 # [7] add.n a15,a15,a14 # [8] s32i a15,a1,104 # [9] gra_spill_temp_85 s32i a13,a1,108 # [10] gra_spill_temp_86 sub a10,a10,a11 # [11] bnez a10,.Lt_6_8194 # [12] .Lt_6_7682: # 0xc9b l32i.n a9,a1,56 # [0] gra_spill_temp_73 l32i a15,a1,64 # [1] gra_spill_temp_75 l32i.n a14,a1,52 # [2] gra_spill_temp_72 l32i a13,a1,80 # [3] gra_spill_temp_79 l32i.n a11,a1,60 # [4] gra_spill_temp_74 l32i a8,a1,68 # [5] gra_spill_temp_76 l32i a10,a1,72 # [6] gra_spill_temp_77 addi.n a8,a8,1 # [7] s32i a8,a1,68 # [8] gra_spill_temp_76 sub a10,a10,a11 # [9] add.n a13,a13,a14 # [10] add.n a15,a15,a11 # [11] s32i a15,a1,64 # [12] gra_spill_temp_75 s32i a13,a1,80 # [13] gra_spill_temp_79 s32i a10,a1,72 # [14] gra_spill_temp_77 sub a8,a8,a9 # [15] bnez a8,.Lt_6_7426 # [16] .Lt_6_6914: # 0xcc8 retw.n # [0] .size esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3, . - esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3 ================================================ FILE: src/convolution/esp_nn_depthwise_conv_s16_mult1_esp32s3.S ================================================ // Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. .text .literal_position # Program Unit: esp_nn_depthwise_conv_s16_mult1_esp32s3 .type esp_nn_depthwise_conv_s16_mult1_esp32s3, @function .align 4 .global esp_nn_depthwise_conv_s16_mult1_esp32s3 esp_nn_depthwise_conv_s16_mult1_esp32s3: # 0x4c8 # scratch_buf = 0 # gra_spill_temp_2 = 48 # gra_spill_temp_22 = 52 # gra_spill_temp_4 = 56 # gra_spill_temp_23 = 60 # gra_spill_temp_24 = 64 # gra_spill_temp_7 = 68 # gra_spill_temp_26 = 72 # gra_spill_temp_27 = 76 # gra_spill_temp_28 = 80 # gra_spill_temp_29 = 84 # gra_spill_temp_12 = 88 # gra_spill_temp_13 = 92 # gra_spill_temp_14 = 96 # gra_spill_temp_15 = 100 # gra_spill_temp_21 = 104 # gra_spill_temp_17 = 108 # gra_spill_temp_18 = 112 # gra_spill_temp_20 = 116 # gra_spill_temp_30 = 0 # gra_spill_temp_34 = 16 // in registers: // a2: *input_data // a3: input_wd // a4: input_ht // a5: channels // a6: pad_wd // a7: pad_ht // on stack: // stride_wd // stride_ht // *filter_data // filter_wd // filter_ht // *bias // *out_data // out_wd // out_ht // out_offset // *out_shift // *out_mult // activation_min // activation_max entry a1,160 # l32i a9,a1,184 # [7] id:237 out_data+0x0 l16ui a8,a1,192 # [8] id:238 out_ht+0x0 s32i a2,a1,52 # [0] gra_spill_temp_22 s32i.n a4,a1,56 # [1] gra_spill_temp_4 s32i a5,a1,60 # [2] gra_spill_temp_23 s32i a9,a1,112 # [10] gra_spill_temp_18 beqz.n a8,.Lt_4_7170 # [20] .LBB3_esp_nn_depthwise_conv_s16_mult1: # 0x508 l16ui a4,a1,172 # [0] id:240 filter_wd+0x0 neg a13,a7 # [2] neg a12,a6 # [3] sext a12,a12,15 # [16] sext a13,a13,15 # [17] s32i a13,a1,92 # [18] gra_spill_temp_13 s32i.n a12,a1,48 # [19] gra_spill_temp_2 movi.n a8,0 # [20] slli a9,a5,1 # [21] addi a10,a5,-7 # [22] s32i a10,a1,100 # [23] gra_spill_temp_15 s32i a9,a1,64 # [24] gra_spill_temp_24 s32i a8,a1,68 # [25] gra_spill_temp_7 j .Lt_4_7682 # [30] .Lt_4_7938: # 0x561 l32i a15,a1,192 # [0] out_ht l32i.n a9,a1,164 # [1] stride_ht l32i a14,a1,68 # [2] gra_spill_temp_7 l32i a8,a1,92 # [3] gra_spill_temp_13 addi.n a14,a14,1 # [4] s32i a14,a1,68 # [5] gra_spill_temp_7 add.n a9,a8,a9 # [6] sub a14,a14,a15 # [7] sext a8,a9,15 # [8] s32i a8,a1,92 # [9] gra_spill_temp_13 beqz a14,.Lt_4_7170 # [10] .Lt_4_7682: # 0x57f # Loop body line 59, nesting depth: 1, estimated iterations: 100 # 60 const int16_t base_y = (out_y * stride_ht) - pad_ht; # 61 for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop l32i a10,a1,188 # [0] out_width beqz.n a10,.Lt_4_7938 # [2] .LBB6_esp_nn_depthwise_conv_s16_mult1: # 0x584 # Part of loop body line 59, head labeled .Lt_4_7682 movi.n a14,0 # [0] l32i.n a7,a1,176 # [1] filter_ht l32i a13,a1,92 # [2] gra_spill_temp_13 l32i.n a8,a1,56 # [3] gra_spill_temp_4 movi.n a11,0 # [4] l32i.n a12,a1,48 # [5] gra_spill_temp_2 s32i a12,a1,84 # [6] gra_spill_temp_29 s32i a11,a1,88 # [7] gra_spill_temp_12 sub a8,a8,a13 # [8] min a7,a7,a8 # [9] neg a13,a13 # [10] max a13,a13,a14 # [11] s32i a13,a1,96 # [12] gra_spill_temp_14 j .Lt_4_8450 # [13] .Lt_4_8706: # 0x5a9 # Part of loop body line 61, head labeled .Lt_4_8450 l32i a10,a1,188 # [0] out_width l32i a12,a1,160 # [1] stride_wd l32i a9,a1,88 # [2] gra_spill_temp_12 l32i a11,a1,84 # [3] gra_spill_temp_29 addi.n a9,a9,1 # [4] s32i a9,a1,88 # [5] gra_spill_temp_12 add.n a12,a11,a12 # [6] sext a11,a12,15 # [7] s32i a11,a1,84 # [8] gra_spill_temp_29 beq a9,a10,.Lt_4_7938 # [9] .Lt_4_8450: # 0x5c5 # Loop body line 61, nesting depth: 2, estimated iterations: 100 # 69 uint32_t bias_ptr = (uint32_t) bias; # 70 const int32_t *out_mult_ptr = out_mult; # 71 const int32_t *out_shift_ptr = out_shift; # 72 # 73 for (int ch_idx = 0; ch_idx < channels - 7; ch_idx += 8) {//channel_loop l32i a13,a1,100 # [0] gra_spill_temp_15 l32i a14,a1,180 # [1] bias l32i a15,a1,204 # [2] out_mult l32i a8,a1,200 # [3] out_shift s32i a8,a1,104 # [4] gra_spill_temp_21 s32i a15,a1,116 # [5] gra_spill_temp_20 s32i a14,a1,108 # [6] gra_spill_temp_17 blti a13,1,.Lt_4_8706 # [7] .LBB9_esp_nn_depthwise_conv_s16_mult1: # 0x5dd # Part of loop body line 61, head labeled .Lt_4_8450 movi.n a2,0 # [0] l32i a5,a1,84 # [1] gra_spill_temp_29 movi.n a8,0 # [2] neg a6,a5 # [3] max a6,a6,a8 # [4] sub a5,a3,a5 # [5] min a5,a4,a5 # [6] sub a9,a5,a6 # [7] s32i a9,a1,72 # [8] gra_spill_temp_26 j .Lt_4_9218 # [9] .Lt_4_9474: # 0x5f9 // extract data mov a11,a1 ee.st.qacc_l.l.128.ip a11,16 # [2] id:252 ee.st.qacc_l.h.32.ip a11,0 # [3] id:253 l8ui a12,a1,15 # [4] scratch_buf+15 l16ui a10,a1,10 # [5] scratch_buf+10 l8ui a13,a1,5 # [6] scratch_buf+5 l8ui a14,a1,6 # [7] scratch_buf+6 l8ui a15,a1,16 # [8] scratch_buf+16 s8i a13,a1,2 # [11] scratch_buf+2 s8i a14,a1,3 # [10] scratch_buf+3 s8i a15,a1,7 # [9] scratch_buf+7 s16i a10,a1,4 # [12] scratch_buf+4 s8i a12,a1,6 # [13] scratch_buf+6 movi.n a10,16 # [14] ee.st.qacc_h.l.128.ip a11,16 # [15] id:263 ee.st.qacc_h.h.32.ip a11,-32 # [16] id:264 ee.srcmb.s16.qacc q1,a10,0 # [17] l8ui a8,a1,31 # [18] scratch_buf+31 l8ui a9,a1,32 # [19] scratch_buf+32 l16ui a12,a1,16 # [20] scratch_buf+16 l8ui a13,a1,21 # [21] scratch_buf+21 l8ui a14,a1,22 # [22] scratch_buf+22 l16ui a15,a1,26 # [23] scratch_buf+26 s8i a13,a1,10 # [26] scratch_buf+10 s8i a14,a1,11 # [25] scratch_buf+11 s16i a15,a1,12 # [24] scratch_buf+12 s16i a12,a1,8 # [27] scratch_buf+8 s8i a9,a1,15 # [28] scratch_buf+15 s8i a8,a1,14 # [29] scratch_buf+14 l32i a9,a1,180 # [30] bias ee.vld.128.ip q0,a11,0 # [31] id:164 ee.vzip.16 q0,q1 # [33] beqz.n a9,.Lt_4_11522 # [34] // skip bias // add bias l32i a9,a1,108 # [0] gra_spill_temp_17 addi a8,a1,112 # [1] extui a10,a9,0,4 # [2] wur.sar_byte a10 # [3] ee.vld.128.ip q4,a9,16 # [4] id:279 ee.vld.128.ip q7,a9,16 # [5] id:168 ee.vld.128.ip q5,a9,0 # [6] id:281 s32i a9,a1,108 # [7] gra_spill_temp_17 ee.src.q q4,q4,q7 # [8] ee.src.q q7,q7,q5 # [10] ee.vadds.s32 q0,q0,q4 # [9] ee.vadds.s32 q1,q1,q7 # [11] st.qr q1,a1,0 # [12] gra_spill_temp_30-112 .Lt_4_11522: # 0x684 // apply quantisation: esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q0, out_mult_ptr, out_shift_ptr); l32i a10,a1,116 # [1] gra_spill_temp_20 l32i a11,a1,104 # [3] gra_spill_temp_21 st.qr q1,a1,0 # [2] gra_spill_temp_30-112 call8 esp_nn_multiply_by_quantized_mult_ver1_esp32s3 # [4] esp_nn_multiply_by_quantized_mult_ver1_esp32s3 l32i a10,a1,116 # [2] gra_spill_temp_20 l32i a11,a1,104 # [0] gra_spill_temp_21 st.qr q0,a1,16 # [3] gra_spill_temp_34-112 ld.qr q0,a1,0 # [4] gra_spill_temp_30-112 addi a10,a10,16 # [5] // out_mult_ptr += 4 addi a11,a11,16 # [6] // out_shift_ptr += 4 call8 esp_nn_multiply_by_quantized_mult_ver1_esp32s3 # [7] esp_nn_multiply_by_quantized_mult_ver1_esp32s3 // add offset, apply activation and store l32i a13,a1,100 # [0] gra_spill_temp_15 addi.n a2,a2,8 # [1] l32i a8,a1,112 # [2] gra_spill_temp_18 l32i a15,a1,116 # [3] gra_spill_temp_20 l32i a14,a1,104 # [4] gra_spill_temp_21 addi a12,a1,212 ee.vldbc.32 q3,a12 # [14] id:236 activation_max addi a12,a1,196 ee.vldbc.32 q1,a12 # [16] id:234 out_offset addi a12,a1,208 ld.qr q2,a1,16 # [8] gra_spill_temp_34-112 addi a14,a14,32 # [9] addi a15,a15,32 # [10] s32i a15,a1,116 # [11] gra_spill_temp_20 ee.vadds.s32 q2,q2,q1 # [12] s32i a14,a1,104 # [13] gra_spill_temp_21 ee.vadds.s32 q1,q0,q1 # [14] ee.vmin.s32 q0,q2,q3 # [15] ee.vldbc.32 q2,a12 # [16] id:234 out_offset ee.vmin.s32 q1,q1,q3 # [17] ee.vmax.s32 q1,q1,q2 # [18] ee.vmax.s32 q0,q0,q2 # [19] ee.vunzip.16 q0,q1 # [20] ee.vunzip.8 q0,q1 # [21] ee.vst.l.64.ip q0,a8,8 # [22] id:172 s32i a8,a1,112 # [23] gra_spill_temp_18 bge a2,a13,.Lt_4_8706 # [24] .Lt_4_9218: # 0x6f5 ee.zero.qacc # [0] l32i a13,a1,96 # [1] gra_spill_temp_14 s32i a13,a1,80 # [2] gra_spill_temp_28 bge a13,a7,.Lt_4_9474 # [3] .LBB12_esp_nn_depthwise_conv_s16_mult1: # 0x701 // channel_loop mull a15,a13,a4 # [0] l32i a14,a1,92 # [1] gra_spill_temp_13 add.n a8,a15,a5 # [2] add.n a14,a14,a13 # [3] mull a14,a3,a14 # [4] s32i a8,a1,76 # [5] gra_spill_temp_27 bge a6,a5,.Lt_4_10242 # [6] .LBB15_esp_nn_depthwise_conv_s16_mult1: # 0x714 l32i a12,a1,64 # [0] gra_spill_temp_24 l32i a9,a1,168 # [1] filter_data l32i a10,a1,60 # [2] gra_spill_temp_23 l32i a11,a1,84 # [3] gra_spill_temp_29 add.n a8,a15,a6 # [4] add.n a11,a11,a6 # [5] mull a8,a8,a10 # [6] add.n a11,a14,a11 # [7] mull a10,a10,a11 # [8] add.n a8,a2,a8 # [9] l32i a11,a1,52 # [10] gra_spill_temp_22 addx2 a8,a8,a9 # [11] add.n a10,a2,a10 # [12] l32i a9,a1,72 # [13] gra_spill_temp_26 addx2 a10,a10,a11 # [14] loopgtz a9,.LBB41_esp_nn_depthwise_conv_s16_mult1 # [15] // innermost loop ee.vld.128.xp q0,a10,a12 # [0*II+3] id:249 ee.vld.128.xp q1,a8,a12 # [0*II+4] id:250 ee.vmulas.s16.qacc q0,q1 # [0*II+6] .LBB41_esp_nn_depthwise_conv_s16_mult1: # 0x750 .Lt_4_10242: # 0x750 add.n a14,a14,a3 # [0] add.n a15,a15,a4 # [1] l32i a9,a1,80 # [2] gra_spill_temp_28 l32i a10,a1,76 # [3] gra_spill_temp_27 addi.n a9,a9,1 # [4] add.n a10,a10,a4 # [5] s32i a10,a1,76 # [6] gra_spill_temp_27 s32i a9,a1,80 # [7] gra_spill_temp_28 sub a9,a7,a9 # [8] beqz a9,.Lt_4_9474 # [9] blt a6,a5,.LBB15_esp_nn_depthwise_conv_s16_mult1 # [0] j .Lt_4_10242 # [0] .Lt_4_7170: # 0x770 retw.n # [0] .size esp_nn_depthwise_conv_s16_mult1_esp32s3, . - esp_nn_depthwise_conv_s16_mult1_esp32s3 ================================================ FILE: src/convolution/esp_nn_depthwise_conv_s16_mult4_esp32s3.S ================================================ // Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. .text .literal_position # Program Unit: esp_nn_depthwise_conv_s16_mult4_esp32s3 .type esp_nn_depthwise_conv_s16_mult4_esp32s3, @function .align 4 .global esp_nn_depthwise_conv_s16_mult4_esp32s3 esp_nn_depthwise_conv_s16_mult4_esp32s3: # 0x17c8 # qacc_scratch = 0 # gra_spill_temp_220 = 32 # gra_spill_temp_221 = 36 # gra_spill_temp_222 = 40 # gra_spill_temp_223 = 44 # gra_spill_temp_224 = 48 # gra_spill_temp_225 = 52 # gra_spill_temp_226 = 56 # gra_spill_temp_227 = 60 # gra_spill_temp_228 = 64 # gra_spill_temp_229 = 68 # gra_spill_temp_230 = 72 # gra_spill_temp_231 = 76 # gra_spill_temp_232 = 80 # gra_spill_temp_233 = 84 # gra_spill_temp_234 = 88 # gra_spill_temp_235 = 92 # gra_spill_temp_236 = 96 # gra_spill_temp_237 = 100 # gra_spill_temp_238 = 104 # gra_spill_temp_239 = 108 # gra_spill_temp_240 = 112 # gra_spill_temp_241 = 116 # gra_spill_temp_242 = 120 # gra_spill_temp_243 = 124 # gra_spill_temp_244 = 128 # gra_spill_temp_245 = 132 # gra_spill_temp_246 = 136 # gra_spill_temp_247 = 140 # gra_spill_temp_248 = 144 # gra_spill_temp_249 = 148 # gra_spill_temp_250 = 152 # gra_spill_temp_251 = 156 # gra_spill_temp_252 = 160 # gra_spill_temp_253 = 164 # gra_spill_temp_254 = 168 # gra_spill_temp_255 = 172 # gra_spill_temp_256 = 176 # gra_spill_temp_257 = 192 # gra_spill_temp_258 = 208 # gra_spill_temp_259 = 224 # gra_spill_temp_260 = 240 // registers: // a2: const int16_t *input_data // a3: const uint16_t input_wd // a4: const uint16_t input_ht // a5: const uint16_t channels // a6: const uint16_t pad_wd // a7: const uint16_t pad_ht // on stack: // const uint16_t stride_wd // const uint16_t stride_ht // const uint16_t ch_mult // const int16_t *filter_data // const uint16_t filter_wd // const uint16_t filter_ht // const int32_t *bias // int8_t *out_data // const uint16_t out_wd // const uint16_t out_ht // const int32_t out_offset // const int32_t *out_shift // const int32_t *out_mult // const int32_t activation_min // const int32_t activation_max entry a1,288 # s32i a2,a1,136 # [0] gra_spill_temp_246 s32i.n a4,a1,40 # [1] gra_spill_temp_222 s32i a5,a1,164 # [2] gra_spill_temp_253 addi a12,a1,112 # [3] addmi a10,a1,256 # [4] addmi a11,a1,256 # [5] addmi a13,a1,256 # [6] l16ui a8,a1,324 # [7] id:216 out_ht+0x0 s32i.n a8,a1,48 # [8] gra_spill_temp_224 addi a13,a13,72 # [9] addi a11,a11,88 # [10] addi a10,a10,84 # [11] ee.vldbc.32 q0,a10 # [12] id:215 activation_min ee.vldbc.32 q1,a11 # [13] id:214 activation_max ee.vldbc.32 q2,a13 # [14] id:213 out_offset st.qr q2,a12,80 # [15] gra_spill_temp_257-112 st.qr q1,a12,96 # [16] gra_spill_temp_258-112 st.qr q0,a12,112 # [17] gra_spill_temp_259-112 beqz.n a8,.Lt_10_8450 # [18] s32i a1,a1,112 # [0] gra_spill_temp_240 neg a15,a6 # [1] neg a4,a7 # [2] addmi a8,a1,256 # [3] movi.n a9,0 # [4] movi.n a11,0 # [5] slli a14,a5,1 # [6] l16ui a13,a1,296 # [7] id:217 ch_mult+0x0 l16ui a10,a1,308 # [8] id:227 filter_ht+0x0 s32i.n a10,a1,36 # [9] gra_spill_temp_221 s32i a13,a1,76 # [10] gra_spill_temp_231 s32i a14,a1,148 # [11] gra_spill_temp_249 s32i.n a11,a1,52 # [12] gra_spill_temp_225 s32i a9,a1,116 # [13] gra_spill_temp_241 st.qr q4,a8,-16 # [14] gra_spill_temp_260-256 sext a4,a4,15 # [15] sext a15,a15,15 # [16] s32i.n a15,a1,32 # [17] gra_spill_temp_220 mul16u a12,a5,a13 # [18] s32i a4,a1,92 # [19] gra_spill_temp_235 l16ui a8,a1,320 # [20] id:229 out_wd+0x0 l16ui a9,a1,292 # [21] id:228 stride_ht+0x0 l32i a11,a1,336 # [22] id:226 out_mult+0x0 s32i a11,a1,64 # [23] gra_spill_temp_228 s32i.n a9,a1,44 # [24] gra_spill_temp_223 s32i a8,a1,68 # [25] gra_spill_temp_229 l32i a4,a1,300 # [26] id:218 filter_data+0x0 s32i a12,a1,140 # [27] gra_spill_temp_247 l32i a15,a1,316 # [28] id:219 out_data+0x0 s32i a15,a1,96 # [29] gra_spill_temp_236 slli a12,a12,1 # [30] s32i a4,a1,152 # [31] gra_spill_temp_250 addi a14,a13,-3 # [32] l16ui a4,a1,304 # [33] id:223 filter_wd+0x0 s32i a14,a1,108 # [34] gra_spill_temp_239 s32i a12,a1,144 # [35] gra_spill_temp_248 slli a13,a13,2 # [36] s32i a13,a1,80 # [37] gra_spill_temp_232 l32i a12,a1,332 # [38] id:225 out_shift+0x0 l32i a14,a1,312 # [39] id:222 bias+0x0 s32i a14,a1,104 # [40] gra_spill_temp_238 s32i.n a12,a1,60 # [41] gra_spill_temp_227 l16ui a13,a1,288 # [42] id:224 stride_wd+0x0 s32i.n a13,a1,56 # [43] gra_spill_temp_226 j .Lt_10_8962 # [44] .Lt_10_9218: # 0x1880 l32i.n a9,a1,48 # [0] gra_spill_temp_224 l32i.n a11,a1,44 # [1] gra_spill_temp_223 l32i.n a8,a1,52 # [2] gra_spill_temp_225 l32i a10,a1,92 # [3] gra_spill_temp_235 addi.n a8,a8,1 # [4] s32i.n a8,a1,52 # [5] gra_spill_temp_225 add.n a11,a10,a11 # [6] sub a8,a8,a9 # [7] sext a10,a11,15 # [8] s32i a10,a1,92 # [9] gra_spill_temp_235 beqz a8,.Lt_10_8450 # [10] .Lt_10_8962: # 0x189b # Loop body line 1223, nesting depth: 1, estimated iterations: 100 #1224 const int16_t base_y = (out_y * stride_ht) - pad_ht; #1225 for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop l32i a12,a1,68 # [0] gra_spill_temp_229 beqz.n a12,.Lt_10_9218 # [2] .LBB6_esp_nn_depthwise_conv_s16_mult4: # 0x18a0 l32i.n a7,a1,36 # [0] gra_spill_temp_221 movi.n a11,0 # [1] l32i.n a8,a1,40 # [2] gra_spill_temp_222 l32i a9,a1,92 # [3] gra_spill_temp_235 movi.n a13,0 # [4] l32i.n a14,a1,32 # [5] gra_spill_temp_220 s32i a14,a1,160 # [6] gra_spill_temp_252 s32i a13,a1,72 # [7] gra_spill_temp_230 neg a10,a9 # [8] sub a8,a8,a9 # [9] max a10,a10,a11 # [10] s32i a10,a1,100 # [11] gra_spill_temp_237 min a7,a7,a8 # [12] j .Lt_10_9730 # [13] .Lt_10_9986: # 0x18c5 l32i a13,a1,68 # [0] gra_spill_temp_229 l32i.n a15,a1,56 # [1] gra_spill_temp_226 l32i a12,a1,72 # [2] gra_spill_temp_230 l32i a14,a1,160 # [3] gra_spill_temp_252 addi.n a12,a12,1 # [4] s32i a12,a1,72 # [5] gra_spill_temp_230 add.n a15,a14,a15 # [6] sext a14,a15,15 # [7] s32i a14,a1,160 # [8] gra_spill_temp_252 beq a12,a13,.Lt_10_9218 # [9] .Lt_10_9730: # 0x18e0 l32i a8,a1,164 # [0] gra_spill_temp_253 l32i a9,a1,64 # [1] gra_spill_temp_228 l32i.n a10,a1,60 # [2] gra_spill_temp_227 s32i a10,a1,132 # [3] gra_spill_temp_245 s32i a9,a1,128 # [4] gra_spill_temp_244 beqz.n a8,.Lt_10_9986 # [5] movi.n a8,0 # [0] l32i a5,a1,160 # [1] gra_spill_temp_252 movi.n a12,0 # [2] movi.n a13,0 # [3] movi.n a14,0 # [4] s32i a14,a1,84 # [5] gra_spill_temp_233 s32i a13,a1,88 # [6] gra_spill_temp_234 s32i a12,a1,176 # [7] gra_spill_temp_256 neg a6,a5 # [8] max a6,a6,a8 # [9] sub a5,a3,a5 # [10] min a5,a4,a5 # [11] sub a11,a5,a6 # [12] s32i a11,a1,156 # [13] gra_spill_temp_251 j .Lt_10_10498 # [14] .Lt_10_10754: # 0x1919 l32i a10,a1,164 # [0] gra_spill_temp_253 l32i a14,a1,76 # [1] gra_spill_temp_231 l32i a13,a1,84 # [2] gra_spill_temp_233 l32i a12,a1,80 # [3] gra_spill_temp_232 l32i a9,a1,176 # [4] gra_spill_temp_256 l32i a11,a1,88 # [5] gra_spill_temp_234 addi.n a9,a9,1 # [6] s32i a9,a1,176 # [7] gra_spill_temp_256 add.n a11,a11,a12 # [8] add.n a13,a13,a14 # [9] s32i a13,a1,84 # [10] gra_spill_temp_233 s32i a11,a1,88 # [11] gra_spill_temp_234 beq a9,a10,.Lt_10_9986 # [12] .Lt_10_10498: # 0x193d l32i a15,a1,108 # [0] gra_spill_temp_239 blti a15,1,.Lt_10_10754 # [2] l32i a2,a1,84 # [0] gra_spill_temp_233 l32i a10,a1,104 # [1] gra_spill_temp_238 l32i a9,a1,88 # [2] gra_spill_temp_234 movi.n a8,0 # [3] s32i a8,a1,120 # [4] gra_spill_temp_242 add.n a9,a9,a10 # [5] s32i a9,a1,124 # [6] gra_spill_temp_243 j .Lt_10_11266 # [7] .Lt_10_11522: # 0x1959 addmi a12,a1,256 # [0] l32i a14,a1,112 # [1] gra_spill_temp_240 movi.n a13,16 # [2] ee.st.qacc_l.l.128.ip a14,16 # [3] id:234 ee.st.qacc_l.h.32.ip a14,-16 # [4] id:235 ee.srcmb.s16.qacc q5,a13,0 # [5] l16ui a15,a1,10 # [6] qacc_scratch+10 l8ui a8,a1,15 # [7] qacc_scratch+15 l8ui a9,a1,5 # [8] qacc_scratch+5 l8ui a11,a1,16 # [9] qacc_scratch+16 l8ui a10,a1,6 # [10] qacc_scratch+6 s8i a10,a1,3 # [11] qacc_scratch+3 s8i a11,a1,7 # [12] qacc_scratch+7 s8i a9,a1,2 # [13] qacc_scratch+2 l32i a11,a1,104 # [14] gra_spill_temp_238 s8i a8,a1,6 # [15] qacc_scratch+6 s16i a15,a1,4 # [16] qacc_scratch+4 ee.vld.l.64.ip q0,a14,0 # [17] id:245 s32i a14,a1,112 # [18] gra_spill_temp_240 ee.vzip.16 q0,q5 # [19] st.qr q5,a12,-16 # [20] gra_spill_temp_260-256 beqz.n a11,.Lt_10_13570 # [21] // skip_bias // add bias l32i a13,a1,124 # [0] gra_spill_temp_243 extui a12,a13,0,4 # [2] ee.vld.128.ip q7,a13,16 # [3] id:248 ee.vld.128.ip q1,a13,0 # [4] id:249 wur.sar_byte a12 # [5] ee.src.q.qup q6,q7,q1 # [6] ee.vadds.s32 q0,q0,q6 # [7] .Lt_10_13570: # 0x19ae #1287 q0 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q0, out_mult_ptr, out_shift_ptr); l32i a10,a1,128 # [0] gra_spill_temp_244 l32i a11,a1,132 # [1] gra_spill_temp_245 call8 esp_nn_multiply_by_quantized_mult_ver1_esp32s3 # [2] esp_nn_multiply_by_quantized_mult_ver1_esp32s3 addi.n a2,a2,4 # [0] l32i a13,a1,96 # [1] gra_spill_temp_236 l32i a11,a1,128 # [2] gra_spill_temp_244 l32i a10,a1,132 # [3] gra_spill_temp_245 addi a8,a1,112 # [4] ld.qr q1,a8,96 # [5] gra_spill_temp_258-112 ld.qr q2,a8,80 # [6] gra_spill_temp_257-112 addi a10,a10,16 # [7] addi a11,a11,16 # [8] s32i a11,a1,128 # [9] gra_spill_temp_244 ee.vadds.s32 q0,q0,q2 # [10] s32i a10,a1,132 # [11] gra_spill_temp_245 ee.vmin.s32 q0,q0,q1 # [12] ld.qr q1,a8,112 # [13] gra_spill_temp_259-112 l32i a8,a1,116 # [14] gra_spill_temp_241 ee.vmax.s32 q0,q0,q1 # [15] ee.movi.32.a q0,a14,2 # [16] ee.movi.32.a q0,a15,1 # [17] ee.movi.32.a q0,a9,0 # [18] add.n a13,a8,a13 # [19] ee.movi.32.a q0,a12,3 # [20] addi.n a8,a8,4 # [21] s8i a12,a13,3 # [22] id:254 s32i a8,a1,116 # [23] gra_spill_temp_241 s8i a9,a13,0 # [24] id:251 s8i a15,a13,1 # [25] id:252 s8i a14,a13,2 # [26] id:253 l32i a15,a1,108 # [27] gra_spill_temp_239 l32i a14,a1,120 # [28] gra_spill_temp_242 l32i a9,a1,124 # [29] gra_spill_temp_243 addi.n a14,a14,4 # [30] addi a9,a9,16 # [31] s32i a9,a1,124 # [32] gra_spill_temp_243 s32i a14,a1,120 # [33] gra_spill_temp_242 bge a14,a15,.Lt_10_10754 # [34] .Lt_10_11266: # 0x1a1c # Loop body line 1230, nesting depth: 4, estimated iterations: 100 ee.zero.qacc # [0] l32i a9,a1,100 # [1] gra_spill_temp_237 s32i a9,a1,172 # [2] gra_spill_temp_255 bge a9,a7,.Lt_10_11522 # [3] mull a15,a9,a4 # [0] l32i a14,a1,92 # [1] gra_spill_temp_235 add.n a11,a15,a5 # [2] add.n a14,a14,a9 # [3] mull a14,a3,a14 # [4] s32i a11,a1,168 # [5] gra_spill_temp_254 bge a6,a5,.Lt_10_12290 # [6] .LBB18_esp_nn_depthwise_conv_s16_mult4: # 0x1a3b l32i a10,a1,176 # [0] gra_spill_temp_256 l32i a11,a1,164 # [1] gra_spill_temp_253 l32i a12,a1,160 # [2] gra_spill_temp_252 add.n a9,a15,a6 # [3] l32i a8,a1,140 # [4] gra_spill_temp_247 addmi a13,a1,256 # [5] ld.qr q1,a13,-16 # [6] gra_spill_temp_260-256 mull a8,a8,a9 # [7] add.n a12,a12,a6 # [8] l32i a9,a1,152 # [9] gra_spill_temp_250 add.n a12,a14,a12 # [10] mull a11,a11,a12 # [11] add.n a8,a2,a8 # [12] l32i a12,a1,148 # [13] gra_spill_temp_249 addx2 a8,a8,a9 # [14] add.n a10,a10,a11 # [15] l32i a11,a1,136 # [16] gra_spill_temp_246 l32i a9,a1,156 # [17] gra_spill_temp_251 addx2 a10,a10,a11 # [18] l32i a11,a1,144 # [19] gra_spill_temp_248 loopgtz a9,.LBB45_esp_nn_depthwise_conv_s16_mult4 # [20] mov.n a9,a8 # [0*II+0] ee.vldbc.16 q0,a10 # [0*II+1] id:232 add.n a10,a10,a12 # [0*II+2] ee.vld.l.64.ip q1,a9,0 # [0*II+3] id:231 add.n a8,a8,a11 # [0*II+4] ee.vmulas.s16.qacc q0,q1 # [0*II+5] .LBB45_esp_nn_depthwise_conv_s16_mult4: # 0x1a84 addmi a10,a1,256 # [0] st.qr q1,a10,-16 # [1] gra_spill_temp_260-256 .Lt_10_12290: # 0x1a8a add.n a14,a14,a3 # [0] add.n a15,a15,a4 # [1] l32i a11,a1,172 # [2] gra_spill_temp_255 l32i a12,a1,168 # [3] gra_spill_temp_254 addi.n a11,a11,1 # [4] add.n a12,a12,a4 # [5] s32i a12,a1,168 # [6] gra_spill_temp_254 s32i a11,a1,172 # [7] gra_spill_temp_255 sub a11,a7,a11 # [8] beqz a11,.Lt_10_11522 # [9] blt a6,a5,.LBB18_esp_nn_depthwise_conv_s16_mult4 # [0] j .Lt_10_12290 # [0] .Lt_10_8450: # 0x1aaa retw.n # [0] .size esp_nn_depthwise_conv_s16_mult4_esp32s3, . - esp_nn_depthwise_conv_s16_mult4_esp32s3 ================================================ FILE: src/convolution/esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3.S ================================================ // Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. .text .literal_position # Program Unit: esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3 .type esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3, @function .align 4 .global esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3 esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3: # 0x11b3 # qacc_scratch = 0 # gra_spill_temp_142 = 48 # gra_spill_temp_143 = 52 # gra_spill_temp_144 = 56 # gra_spill_temp_145 = 60 # gra_spill_temp_146 = 64 # gra_spill_temp_147 = 68 # gra_spill_temp_148 = 72 # gra_spill_temp_149 = 76 # gra_spill_temp_150 = 80 # gra_spill_temp_151 = 84 # gra_spill_temp_152 = 88 # gra_spill_temp_153 = 92 # gra_spill_temp_154 = 96 # gra_spill_temp_155 = 100 # gra_spill_temp_156 = 104 # gra_spill_temp_157 = 108 # gra_spill_temp_158 = 112 # gra_spill_temp_159 = 116 # gra_spill_temp_160 = 120 # gra_spill_temp_161 = 124 # gra_spill_temp_162 = 128 # gra_spill_temp_163 = 132 # gra_spill_temp_164 = 136 # gra_spill_temp_165 = 140 # gra_spill_temp_166 = 144 # gra_spill_temp_167 = 148 # gra_spill_temp_168 = 152 # gra_spill_temp_169 = 156 # gra_spill_temp_170 = 160 # gra_spill_temp_171 = 164 # gra_spill_temp_172 = 168 # gra_spill_temp_173 = 172 # gra_spill_temp_174 = 176 # gra_spill_temp_175 = 180 # gra_spill_temp_176 = 184 # gra_spill_temp_177 = 188 # gra_spill_temp_178 = 192 # gra_spill_temp_179 = 208 # gra_spill_temp_180 = 224 # gra_spill_temp_181 = 240 # gra_spill_temp_182 = 256 // registers: // a2: const int16_t *input_data // a3: const uint16_t input_wd // a4: const uint16_t input_ht // a5: const uint16_t channels // a6: const uint16_t pad_wd // a7: const uint16_t pad_ht // const uint16_t stride_wd // const uint16_t stride_ht // const uint16_t ch_mult // const int16_t *filter_data // const int32_t *bias // int8_t *out_data // const uint16_t out_wd // const uint16_t out_ht // const int32_t out_offset // const int32_t *out_shift // const int32_t *out_mult // const int32_t activation_min // const int32_t activation_max entry a1,304 # s32i a2,a1,116 # [0] gra_spill_temp_159 s32i a3,a1,120 # [1] gra_spill_temp_160 s32i a5,a1,144 # [2] gra_spill_temp_166 s32i.n a6,a1,60 # [3] gra_spill_temp_145 addmi a9,a1,256 # [4] addi a12,a1,112 # [5] addmi a10,a1,256 # [6] addmi a11,a1,256 # [7] addmi a13,a1,256 # [8] // height loop l16ui a8,a1,332 # [9] id:261 out_ht+0x0 l32i a14,a1,324 # [10] id:257 out_data+0x0 s32i a14,a1,176 # [11] gra_spill_temp_174 s32i a8,a1,68 # [12] gra_spill_temp_147 addi a13,a13,80 # [13] addi a11,a11,96 # [14] addi a10,a10,92 # [15] ee.vldbc.32 q0,a10 # [16] id:260 activation_min ee.vldbc.32 q1,a11 # [17] id:259 activation_max ee.vldbc.32 q2,a13 # [18] id:258 out_offset st.qr q2,a12,96 # [19] gra_spill_temp_179-112 st.qr q1,a12,112 # [20] gra_spill_temp_180-112 st.qr q0,a9,-16 # [21] gra_spill_temp_181-256 beqz.n a8,.Lt_8_8194 # [22] .LBB3_esp_nn_depthwise_conv_s16_mult8_3x3: # 0x11f9 s32i a1,a1,180 # [0] gra_spill_temp_175 mul16u a6,a3,a5 # [1] s32i a7,a1,76 # [2] gra_spill_temp_149 l32i a9,a1,316 # [3] id:264 filter_data+0x0 l32i a15,a1,320 # [4] id:262 bias+0x0 l16ui a10,a1,312 # [5] id:263 ch_mult+0x0 slli a11,a5,1 # [6] l16ui a12,a1,308 # [7] id:268 stride_ht+0x0 l32i a13,a1,344 # [8] id:267 out_mult+0x0 l32i a14,a1,340 # [9] id:266 out_shift+0x0 s32i a14,a1,88 # [10] gra_spill_temp_152 s32i a13,a1,92 # [11] gra_spill_temp_153 s32i a12,a1,64 # [12] gra_spill_temp_146 s32i a11,a1,124 # [13] gra_spill_temp_161 s32i a10,a1,108 # [14] gra_spill_temp_157 s32i a15,a1,160 # [15] gra_spill_temp_170 s32i a9,a1,128 # [16] gra_spill_temp_162 neg a7,a7 # [17] slli a6,a6,1 # [18] s32i a7,a1,136 # [19] gra_spill_temp_164 movi.n a9,0 # [20] extui a15,a15,0,4 # [21] s32i a15,a1,152 # [22] gra_spill_temp_168 s32i a9,a1,72 # [23] gra_spill_temp_148 sub a7,a4,a7 # [24] l32i.n a9,a1,60 # [25] gra_spill_temp_145 s32i a7,a1,80 # [26] gra_spill_temp_150 l16ui a4,a1,328 # [27] id:269 out_wd+0x0 s32i a4,a1,96 # [28] gra_spill_temp_154 l16ui a7,a1,304 # [29] id:265 stride_wd+0x0 s32i a7,a1,84 # [30] gra_spill_temp_151 mul16u a4,a5,a10 # [31] neg a9,a9 # [32] s32i.n a9,a1,52 # [33] gra_spill_temp_143 sub a8,a3,a9 # [34] addi a10,a10,-7 # [35] s32i a10,a1,164 # [36] gra_spill_temp_171 s32i.n a8,a1,56 # [37] gra_spill_temp_144 addx2 a7,a4,a4 # [38] slli a7,a7,1 # [39] j .Lt_8_8706 # [40] .Lt_8_8962: # 0x1270 # Part of loop body line 933, head labeled .Lt_8_8706 l32i a10,a1,68 # [0] gra_spill_temp_147 l32i a14,a1,76 # [1] gra_spill_temp_149 l32i a13,a1,136 # [2] gra_spill_temp_164 l32i a12,a1,64 # [3] gra_spill_temp_146 l32i a9,a1,72 # [4] gra_spill_temp_148 l32i a11,a1,80 # [5] gra_spill_temp_150 addi.n a9,a9,1 # [6] s32i a9,a1,72 # [7] gra_spill_temp_148 sub a11,a11,a12 # [8] add.n a13,a13,a12 # [9] sub a14,a14,a12 # [10] s32i a14,a1,76 # [11] gra_spill_temp_149 s32i a13,a1,136 # [12] gra_spill_temp_164 s32i a11,a1,80 # [13] gra_spill_temp_150 sub a9,a9,a10 # [14] beqz a9,.Lt_8_8194 # [15] .Lt_8_8706: # 0x129e # Loop body line 933, nesting depth: 1, estimated iterations: 100 # 934 const int32_t base_y = (out_y * stride_ht) - pad_ht; # 935 for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop l32i a15,a1,96 # [0] gra_spill_temp_154 beqz.n a15,.Lt_8_8962 # [2] .LBB6_esp_nn_depthwise_conv_s16_mult8_3x3: # 0x12a3 # Part of loop body line 933, head labeled .Lt_8_8706 l32i.n a3,a1,56 # [0] gra_spill_temp_144 l32i a8,a1,80 # [1] gra_spill_temp_150 movi.n a10,0 # [2] l32i a9,a1,76 # [3] gra_spill_temp_149 movi.n a11,0 # [4] l32i.n a12,a1,52 # [5] gra_spill_temp_143 l32i.n a13,a1,60 # [6] gra_spill_temp_145 s32i a13,a1,104 # [7] gra_spill_temp_156 s32i a12,a1,140 # [8] gra_spill_temp_165 s32i a11,a1,100 # [9] gra_spill_temp_155 max a9,a9,a10 # [10] movi.n a10,3 # [11] s32i a9,a1,172 # [12] gra_spill_temp_173 min a8,a8,a10 # [13] s32i a8,a1,156 # [14] gra_spill_temp_169 sub a8,a8,a9 # [15] s32i a8,a1,132 # [16] gra_spill_temp_163 j .Lt_8_9474 # [17] .Lt_8_9730: # 0x12d3 # Part of loop body line 935, head labeled .Lt_8_9474 l32i a15,a1,96 # [0] gra_spill_temp_154 l32i a10,a1,104 # [1] gra_spill_temp_156 l32i a9,a1,140 # [2] gra_spill_temp_165 l32i a8,a1,84 # [3] gra_spill_temp_151 l32i a14,a1,100 # [4] gra_spill_temp_155 sub a3,a3,a8 # [5] addi.n a14,a14,1 # [6] s32i a14,a1,100 # [7] gra_spill_temp_155 add.n a9,a9,a8 # [8] sub a10,a10,a8 # [9] s32i a10,a1,104 # [10] gra_spill_temp_156 s32i a9,a1,140 # [11] gra_spill_temp_165 beq a14,a15,.Lt_8_8962 # [12] .Lt_8_9474: # 0x12f8 # 936 const int32_t base_x = (out_x * stride_wd) - pad_wd; # 937 const int32_t *out_mult_ptr = out_mult; # 938 const int32_t *out_shift_ptr = out_shift; l32i a2,a1,88 # [0] gra_spill_temp_152 l32i a10,a1,92 # [1] gra_spill_temp_153 # 939 uint32_t bias_ptr = (uint32_t) (bias); l32i a12,a1,160 # [2] gra_spill_temp_170 # 940 # 941 for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop l32i a11,a1,144 # [3] gra_spill_temp_166 s32i a12,a1,168 # [4] gra_spill_temp_172 beqz.n a11,.Lt_8_9730 # [5] .LBB9_esp_nn_depthwise_conv_s16_mult8_3x3: # 0x1309 # Part of loop body line 935, head labeled .Lt_8_9474 movi.n a8,0 # [0] l32i a5,a1,104 # [1] gra_spill_temp_156 movi.n a13,0 # [2] movi.n a9,0 # [3] s32i a9,a1,112 # [4] gra_spill_temp_158 s32i a13,a1,148 # [5] gra_spill_temp_167 max a5,a5,a8 # [6] j .Lt_8_10242 # [7] .Lt_8_10498: # 0x131e # Part of loop body line 941, head labeled .Lt_8_10242 l32i a12,a1,144 # [0] gra_spill_temp_166 l32i a14,a1,108 # [1] gra_spill_temp_157 l32i a11,a1,148 # [2] gra_spill_temp_167 l32i a13,a1,112 # [3] gra_spill_temp_158 addi.n a11,a11,1 # [4] s32i a11,a1,148 # [5] gra_spill_temp_167 add.n a13,a13,a14 # [6] s32i a13,a1,112 # [7] gra_spill_temp_158 beq a11,a12,.Lt_8_9730 # [8] .Lt_8_10242: # 0x1337 # 942 for (int ch_mult_idx = 0; ch_mult_idx < ch_mult - 7; ch_mult_idx += 8) { l32i a15,a1,164 # [0] gra_spill_temp_171 blti a15,1,.Lt_8_10498 # [2] movi.n a8,0 # [0] l32i a9,a1,112 # [1] gra_spill_temp_158 s32i a9,a1,188 # [2] gra_spill_temp_177 s32i a8,a1,184 # [3] gra_spill_temp_176 j .Lt_8_11010 # [4] .LBB23_esp_nn_depthwise_conv_s16_mult8_3x3: # 0x134b s32i.n a10,a1,48 # [0] gra_spill_temp_142 addi a11,a1,112 # [1] l32i a13,a1,152 # [2] gra_spill_temp_168 l32i a12,a1,168 # [3] gra_spill_temp_172 wur.sar_byte a13 # [4] ee.vld.128.ip q4,a12,16 # [5] id:307 ee.vld.128.ip q7,a12,16 # [6] id:308 ee.vld.128.ip q5,a12,0 # [7] id:309 s32i a12,a1,168 # [8] gra_spill_temp_172 ee.src.q.qup q6,q4,q7 # [9] ee.vadds.s32 q0,q0,q6 # [10] ee.src.q.qup q3,q4,q5 # [11] ee.vadds.s32 q1,q1,q3 # [12] st.qr q1,a11,80 # [13] gra_spill_temp_178-112 .Lt_8_13314: # 0x1374 #1025 q0 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q0, out_mult_ptr, out_shift_ptr); l32i.n a10,a1,48 # [0] gra_spill_temp_142 mov.n a11,a2 # [1] call8 esp_nn_multiply_by_quantized_mult_ver1_esp32s3 #1026 out_mult_ptr += 4; #1027 out_shift_ptr += 4; #1028 #1029 q1 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q1, out_mult_ptr, out_shift_ptr); l32i.n a10,a1,48 # [0] gra_spill_temp_142 addmi a12,a1,256 # [1] addi a11,a1,112 # [2] st.qr q0,a12,0 # [3] gra_spill_temp_182-256 ld.qr q0,a11,80 # [4] gra_spill_temp_178-112 addi a10,a10,16 # [5] addi a11,a2,16 # [6] call8 esp_nn_multiply_by_quantized_mult_ver1_esp32s3 # Part of loop body line 942, head labeled .Lt_8_11010 #1030 out_mult_ptr += 4; #1031 out_shift_ptr += 4; addi a2,a2,32 # [0] l32i a14,a1,164 # [1] gra_spill_temp_171 l32i a8,a1,176 # [2] gra_spill_temp_174 l32i a15,a1,188 # [3] gra_spill_temp_177 l32i a13,a1,184 # [4] gra_spill_temp_176 l32i.n a10,a1,48 # [5] gra_spill_temp_142 addmi a11,a1,256 # [6] addi a12,a1,112 # [7] ld.qr q3,a12,112 # [8] gra_spill_temp_180-112 ld.qr q1,a12,96 # [9] gra_spill_temp_179-112 ld.qr q2,a11,0 # [10] gra_spill_temp_182-256 addi a10,a10,32 # [11] addi.n a13,a13,8 # [12] addi.n a15,a15,8 # [13] s32i a15,a1,188 # [14] gra_spill_temp_177 ee.vadds.s32 q2,q2,q1 # [15] s32i a13,a1,184 # [16] gra_spill_temp_176 ee.vadds.s32 q1,q0,q1 # [17] ee.vmin.s32 q0,q2,q3 # [18] ld.qr q2,a11,-16 # [19] gra_spill_temp_181-256 ee.vmin.s32 q1,q1,q3 # [20] ee.vmax.s32 q1,q1,q2 # [21] ee.vmax.s32 q0,q0,q2 # [22] ee.vunzip.16 q0,q1 # [23] ee.vunzip.8 q0,q1 # [24] ee.vst.l.64.ip q0,a8,8 # [25] id:312 s32i a8,a1,176 # [26] gra_spill_temp_174 bge a13,a14,.Lt_8_10498 # [27] .Lt_8_11010: # 0x13e3 # Loop body line 942, nesting depth: 4, estimated iterations: 100 l32i a14,a1,156 # [0] gra_spill_temp_169 l32i a13,a1,172 # [1] gra_spill_temp_173 ee.zero.qacc # [2] bge a13,a14,.Lt_8_11266 # [3] .LBB15_esp_nn_depthwise_conv_s16_mult8_3x3: # 0x13ef # Part of loop body line 942, head labeled .Lt_8_11010 l32i a12,a1,124 # [0] gra_spill_temp_161 l32i a8,a1,140 # [1] gra_spill_temp_165 l32i a11,a1,120 # [2] gra_spill_temp_160 l32i a14,a1,188 # [3] gra_spill_temp_177 l32i a9,a1,136 # [4] gra_spill_temp_164 mull a15,a4,a13 # [5] add.n a9,a9,a13 # [6] addx2 a15,a15,a15 # [7] l32i a13,a1,148 # [8] gra_spill_temp_167 add.n a14,a14,a15 # [9] mull a9,a9,a11 # [10] l32i a15,a1,144 # [11] gra_spill_temp_166 add.n a8,a8,a9 # [12] mull a15,a15,a8 # [13] l32i a8,a1,128 # [14] gra_spill_temp_162 add.n a13,a13,a15 # [15] l32i a15,a1,116 # [16] gra_spill_temp_159 addx2 a14,a14,a8 # [17] addx2 a13,a13,a15 # [18] add.n a11,a12,a13 # [19] l32i a15,a1,132 # [20] gra_spill_temp_163 add.n a12,a12,a11 # [21] loopgtz a15,.LBB34_esp_nn_depthwise_conv_s16_mult8_3x3 # [22] .Lt_8_11778: # 0x142e mov.n a15,a14 # [0] mov.n a9,a14 # [1] bnez.n a5,.Lt_8_12034 # [2] ee.vldbc.16 q3,a13 # [0] id:271 mov.n a9,a14 # [1] ee.vld.128.ip q4,a9,0 # [2] id:272 ee.vmulas.s16.qacc q3,q4 # [4] .Lt_8_12034: # 0x143f ee.vldbc.16 q5,a11 # [0] id:274 addx2 a9,a4,a9 # [1] ee.vld.128.ip q6,a9,0 # [2] id:275 add.n a13,a13,a6 # [3] ee.vmulas.s16.qacc q5,q6 # [4] blti a3,3,.Lt_8_12546 # [5] ee.vldbc.16 q7,a12 # [0] id:277 addx2 a14,a4,a9 # [1] ee.vld.128.ip q0,a14,0 # [2] id:278 ee.vmulas.s16.qacc q7,q0 # [4] .Lt_8_12546: # 0x145c # Part of loop body line 953, head labeled .Lt_8_11778 add.n a11,a11,a6 # [0] add.n a12,a12,a6 # [1] add.n a14,a7,a15 # [2] .LBB34_esp_nn_depthwise_conv_s16_mult8_3x3: # 0x1464 .Lt_8_11266: # 0x1464 l32i a8,a1,180 # [0] gra_spill_temp_175 ee.st.qacc_l.l.128.ip a8,16 # [2] id:280 ee.st.qacc_l.h.32.ip a8,0 # [3] id:281 l16ui a9,a1,10 # [4] qacc_scratch+10 l8ui a11,a1,15 # [5] qacc_scratch+15 l8ui a12,a1,5 # [6] qacc_scratch+5 l8ui a13,a1,6 # [7] qacc_scratch+6 l8ui a14,a1,16 # [8] qacc_scratch+16 s8i a14,a1,7 # [9] qacc_scratch+7 s8i a13,a1,3 # [10] qacc_scratch+3 s8i a12,a1,2 # [11] qacc_scratch+2 s8i a11,a1,6 # [12] qacc_scratch+6 s16i a9,a1,4 # [13] qacc_scratch+4 ee.st.qacc_h.l.128.ip a8,16 # [14] id:291 ee.st.qacc_h.h.32.ip a8,-32 # [15] id:292 l16ui a9,a1,16 # [16] qacc_scratch+16 l8ui a15,a1,32 # [17] qacc_scratch+32 l8ui a12,a1,22 # [18] qacc_scratch+22 l8ui a11,a1,21 # [19] qacc_scratch+21 l8ui a14,a1,31 # [20] qacc_scratch+31 l16ui a13,a1,26 # [21] qacc_scratch+26 s16i a13,a1,12 # [22] qacc_scratch+12 s8i a14,a1,14 # [23] qacc_scratch+14 s8i a11,a1,10 # [24] qacc_scratch+10 s8i a12,a1,11 # [25] qacc_scratch+11 s8i a15,a1,15 # [26] qacc_scratch+15 s16i a9,a1,8 # [27] qacc_scratch+8 l32i a15,a1,160 # [28] gra_spill_temp_170 movi.n a9,16 # [29] ee.srcmb.s16.qacc q1,a9,0 # [30] ee.vld.128.ip q0,a8,0 # [31] id:304 s32i a8,a1,180 # [32] gra_spill_temp_175 ee.vzip.16 q0,q1 # [33] bnez.n a15,.LBB23_esp_nn_depthwise_conv_s16_mult8_3x3 # [34] s32i.n a10,a1,48 # [0] gra_spill_temp_142 addi a15,a1,112 # [1] st.qr q1,a15,80 # [2] gra_spill_temp_178-112 j .Lt_8_13314 # [3] .Lt_8_8194: # 0x14d3 retw.n # [0] .size esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3, . - esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3 ================================================ FILE: src/convolution/esp_nn_depthwise_conv_s16_mult8_esp32s3.S ================================================ // Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. .text .literal_position # Program Unit: esp_nn_depthwise_conv_s16_mult8_esp32s3 .type esp_nn_depthwise_conv_s16_mult8_esp32s3, @function .align 4 .global esp_nn_depthwise_conv_s16_mult8_esp32s3 esp_nn_depthwise_conv_s16_mult8_esp32s3: # 0x14d7 # qacc_scratch = 0 # gra_spill_temp_183 = 48 # gra_spill_temp_184 = 52 # gra_spill_temp_185 = 56 # gra_spill_temp_186 = 60 # gra_spill_temp_187 = 64 # gra_spill_temp_188 = 68 # gra_spill_temp_189 = 72 # gra_spill_temp_190 = 76 # gra_spill_temp_191 = 80 # gra_spill_temp_192 = 84 # gra_spill_temp_193 = 88 # gra_spill_temp_194 = 92 # gra_spill_temp_195 = 96 # gra_spill_temp_196 = 100 # gra_spill_temp_197 = 104 # gra_spill_temp_198 = 108 # gra_spill_temp_199 = 112 # gra_spill_temp_200 = 116 # gra_spill_temp_201 = 120 # gra_spill_temp_202 = 124 # gra_spill_temp_203 = 128 # gra_spill_temp_204 = 132 # gra_spill_temp_205 = 136 # gra_spill_temp_206 = 140 # gra_spill_temp_207 = 144 # gra_spill_temp_208 = 148 # gra_spill_temp_209 = 152 # gra_spill_temp_210 = 156 # gra_spill_temp_211 = 160 # gra_spill_temp_212 = 164 # gra_spill_temp_213 = 168 # gra_spill_temp_214 = 172 # gra_spill_temp_215 = 176 # gra_spill_temp_216 = 180 # gra_spill_temp_217 = 184 # gra_spill_temp_218 = 192 # gra_spill_temp_219 = 208 // registers: // a2: const int16_t *input_data // a3: const uint16_t input_wd // a4: const uint16_t input_ht // a5: const uint16_t channels // a6: const uint16_t pad_wd // a7: const uint16_t pad_ht // on stack: // const uint16_t stride_wd // const uint16_t stride_ht // const uint16_t ch_mult // const int16_t *filter_data // const uint16_t filter_wd // const uint16_t filter_ht // const int32_t *bias // int8_t *out_data // const uint16_t out_wd // const uint16_t out_ht // const int32_t out_offset // const int32_t *out_shift // const int32_t *out_mult // const int32_t activation_min // const int32_t activation_max entry a1,256 # s32i a2,a1,144 # [0] gra_spill_temp_207 s32i.n a4,a1,56 # [1] gra_spill_temp_185 s32i a5,a1,172 # [2] gra_spill_temp_214 l32i a9,a1,284 # [3] id:241 out_data+0x0 l16ui a8,a1,292 # [4] id:242 out_ht+0x0 s32i a8,a1,64 # [5] gra_spill_temp_187 s32i a9,a1,124 # [6] gra_spill_temp_202 beqz.n a8,.Lt_9_8450 # [7] s32i a1,a1,128 # [0] gra_spill_temp_203 neg a13,a7 # [1] movi.n a4,0 # [2] neg a12,a6 # [3] l32i a9,a1,280 # [4] id:243 bias+0x0 slli a11,a5,1 # [5] l16ui a10,a1,264 # [6] id:244 ch_mult+0x0 l32i a14,a1,268 # [7] id:245 filter_data+0x0 s32i a14,a1,160 # [8] gra_spill_temp_211 s32i a10,a1,92 # [9] gra_spill_temp_194 s32i a11,a1,156 # [10] gra_spill_temp_210 s32i a9,a1,112 # [11] gra_spill_temp_199 sext a12,a12,15 # [12] s32i a4,a1,68 # [13] gra_spill_temp_188 sext a13,a13,15 # [14] l16ui a4,a1,272 # [15] id:246 filter_wd+0x0 s32i a13,a1,100 # [16] gra_spill_temp_196 s32i.n a12,a1,48 # [17] gra_spill_temp_183 mul16u a8,a5,a10 # [18] extui a9,a9,0,4 # [19] l32i a11,a1,304 # [20] id:249 out_mult+0x0 s32i a11,a1,80 # [21] gra_spill_temp_191 s32i a9,a1,104 # [22] gra_spill_temp_197 s32i a8,a1,148 # [23] gra_spill_temp_208 addi a10,a10,-7 # [24] l32i a12,a1,300 # [25] id:248 out_shift+0x0 l16ui a13,a1,256 # [26] id:247 stride_wd+0x0 s32i a13,a1,72 # [27] gra_spill_temp_189 s32i a12,a1,76 # [28] gra_spill_temp_190 s32i a10,a1,116 # [29] gra_spill_temp_200 slli a8,a8,1 # [30] l16ui a9,a1,260 # [31] id:251 stride_ht+0x0 s32i.n a9,a1,60 # [32] gra_spill_temp_186 s32i a8,a1,152 # [33] gra_spill_temp_209 l16ui a10,a1,276 # [34] id:250 filter_ht+0x0 s32i.n a10,a1,52 # [35] gra_spill_temp_184 l16ui a8,a1,288 # [36] id:252 out_wd+0x0 s32i a8,a1,84 # [37] gra_spill_temp_192 j .Lt_9_8962 # [38] .Lt_9_9218: # 0x1561 # Part of loop body line 1083, head labeled .Lt_9_8962 l32i a15,a1,64 # [0] gra_spill_temp_187 l32i.n a9,a1,60 # [1] gra_spill_temp_186 l32i a14,a1,68 # [2] gra_spill_temp_188 l32i a8,a1,100 # [3] gra_spill_temp_196 addi.n a14,a14,1 # [4] s32i a14,a1,68 # [5] gra_spill_temp_188 add.n a9,a8,a9 # [6] sub a14,a14,a15 # [7] sext a8,a9,15 # [8] s32i a8,a1,100 # [9] gra_spill_temp_196 beqz a14,.Lt_9_8450 # [10] .Lt_9_8962: # 0x157f l32i a10,a1,84 # [0] gra_spill_temp_192 beqz.n a10,.Lt_9_9218 # [2] l32i.n a7,a1,52 # [0] gra_spill_temp_184 movi.n a11,0 # [1] l32i.n a8,a1,56 # [2] gra_spill_temp_185 l32i a9,a1,100 # [3] gra_spill_temp_196 l32i.n a12,a1,48 # [4] gra_spill_temp_183 s32i a12,a1,168 # [5] gra_spill_temp_213 neg a10,a9 # [6] sub a8,a8,a9 # [7] max a10,a10,a11 # [8] s32i a10,a1,108 # [9] gra_spill_temp_198 min a7,a7,a8 # [10] movi.n a11,0 # [11] s32i a11,a1,88 # [12] gra_spill_temp_193 j .Lt_9_9730 # [13] .Lt_9_9986: # 0x15a9 # Part of loop body line 1085, head labeled .Lt_9_9730 l32i a13,a1,84 # [0] gra_spill_temp_192 l32i a15,a1,72 # [1] gra_spill_temp_189 l32i a12,a1,88 # [2] gra_spill_temp_193 l32i a14,a1,168 # [3] gra_spill_temp_213 addi.n a12,a12,1 # [4] s32i a12,a1,88 # [5] gra_spill_temp_193 add.n a15,a14,a15 # [6] sext a14,a15,15 # [7] s32i a14,a1,168 # [8] gra_spill_temp_213 beq a12,a13,.Lt_9_9218 # [9] .Lt_9_9730: # 0x15c5 # Loop body line 1085, nesting depth: 2, estimated iterations: 100 #1086 const int16_t base_x = (out_x * stride_wd) - pad_wd; #1087 const int32_t *out_mult_ptr = out_mult; #1088 const int32_t *out_shift_ptr = out_shift; #1089 uint32_t bias_ptr = (uint32_t) (bias); #1090 for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop l32i a8,a1,172 # [0] gra_spill_temp_214 l32i a9,a1,80 # [1] gra_spill_temp_191 l32i a10,a1,76 # [2] gra_spill_temp_190 l32i a11,a1,112 # [3] gra_spill_temp_199 s32i a11,a1,120 # [4] gra_spill_temp_201 s32i a10,a1,140 # [5] gra_spill_temp_206 s32i a9,a1,136 # [6] gra_spill_temp_205 beqz.n a8,.Lt_9_9986 # [7] .LBB9_esp_nn_depthwise_conv_s16_mult8: # 0x15dc # Part of loop body line 1085, head labeled .Lt_9_9730 movi.n a8,0 # [0] l32i a5,a1,168 # [1] gra_spill_temp_213 movi.n a13,0 # [2] movi.n a14,0 # [3] s32i a14,a1,96 # [4] gra_spill_temp_195 s32i a13,a1,184 # [5] gra_spill_temp_217 neg a6,a5 # [6] max a6,a6,a8 # [7] sub a5,a3,a5 # [8] min a5,a4,a5 # [9] sub a12,a5,a6 # [10] s32i a12,a1,164 # [11] gra_spill_temp_212 j .Lt_9_10498 # [12] .Lt_9_10754: # 0x1600 # Part of loop body line 1090, head labeled .Lt_9_10498 l32i a10,a1,172 # [0] gra_spill_temp_214 l32i a12,a1,92 # [1] gra_spill_temp_194 l32i a9,a1,184 # [2] gra_spill_temp_217 l32i a11,a1,96 # [3] gra_spill_temp_195 addi.n a9,a9,1 # [4] s32i a9,a1,184 # [5] gra_spill_temp_217 add.n a11,a11,a12 # [6] s32i a11,a1,96 # [7] gra_spill_temp_195 beq a9,a10,.Lt_9_9986 # [8] .Lt_9_10498: # 0x1619 # Loop body line 1090, nesting depth: 3, estimated iterations: 100 #1091 for (int ch_mult_idx = 0; ch_mult_idx < ch_mult - 7; ch_mult_idx += 8) { l32i a13,a1,116 # [0] gra_spill_temp_200 blti a13,1,.Lt_9_10754 # [2] .LBB12_esp_nn_depthwise_conv_s16_mult8: # 0x161f # Part of loop body line 1090, head labeled .Lt_9_10498 l32i a2,a1,96 # [0] gra_spill_temp_195 movi.n a14,0 # [1] s32i a14,a1,132 # [2] gra_spill_temp_204 j .Lt_9_11266 # [3] .Lt_9_11522: # 0x162a l32i a9,a1,128 # [0] gra_spill_temp_203 ee.st.qacc_l.l.128.ip a9,16 # [2] id:257 ee.st.qacc_l.h.32.ip a9,0 # [3] id:258 l8ui a10,a1,15 # [4] qacc_scratch+15 l16ui a8,a1,10 # [5] qacc_scratch+10 l8ui a13,a1,16 # [6] qacc_scratch+16 l8ui a12,a1,6 # [7] qacc_scratch+6 l8ui a11,a1,5 # [8] qacc_scratch+5 s8i a11,a1,2 # [9] qacc_scratch+2 s8i a12,a1,3 # [10] qacc_scratch+3 s8i a13,a1,7 # [11] qacc_scratch+7 s16i a8,a1,4 # [12] qacc_scratch+4 s8i a10,a1,6 # [13] qacc_scratch+6 movi.n a8,16 # [14] ee.st.qacc_h.l.128.ip a9,16 # [15] id:268 ee.st.qacc_h.h.32.ip a9,-32 # [16] id:269 ee.srcmb.s16.qacc q1,a8,0 # [17] l16ui a13,a1,26 # [18] qacc_scratch+26 l8ui a15,a1,32 # [19] qacc_scratch+32 l8ui a12,a1,22 # [20] qacc_scratch+22 l8ui a11,a1,21 # [21] qacc_scratch+21 l16ui a10,a1,16 # [22] qacc_scratch+16 l8ui a14,a1,31 # [23] qacc_scratch+31 s8i a14,a1,14 # [24] qacc_scratch+14 s16i a10,a1,8 # [25] qacc_scratch+8 s8i a11,a1,10 # [26] qacc_scratch+10 s8i a12,a1,11 # [27] qacc_scratch+11 s8i a15,a1,15 # [28] qacc_scratch+15 s16i a13,a1,12 # [29] qacc_scratch+12 #1138 EE_VZIP_16(q0, q1); /* 4x32 */ #1139 #1140 if (bias) { l32i a15,a1,112 # [30] gra_spill_temp_199 ee.vld.128.ip q0,a9,0 # [31] id:281 s32i a9,a1,128 # [32] gra_spill_temp_203 ee.vzip.16 q0,q1 # [33] beqz.n a15,.Lt_9_13570 # [34] .LBB23_esp_nn_depthwise_conv_s16_mult8: # 0x168e # Part of loop body line 1091, head labeled .Lt_9_11266 addi a14,a1,112 # [0] l32i a8,a1,104 # [1] gra_spill_temp_197 l32i a15,a1,120 # [2] gra_spill_temp_201 wur.sar_byte a8 # [3] ee.vld.128.ip q3,a15,16 # [4] id:284 ee.vld.128.ip q6,a15,16 # [5] id:285 ee.vld.128.ip q4,a15,0 # [6] id:286 s32i a15,a1,120 # [7] gra_spill_temp_201 ee.src.q.qup q5,q3,q6 # [8] ee.vadds.s32 q0,q0,q5 # [9] ee.src.q.qup q2,q3,q4 # [10] ee.vadds.s32 q1,q1,q2 # [11] st.qr q1,a14,96 # [12] gra_spill_temp_219-112 .Lt_9_13570: # 0x16b5 #1158 q0 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q0, out_mult_ptr, out_shift_ptr); l32i a10,a1,136 # [0] gra_spill_temp_205 l32i a11,a1,140 # [1] gra_spill_temp_206 addi a9,a1,112 # [2] st.qr q1,a9,96 # [3] gra_spill_temp_219-112 call8 esp_nn_multiply_by_quantized_mult_ver1_esp32s3 #1159 out_mult_ptr += 4; #1160 out_shift_ptr += 4; #1161 #1162 q1 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q1, out_mult_ptr, out_shift_ptr); l32i a11,a1,140 # [0] gra_spill_temp_206 addi a12,a1,112 # [1] l32i a10,a1,136 # [2] gra_spill_temp_205 st.qr q0,a12,80 # [3] gra_spill_temp_218-112 ld.qr q0,a12,96 # [4] gra_spill_temp_219-112 addi a10,a10,16 # [5] addi a11,a11,16 # [6] call8 esp_nn_multiply_by_quantized_mult_ver1_esp32s3 addi.n a2,a2,8 # [0] l32i a14,a1,116 # [1] gra_spill_temp_200 l32i a15,a1,124 # [2] gra_spill_temp_202 l32i a13,a1,132 # [3] gra_spill_temp_204 l32i a10,a1,140 # [4] gra_spill_temp_206 l32i a11,a1,136 # [5] gra_spill_temp_205 addmi a9,a1,256 # [6] addi a8,a1,112 # [7] ld.qr q7,a8,80 # [8] gra_spill_temp_218-112 addi a9,a9,56 # [9] ee.vldbc.32 q2,a9 # [10] id:290 activation_max addi a11,a11,32 # [11] addi a10,a10,32 # [12] addi.n a13,a13,8 # [13] s32i a13,a1,132 # [14] gra_spill_temp_204 s32i a10,a1,140 # [15] gra_spill_temp_206 s32i a11,a1,136 # [16] gra_spill_temp_205 addmi a10,a1,256 # [17] addmi a11,a1,256 # [18] addi a11,a11,52 # [19] addi a10,a10,40 # [20] ee.vldbc.32 q3,a10 # [21] id:289 out_offset ee.vldbc.32 q1,a11 # [22] id:291 activation_min ee.vadds.s32 q0,q0,q3 # [23] ee.vadds.s32 q7,q7,q3 # [24] ee.vmin.s32 q7,q7,q2 # [25] ee.vmin.s32 q0,q0,q2 # [26] ee.vmax.s32 q0,q0,q1 # [27] ee.vmax.s32 q7,q7,q1 # [28] ee.vunzip.16 q7,q0 # [29] ee.vunzip.8 q7,q0 # [30] ee.vst.l.64.ip q7,a15,8 # [31] id:292 s32i a15,a1,124 # [32] gra_spill_temp_202 bge a13,a14,.Lt_9_10754 # [33] .Lt_9_11266: # 0x1740 ee.zero.qacc # [0] l32i a12,a1,108 # [1] gra_spill_temp_198 s32i a12,a1,180 # [2] gra_spill_temp_216 bge a12,a7,.Lt_9_11522 # [3] mull a15,a12,a4 # [0] l32i a14,a1,100 # [1] gra_spill_temp_196 add.n a8,a15,a5 # [2] add.n a14,a14,a12 # [3] mull a14,a3,a14 # [4] s32i a8,a1,176 # [5] gra_spill_temp_215 bge a6,a5,.Lt_9_12290 # [6] .LBB18_esp_nn_depthwise_conv_s16_mult8: # 0x175f # Part of loop body line 1091, head labeled .Lt_9_11266 l32i a10,a1,184 # [0] gra_spill_temp_217 l32i a11,a1,172 # [1] gra_spill_temp_214 l32i a12,a1,168 # [2] gra_spill_temp_213 l32i a8,a1,148 # [3] gra_spill_temp_208 add.n a9,a15,a6 # [4] mull a8,a8,a9 # [5] add.n a12,a12,a6 # [6] l32i a9,a1,160 # [7] gra_spill_temp_211 add.n a12,a14,a12 # [8] mull a11,a11,a12 # [9] add.n a8,a2,a8 # [10] l32i a12,a1,156 # [11] gra_spill_temp_210 addx2 a8,a8,a9 # [12] add.n a10,a10,a11 # [13] l32i a11,a1,144 # [14] gra_spill_temp_207 l32i a9,a1,164 # [15] gra_spill_temp_212 addx2 a10,a10,a11 # [16] l32i a11,a1,152 # [17] gra_spill_temp_209 loopgtz a9,.LBB45_esp_nn_depthwise_conv_s16_mult8 # [18] mov.n a9,a8 # [0*II+0] ee.vldbc.16 q0,a10 # [0*II+1] id:255 ee.vld.128.ip q1,a9,0 # [0*II+2] id:254 add.n a10,a10,a12 # [0*II+3] add.n a8,a8,a11 # [0*II+4] ee.vmulas.s16.qacc q0,q1 # [0*II+5] .LBB45_esp_nn_depthwise_conv_s16_mult8: # 0x17a2 .Lt_9_12290: # 0x17a2 add.n a14,a14,a3 # [0] add.n a15,a15,a4 # [1] l32i a10,a1,180 # [2] gra_spill_temp_216 l32i a11,a1,176 # [3] gra_spill_temp_215 addi.n a10,a10,1 # [4] add.n a11,a11,a4 # [5] s32i a11,a1,176 # [6] gra_spill_temp_215 s32i a10,a1,180 # [7] gra_spill_temp_216 sub a10,a7,a10 # [8] beqz a10,.Lt_9_11522 # [9] .Lt_9_12034: # 0x17bc blt a6,a5,.LBB18_esp_nn_depthwise_conv_s16_mult8 # [0] j .Lt_9_12290 # [0] .Lt_9_8450: # 0x17c2 retw.n # [0] .size esp_nn_depthwise_conv_s16_mult8_esp32s3, . - esp_nn_depthwise_conv_s16_mult8_esp32s3 ================================================ FILE: src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c ================================================ /* * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include #include #include static int16_t *scratch_buffer = NULL; extern void esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3(const int16_t *input_data, const uint16_t input_wd, const uint16_t input_ht, const uint16_t channels, const uint16_t pad_wd, const uint16_t pad_ht, const uint16_t stride_wd, const uint16_t stride_ht, const uint16_t ch_mult, const int16_t *filter_data, const int32_t *bias, int8_t *out_data, const uint16_t out_wd, const uint16_t out_ht, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max); extern void esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(const int8_t *input_data, const uint16_t input_wd, const uint16_t input_ht, const uint16_t channels, const int32_t input_offset, const uint16_t stride_wd, const uint16_t stride_ht, const int8_t *filter_data, const int32_t *bias, int8_t *out_data, const uint16_t out_wd, const uint16_t out_ht, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max); extern void esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3(const int16_t *input_data, const uint16_t input_wd, const uint16_t input_ht, const uint16_t channels, const uint16_t stride_wd, const uint16_t stride_ht, const int16_t *filter_data, const int32_t *bias, int8_t *out_data, const uint16_t out_wd, const uint16_t out_ht, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max); extern void esp_nn_depthwise_conv_s16_mult8_esp32s3(const int16_t *input_data, const uint16_t input_wd, const uint16_t input_ht, const uint16_t channels, const uint16_t pad_wd, const uint16_t pad_ht, const uint16_t stride_wd, const uint16_t stride_ht, const uint16_t ch_mult, const int16_t *filter_data, const uint16_t filter_wd, const uint16_t filter_ht, const int32_t *bias, int8_t *out_data, const uint16_t out_wd, const uint16_t out_ht, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max); extern void esp_nn_depthwise_conv_s16_mult4_esp32s3(const int16_t *input_data, const uint16_t input_wd, const uint16_t input_ht, const uint16_t channels, const uint16_t pad_wd, const uint16_t pad_ht, const uint16_t stride_wd, const uint16_t stride_ht, const uint16_t ch_mult, const int16_t *filter_data, const uint16_t filter_wd, const uint16_t filter_ht, const int32_t *bias, int8_t *out_data, const uint16_t out_wd, const uint16_t out_ht, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max); extern void esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3(const int16_t *input_data, const uint16_t input_wd, const uint16_t input_ht, const uint16_t channels, const uint16_t pad_wd, const uint16_t pad_ht, const uint16_t stride_wd, const uint16_t stride_ht, const int16_t *filter_data, const int32_t *bias, int8_t *out_data, const uint16_t out_wd, const uint16_t out_ht, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max); extern void esp_nn_depthwise_conv_s16_mult1_esp32s3(const int16_t *input_data, const uint16_t input_wd, const uint16_t input_ht, const uint16_t channels, const uint16_t pad_wd, const uint16_t pad_ht, const uint16_t stride_wd, const uint16_t stride_ht, const int16_t *filter_data, const uint16_t filter_wd, const uint16_t filter_ht, const int32_t *bias, int8_t *out_data, const uint16_t out_wd, const uint16_t out_ht, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max); extern void esp_nn_s8_to_s16_esp32s3(const int8_t *src, int16_t *dst, const int size); extern void esp_nn_aligned_s8_to_s16_with_offset_esp32s3(const int8_t *src, int16_t *dst, const int size, const int32_t offset); static void esp_nn_depthwise_conv_s8_unrolled(const int8_t *input_data, const uint16_t input_wd, const uint16_t input_ht, const uint16_t channels, const int32_t input_offset, const uint16_t pad_wd, const uint16_t pad_ht, const uint16_t stride_wd, const uint16_t stride_ht, const uint16_t ch_mult, const int8_t *filter_data, const uint16_t filter_wd, const uint16_t filter_ht, const int32_t *bias, int8_t *out_data, const uint16_t out_wd, const uint16_t out_ht, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max) { int out_idx = 0; for (int out_y = 0; out_y < out_ht; out_y++) { //height loop const int16_t base_y = (out_y * stride_ht) - pad_ht; for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop const int16_t base_x = (out_x * stride_wd) - pad_wd; for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop int ch_mult_idx = 0; for (; ch_mult_idx < ch_mult - 3; ch_mult_idx += 4) { int32_t result0 = 0, result1 = 0, result2 = 0, result3 = 0; const int out_ch_idx = ch_mult_idx + ch_idx * ch_mult; /* Select filter so as the point doesn't lie outside block */ int filter_y_start = max(0, -base_y); int filter_x_start = max(0, -base_x); int filter_y_end = min(filter_ht, input_ht - base_y); int filter_x_end = min(filter_wd, input_wd - base_x); for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) { const int32_t idx_y = base_y + filter_y_idx; for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) { const int32_t idx_x = base_x + filter_x_idx; int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx; int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx; int32_t input_val = input_data[input_index] + input_offset; int32_t filter_val0 = filter_data[filter_index + 0]; int32_t filter_val1 = filter_data[filter_index + 1]; int32_t filter_val2 = filter_data[filter_index + 2]; int32_t filter_val3 = filter_data[filter_index + 3]; result0 += input_val * filter_val0; result1 += input_val * filter_val1; result2 += input_val * filter_val2; result3 += input_val * filter_val3; } } if (bias) { result0 += bias[out_ch_idx + 0]; result1 += bias[out_ch_idx + 1]; result2 += bias[out_ch_idx + 2]; result3 += bias[out_ch_idx + 3]; } result0 = esp_nn_multiply_by_quantized_mult(result0, out_mult[out_ch_idx + 0], out_shift[out_ch_idx + 0]); result1 = esp_nn_multiply_by_quantized_mult(result1, out_mult[out_ch_idx + 1], out_shift[out_ch_idx + 1]); result2 = esp_nn_multiply_by_quantized_mult(result2, out_mult[out_ch_idx + 2], out_shift[out_ch_idx + 2]); result3 = esp_nn_multiply_by_quantized_mult(result3, out_mult[out_ch_idx + 3], out_shift[out_ch_idx + 3]); result0 += out_offset; result1 += out_offset; result2 += out_offset; result3 += out_offset; result0 = max(result0, activation_min); result1 = max(result1, activation_min); result2 = max(result2, activation_min); result3 = max(result3, activation_min); result0 = min(result0, activation_max); result1 = min(result1, activation_max); result2 = min(result2, activation_max); result3 = min(result3, activation_max); out_data[out_idx++] = result0; out_data[out_idx++] = result1; out_data[out_idx++] = result2; out_data[out_idx++] = result3; } /* left-over */ for (; ch_mult_idx < ch_mult; ch_mult_idx++) { int32_t result = 0; const int out_ch_idx = ch_mult_idx + ch_idx * ch_mult; /* Select filter so as the point doesn't lie outside block */ int filter_y_start = max(0, -base_y); int filter_x_start = max(0, -base_x); int filter_y_end = min(filter_ht, input_ht - base_y); int filter_x_end = min(filter_wd, input_wd - base_x); for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) { const int32_t idx_y = base_y + filter_y_idx; for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) { const int32_t idx_x = base_x + filter_x_idx; int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx; int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx; int32_t input_val = input_data[input_index] + input_offset; int32_t filter_val = filter_data[filter_index]; result += input_val * filter_val; } } if (bias) { result += bias[out_ch_idx]; } result = esp_nn_multiply_by_quantized_mult(result, out_mult[out_ch_idx], out_shift[out_ch_idx]); result += out_offset; result = max(result, activation_min); result = min(result, activation_max); out_data[out_idx++] = result; } } } } } void esp_nn_depthwise_conv_s8_ch_mult1(const int8_t *input_data, const uint16_t input_wd, const uint16_t input_ht, const uint16_t channels, const int32_t input_offset, const uint16_t pad_wd, const uint16_t pad_ht, const uint16_t stride_wd, const uint16_t stride_ht, const int8_t *filter_data, const uint16_t filter_wd, const uint16_t filter_ht, const int32_t *bias, int8_t *out_data, const uint16_t out_wd, const uint16_t out_ht, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max) { int out_idx = 0; for (int out_y = 0; out_y < out_ht; out_y++) { //height loop const int16_t base_y = (out_y * stride_ht) - pad_ht; for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop const int16_t base_x = (out_x * stride_wd) - pad_wd; for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop int32_t result = 0; /* Select filter so as the point doesn't lie outside block */ int filter_y_start = max(0, -base_y); int filter_x_start = max(0, -base_x); int filter_y_end = min(filter_ht, input_ht - base_y); int filter_x_end = min(filter_wd, input_wd - base_x); for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) { const int32_t idx_y = base_y + filter_y_idx; for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) { const int32_t idx_x = base_x + filter_x_idx; int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx; int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * channels + ch_idx; int32_t input_val = input_data[input_index] + input_offset; int32_t filter_val = filter_data[filter_index]; result += input_val * filter_val; } } if (bias) { result += bias[ch_idx]; } result = esp_nn_multiply_by_quantized_mult(result, out_mult[ch_idx], out_shift[ch_idx]); result += out_offset; result = max(result, activation_min); result = min(result, activation_max); out_data[out_idx++] = result; } } } } int esp_nn_get_depthwise_conv_scratch_size_esp32s3(const data_dims_t *input_dims, const data_dims_t *filter_dims, const data_dims_t *output_dims, const dw_conv_params_t *conv_params) { const uint16_t input_wd = input_dims->width; const uint16_t input_ht = input_dims->height; const uint16_t channels = input_dims->channels; const uint16_t filter_wd = filter_dims->width; const uint16_t filter_ht = filter_dims->height; const uint16_t ch_mult = conv_params->ch_mult; const uint16_t out_wd = output_dims->width; const uint16_t out_ht = output_dims->height; const uint16_t pad_wd = conv_params->padding.width; const uint16_t pad_ht = conv_params->padding.height; const uint16_t stride_wd = conv_params->stride.width; const uint16_t stride_ht = conv_params->stride.height; int filter_size = filter_wd * filter_ht * channels * ch_mult; int pad_width = 0, pad_height = 0; if ((ch_mult == 1) && (channels % 8 == 0)) { if(filter_wd == 3 && filter_ht == 3) { if (channels % 16 == 0) { if (pad_wd || pad_ht) { pad_width = pad_wd * 2; pad_height = pad_ht * 2; } else { pad_width = (out_wd * stride_wd + filter_wd - 1) - input_wd; pad_height = (out_ht * stride_ht + filter_ht - 1) - input_ht; } if (pad_width || pad_height) { int full_input = (input_wd + pad_width) * (input_ht + pad_height) * channels; if (full_input <= 40 * 1024) { return filter_size + full_input + 16; } else { /* Tiled: only need filter + strip buffer (filter_ht rows) */ int strip = (input_wd + pad_width) * filter_ht * channels; return filter_size + strip + 16; } } else { return filter_size + 16; } } else if (channels >= 12) { /* ch % 8 == 0, not % 16, ch >= 12: pad channels to 16, s8 path + compaction */ int new_ch = (channels + 15) & ~15; int new_filter_size = 9 * new_ch; int total_pad_wd = pad_wd * 2 + max(0, (out_wd * stride_wd + 2) - input_wd); int total_pad_ht = pad_ht * 2 + max(0, (out_ht * stride_ht + 2) - input_ht); int new_input_size = (input_wd + total_pad_wd) * (input_ht + total_pad_ht) * new_ch; int out_buf_size = out_wd * out_ht * new_ch; return new_filter_size + new_input_size + out_buf_size + 64; } else { /* ch=8: s16 path is more efficient (no channel padding overhead) */ int input_s = input_wd * input_ht * channels; return 2 * (filter_size + input_s) + 32; } } else { int input_size = input_wd * input_ht * channels; int total_s16 = 2 * (filter_size + input_size); if (total_s16 <= 48 * 1024) { return total_s16 + 32; } else { /* Tiled: only need filter_s16 + tile buffer (filter_ht rows of input s16) */ int tile_rows = filter_ht; int tile_s16 = 2 * input_wd * tile_rows * channels; return 2 * filter_size + tile_s16 + 32; } } } else if ((ch_mult == 1) && (channels > 3)) { // ch_mult=1, channels>3 case: pad channels to multiple of 8 for mult1 int padded_channels = (channels + 7) & ~7; int padded_input_size = input_wd * input_ht * padded_channels; int padded_filter_size = filter_wd * filter_ht * padded_channels; // Calculate actual memory layout with 16-byte alignments (matching usage) size_t filter_bytes = padded_filter_size * sizeof(int16_t); size_t input_start = (filter_bytes + 15) & ~15; size_t input_bytes = padded_input_size * sizeof(int16_t); size_t out_start = (input_start + input_bytes + 15) & ~15; size_t out_bytes = out_wd * out_ht * padded_channels * sizeof(int8_t); size_t bias_start = (out_start + out_bytes + 15) & ~15; size_t bias_bytes = padded_channels * sizeof(int32_t); size_t shift_bytes = padded_channels * sizeof(int32_t); size_t mult_bytes = padded_channels * sizeof(int32_t); size_t total_size = bias_start + bias_bytes + shift_bytes + mult_bytes; return total_size + 16; // 16 for margin } else if (ch_mult % 4 == 0) { int input_size = input_wd * input_ht * channels; return 2 * (filter_size + input_size) + 32; // 32 for alignment } // Default fallback return 32; } void esp_nn_set_depthwise_conv_scratch_buf_esp32s3(void *buf) { scratch_buffer = (int16_t *) buf; } /** * ESP32-S3 optimized depthwise convolution implementation. * * This function dispatches to various optimized implementations based on: * - Channel multiplier (ch_mult) * - Number of channels * - Filter dimensions * - Padding requirements * * For cases that don't have direct optimized implementations, the function * uses data padding techniques to leverage existing optimized functions: * - ch_mult % 4 != 0: Pad ch_mult to next multiple of 4, use mult4 functions * - ch_mult == 1, channels % 8 != 0: Fallback to C implementation for correctness * * Assumption 1: i/p channels == o/p channels * Assumption 2: Pointers are valid * Assumption 3: dilation width = 1 */ #include "esp_nn_generic_opt.h" void esp_nn_depthwise_conv_s8_esp32s3(const data_dims_t *input_dims, const int8_t *input_data, const data_dims_t *filter_dims, const int8_t *filter_data, const int32_t *bias, const data_dims_t *output_dims, int8_t *out_data, const dw_conv_params_t *conv_params, const quant_data_t *quant_data) { const uint16_t input_wd = input_dims->width; const uint16_t input_ht = input_dims->height; const uint16_t channels = input_dims->channels; const int32_t input_offset = conv_params->in_offset; const int32_t out_offset = conv_params->out_offset; const uint16_t pad_wd = conv_params->padding.width; const uint16_t pad_ht = conv_params->padding.height; const uint16_t stride_wd = conv_params->stride.width; const uint16_t stride_ht = conv_params->stride.height; const uint16_t filter_wd = filter_dims->width; const uint16_t filter_ht = filter_dims->height; const uint16_t out_wd = output_dims->width; const uint16_t out_ht = output_dims->height; const int32_t *out_shift = quant_data->shift; const int32_t *out_mult = quant_data->mult; const int32_t activation_min = conv_params->activation.min; const int32_t activation_max = conv_params->activation.max; const uint16_t ch_mult = conv_params->ch_mult; int filter_size = filter_wd * filter_ht * channels * ch_mult; int align_len = 16 - (filter_size & 15); int input_size = input_wd * input_ht * channels; int16_t *filter_data16 = scratch_buffer; int16_t *input_data16 = scratch_buffer + filter_size + align_len; if (scratch_buffer == NULL) { printf("esp_nn_depthwise_conv error! scratch_buffer not set!\n"); return; } if ((ch_mult == 1) && (channels % 8 == 0)) { if ((filter_wd == 3) && (filter_ht == 3)) { if ((channels % 16 == 0) && (pad_wd == 1) && (pad_ht == 1)) { /* process in 8 bits with s8 padded assembly */ int8_t *filter_aligned = (int8_t *) scratch_buffer; int8_t *input_padded = (int8_t *) scratch_buffer + filter_size + align_len; memcpy(filter_aligned, filter_data, filter_size); int padded_input_size = (input_wd + 2*pad_wd) * (input_ht + 2*pad_ht) * channels; if (padded_input_size <= 40 * 1024) { /* Small enough — full padding, single assembly call */ esp_nn_aligned_s8_pad_with_value(input_data, input_padded, input_wd, input_ht, channels, -input_offset, pad_wd, pad_ht); esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(input_padded, input_wd + 2 * pad_wd, input_ht + 2 * pad_ht, channels, input_offset, stride_wd, stride_ht, filter_aligned, bias, out_data, out_wd, out_ht, out_offset, out_shift, out_mult, activation_min, activation_max); } else { /* Large input: row-tiled processing to reduce cache pressure. * Pad and process a strip of output rows at a time. */ int padded_wd = input_wd + 2 * pad_wd; int8_t pad_val = (int8_t)(-input_offset); for (int out_y = 0; out_y < out_ht; out_y++) { int in_y_start = out_y * stride_ht; /* in padded coords (pad_ht already accounted) */ /* Pad filter_ht rows of input into scratch */ int8_t *tile = input_padded; for (int fy = 0; fy < filter_ht; fy++) { int src_y = in_y_start + fy - pad_ht; /* original input row */ if (src_y < 0 || src_y >= input_ht) { /* Padding row */ memset(tile, pad_val, padded_wd * channels); } else { /* Left pad */ memset(tile, pad_val, pad_wd * channels); /* Copy input row */ memcpy(tile + pad_wd * channels, input_data + src_y * input_wd * channels, input_wd * channels); /* Right pad */ memset(tile + (pad_wd + input_wd) * channels, pad_val, pad_wd * channels); } tile += padded_wd * channels; } /* Process one output row */ esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3( input_padded, padded_wd, filter_ht, channels, input_offset, stride_wd, 1, filter_aligned, bias, out_data + out_y * out_wd * channels, out_wd, 1, out_offset, out_shift, out_mult, activation_min, activation_max); } } } else if ((channels % 16 == 0) && (pad_wd == 0) && (pad_ht == 0)) { /* process in 8 bits */ int8_t *filter_aligned = (int8_t *) scratch_buffer; int8_t *input_padded = (int8_t *) scratch_buffer + filter_size + align_len; // check if we need to pad additionally int pad_right = (out_wd * stride_wd + filter_wd - 1) - input_wd; int pad_bottom = (out_ht * stride_ht + filter_ht - 1) - input_ht; if (pad_right || pad_bottom) { // pad right and bottom esp_nn_aligned_s8_pad_end_with_value(input_data, input_padded, input_wd, input_ht, channels, -input_offset, pad_right, pad_bottom); } else { input_padded = (int8_t *) input_data; } memcpy(filter_aligned, filter_data, filter_size); esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(input_padded, input_wd + pad_right, input_ht + pad_bottom, channels, input_offset, stride_wd, stride_ht, filter_aligned, bias, out_data, out_wd, out_ht, out_offset, out_shift, out_mult, activation_min, activation_max); } else if (channels >= 12) { /* channels % 8 == 0, not % 16, channels >= 12: pad to 16 is worthwhile * (overhead <= 33%). For ch=8, padding to 16 doubles data — use s16 instead */ int new_ch = (channels + 15) & ~15; int8_t pad_val = (int8_t)(-input_offset); /* Pad filter: 3x3 x new_ch */ int new_filter_size = 9 * new_ch; int8_t *filter_padded = (int8_t *) scratch_buffer; memset(filter_padded, 0, new_filter_size); for (int f = 0; f < 9; f++) { memcpy(filter_padded + f * new_ch, filter_data + f * channels, channels); } /* Pad input: (input_wd + 2*pad) x (input_ht + 2*pad) x new_ch */ int new_input_wd = input_wd + 2 * pad_wd; int new_input_ht = input_ht + 2 * pad_ht; int pad_right = max(0, (out_wd * stride_wd + 3 - 1) - (input_wd + 2 * pad_wd)); int pad_bottom = max(0, (out_ht * stride_ht + 3 - 1) - (input_ht + 2 * pad_ht)); new_input_wd += pad_right; new_input_ht += pad_bottom; int8_t *input_padded = filter_padded + new_filter_size + 16; int padded_input_total = new_input_wd * new_input_ht * new_ch; /* Fill entire padded input with pad_val first */ memset(input_padded, pad_val, padded_input_total); /* Copy actual input data into correct positions */ for (int y = 0; y < input_ht; y++) { for (int x = 0; x < input_wd; x++) { int dst_y = y + pad_ht; int dst_x = x + pad_wd; memcpy(input_padded + (dst_y * new_input_wd + dst_x) * new_ch, input_data + (y * input_wd + x) * channels, channels); } } /* Padded output buffer */ int8_t *out_padded = input_padded + padded_input_total; /* Pad quant arrays */ int32_t shift_pad[new_ch], mult_pad[new_ch], bias_pad[new_ch]; memcpy(shift_pad, out_shift, channels * sizeof(int32_t)); memcpy(mult_pad, out_mult, channels * sizeof(int32_t)); memset(shift_pad + channels, 0, (new_ch - channels) * sizeof(int32_t)); memset(mult_pad + channels, 0, (new_ch - channels) * sizeof(int32_t)); if (bias) { memcpy(bias_pad, bias, channels * sizeof(int32_t)); memset(bias_pad + channels, 0, (new_ch - channels) * sizeof(int32_t)); } esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3( input_padded, new_input_wd, new_input_ht, new_ch, input_offset, stride_wd, stride_ht, filter_padded, bias ? bias_pad : NULL, out_padded, out_wd, out_ht, out_offset, shift_pad, mult_pad, activation_min, activation_max); /* Compact output: strip padding channels */ for (int pos = 0; pos < out_wd * out_ht; pos++) { memcpy(out_data + pos * channels, out_padded + pos * new_ch, channels); } } else { /* ch < 12 (e.g., ch=8), 3x3: use s16 mult1 3x3 path */ esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size); esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input_data, input_data16, input_size, input_offset); esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3(input_data16, input_wd, input_ht, channels, pad_wd, pad_ht, stride_wd, stride_ht, filter_data16, bias, out_data, out_wd, out_ht, out_offset, out_shift, out_mult, activation_min, activation_max); } } else { // all other ch_mult == 1, channels % 8 == 0 /* Tiled s16 processing: convert filter once, process input in row strips * to keep working set within DCache (64KB) */ esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size); /* Check if full conversion fits comfortably in cache */ int total_s16_size = 2 * (filter_size + input_size); if (total_s16_size <= 48 * 1024) { /* Small enough — full conversion is fine */ esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input_data, input_data16, input_size, input_offset); esp_nn_depthwise_conv_s16_mult1_esp32s3(input_data16, input_wd, input_ht, channels, pad_wd, pad_ht, stride_wd, stride_ht, filter_data16, filter_wd, filter_ht, bias, out_data, out_wd, out_ht, out_offset, out_shift, out_mult, activation_min, activation_max); } else { /* Large input: process in row tiles to reduce cache pressure. * Convert only the input rows needed for each output row strip. */ int16_t *tile_buf = input_data16; /* reuse scratch for tile */ for (int out_row = 0; out_row < out_ht; out_row++) { int in_row_start = out_row * stride_ht - pad_ht; int in_row_end = in_row_start + filter_ht; /* Fill tile: pad rows that are outside input bounds */ int16_t *dst = tile_buf; for (int r = in_row_start; r < in_row_end; r++) { if (r < 0 || r >= input_ht) { /* Padding row: fill with input_offset */ for (int i = 0; i < input_wd * channels; i++) { dst[i] = (int16_t)input_offset; } } else { /* Valid row: convert s8 to s16 with offset */ const int8_t *src = input_data + r * input_wd * channels; for (int i = 0; i < input_wd * channels; i++) { dst[i] = (int16_t)src[i] + (int16_t)input_offset; } } dst += input_wd * channels; } /* Process one output row */ esp_nn_depthwise_conv_s16_mult1_esp32s3(tile_buf, input_wd, filter_ht, channels, pad_wd, 0, stride_wd, 1, filter_data16, filter_wd, filter_ht, bias, out_data + out_row * out_wd * channels, out_wd, 1, out_offset, out_shift, out_mult, activation_min, activation_max); } } } } else if ((ch_mult == 1) && (channels > 3)) { // For ch_mult=1, pad channels to multiple of 8 for optimized mult1 function int padded_channels = (channels + 7) & ~7; // Round up to multiple of 8 int padded_input_size = input_wd * input_ht * padded_channels; int padded_filter_size = filter_wd * filter_ht * padded_channels; // Use scratch buffer for padded data (ensure 16-byte alignment for SIMD) int16_t *padded_filter_data16 = (int16_t*)scratch_buffer; size_t input_start = (size_t)(padded_filter_data16 + padded_filter_size); int16_t *padded_input_data16 = (int16_t*)((input_start + 15) & ~15); size_t out_start = (size_t)(padded_input_data16 + padded_input_size); int8_t *padded_out_data = (int8_t*)((out_start + 15) & ~15); // Create padded parameter arrays size_t bias_start = (size_t)(padded_out_data + out_wd * out_ht * padded_channels); int32_t *padded_bias = (int32_t*)((bias_start + 15) & ~15); int32_t *padded_shift = padded_bias + padded_channels; int32_t *padded_mult = padded_shift + padded_channels; // Initialize padded parameters - copy valid values, set padded ones to safe defaults memset(padded_bias, 0, padded_channels * sizeof(int32_t)); memset(padded_shift, 0, padded_channels * sizeof(int32_t)); memset(padded_mult, 0, padded_channels * sizeof(int32_t)); if (bias) { memcpy(padded_bias, bias, channels * sizeof(int32_t)); } if (out_shift) { memcpy(padded_shift, out_shift, channels * sizeof(int32_t)); } if (out_mult) { memcpy(padded_mult, out_mult, channels * sizeof(int32_t)); } // Convert filter data to padded layout (zero out extra channels) memset(padded_filter_data16, 0, padded_filter_size * sizeof(int16_t)); for (int c = 0; c < channels; c++) { for (int fy = 0; fy < filter_ht; fy++) { for (int fx = 0; fx < filter_wd; fx++) { int orig_idx = (fy * filter_wd + fx) * channels + c; int padded_idx = (fy * filter_wd + fx) * padded_channels + c; padded_filter_data16[padded_idx] = (int16_t) filter_data[orig_idx]; } } } // Convert input data to padded layout (zero out extra channels, apply offset) memset(padded_input_data16, 0, padded_input_size * sizeof(int16_t)); for (int h = 0; h < input_ht; h++) { for (int w = 0; w < input_wd; w++) { for (int c = 0; c < channels; c++) { int orig_idx = (h * input_wd + w) * channels + c; int padded_idx = (h * input_wd + w) * padded_channels + c; padded_input_data16[padded_idx] = (int16_t) input_data[orig_idx] + input_offset; } } } // Call mult1 with padded data esp_nn_depthwise_conv_s16_mult1_esp32s3(padded_input_data16, input_wd, input_ht, padded_channels, pad_wd, pad_ht, stride_wd, stride_ht, padded_filter_data16, filter_wd, filter_ht, padded_bias, padded_out_data, out_wd, out_ht, out_offset, padded_shift, padded_mult, activation_min, activation_max); // Copy back only valid channels for (int h = 0; h < out_ht; h++) { for (int w = 0; w < out_wd; w++) { for (int c = 0; c < channels; c++) { int out_idx = (h * out_wd + w) * channels + c; int padded_idx = (h * out_wd + w) * padded_channels + c; out_data[out_idx] = padded_out_data[padded_idx]; } } } } else if (ch_mult % 8 == 0) { // Channel multiplier is optimized multiple - use direct s16 functions esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size); esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input_data, input_data16, input_size, input_offset); if (filter_wd == 3 && filter_ht == 3) { esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3(input_data16, input_wd, input_ht, channels, pad_wd, pad_ht, stride_wd, stride_ht, ch_mult, filter_data16, bias, out_data, out_wd, out_ht, out_offset, out_shift, out_mult, activation_min, activation_max); } else { esp_nn_depthwise_conv_s16_mult8_esp32s3(input_data16, input_wd, input_ht, channels, pad_wd, pad_ht, stride_wd, stride_ht, ch_mult, filter_data16, filter_wd, filter_ht, bias, out_data, out_wd, out_ht, out_offset, out_shift, out_mult, activation_min, activation_max); } } else if (ch_mult % 4 == 0) { esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size); esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input_data, input_data16, input_size, input_offset); esp_nn_depthwise_conv_s16_mult4_esp32s3(input_data16, input_wd, input_ht, channels, pad_wd, pad_ht, stride_wd, stride_ht, ch_mult, filter_data16, filter_wd, filter_ht, bias, out_data, out_wd, out_ht, out_offset, out_shift, out_mult, activation_min, activation_max); } else { esp_nn_depthwise_conv_s8_opt(input_dims, input_data, filter_dims, filter_data, bias, output_dims, out_data, conv_params, quant_data); } } ================================================ FILE: src/convolution/esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3.S ================================================ // Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. .text .literal_position // processes multiple of 16 channels // already padded version. no additional padding needed // simply keep sliding filter window by stride_size # Program Unit: esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3 .type esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3, @function .align 4 .global esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3 esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3: # 0xccc # qacc_scratch = 0 # gra_spill_temp_103 = 40 // stride_wd*channels # gra_spill_temp_104 = 44 // bias_align # gra_spill_temp_107 = 48 // input_offset # gra_spill_temp_105 = 52 // out_mult_ptr # gra_spill_temp_106 = 56 // out_shift_ptr # gra_spill_temp_108 = 60 // ch_idx # gra_spill_temp_109 = 64 // out_ch # gra_spill_temp_110 = 68 // bias_ptr # gra_spill_temp_111 = 72 // 2 * (input_wd * channels) # gra_spill_temp_112 = 76 // input_data # gra_spill_temp_118 = 96 # gra_spill_temp_119 = 100 # gra_spill_temp_120 = 104 # gra_spill_temp_121 = 108 # gra_spill_temp_113 = 112 // input_wd * channels # gra_spill_temp_114 = 116 // input_wd # gra_spill_temp_130 = 120 # gra_spill_temp_141 = 0 # gra_spill_temp_120 = 16 # gra_spill_temp_137 = 80 // offset+bias factor # gra_spill_temp_134 = 128 //256-128 # gra_spill_temp_135 = 144 //256-112 # gra_spill_temp_133 = 160 //256-96 # gra_spill_temp_132 = 176 //256-80 // registers: // a2: input_data // a3: input_wd // a4: input_ht // a5: channels // a6: input_offset // a7: stride_wd // on stack: // 320: stride_ht // 324: filter_data // 328: *bias // 332: *out_data // 336: out_wd // 340: out_ht // 344: out_offset // 348: *out_shift // 352: *out_mult // 356: activation_min // 360: activation_max entry a1,320 # mul16u a7,a7,a5 s32i a3,a1,116 # [0] gra_spill_temp_114, input_wd s32i a6,a1,48 # [1] gra_spill_temp_107, input_offset s32i a7,a1,40 # gra_spill_temp_103, stride_wd*channels addi a8,a5,-15 # [2] s32i a2,a1,76 # [3] gra_spill_temp_112, input_data l32i a9,a1,328 # [4] id:664 bias+0x0 mov.n a2,a5 # [5] s32i a8,a1,64 # [7] gra_spill_temp_109 s32i a9,a1,68 # [8] gra_spill_temp_110, bias_ptr blti a8,1,.Lt_7_4610 # [9] l32i a12,a1,348 # [4] id:666 out_shift+0x0 mul16u a15,a3,a5 # [1] movi.n a9,0 # [13] s32i a12,a1,56 # [9] gra_spill_temp_106 // out_shift_ptr s32i a9,a1,60 # [14] gra_spill_temp_108, ch_idx s32i a15,a1,112 # [12] gra_spill_temp_113, input_wd*channels l32i a9,a1,352 # [24] id:665 out_mult+0x0 slli a15,a15,1 # [15] s32i a15,a1,72 # [23] gra_spill_temp_111, 2 * (input_wd * channels) s32i a9,a1,52 # [25] gra_spill_temp_105, out_mult_ptr // outer most out_ch loop .Lt_7_5122: # 0xd57 l32i a13,a1,324 # [1] filter_data l32i a6,a1,60 # [2] gra_spill_temp_108, ch_idx l32i a9,a1,48 # [0] gra_spill_temp_107, input_offset ee.zero.q q2 # [3] add.n a13,a6,a13 # [4] s32i a13,a1,108 # [5] gra_spill_temp_121 // multiply accumulate filter points ee.vld.128.xp q1,a13,a2 # [6] id:673 ee.vld.128.xp q3,a13,a2 # [7] id:674 ee.vcmp.lt.s8 q0,q1,q2 # [8] ee.vcmp.lt.s8 q4,q3,q2 # [9] ee.vzip.8 q1,q0 # [10] ee.vzip.8 q3,q4 # [11] ee.vadds.s16 q0,q0,q4 # [12] ee.vld.128.xp q4,a13,a2 # [13] id:675 ee.vadds.s16 q1,q1,q3 # [14] ee.vcmp.lt.s8 q3,q4,q2 # [15] ee.vzip.8 q4,q3 # [16] ee.vadds.s16 q1,q1,q4 # [17] ee.vld.128.xp q4,a13,a2 # [18] id:676 ee.vadds.s16 q0,q0,q3 # [19] ee.vcmp.lt.s8 q3,q4,q2 # [20] ee.vzip.8 q4,q3 # [21] ee.vadds.s16 q0,q0,q3 # [22] ee.vld.128.xp q3,a13,a2 # [23] id:677 ee.vadds.s16 q1,q1,q4 # [24] ee.vcmp.lt.s8 q4,q3,q2 # [25] ee.vzip.8 q3,q4 # [26] ee.vadds.s16 q1,q1,q3 # [27] ee.vld.128.xp q3,a13,a2 # [28] id:678 ee.vadds.s16 q0,q0,q4 # [29] ee.vcmp.lt.s8 q4,q3,q2 # [30] ee.vzip.8 q3,q4 # [31] ee.vadds.s16 q0,q0,q4 # [32] ee.vld.128.xp q4,a13,a2 # [33] id:679 ee.vadds.s16 q1,q1,q3 # [34] ee.vcmp.lt.s8 q3,q4,q2 # [35] ee.vzip.8 q4,q3 # [36] ee.vadds.s16 q1,q1,q4 # [37] ee.vld.128.xp q4,a13,a2 # [38] id:680 ee.vadds.s16 q0,q0,q3 # [39] ee.vcmp.lt.s8 q3,q4,q2 # [40] ee.vzip.8 q4,q3 # [41] ee.vadds.s16 q0,q0,q3 # [42] ee.vld.128.xp q3,a13,a2 # [44] id:681 ee.vadds.s16 q1,q1,q4 # [43] ee.vcmp.lt.s8 q2,q3,q2 # [47] ee.vzip.8 q3,q2 # [48] ee.vadds.s16 q0,q0,q2 # [49] ee.vadds.s16 q1,q1,q3 # [50] ee.movi.32.a q1,a15,1 # [51] ee.movi.32.a q1,a8,3 # [52] ee.movi.32.a q0,a10,3 # [54] ee.movi.32.a q0,a13,1 # [55] srai a11,a10,16 # [56] srai a12,a8,16 # [57] mull a12,a9,a12 # [58] mull a11,a9,a11 # [59] sext a8,a8,15 # [328] sext a10,a10,15 # [61] srai a14,a13,16 # [62] mull a14,a9,a14 # [63] mull a10,a9,a10 # [64] mull a8,a9,a8 # [65] sext a13,a13,15 # [66] mull a13,a9,a13 # [67] ee.movi.32.q q3,a11,3 # [68] ee.movi.32.q q4,a12,3 # [69] ee.movi.32.q q4,a8,2 # [70] ee.movi.32.q q3,a10,2 # [71] ee.movi.32.a q1,a11,2 # [72] srai a12,a11,16 # [74] srai a8,a15,16 # [75] mull a8,a9,a8 # [76] mull a12,a9,a12 # [77] sext a15,a15,15 # [78] sext a11,a11,15 # [79] mull a11,a9,a11 # [80] mull a15,a9,a15 # [81] ee.movi.32.q q4,a12,1 # [82] ee.movi.32.q q1,a8,3 # [83] ee.movi.32.q q1,a15,2 # [84] ee.movi.32.q q4,a11,0 # [85] ee.movi.32.a q0,a15,2 # [86] ee.movi.32.q q0,a14,3 # [88] ee.movi.32.q q0,a13,2 # [91] srai a8,a15,16 # [89] mull a8,a9,a8 # [90] sext a15,a15,15 # [92] mull a15,a9,a15 # [93] # 526 MUL_IN_OFFSET_EXPAND(q_sum2, 0, q_sum2, 0); ee.movi.32.a q0,a11,0 # [94] srai a13,a11,16 # [95] ee.movi.32.q q3,a8,1 # [96] ee.movi.32.q q3,a15,0 # [100] sext a11,a11,15 # [97] mull a13,a9,a13 # [98] l32i a8,a1,332 # [99] ee.movi.32.a q1,a10,0 # [103] ee.movi.32.q q0,a13,1 # [100] srai a12,a10,16 # [105] sext a10,a10,15 # [106] mull a12,a9,a12 # [107] mull a10,a9,a10 # [108] mull a9,a9,a11 # [109] ee.movi.32.q q1,a12,1 # [110] ee.movi.32.q q1,a10,0 # [111] l32i a11,a1,328 // load bias add.n a6,a6,a8 # [102] ee.movi.32.q q0,a9,0 # [113] beqz.n a11,.Lt_7_5378 # [114] // add bias l32i a8,a1,68 # [0] gra_spill_temp_110, bias_ptr extui a11,a11,0,4 # [2] // bias_align wur.sar_byte a11 # [4] ee.vld.128.ip q5,a8,16 # [5] id:683 ee.vld.128.ip q6,a8,16 # [6] id:684 ee.vld.128.ip q7,a8,16 # [7] id:685 addmi a10,a1,256 # [2] ee.src.q.ld.ip q2,a8,16,q5,q6 # [9] ee.vadds.s32 q1,q1,q5 # [12] ee.src.q.ld.ip q5,a8,0,q6,q7 # [13] s32i a8,a1,68 # [11] gra_spill_temp_110, bias_ptr ee.vadds.s32 q4,q4,q6 # [18] ee.src.q q7,q7,q2 # [9] ee.src.q q2,q2,q5 # [13] ee.vadds.s32 q0,q0,q7 # [12] ee.vadds.s32 q3,q3,q2 # [12] .Lt_7_5378: # 0xeef // store offset+bias factor (q1,q4,q0,q3) st.qr q4,a10,-112 # [17] gra_spill_temp_135-256 st.qr q3,a10,-128 # [21] gra_spill_temp_134-256 st.qr q1,a10,-96 # [7] gra_spill_temp_133-256 st.qr q0,a10,-80 # [8] gra_spill_temp_132-256 // prepare height loop movi.n a15,0 # [1] movi.n a8,0 # [2] movi.n a9,0 # [3] s32i a9,a1,100 # [4] gra_spill_temp_119 s32i a8,a1,104 # [5] gra_spill_temp_120 s32i a15,a1,96 # [6] gra_spill_temp_118 // height loop .Lt_7_6402: # 0xf0c l32i a4,a1,104 # [2] gra_spill_temp_120 // out_y * (input_wd * stride_ht) * channels) l32i a8,a1,100 # [3] gra_spill_temp_119 // initialised to 0 before height loop l32i a5,a1,76 # [1] gra_spill_temp_112, input_data l32i a3,a1,60 # [0] gra_spill_temp_108, ch_idx l32i a7,a1,112 # [1] gra_spill_temp_113, input_wd*channels l32i a10,a1,336 # [0] out_wd add.n a4,a4,a5 # [4] // input_data + (out_y * stride_ht) * input_wd * channels mov.n a5,a8 # [5] // index add.n a3,a3,a4 # [6] // input_row0 l32i a4,a1,72 # [9] gra_spill_temp_111, 2 * (input_wd * channels) add.n a7,a7,a3 # [7] // input_row1 = (input_wd * channels) add.n a8,a8,a10 # [8] s32i a8,a1,120 # [10] gra_spill_temp_130 add.n a4,a4,a3 # [11] // input_row2 // width loop .Lt_7_7170: # 0xf32 l32i a9,a1,108 # [3] gra_spill_temp_121, filter_ptr ee.zero.qacc # [2] mov.n a12,a3 # [4] mov.n a11,a7 # [1] mov.n a10,a4 # [0] ee.vld.128.xp q0,a12,a2 # [5] id:693 ee.vld.128.xp q6,a12,a2 # [6] id:695 ee.vld.128.xp q1,a9,a2 # [7] id:694 ee.vld.128.xp q7,a9,a2 # [8] id:696 ee.vld.128.xp q5,a9,a2 # [9] id:698 ee.vld.128.xp q3,a9,a2 # [10] id:700 ee.vmulas.s8.qacc.ld.xp q4,a12,a2,q0,q1 # [11] id:697 ee.vmulas.s8.qacc.ld.xp q2,a11,a2,q6,q7 # [13] id:699 ee.vld.128.xp q1,a9,a2 # [14] id:702 ee.vmulas.s8.qacc.ld.xp q0,a11,a2,q4,q5 # [15] id:701 ee.vmulas.s8.qacc.ld.xp q6,a11,a2,q2,q3 # [16] id:703 ee.vld.128.xp q7,a9,a2 # [17] id:704 ee.vld.128.xp q3,a9,a2 # [18] id:706 ee.vmulas.s8.qacc.ld.xp q0,a10,a2,q0,q1 # [19] id:705 ee.vmulas.s8.qacc.ld.xp q1,a10,a2,q6,q7 # [20] id:707 ee.vmulas.s8.qacc.ld.xp q4,a10,a2,q0,q3 # [21] id:709 ee.vld.128.xp q6,a9,a2 # [22] id:708 ee.vld.128.xp q5,a9,a2 # [23] id:710 ee.vmulas.s8.qacc q1,q6 # [24] ee.vmulas.s8.qacc q4,q5 # [25] // extract data mov a12,a1 //// scratch ee.st.qacc_l.l.128.ip a12,16 # [27] id:713 ee.st.qacc_l.h.32.ip a12,-16 # [28] id:714 l32i.n a9,a1,8 # [29] qacc_scratch+8 l32i.n a11,a1,4 # [30] qacc_scratch+4 l32i.n a15,a1,0 # [31] qacc_scratch slli a14,a11,24 # [32] sext a8,a15,19 # [33] slli a10,a9,16 # [34] slli a13,a11,4 # [35] extui a9,a9,16,16 # [36] srai a13,a13,12 # [37] extui a15,a15,20,12 # [39] srai a14,a14,12 # [40] srai a10,a10,12 # [41] extui a11,a11,28,4 # [42] or a10,a10,a11 # [43] or a14,a14,a15 # [44] // insert to q0 ee.movi.32.q q0,a8,0 # [38] ee.movi.32.q q0,a14,1 # [45] ee.movi.32.q q0,a13,2 # [48] ee.movi.32.q q0,a10,3 # [49] l32i.n a11,a1,16 # [46] qacc_scratch+16 l32i.n a14,a1,12 # [47] qacc_scratch+12 slli a13,a11,20 # [50] ee.st.qacc_h.l.128.ip a12,16 # [51] id:720 ee.st.qacc_h.h.32.ip a12,-16 # [55] id:721 srai a11,a11,12 # [52] srai a13,a13,12 # [53] slli a8,a14,28 # [54] slli a15,a14,8 # [56] srai a15,a15,12 # [57] srai a8,a8,12 # [59] l32i.n a12,a1,8 # [328] qacc_scratch+8 or a8,a8,a9 # [61] extui a14,a14,24,8 # [62] l32i.n a9,a1,0 # [63] qacc_scratch or a13,a13,a14 # [64] //insert to q3 ee.movi.32.q q3,a8,0 # [65] ee.movi.32.q q3,a15,1 # [67] ee.movi.32.q q3,a13,2 # [69] ee.movi.32.q q3,a11,3 # [70] l32i.n a14,a1,4 # [66] qacc_scratch+4 sext a10,a9,19 # [68] extui a9,a9,20,12 # [72] slli a13,a12,16 # [73] slli a8,a14,24 # [74] extui a12,a12,16,16 # [75] srai a13,a13,12 # [76] srai a8,a8,12 # [77] slli a15,a14,4 # [78] srai a15,a15,12 # [79] or a8,a8,a9 # [80] extui a14,a14,28,4 # [81] l32i.n a9,a1,12 # [82] qacc_scratch+12 or a13,a13,a14 # [83] // insert to q1 ee.movi.32.q q1,a10,0 # [71] ee.movi.32.q q1,a8,1 # [84] ee.movi.32.q q1,a15,2 # [85] ee.movi.32.q q1,a13,3 # [88] // load in_offset+bias factor addmi a14,a1,256 # [86] ld.qr q7,a14,-128 # [87] gra_spill_temp_134-256 ld.qr q4,a14,-112 # [89] gra_spill_temp_135-256 l32i.n a15,a1,16 # [90] qacc_scratch+16 ld.qr q2,a14,-96 # [91] gra_spill_temp_133-256 slli a11,a9,28 # [92] slli a10,a9,8 # [93] srai a10,a10,12 # [94] srai a11,a11,12 # [95] extui a9,a9,24,8 # [96] or a11,a11,a12 # [97] ee.vadds.s32 q0,q0,q2 # [98] slli a8,a15,20 # [99] ee.vadds.s32 q3,q3,q4 # [100] st.qr q3,a1,80 # [101] gra_spill_temp_137-256 srai a15,a15,12 # [102] ld.qr q2,a14,-80 # [103] gra_spill_temp_132-256 srai a8,a8,12 # [105] or a8,a8,a9 # [108] // insert to q6 ee.movi.32.q q6,a11,0 # [100] ee.movi.32.q q6,a10,1 # [107] ee.movi.32.q q6,a8,2 # [112] ee.movi.32.q q6,a15,3 # [113] ee.vadds.s32 q1,q1,q2 # [110] ee.vadds.s32 q6,q6,q7 # [114] st.qr q1,a1,16 # [111] gra_spill_temp_120 s32i.n a7,a1,32 # [0] // tmp s32i.n a6,a1,36 # [106] // tmp l32i a7,a1,52 # [109] gra_spill_temp_105, out_mult_ptr l32i a6,a1,56 # [106] gra_spill_temp_106, out_shift_ptr addi.n a10,a7,0 addi.n a11,a6,0 call8 esp_nn_multiply_by_quantized_mult_ver1_esp32s3 # [116] esp_nn_multiply_by_quantized_mult_ver1_esp32s3 mv.qr q5,q0 ld.qr q0,a1,80 # [4] gra_spill_temp_137-256 addi.n a10,a7,16 addi.n a11,a6,16 call8 esp_nn_multiply_by_quantized_mult_ver1_esp32s3 # [5] esp_nn_multiply_by_quantized_mult_ver1_esp32s3 mv.qr q4,q0 ld.qr q0,a1,16 # [5] gra_spill_temp_120 addi.n a10,a7,32 addi.n a11,a6,32 call8 esp_nn_multiply_by_quantized_mult_ver1_esp32s3 # [6] esp_nn_multiply_by_quantized_mult_ver1_esp32s3 st.qr q0,a1,0 # [3] gra_spill_temp_141 mv.qr q0,q6 addi.n a10,a7,48 addi.n a11,a6,48 call8 esp_nn_multiply_by_quantized_mult_ver1_esp32s3 # [6] esp_nn_multiply_by_quantized_mult_ver1_esp32s3 l32i.n a6,a1,36 # [106] // tmp l32i.n a7,a1,32 # [0] // tmp l32i a15,a1,40 # gra_spill_temp_103, stride_wd * channels l32i a11,a1,120 # [3] gra_spill_temp_130 add.n a3,a3,a15 # [0] add.n a4,a4,a15 # [1] add.n a7,a7,a15 # [2] addi.n a5,a5,1 # [4] // add offset, apply activation and store addmi a13,a1,256 # [8] ld.qr q3,a1,0 # [10] gra_spill_temp_141 mv.qr q2,q5 addi a8,a13,88 # [14] addi a9,a13,100 # [15] addi a15,a13,104 # [13] ee.vldbc.32 q6,a9 # [17] id:723 activation_min ee.vldbc.32 q1,a8 # [18] id:722 out_offset ee.vldbc.32 q7,a15 # [19] id:724 activation_max ee.vadds.s32 q4,q4,q1 # [20] ee.vadds.s32 q2,q2,q1 # [21] ee.vadds.s32 q5,q0,q1 # [22] ee.vadds.s32 q3,q3,q1 # [23] ee.vmin.s32 q3,q3,q7 # [24] ee.vmin.s32 q5,q5,q7 # [25] ee.vmin.s32 q2,q2,q7 # [26] ee.vmin.s32 q4,q4,q7 # [27] ee.vmax.s32 q4,q4,q6 # [28] ee.vmax.s32 q2,q2,q6 # [29] ee.vmax.s32 q5,q5,q6 # [30] ee.vmax.s32 q3,q3,q6 # [31] ee.vunzip.16 q3,q5 # [32] ee.vunzip.16 q2,q4 # [33] ee.vunzip.8 q2,q3 # [34] ee.vst.128.xp q2,a6,a2 # [35] id:725 bne a5,a11,.Lt_7_7170 # [36] .Lt_7_6658: # 0x112f # Part of loop body line 548, head labeled .Lt_7_6402 l32i a15,a1,112 # [3] gra_spill_temp_113, input_wd*channels l32i a10,a1,320 # gra_spill_temp_103 l32i a13,a1,340 # [0] // out_ht l32i a9,a1,116 # [1] gra_spill_temp_114, input_wd l32i a12,a1,96 # [4] gra_spill_temp_118 mull a15,a10,a15 # // (input_wd * stride_ht) * channels l32i a14,a1,104 # [5] gra_spill_temp_120 l32i a8,a1,100 # [2] gra_spill_temp_119 addi.n a12,a12,1 # [6] s32i a12,a1,96 # [7] gra_spill_temp_118 add.n a14,a14,a15 # [8] add.n a8,a8,a9 # [9] s32i a8,a1,100 # [10] gra_spill_temp_119 s32i a14,a1,104 # [11] gra_spill_temp_120, (input_wd * stride_wd) * channels bne a12,a13,.Lt_7_6402 # [13] // iterate over height loop # Part of loop body line 348, head labeled .Lt_7_5122 l32i a11,a1,56 # [6] gra_spill_temp_106 // out_shift_ptr l32i a15,a1,52 # [2] gra_spill_temp_105, out_mult_ptr l32i a10,a1,60 # [24] gra_spill_temp_108, ch_idx addi a11,a11,64 # [8] addi a15,a15,64 # [13] s32i a11,a1,56 # [23] gra_spill_temp_106 s32i a15,a1,52 # [18] gra_spill_temp_105, out_mult_ptr l32i a11,a1,64 # [25] gra_spill_temp_109 addi a10,a10,16 # [26] s32i a10,a1,60 # [27] gra_spill_temp_108, ch_idx blt a10,a11,.Lt_7_5122 # [28] // iterate over outer most out_ch loop .Lt_7_4610: # 0x11ad retw.n # [0] .size esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3, . - esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3 ================================================ FILE: src/fully_connected/esp_nn_fc_s8_mac16_esp32s3.S ================================================ // // SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD // // SPDX-License-Identifier: Apache-2.0 // // // s8 dot product for FC with 2x loop unrolling and QUP for unaligned filter. // Pattern adapted from esp-dsp dspi_dotprod_s8_aes3.S. // Input must be 16-byte aligned. Filter can be unaligned. // .text .align 4 .type esp_nn_fc_s8_mac16_esp32s3, @function .align 4 .global esp_nn_fc_s8_mac16_esp32s3 // a2: input_data (16-byte aligned) // a3: filter_data (may be unaligned) // a4: row_len_div16 (>= 1) // Returns: int32_t dot product in a2 esp_nn_fc_s8_mac16_esp32s3: entry a1, 32 ee.zero.accx beqz a4, .Ldone // Prime: first unaligned filter load (sets SAR_BYTE) ee.ld.128.usar.ip q0, a3, 16 // filter chunk 0 // Check if we can do 2x unrolled (need >= 2 iterations) srai a5, a4, 1 // a5 = row_len_div16 / 2 beqz a5, .Lsingle // Load first input + filter pair for unrolled loop ee.vld.128.ip q1, a2, 16 // input[0] ee.ld.128.usar.ip q2, a3, 16 // filter chunk 1 // 2x unrolled main loop: 2 MACs per iteration loopgtz a5, .Lloop2_end ee.src.q.qup q4, q0, q2 // align filter[i] ee.vld.128.ip q3, a2, 16 // input[i+1] ee.vmulas.s8.accx q4, q1 // MAC filter[i] * input[i] ee.ld.128.usar.ip q0, a3, 16 // filter chunk[i+2] ee.src.q.qup q5, q2, q0 // align filter[i+1] ee.vld.128.ip q1, a2, 16 // input[i+2] (primed for next) ee.vmulas.s8.accx q5, q3 // MAC filter[i+1] * input[i+1] ee.ld.128.usar.ip q2, a3, 16 // filter chunk[i+3] .Lloop2_end: // Check if there's a remaining single iteration bbci a4, 0, .Llast_qup // if row_len_div16 is even, skip single .Lsingle: // Single iteration: load input, QUP filter, MAC ee.vld.128.ip q1, a2, 16 // input ee.ld.128.usar.ip q2, a3, 16 // next filter chunk ee.src.q.qup q4, q0, q2 // align filter ee.vmulas.s8.accx q4, q1 // MAC j .Ldone_mac .Llast_qup: // After 2x loop: need to back up pointers since we loaded one extra pair addi a2, a2, -16 addi a3, a3, -16 .Ldone_mac: .Ldone: // 2-cycle gap before ACCX read movi.n a3, 0 nop ee.srs.accx a2, a3, 0 retw.n .size esp_nn_fc_s8_mac16_esp32s3, . - esp_nn_fc_s8_mac16_esp32s3 ================================================ FILE: src/fully_connected/esp_nn_fully_connected_ansi.c ================================================ // Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include void esp_nn_fully_connected_s8_ansi(const int8_t *input_data, const int32_t input_offset, const uint16_t row_len, const int8_t *filter_data, const int32_t filter_offset, const int32_t *bias, int8_t *out_data, const uint16_t out_channels, const int32_t out_offset, const int32_t out_shift, const int32_t out_mult, const int32_t activation_min, const int32_t activation_max) { for (int32_t out_c = 0; out_c < out_channels; ++out_c) { int32_t result = 0; for (int32_t data_idx = 0; data_idx < row_len; data_idx++) { int32_t filter_index = row_len * out_c + data_idx; int32_t input_val = input_data[data_idx]; int32_t filter_val = filter_data[filter_index]; result += (filter_val + filter_offset) * (input_val + input_offset); } if (bias) { result += bias[out_c]; } result = esp_nn_multiply_by_quantized_mult(result, out_mult, out_shift); result += out_offset; result = max(result, activation_min); result = min(result, activation_max); out_data[out_c] = (int8_t) result; } } void esp_nn_fully_connected_per_ch_s8_ansi(const int8_t *input_data, const int32_t input_offset, const uint16_t row_len, const int8_t *filter_data, const int32_t filter_offset, const int32_t *bias, int8_t *out_data, const uint16_t out_channels, const int32_t out_offset, const int32_t* out_shift, const int32_t* out_mult, const int32_t activation_min, const int32_t activation_max) { for (int32_t out_c = 0; out_c < out_channels; ++out_c) { int32_t result = 0; for (int32_t data_idx = 0; data_idx < row_len; data_idx++) { int32_t filter_index = row_len * out_c + data_idx; int32_t input_val = input_data[data_idx]; int32_t filter_val = filter_data[filter_index]; result += (filter_val + filter_offset) * (input_val + input_offset); } if (bias) { result += bias[out_c]; } result = esp_nn_multiply_by_quantized_mult(result, out_mult[out_c], out_shift[out_c]); result += out_offset; result = max(result, activation_min); result = min(result, activation_max); out_data[out_c] = (int8_t) result; } } ================================================ FILE: src/fully_connected/esp_nn_fully_connected_esp32s3.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /* * FC multi-path dispatcher for ESP32-S3. * - Pre-computes offset corrections per channel in C * - Dispatches to s8 MAC assembly (aligned, large row_len) or s16 assembly (fallback) */ #include #include #include #include /* Original s16 assembly (renamed) */ extern void esp_nn_fc_s16_esp32s3(const int8_t *input_data, const int32_t input_offset, const uint16_t row_len, const int8_t *filter_data, const int32_t filter_offset, const int32_t *bias, int8_t *out_data, const uint16_t out_channels, const int32_t out_offset, const int32_t out_shift, const int32_t out_mult, const int32_t activation_min, const int32_t activation_max); extern void esp_nn_fc_per_ch_s16_esp32s3(const int8_t *input_data, const int32_t input_offset, const uint16_t row_len, const int8_t *filter_data, const int32_t filter_offset, const int32_t *bias, int8_t *out_data, const uint16_t out_channels, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max); /* Shared s8 dot product from common — handles unaligned filter via USAR+QUP */ extern int32_t esp_nn_dot_s8_unaligned_esp32s3(const int8_t *a, const int8_t *b, int32_t len_div16); void esp_nn_fully_connected_s8_esp32s3(const int8_t *input_data, const int32_t input_offset, const uint16_t row_len, const int8_t *filter_data, const int32_t filter_offset, const int32_t *bias, int8_t *out_data, const uint16_t out_channels, const int32_t out_offset, const int32_t out_shift, const int32_t out_mult, const int32_t activation_min, const int32_t activation_max) { /* Quick check: s8 fast path only for aligned, row_len%16, no filter_offset */ if (__builtin_expect(filter_offset != 0 || row_len < 16 || ((uintptr_t)input_data & 15), 0)) { /* Fallback to original s16 assembly — tail call, no extra overhead */ esp_nn_fc_s16_esp32s3(input_data, input_offset, row_len, filter_data, filter_offset, bias, out_data, out_channels, out_offset, out_shift, out_mult, activation_min, activation_max); return; } { int32_t row_len_div16 = row_len >> 4; /* Pre-compute per-channel corrections once */ int32_t corrections[out_channels]; for (int ch = 0; ch < out_channels; ch++) { const int8_t *f_ptr = filter_data + ch * row_len; int32_t corr = 0; if (input_offset != 0) { int32_t filter_sum = 0; for (int i = 0; i < row_len; i++) { filter_sum += f_ptr[i]; } corr = filter_sum * input_offset; } if (bias) { corr += bias[ch]; } corrections[ch] = corr; } int32_t row_len_rem = row_len & 15; int32_t simd_bytes = row_len_div16 << 4; for (int ch = 0; ch < out_channels; ch++) { const int8_t *f_ptr = filter_data + ch * row_len; int32_t acc = esp_nn_dot_s8_unaligned_esp32s3(input_data, f_ptr, row_len_div16); /* Scalar remainder for non-multiple-of-16 row_len */ for (int i = 0; i < row_len_rem; i++) { acc += (int32_t)input_data[simd_bytes + i] * (int32_t)f_ptr[simd_bytes + i]; } acc += corrections[ch]; acc = esp_nn_multiply_by_quantized_mult(acc, out_mult, out_shift); acc += out_offset; acc = max(acc, activation_min); acc = min(acc, activation_max); out_data[ch] = (int8_t)acc; } } } void esp_nn_fully_connected_per_ch_s8_esp32s3(const int8_t *input_data, const int32_t input_offset, const uint16_t row_len, const int8_t *filter_data, const int32_t filter_offset, const int32_t *bias, int8_t *out_data, const uint16_t out_channels, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max) { if (__builtin_expect(filter_offset != 0 || row_len < 16 || ((uintptr_t)input_data & 15), 0)) { esp_nn_fc_per_ch_s16_esp32s3(input_data, input_offset, row_len, filter_data, filter_offset, bias, out_data, out_channels, out_offset, out_shift, out_mult, activation_min, activation_max); return; } { int32_t row_len_div16 = row_len >> 4; /* Pre-compute per-channel corrections once */ int32_t corrections[out_channels]; for (int ch = 0; ch < out_channels; ch++) { const int8_t *f_ptr = filter_data + ch * row_len; int32_t corr = 0; if (input_offset != 0) { int32_t filter_sum = 0; for (int i = 0; i < row_len; i++) { filter_sum += f_ptr[i]; } corr = filter_sum * input_offset; } if (bias) { corr += bias[ch]; } corrections[ch] = corr; } int32_t row_len_rem = row_len & 15; int32_t simd_bytes = row_len_div16 << 4; for (int ch = 0; ch < out_channels; ch++) { const int8_t *f_ptr = filter_data + ch * row_len; int32_t acc = esp_nn_dot_s8_unaligned_esp32s3(input_data, f_ptr, row_len_div16); for (int i = 0; i < row_len_rem; i++) { acc += (int32_t)input_data[simd_bytes + i] * (int32_t)f_ptr[simd_bytes + i]; } acc += corrections[ch]; acc = esp_nn_multiply_by_quantized_mult(acc, out_mult[ch], out_shift[ch]); acc += out_offset; acc = max(acc, activation_min); acc = min(acc, activation_max); out_data[ch] = (int8_t)acc; } } } ================================================ FILE: src/fully_connected/esp_nn_fully_connected_per_ch_s8_esp32s3.S ================================================ // // SPDX-FileCopyrightText: 2025-2026 Espressif Systems (Shanghai) CO LTD // // SPDX-License-Identifier: Apache-2.0 // .text .align 4 .literal_position .literal .LC3_26_101, 1073741824 // nudge (1 << 30) # Program Unit: esp_nn_fc_per_ch_s16_esp32s3 .type esp_nn_fc_per_ch_s16_esp32s3, @function .align 4 .global esp_nn_fc_per_ch_s16_esp32s3 // a2: input_data // a3: input_offset // a4: row_len // a5: filter_data // a6: filter_offset // a7: bias // on stack: out_data // on stack: out_channels // on stack: out_offset // on stack: out_shift // on stack: out_mult // on stack: activation_min // on stack: activation_max esp_nn_fc_per_ch_s16_esp32s3: # 0x4 # qacc_scratch = 0 // 40, filter_offset // 44, input_offset # gra_spill_temp_7 = 48 # gra_spill_temp_2 = 60 # gra_spill_temp_3 = 64 # gra_spill_temp_4 = 68 # gra_spill_temp_5 = 72 # gra_spill_temp_6 = 76 # gra_spill_temp_8 = 80 # gra_spill_temp_9 = 84 entry a1,112 # s32i.n a5,a1,60 # [0] gra_spill_temp_2, filter_data s32i a7,a1,48 # [1] gra_spill_temp_7, bias s32i a6,a1,40 # [2] id:252 filter_offset+0x0 s32i a3,a1,44 # [3] id:251 input_offset+0x0 mov.n a13,a2 # [5] mov.n a12,a4 # [6] // out_channel loop l16ui a2,a1,116 # [7] id:255 out_channels+0x0 addi a4,a1,40 # [8] addi a8,a1,44 # [9] ee.vldbc.16 q5,a8 # [10] id:253 input_offset ee.vldbc.16 q6,a4 # [12] id:254 filter_offset beqz.n a2,.Lt_0_7938 # [13] ee.zero.q q7 # [0] srai a11,a12,3 # [2] l32i a8,a1,112 # [6] id:259 out_data+0x0 addi a9,a12,-7 # [7] s32i a9,a1,76 # [8] gra_spill_temp_6 s32i a8,a1,72 # [9] gra_spill_temp_5 s32i a11,a1,64 # [14] gra_spill_temp_3 slli a11,a11,3 # [16] s32i a11,a1,68 # [18] gra_spill_temp_4 movi.n a15,0 # [17] mov.n a14,a7 # [15] mov.n a11,a5 # [31] l32i a10,a1,124 # out_shift l32i a2,a1,128 # out_mult s32i a10,a1,80 # gra_spill_temp_8 s32i a2,a1,84 # gra_spill_temp_9 movi.n a10,0 # [32] mov.n a2,a11 # [33] .Lt_0_8450: # 0x12b l32i a9,a1,76 # [2] gra_spill_temp_6 extui a5,a11,0,3 # [34] ee.zero.accx slli a5,a5,1 # [3] bgei a9,0,.LBB6_esp_nn_fc_per_ch_s16_esp32s3 # [9] mov.n a5,a10 # [6] movi.n a2,0 # [0] j .Lt_0_8706 # [1] .LBB6_esp_nn_fc_per_ch_s16_esp32s3: # 0x147 wur.sar_byte a5 # [5] ee.vld.l.64.ip q4,a2,8 # [4] id:267 l32i a4,a1,64 # [0] gra_spill_temp_3 mov.n a3,a13 # [1] addx8 a5,a4,a10 # [2] ee.vcmp.lt.s8 q2,q4,q7 # [7] ee.vzip.8 q4,q2 # [8] loopgtz a4,.LBB45_esp_nn_fc_per_ch_s16_esp32s3 # [3] ee.vld.l.64.ip q0,a2,8 # [0*II+0] id:268 ee.vld.l.64.ip q1,a3,8 # [0*II+1] id:270 ee.vcmp.lt.s8 q2,q0,q7 # [0*II+2] ee.vcmp.lt.s8 q3,q1,q7 # [0*II+3] ee.vzip.8 q0,q2 # [0*II+4] ee.vzip.8 q1,q3 # [0*II+5] ee.vadds.s16 q1,q1,q5 # [0*II+6] ee.src.q.qup q2,q4,q0 # [0*II+7] ee.vadds.s16 q2,q2,q6 # [0*II+8] ee.vmulas.s16.accx q1,q2 # [0*II+9] .LBB45_esp_nn_fc_per_ch_s16_esp32s3: # 0x170 l32i a2,a1,68 # [0] gra_spill_temp_4 .Lt_0_8706: # 0x173 movi a9, 0 ee.srs.accx a6, a9, 0 bge a2,a12,.Lt_0_9730 # [38] // prepare remaining loop l32i a8,a1,44 # [0] id:251 input_offset+0x0 l32i a7,a1,40 # [1] id:252 filter_offset+0x0 sub a3,a12,a2 # [2] l32i.n a4,a1,60 # [3] gra_spill_temp_2 add.n a2,a2,a13 # [4] add.n a4,a4,a5 # [5] loopgtz a3,.LBB60_esp_nn_fc_per_ch_s16_esp32s3 # [6] // remaining c loop l8ui a3,a2,0 # [0*II+0] id:299 l8ui a5,a4,0 # [0*II+1] id:300 sext a3,a3,7 # [0*II+2] sext a5,a5,7 # [0*II+3] add.n a5,a5,a7 # [0*II+5] add.n a3,a3,a8 # [0*II+6] mull a3,a3,a5 # [0*II+7] addi.n a2,a2,1 # [0*II+8] addi.n a4,a4,1 # [0*II+4] add.n a6,a6,a3 # [0*II+9] .LBB60_esp_nn_fc_per_ch_s16_esp32s3: # 0x20f // add bias .Lt_0_9730: # 0x20f l32i a8,a1,48 # [0] gra_spill_temp_7, bias beqz.n a8,.Lt_0_10754 # [2], skip_bias l32i.n a9,a14,0 # [0] id:301 add.n a6,a6,a9 # [2] // apply quantization .Lt_0_10754: # 0x218 movi a4,0 l32i a5,a1,80 # [25] id:256 gra_spill_temp_8, out_shift+0x0 l32i a5,a5,0 max a2,a5,a4 // left_shift sub a5,a2,a5 // right_shift ssl a2 # [3] sll a6,a6 # [5] // x * (1 << left_shift) l32i a4,a1,84 # [2] gra_spill_temp_9 //out_mult l32r a3,.LC3_26_101 # [0] add.n a10,a10,a12 # [0] addi.n a14,a14,4 # [1] l32i a4,a4,0 add.n a11,a11,a12 # [6] // multiply add nudge and pick high32 ssai 31 mulsh a7,a4,a6 # [4] mull a4,a4,a6 # [5] mov.n a2,a11 # [27] add a4,a4,a3 saltu a8,a4,a3 add.n a7,a7,a8 src a3,a7,a4 // divide_by_power_of2_step blti a5,1,.skip_divide_by2 movi.n a8,1 # [28] addi a4,a5,-1 ssl a4 // load left_shift sll a8,a8 // to_add factor ( 1 << (exponent - 1)) extui a6,a3,31,1 # [33] sub a8,a8,a6 // modified to_add factor ( 1 << (exponent - 1) - (val < 0)) add a3,a3,a8 // val + to_add ssr a5 # [29] //load right_shift sra a3,a3 # [31] .skip_divide_by2: l32i a8,a1,120 # [41] out_offset l32i a7,a1,132 # [44] // activation_min l32i a4,a1,136 # [45] // activation_max add.n a8,a8,a3 # [46] // add out_offset l32i a6,a1,72 # [47] gra_spill_temp_5 l32i.n a3,a1,116 # [48] out_channels max a7,a7,a8 # [49] add.n a6,a15,a6 # [50] min a4,a4,a7 # [51] addi.n a15,a15,1 # [52] l32i a7,a1,84 # gra_spill_temp_9 l32i a8,a1,80 # gra_spill_temp_8 s8i a4,a6,0 # store output addi.n a7,a7,4 # increment mult pointer addi.n a8,a8,4 # increment mult pointer s32i a7,a1,84 # gra_spill_temp_9 s32i a8,a1,80 # gra_spill_temp_8 bne a3,a15,.Lt_0_8450 # [55] .Lt_0_7938: # 0x25c retw.n # [0] .size esp_nn_fc_per_ch_s16_esp32s3, . - esp_nn_fc_per_ch_s16_esp32s3 ================================================ FILE: src/fully_connected/esp_nn_fully_connected_s8_esp32p4.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include #include /** * Fully connected layer for s8 using ESP32-P4 PIE SIMD. * * Uses esp.vmulas.s8.xacc.ld.ip for fused 16-wide s8 MAC + load. * Pre-computes filter_sum * input_offset (like conv) so PIE path * works even with non-zero input_offset. * * Inner loop is software-pipelined: * iteration N: MAC(q0,q1) + load_next_input(q0) * load_next_filter(q1) <- hides MAC latency * counter_update <- independent of above */ /* Core dot product: PIE-accelerated when row_len >= 16 */ static inline __attribute__((always_inline)) int32_t fc_dot_s8_pie(const int8_t *input, const int8_t *filter, int32_t row_len) { int32_t result = 0; int32_t idx = 0; if (row_len >= 32) { /* Double-pumped: process 32 elements per iteration * Uses q0/q1 for first pair, q2/q3 for second pair */ asm volatile ( "esp.zero.xacc \n\t" "mv x30, %[in] \n\t" "mv x31, %[flt] \n\t" "li %[idx], 32 \n\t" "addi s7, %[len], -31 \n\t" /* Prime the pipeline: load first 32 bytes */ "esp.vld.128.ip q0, x30, 16 \n\t" "esp.vld.128.ip q2, x30, 16 \n\t" "esp.vld.128.ip q1, x31, 16 \n\t" "esp.vld.128.ip q3, x31, 16 \n\t" "j 2f \n\t" "1: \n\t" /* MAC pair 1 + load next input[0:16] */ "esp.vmulas.s8.xacc.ld.ip q0, x30, 16, q0, q1 \n\t" /* Load next filter[0:16] while MAC settles */ "esp.vld.128.ip q1, x31, 16 \n\t" /* MAC pair 2 + load next input[16:32] */ "esp.vmulas.s8.xacc.ld.ip q2, x30, 16, q2, q3 \n\t" /* Load next filter[16:32] - interleaved with counter */ "esp.vld.128.ip q3, x31, 16 \n\t" "addi %[idx], %[idx], 32 \n\t" "2: \n\t" "blt %[idx], s7, 1b \n\t" /* Drain pipeline: final two MACs */ "esp.vmulas.s8.xacc q0, q1 \n\t" "esp.vmulas.s8.xacc q2, q3 \n\t" /* Handle 16-element remainder if any (idx+16 <= row_len) */ "addi s7, %[len], -15 \n\t" "bge %[idx], s7, 3f \n\t" "esp.vld.128.ip q0, x30, 16 \n\t" "esp.vld.128.ip q1, x31, 16 \n\t" "esp.vmulas.s8.xacc q0, q1 \n\t" "addi %[idx], %[idx], 16 \n\t" "3: \n\t" "esp.movx.r.xacc.l x30 \n\t" "mv %[res], x30 \n\t" : [idx] "+r"(idx), [res] "=r"(result) : [in] "r"(input), [flt] "r"(filter), [len] "r"(row_len) : "x30", "x31", "s7" ); } else if (row_len >= 16) { /* Single-pumped for 16-31 element rows */ asm volatile ( "esp.zero.xacc \n\t" "mv x30, %[in] \n\t" "mv x31, %[flt] \n\t" "li %[idx], 16 \n\t" "addi s7, %[len], -15 \n\t" "esp.vld.128.ip q0, x30, 16 \n\t" "esp.vld.128.ip q1, x31, 16 \n\t" "j 5f \n\t" "4: \n\t" "esp.vmulas.s8.xacc.ld.ip q0, x30, 16, q0, q1 \n\t" "esp.vld.128.ip q1, x31, 16 \n\t" "addi %[idx], %[idx], 16 \n\t" "5: \n\t" "blt %[idx], s7, 4b \n\t" "esp.vmulas.s8.xacc q0, q1 \n\t" "esp.movx.r.xacc.l x30 \n\t" "mv %[res], x30 \n\t" : [idx] "+r"(idx), [res] "=r"(result) : [in] "r"(input), [flt] "r"(filter), [len] "r"(row_len) : "x30", "x31", "s7" ); } /* Scalar remainder */ for (; idx < row_len; idx++) { result += (int32_t)input[idx] * (int32_t)filter[idx]; } return result; } void esp_nn_fully_connected_s8_esp32p4(const int8_t *input_data, const int32_t input_offset, const uint16_t row_len, const int8_t *filter_data, const int32_t filter_offset, const int32_t *bias, int8_t *out_data, const uint16_t out_channels, const int32_t out_offset, const int32_t out_shift, const int32_t out_mult, const int32_t activation_min, const int32_t activation_max) { /* Enable PIE once for all channels */ asm volatile ( "csrsi 0x7f2, 0b01 \n\t" "li x29, 0b10 \n\t" "esp.movx.w.cfg x29 \n\t" ::: "x29" ); for (int32_t out_c = 0; out_c < out_channels; ++out_c) { const int8_t *filter_row = filter_data + (int32_t)row_len * out_c; int32_t result; if (input_offset == 0 && filter_offset == 0) { /* Fast PIE path: pure s8 dot product */ result = fc_dot_s8_pie(input_data, filter_row, row_len); } else { /* Scalar path with offsets */ result = 0; for (int32_t i = 0; i < row_len; i++) { result += ((int32_t)input_data[i] + input_offset) * ((int32_t)filter_row[i] + filter_offset); } } if (bias) { result += bias[out_c]; } result = esp_nn_requantize(result, out_mult, out_shift); result += out_offset; result = max(result, activation_min); result = min(result, activation_max); out_data[out_c] = (int8_t) result; } } void esp_nn_fully_connected_per_ch_s8_esp32p4(const int8_t *input_data, const int32_t input_offset, const uint16_t row_len, const int8_t *filter_data, const int32_t filter_offset, const int32_t *bias, int8_t *out_data, const uint16_t out_channels, const int32_t out_offset, const int32_t *out_shift, const int32_t *out_mult, const int32_t activation_min, const int32_t activation_max) { /* Enable PIE once for all channels */ asm volatile ( "csrsi 0x7f2, 0b01 \n\t" "li x29, 0b10 \n\t" "esp.movx.w.cfg x29 \n\t" ::: "x29" ); for (int32_t out_c = 0; out_c < out_channels; ++out_c) { const int8_t *filter_row = filter_data + (int32_t)row_len * out_c; int32_t result; if (input_offset == 0 && filter_offset == 0) { result = fc_dot_s8_pie(input_data, filter_row, row_len); } else { result = 0; for (int32_t i = 0; i < row_len; i++) { result += ((int32_t)input_data[i] + input_offset) * ((int32_t)filter_row[i] + filter_offset); } } if (bias) { result += bias[out_c]; } result = esp_nn_requantize(result, out_mult[out_c], out_shift[out_c]); result += out_offset; result = max(result, activation_min); result = min(result, activation_max); out_data[out_c] = (int8_t) result; } } ================================================ FILE: src/fully_connected/esp_nn_fully_connected_s8_esp32s3.S ================================================ // // SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD // // SPDX-License-Identifier: Apache-2.0 // .text .align 4 .literal_position .literal .LC3_26_101, 1073741824 // nudge (1 << 30) # Program Unit: esp_nn_fc_s16_esp32s3 .type esp_nn_fc_s16_esp32s3, @function .align 4 .global esp_nn_fc_s16_esp32s3 // a2: input_data // a3: input_offset // a4: row_len // a5: filter_data // a6: filter_offset // a7: bias // on stack: out_data // on stack: out_channels // on stack: out_offset // on stack: out_shift // on stack: out_mult // on stack: activation_min // on stack: activation_max esp_nn_fc_s16_esp32s3: # 0x4 # qacc_scratch = 0 // 40, filter_offset // 44, input_offset # gra_spill_temp_7 = 48 # gra_spill_temp_0 = 52 # gra_spill_temp_1 = 56 # gra_spill_temp_2 = 60 # gra_spill_temp_3 = 64 # gra_spill_temp_4 = 68 # gra_spill_temp_5 = 72 # gra_spill_temp_6 = 76 entry a1,112 # s32i.n a5,a1,60 # [0] gra_spill_temp_2, filter_data s32i a7,a1,48 # [1] gra_spill_temp_7, bias s32i a6,a1,40 # [2] id:252 filter_offset+0x0 s32i a3,a1,44 # [3] id:251 input_offset+0x0 mov.n a13,a2 # [5] mov.n a12,a4 # [6] // out_channel loop l16ui a2,a1,116 # [7] id:255 out_channels+0x0 addi a4,a1,40 # [8] addi a8,a1,44 # [9] ee.vldbc.16 q5,a8 # [10] id:253 input_offset ee.vldbc.16 q6,a4 # [12] id:254 filter_offset beqz.n a2,.Lt_0_7938 # [13] ee.zero.q q7 # [0] srai a11,a12,3 # [2] l32i a10,a1,128 # [5] id:257 out_mult+0x0 l32i a8,a1,112 # [6] id:259 out_data+0x0 addi a9,a12,-7 # [7] s32i a9,a1,76 # [8] gra_spill_temp_6 s32i a8,a1,72 # [9] gra_spill_temp_5 s32i a11,a1,64 # [14] gra_spill_temp_3 slli a11,a11,3 # [16] s32i a11,a1,68 # [18] gra_spill_temp_4 l32i a10,a1,124 # [25] id:256 out_shift+0x0 movi.n a15,0 # [17] mov.n a14,a7 # [15] max a11,a10,a15 # [29] s32i a11,a1,52 # [30] gra_spill_temp_0 // left_shift sub a10,a11,a10 # // right_shift s32i.n a10,a1,56 # [28] gra_spill_temp_1 // right_shift mov.n a11,a5 # [31] movi.n a10,0 # [32] mov.n a2,a11 # [33] .Lt_0_8450: # 0x12b l32i a9,a1,76 # [2] gra_spill_temp_6 extui a5,a11,0,3 # [34] ee.zero.accx slli a5,a5,1 # [3] bgei a9,0,.LBB6_esp_nn_fc_s16_esp32s3 # [9] mov.n a5,a10 # [6] movi.n a2,0 # [0] j .Lt_0_8706 # [1] .LBB6_esp_nn_fc_s16_esp32s3: # 0x147 wur.sar_byte a5 # [5] ee.vld.l.64.ip q4,a2,8 # [4] id:267 l32i a4,a1,64 # [0] gra_spill_temp_3 mov.n a3,a13 # [1] addx8 a5,a4,a10 # [2] ee.vcmp.lt.s8 q2,q4,q7 # [7] ee.vzip.8 q4,q2 # [8] loopgtz a4,.LBB45_esp_nn_fc_s16_esp32s3 # [3] ee.vld.l.64.ip q0,a2,8 # [0*II+0] id:268 ee.vld.l.64.ip q1,a3,8 # [0*II+1] id:270 ee.vcmp.lt.s8 q2,q0,q7 # [0*II+2] ee.vcmp.lt.s8 q3,q1,q7 # [0*II+3] ee.vzip.8 q0,q2 # [0*II+4] ee.vzip.8 q1,q3 # [0*II+5] ee.vadds.s16 q1,q1,q5 # [0*II+6] ee.src.q.qup q2,q4,q0 # [0*II+7] ee.vadds.s16 q2,q2,q6 # [0*II+8] ee.vmulas.s16.accx q1,q2 # [0*II+9] .LBB45_esp_nn_fc_s16_esp32s3: # 0x170 l32i a2,a1,68 # [0] gra_spill_temp_4 .Lt_0_8706: # 0x173 movi a9, 0 ee.srs.accx a6, a9, 0 bge a2,a12,.Lt_0_9730 # [38] // prepare remaining loop l32i a8,a1,44 # [0] id:251 input_offset+0x0 l32i a7,a1,40 # [1] id:252 filter_offset+0x0 sub a3,a12,a2 # [2] l32i.n a4,a1,60 # [3] gra_spill_temp_2 add.n a2,a2,a13 # [4] add.n a4,a4,a5 # [5] loopgtz a3,.LBB60_esp_nn_fc_s16_esp32s3 # [6] // remaining c loop l8ui a3,a2,0 # [0*II+0] id:299 l8ui a5,a4,0 # [0*II+1] id:300 sext a3,a3,7 # [0*II+2] sext a5,a5,7 # [0*II+3] add.n a5,a5,a7 # [0*II+5] add.n a3,a3,a8 # [0*II+6] mull a3,a3,a5 # [0*II+7] addi.n a2,a2,1 # [0*II+8] addi.n a4,a4,1 # [0*II+4] add.n a6,a6,a3 # [0*II+9] .LBB60_esp_nn_fc_s16_esp32s3: # 0x20f // add bias .Lt_0_9730: # 0x20f l32i a8,a1,48 # [0] gra_spill_temp_7, bias beqz.n a8,.Lt_0_10754 # [2], skip_bias l32i.n a9,a14,0 # [0] id:301 add.n a6,a6,a9 # [2] // apply quantization .Lt_0_10754: # 0x218 l32i a2,a1,52 # [1] gra_spill_temp_0 // left_shift l32i a5,a1,56 # [2] gra_spill_temp_1 // right_shift ssl a2 # [3] sll a6,a6 # [5] // x * (1 << left_shift) l32r a3,.LC3_26_101 # [0] add.n a10,a10,a12 # [0] addi.n a14,a14,4 # [1] l32i a4,a1,128 # [2] gra_spill_temp_10 //out_mult add.n a11,a11,a12 # [6] // multiply add nudge and pick high32 ssai 31 mulsh a7,a4,a6 # [4] mull a4,a4,a6 # [5] mov.n a2,a11 # [27] add a4,a4,a3 saltu a8,a4,a3 add.n a7,a7,a8 src a3,a7,a4 // divide_by_power_of2_step blti a5,1,.skip_divide_by2 movi.n a8,1 # [28] addi a4,a5,-1 ssl a4 // load left_shift sll a8,a8 // to_add factor ( 1 << (exponent - 1)) extui a6,a3,31,1 # [33] sub a8,a8,a6 // modified to_add factor ( 1 << (exponent - 1) - (val < 0)) add a3,a3,a8 // val + to_add ssr a5 # [29] //load right_shift sra a3,a3 # [31] .skip_divide_by2: l32i a8,a1,120 # [41] out_offset l32i a7,a1,132 # [44] // activation_min l32i a4,a1,136 # [45] // activation_max add.n a8,a8,a3 # [46] // add out_offset l32i a6,a1,72 # [47] gra_spill_temp_5 l32i.n a3,a1,116 # [48] out_channels max a7,a7,a8 # [49] add.n a6,a15,a6 # [50] min a4,a4,a7 # [51] addi.n a15,a15,1 # [52] s8i a4,a6,0 # [53] id:302 bne a3,a15,.Lt_0_8450 # [55] .Lt_0_7938: # 0x25c retw.n # [0] .size esp_nn_fc_s16_esp32s3, . - esp_nn_fc_s16_esp32s3 ================================================ FILE: src/logistic/esp_nn_logistic_ansi.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include #include /* * LUT-based int8 logistic (sigmoid) for quantized inference. * * For int8, there are only 256 possible input values. We precompute sigmoid * for all of them during Prepare() and store as a 256-byte LUT. * Eval() then becomes a trivial table lookup — O(1) per element. * * Output quantization is fixed: scale = 1/256, zero_point = -128. * This matches TFLite's convention for int8 logistic output. */ int32_t esp_nn_get_logistic_s8_scratch_size_ansi(void) { return 256; /* LUT: one int8 output per possible int8 input */ } void esp_nn_logistic_s8_prepare_ansi(int8_t *lut, int32_t input_zero_point, float input_scale) { /* Build LUT: for each possible int8 input value (-128..127), * compute sigmoid and quantize to output int8. * * Output quant: scale=1/256, zero_point=-128 * So output_int8 = clamp(round(sigmoid * 256) - 128, -128, 127) * Which simplifies to: output_int8 = clamp(round(sigmoid * 256) - 128, -128, 127) */ for (int i = 0; i < 256; i++) { /* Index matches (uint8_t) cast of int8: i=0→int8(0), i=128→int8(-128) */ int8_t input_val = (int8_t)i; float dequant = (input_val - input_zero_point) * input_scale; float sigmoid = 1.0f / (1.0f + expf(-dequant)); /* Quantize to output: scale=1/256, zp=-128 */ int32_t out_q = (int32_t)roundf(sigmoid * 256.0f) - 128; if (out_q < -128) out_q = -128; if (out_q > 127) out_q = 127; lut[i] = (int8_t)out_q; } } void esp_nn_logistic_s8_ansi(const int8_t *input, int8_t *output, int32_t size, const int8_t *lut) { for (int i = 0; i < size; i++) { output[i] = lut[(uint8_t)input[i]]; } } ================================================ FILE: src/pooling/esp_nn_avg_pool_ansi.c ================================================ // Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include void esp_nn_avg_pool_s8_ansi(const int8_t *input, const uint16_t input_wd, const uint16_t input_ht, int8_t *output, const uint16_t output_wd, const uint16_t output_ht, const uint16_t stride_wd, const uint16_t stride_ht, const uint16_t filter_wd, const uint16_t filter_ht, const uint16_t pad_wd, const uint16_t pad_ht, const int32_t activation_min, const int32_t activation_max, const uint16_t channels) { int32_t base_y = -pad_ht; for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) { int32_t base_x = -pad_wd; for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) { for (int32_t ch_idx = 0; ch_idx < channels; ch_idx++) { int32_t result = 0; int32_t filter_cnt = 0; /* Make sure filter does not cross the input box */ int32_t filter_y_start = max(0, -base_y); int32_t filter_x_start = max(0, -base_x); int32_t filter_y_end = min(filter_ht, input_ht - base_y); int32_t filter_x_end = min(filter_wd, input_wd - base_x); for (int32_t filter_y = filter_y_start; filter_y < filter_y_end; filter_y++) { for (int32_t filter_x = filter_x_start; filter_x < filter_x_end; filter_x++) { int32_t in_x_idx = base_x + filter_x; int32_t in_y_idx = base_y + filter_y; int32_t input_index = (in_y_idx * input_wd + in_x_idx) * channels + ch_idx; result += input[input_index]; filter_cnt++; } } /* Rounded average */ result = result > 0 ? (result + filter_cnt / 2) / filter_cnt : (result - filter_cnt / 2) / filter_cnt; /* Activation function */ result = max(result, activation_min); result = min(result, activation_max); int32_t output_index = (out_y * output_wd + out_x) * channels + ch_idx; output[output_index] = (int8_t) result; } } } } ================================================ FILE: src/pooling/esp_nn_avg_pool_s8_esp32p4.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include #include /** * Average pooling for s8 using ESP32-P4 PIE SIMD. * * Uses QACC per-lane accumulation: multiply 16 input channels by a * vector of 1s, accumulate per-lane across filter window. * Extract 16 × int32 sums via esp.st.qacc.{l,h}.{l,h}.128.ip. * Then divide, clamp, and store. */ void esp_nn_avg_pool_s8_esp32p4(const int8_t *input, const uint16_t input_wd, const uint16_t input_ht, int8_t *output, const uint16_t output_wd, const uint16_t output_ht, const uint16_t stride_wd, const uint16_t stride_ht, const uint16_t filter_wd, const uint16_t filter_ht, const uint16_t pad_wd, const uint16_t pad_ht, const int32_t activation_min, const int32_t activation_max, const uint16_t channels) { /* Enable PIE */ asm volatile ( "csrsi 0x7f2, 0b01 \n\t" "li x29, 0b10 \n\t" "esp.movx.w.cfg x29 \n\t" ::: "x29" ); /* Broadcast 1 into q7 for "multiply by 1" accumulation trick */ const int8_t one_val = 1; asm volatile ( "mv x30, %0 \n\t" "esp.vldbc.8.ip q7, x30, 0 \n\t" :: "r"(&one_val) : "x30" ); const int32_t ch_16 = channels >> 4; int32_t base_y = -pad_ht; for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) { int32_t base_x = -pad_wd; for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) { int32_t filter_y_start = max(0, -base_y); int32_t filter_x_start = max(0, -base_x); int32_t filter_y_end = min(filter_ht, input_ht - base_y); int32_t filter_x_end = min(filter_wd, input_wd - base_x); int32_t filter_cnt = (filter_y_end - filter_y_start) * (filter_x_end - filter_x_start); int32_t half_cnt = filter_cnt >> 1; int8_t *out_ptr = output + (out_y * output_wd + out_x) * channels; /* Process 16 channels at a time using QACC per-lane accumulation */ int32_t ch_offset = 0; for (int32_t ch_blk = 0; ch_blk < ch_16; ch_blk++, ch_offset += 16) { /* Clear per-lane accumulators */ asm volatile ("esp.zero.qacc \n\t"); /* Accumulate via QACC with stride-based fx loop */ for (int32_t fy = filter_y_start; fy < filter_y_end; fy++) { int32_t in_y = base_y + fy; const int8_t *row_ptr = input + (in_y * input_wd + base_x + filter_x_start) * channels + ch_offset; int32_t fx_count = filter_x_end - filter_x_start; asm volatile ( "mv x30, %[ptr] \n\t" "mv s7, %[cnt] \n\t" "1: \n\t" "esp.vld.128.ip q0, x30, 0 \n\t" "esp.vmulas.s8.qacc q0, q7 \n\t" "add x30, x30, %[stride] \n\t" "addi s7, s7, -1 \n\t" "bnez s7, 1b \n\t" : : [ptr] "r"(row_ptr), [cnt] "r"(fx_count), [stride] "r"((int32_t)channels) : "x30", "s7" ); } /* Extract 16 per-lane int32 sums from QACC: * qacc has 4 quadrants, each 128 bits = 4 × int32 */ int32_t sums[16] __attribute__((aligned(16))); asm volatile ( "mv x30, %0 \n\t" "esp.st.qacc.l.l.128.ip x30, 16 \n\t" /* lanes 0-3 */ "esp.st.qacc.l.h.128.ip x30, 16 \n\t" /* lanes 4-7 */ "esp.st.qacc.h.l.128.ip x30, 16 \n\t" /* lanes 8-11 */ "esp.st.qacc.h.h.128.ip x30, 0 \n\t" /* lanes 12-15 */ :: "r"(sums) : "x30", "memory" ); /* Rounded division and activation clamp */ for (int k = 0; k < 16; k++) { int32_t s = sums[k]; int32_t result = s > 0 ? (s + half_cnt) / filter_cnt : (s - half_cnt) / filter_cnt; result = max(result, activation_min); result = min(result, activation_max); out_ptr[ch_offset + k] = (int8_t) result; } } /* Handle remaining channels scalar */ for (int32_t ch_idx = ch_offset; ch_idx < channels; ch_idx++) { int32_t result = 0; int32_t count = 0; for (int32_t fy = filter_y_start; fy < filter_y_end; fy++) { for (int32_t fx = filter_x_start; fx < filter_x_end; fx++) { int32_t in_y = base_y + fy; int32_t in_x = base_x + fx; result += input[(in_y * input_wd + in_x) * channels + ch_idx]; count++; } } result = result > 0 ? (result + count / 2) / count : (result - count / 2) / count; result = max(result, activation_min); result = min(result, activation_max); out_ptr[ch_idx] = (int8_t) result; } } } } ================================================ FILE: src/pooling/esp_nn_avg_pool_s8_esp32s3.S ================================================ // // SPDX-FileCopyrightText: 2021-2026 Espressif Systems (Shanghai) CO LTD // // SPDX-License-Identifier: Apache-2.0 // .text .align 4 .literal_position # Program Unit: esp_nn_avg_pool_s8_esp32s3_asm .type esp_nn_avg_pool_s8_esp32s3_asm, @function .align 4 .global esp_nn_avg_pool_s8_esp32s3_asm // no of channels must be multiple of 4. // a2: input // a3: input_wd // a4: input_ht // a5: output // a6: output_wd // a7: output_ht // on stack: stride_wd // on stack: stride_ht // on stack: filter_wd // on stack: filter_ht // on stack: pad_wd // on stack: pad_ht // on stack: activation_min // on stack: activation_max // on stack: channels esp_nn_avg_pool_s8_esp32s3_asm: # 0x4 # activation_min = 0 # activation_max = 4 # gra_spill_temp_0 = 8 # gra_spill_temp_1 = 12 # gra_spill_temp_2 = 16 # gra_spill_temp_3 = 20 # gra_spill_temp_4 = 24 # gra_spill_temp_5 = 28 # gra_spill_temp_6 = 32 # gra_spill_temp_7 = 36 # gra_spill_temp_8 = 40 # gra_spill_temp_9 = 44 # gra_spill_temp_10 = 48 # gra_spill_temp_11 = 52 # gra_spill_temp_12 = 56 # gra_spill_temp_13 = 60 # gra_spill_temp_14 = 64 # gra_spill_temp_15 = 68 # gra_spill_temp_16 = 72 # gra_spill_temp_17 = 76 # gra_spill_temp_18 = 80 # gra_spill_temp_19 = 84 # gra_spill_temp_20 = 88 # gra_spill_temp_21 = 92 # gra_spill_temp_22 = 96 # gra_spill_temp_23 = 100 # gra_spill_temp_24 = 104 # gra_spill_temp_25 = 108 # gra_spill_temp_26 = 112 # gra_spill_temp_27 = 116 # gra_spill_temp_28 = 120 # gra_spill_temp_29 = 124 # gra_spill_temp_30 = 128 # gra_spill_temp_31 = 132 # gra_spill_temp_32 = 136 # gra_spill_temp_33 = 140 # gra_spill_temp_34 = 144 # gra_spill_temp_35 = 148 # gra_spill_temp_36 = 152 # gra_spill_temp_37 = 156 # gra_spill_temp_38 = 160 # gra_spill_temp_39 = 164 # gra_spill_temp_40 = 168 # gra_spill_temp_41 = 172 # gra_spill_temp_43 = 180 entry a1,240 # mov.n a11,a3 # [0] mov.n a12,a2 # [1] s32i a5,a1,136 # [4] gra_spill_temp_30 s32i a6,a1,128 # [3] gra_spill_temp_32 l16ui a5,a1,272 # [5] id:663 channels+0x0 s32i a7,a1,72 # [6] gra_spill_temp_16 l32i a9,a1,264 # [1] id:664 activation_min+0x0 l32i a10,a1,268 # [2] id:666 activation_max+0x0 s32i.n a9,a1,0 # [4] activation_min s32i.n a10,a1,4 # [3] activation_max addi.n a8,a1,4 # [0] activation_max ee.vldbc.32 q7,a1 # [5] id:668 activation_min ee.vldbc.32 q6,a8 # [6] id:669 activation_max ee.zero.q q4 # [0] extui a10,a5,0,3 # [7] beqz.n a10,.LBB3_esp_nn_avg_pool_s8_esp32s3_asm # [8], if (channels % 8 == 0) extui a13,a5,0,2 # [0] beqz.n a13,.LBB52_esp_nn_avg_pool_s8_esp32s3_asm # [1], if (channels % 4 == 0) // exit .Lt_0_44546: # 0x1e9 retw.n # [0] .LBB3_esp_nn_avg_pool_s8_esp32s3_asm: # 0x1eb // if (channels % 8 == 0) l16ui a7,a1,256 # [1] id:671 pad_wd+0x0 l16ui a10,a1,260 # [5] id:670 pad_ht+0x0 l32i a15,a1,72 # [12] gra_spill_temp_16 movi.n a14,0 # [13] movi.n a8,0 # [14] neg a10,a10 # [15] s32i a10,a1,56 # [16] gra_spill_temp_12 s32i a8,a1,44 # [17] gra_spill_temp_9 s32i.n a14,a1,20 # [18] gra_spill_temp_3 sub a9,a4,a10 # [19] s32i a9,a1,40 # [20] gra_spill_temp_8 mul16u a15,a15,a5 # [21] neg a13,a7 # [22] s32i a13,a1,104 # [23] gra_spill_temp_24 s32i.n a15,a1,16 # [24] gra_spill_temp_2 sub a13,a3,a13 # [25] s32i.n a13,a1,12 # [26] gra_spill_temp_1 j .Lt_0_28162 # [27] .Lt_0_28418: # 0x24e # Part of loop body line 44, head labeled .Lt_0_28162 l32i a15,a1,260 # [0] pad_ht l32i a14,a1,56 # [1] gra_spill_temp_12 l32i.n a9,a1,16 # [2] gra_spill_temp_2 l32i a13,a1,244 # [3] stride_ht l32i a10,a1,40 # [4] gra_spill_temp_8 l32i a8,a1,44 # [5] gra_spill_temp_9 sub a10,a10,a13 # [6] add.n a8,a8,a9 # [7] add.n a14,a14,a13 # [8] sub a15,a15,a13 # [9] s32i a15,a1,260 # [10] pad_ht s32i a14,a1,56 # [11] gra_spill_temp_12 s32i a8,a1,44 # [12] gra_spill_temp_9 s32i a10,a1,40 # [13] gra_spill_temp_8 l32i.n a8,a1,20 # [14] gra_spill_temp_3 l32i a9,a1,72 # [15] gra_spill_temp_16 addi.n a8,a8,1 # [16] s32i.n a8,a1,20 # [17] gra_spill_temp_3 beq a8,a9,.Lt_0_44546 # [18] .Lt_0_28162: # 0x281 l32i a10,a1,128 # [0] gra_spill_temp_32 beqz.n a10,.Lt_0_28418 # [2] .LBB7_esp_nn_avg_pool_s8_esp32s3_asm: # 0x286 # Part of loop body line 44, head labeled .Lt_0_28162 s32i a7,a1,112 # [0] gra_spill_temp_26 movi.n a10,0 # [1] l32i a9,a1,260 # [2] pad_ht l32i.n a6,a1,12 # [3] gra_spill_temp_1 l32i a8,a1,44 # [4] gra_spill_temp_9 movi.n a13,0 # [5] l32i a15,a1,104 # [6] gra_spill_temp_24 s32i a15,a1,116 # [7] gra_spill_temp_27 s32i a13,a1,48 # [8] gra_spill_temp_10 s32i a8,a1,124 # [9] gra_spill_temp_29 s32i a6,a1,120 # [10] gra_spill_temp_28 l32i a8,a1,40 # [11] gra_spill_temp_8 l32i a6,a1,252 # [12] filter_ht movi.n a13,0 # [13] max a9,a9,a10 # [14] s32i a9,a1,160 # [15] gra_spill_temp_38 s32i a13,a1,92 # [16] gra_spill_temp_21 min a6,a6,a8 # [17] bnez.n a5,.LBB10_esp_nn_avg_pool_s8_esp32s3_asm # [18] .Lt_0_29186: # 0x2ba l32i a8,a1,116 # [0] gra_spill_temp_27 l32i a15,a1,120 # [1] gra_spill_temp_28 l32i a9,a1,48 # [2] gra_spill_temp_10 l32i a14,a1,240 # [3] stride_wd l32i a10,a1,124 # [4] gra_spill_temp_29 l32i a13,a1,112 # [5] gra_spill_temp_26 add.n a10,a10,a5 # [6] s32i a10,a1,124 # [7] gra_spill_temp_29 sub a13,a13,a14 # [8] add.n a9,a9,a14 # [9] sub a15,a15,a14 # [10] add.n a8,a8,a14 # [11] s32i a8,a1,116 # [12] gra_spill_temp_27 s32i a15,a1,120 # [13] gra_spill_temp_28 s32i a9,a1,48 # [14] gra_spill_temp_10 s32i a13,a1,112 # [15] gra_spill_temp_26 l32i a9,a1,92 # [16] gra_spill_temp_21 l32i a10,a1,128 # [17] gra_spill_temp_32 addi.n a9,a9,1 # [18] s32i a9,a1,92 # [19] gra_spill_temp_21 beq a9,a10,.Lt_0_28418 # [20] .Lt_0_28930: # 0x2f5 # Part of loop body line 46, head labeled .Lt_0_29186 beqz.n a5,.Lt_0_29186 # [0] .LBB10_esp_nn_avg_pool_s8_esp32s3_asm: # 0x2f7 # Part of loop body line 44, head labeled .Lt_0_28162 l32i a14,a1,120 # [0] gra_spill_temp_28 l32i a13,a1,248 # [1] filter_wd l32i a9,a1,136 # [2] gra_spill_temp_30 l32i a8,a1,124 # [3] gra_spill_temp_29 movi.n a15,0 # [4] s32i a15,a1,24 # [5] gra_spill_temp_60 add.n a10,a8,a5 # [6] movi.n a15,0 # [7] add.n a8,a8,a9 # [8] min a13,a13,a14 # [9] add.n a10,a9,a10 # [10] s32i a10,a1,180 # [11] gra_spill_temp_43 s32i a13,a1,76 # [12] gra_spill_temp_17 l32i a14,a1,112 # [13] gra_spill_temp_26 s32i a8,a1,148 # [14] gra_spill_temp_45 max a14,a14,a15 # [15] l32i a15,a1,116 # [16] gra_spill_temp_27 s32i a14,a1,152 # [17] gra_spill_temp_63 add.n a8,a15,a14 # [18] s32i a8,a1,36 # [19] gra_spill_temp_7 add.n a15,a15,a13 # [20] s32i a15,a1,204 # [21] gra_spill_temp_39 sub a13,a13,a14 # [22] s32i a13,a1,280 # [23] gra_spill_temp_58 j .Lt_0_29698 # [24] .LBB13_esp_nn_avg_pool_s8_esp32s3_asm: # 0x33b # Part of loop body line 16, head labeled .Lt_0_29698 l32i a10,a1,56 # [0] gra_spill_temp_12 l32i a14,a1,204 # [1] gra_spill_temp_39 add.n a10,a10,a15 # [2] mull a10,a11,a10 # [3] movi.n a15,0 # [4] add.n a14,a10,a14 # [5] .Lt_0_30466: # 0x34a # Loop body line 61, nesting depth: 4, estimated iterations: 252 l32i a9,a1,76 # [0] gra_spill_temp_17 l32i a8,a1,152 # [1] gra_spill_temp_63 add.n a14,a14,a11 # [2] bge a8,a9,.Lt_0_30722 # [3] .LBB16_esp_nn_avg_pool_s8_esp32s3_asm: # 0x355 # Part of loop body line 61, head labeled .Lt_0_30466 l32i a3,a1,36 # [0] gra_spill_temp_7 l32i a2,a1,24 # [1] gra_spill_temp_4 add.n a3,a3,a10 # [2] mull a3,a3,a5 # [3] movi.n a8,0 # [4] add.n a2,a2,a3 # [5] l32i a3,a1,280 # [6] gra_spill_temp_58 add.n a2,a12,a2 # [7] loopgtz a3,.LBB140_esp_nn_avg_pool_s8_esp32s3_asm # [8] ee.vld.l.64.xp q0,a2,a5 # [0*II+1] id:677 ee.vcmp.lt.s8 q1,q0,q4 # [0*II+3] ee.vzip.8 q0,q1 # [0*II+4] ee.vcmp.lt.s16 q1,q0,q4 # [0*II+5] ee.vzip.16 q0,q1 # [0*II+6] ee.vadds.s32 q2,q2,q1 # [0*II+7] ee.vadds.s32 q3,q3,q0 # [0*II+8] .LBB140_esp_nn_avg_pool_s8_esp32s3_asm: # 0x385 # Part of loop body line 61, head labeled .Lt_0_30466 l32i a2,a1,48 # [0] gra_spill_temp_10 sub a9,a7,a2 # [2] sub a2,a2,a7 # [3] max a9,a9,a8 # [4] l32i a8,a1,248 # [5] filter_wd sub a2,a11,a2 # [6] min a8,a8,a2 # [7] sub a8,a8,a9 # [8] add.n a15,a15,a8 # [9] .Lt_0_30722: # 0x39f # Part of loop body line 61, head labeled .Lt_0_30466 add.n a10,a10,a11 # [0] addi.n a13,a13,1 # [1] bne a6,a13,.Lt_0_30466 # [2] .Lt_0_29954: # 0x3a6 srai a2,a15,1 # [3] // move data to general purpose registers and average ee.movi.32.a q3,a9,0 # [0] ee.movi.32.a q3,a4,1 # [0] blti a9,1,.Lt_0_32258 # [4] add.n a9,a9,a2 # [0] j .Lt_0_32002 # [2] .Lt_0_32258: # 0x45e sub a9,a9,a2 # [0] .Lt_0_32002: # 0x3b9 blti a4,1,.Lt_0_32770 # [1] add.n a4,a2,a4 # [0] j .Lt_0_32514 # [2] .Lt_0_32770: sub a4,a4,a2 # [0] .Lt_0_32514: # 0x3c4 quos a9,a9,a15 # [1] quos a4,a4,a15 # [1] ee.movi.32.q q3,a9,0 # [0] ee.movi.32.q q3,a4,1 # [1] ee.movi.32.a q3,a9,2 # [2] ee.movi.32.a q3,a14,3 # [0] blti a9,1,.Lt_0_33282 # [3] add.n a9,a9,a2 # [0] j .Lt_0_33026 # [2] .Lt_0_33282: # 0x470 sub a9,a9,a2 # [0] .Lt_0_33026: # 0x3d5 blti a14,1,.Lt_0_33794 # [1] add.n a14,a2,a14 # [0] j .Lt_0_33538 # [2] .Lt_0_33794: # 0x479 sub a14,a14,a2 # [0] .Lt_0_33538: # 0x3e0 quos a9,a9,a15 # [1] quos a14,a14,a15 # [1] ee.movi.32.q q3,a9,2 # [0] ee.movi.32.q q3,a14,3 # [1] ee.movi.32.a q2,a9,0 # [0] ee.movi.32.a q2,a4,1 # [0] blti a9,1,.Lt_0_34306 # [3] add.n a9,a9,a2 # [0] j .Lt_0_34050 # [2] .Lt_0_34306: # 0x482 sub a9,a9,a2 # [0] .Lt_0_34050: # 0x3f1 blti a4,1,.Lt_0_34818 # [1] add.n a4,a2,a4 # [0] j .Lt_0_34562 # [2] .Lt_0_34818: # 0x48b sub a4,a4,a2 # [0] .Lt_0_34562: # 0x3fc quos a9,a9,a15 # [1] quos a4,a4,a15 # [1] ee.movi.32.q q2,a9,0 # [0] ee.movi.32.q q2,a4,1 # [1] ee.movi.32.a q2,a9,2 # [2] ee.movi.32.a q2,a14,3 # [0] blti a9,1,.Lt_0_35330 # [3] add.n a9,a9,a2 # [0] j .Lt_0_35074 # [2] .Lt_0_35330: # 0x494 sub a9,a9,a2 # [0] .Lt_0_35074: # 0x40d blti a14,1,.Lt_0_35842 # [1] add.n a14,a2,a14 # [0] j .Lt_0_35586 # [2] .Lt_0_35842: # 0x49d sub a14,a14,a2 # [0] .Lt_0_35586: # 0x418 quos a9,a9,a15 # [1] quos a14,a14,a15 # [1] ee.movi.32.q q2,a9,2 # [0] ee.movi.32.q q2,a14,3 # [1] l32i a9,a1,180 # [0] gra_spill_temp_43 l32i a14,a1,24 # [1] gra_spill_temp_4 l32i a13,a1,148 # [2] gra_spill_temp_45 ee.vmin.s32 q1,q3,q6 # [4] ee.vmax.s32 q1,q1,q7 # [5] ee.vmin.s32 q5,q2,q6 # [8] addi.n a14,a14,8 # [9] s32i a14,a1,24 # [10] gra_spill_temp_4 ee.vmax.s32 q5,q5,q7 # [11] addi.n a8,a13,8 # [12] s32i a8,a1,148 # [13] gra_spill_temp_45 ee.vunzip.16 q1,q5 # [14] ee.vunzip.8 q1,q5 # [15] ee.vst.l.64.ip q1,a13,0 # [16] id:678 bge a8,a9,.Lt_0_29186 # [17] .Lt_0_29698: # 0x44b # Loop body line 16, nesting depth: 3, estimated iterations: 252 mv.qr q3,q4 # [0] l32i a15,a1,160 # [1] gra_spill_temp_38 mv.qr q2,q4 # [2] mov.n a13,a15 # [3] blt a15,a6,.LBB13_esp_nn_avg_pool_s8_esp32s3_asm # [4] .Lt_0_51458: # 0x459 # Part of loop body line 16, head labeled .Lt_0_29698 movi.n a15,0 # [0] j .Lt_0_29954 # [1] .LBB52_esp_nn_avg_pool_s8_esp32s3_asm: # 0x4a6 // if (channels % 4 == 0) l16ui a7,a1,256 # [1] id:671 pad_wd+0x0 l16ui a13,a1,260 # [5] id:670 pad_ht+0x0 s32i a13,a1,64 # [8] gra_spill_temp_4 l32i a8,a1,72 # [12] gra_spill_temp_16 movi.n a15,0 # [13] movi.n a9,0 # [14] neg a13,a13 # [15] s32i a13,a1,192 # [16] gra_spill_temp_36 s32i a9,a1,32 # [17] gra_spill_temp_6 s32i.n a15,a1,8 # [18] gra_spill_temp_0 sub a10,a4,a13 # [19] s32i a10,a1,28 # [20] gra_spill_temp_5 mul16u a8,a8,a5 # [21] neg a14,a7 # [22] s32i a14,a1,104 # [23] gra_spill_temp_24 s32i.n a8,a1,16 # [24] gra_spill_temp_2 sub a14,a3,a14 # [25] s32i.n a14,a1,12 # [26] gra_spill_temp_1 j .Lt_0_37890 # [27] .Lt_0_38146: # 0x50b # Part of loop body line 161, head labeled .Lt_0_37890 l32i a15,a1,64 # [0] gra_spill_temp_4 l32i a14,a1,192 # [1] gra_spill_temp_36 l32i.n a9,a1,16 # [2] gra_spill_temp_2 l32i a13,a1,244 # [3] stride_ht l32i a10,a1,28 # [4] gra_spill_temp_5 l32i a8,a1,32 # [5] gra_spill_temp_6 sub a10,a10,a13 # [6] add.n a8,a8,a9 # [7] add.n a14,a14,a13 # [8] sub a15,a15,a13 # [9] s32i a15,a1,64 # [10] gra_spill_temp_4 s32i a14,a1,192 # [11] gra_spill_temp_36 s32i a8,a1,32 # [12] gra_spill_temp_6 s32i a10,a1,28 # [13] gra_spill_temp_5 l32i.n a8,a1,8 # [14] gra_spill_temp_0 l32i a9,a1,72 # [15] gra_spill_temp_16 addi.n a8,a8,1 # [16] s32i.n a8,a1,8 # [17] gra_spill_temp_0 sub a8,a8,a9 # [18] beqz a8,.Lt_0_44546 # [19] .Lt_0_37890: # 0x541 # Loop body line 161, nesting depth: 1, estimated iterations: 252 l32i a10,a1,128 # [0] gra_spill_temp_32 beqz.n a10,.Lt_0_38146 # [2] # Part of loop body line 161, head labeled .Lt_0_37890 s32i a7,a1,96 # [0] gra_spill_temp_22 movi.n a10,0 # [1] l32i a9,a1,64 # [2] gra_spill_temp_4 l32i.n a6,a1,12 # [3] gra_spill_temp_1 l32i a8,a1,32 # [4] gra_spill_temp_6 movi.n a13,0 # [5] l32i a15,a1,104 # [6] gra_spill_temp_24 s32i a15,a1,100 # [7] gra_spill_temp_23 s32i a13,a1,148 # [8] gra_spill_temp_35 s32i a8,a1,108 # [9] gra_spill_temp_25 s32i a6,a1,144 # [10] gra_spill_temp_24 l32i a8,a1,28 # [11] gra_spill_temp_5 l32i a6,a1,252 # [12] filter_ht max a9,a9,a10 # [14] s32i a9,a1,168 # [15] gra_spill_temp_40 s32i a13,a1,88 # [16] gra_spill_temp_20 min a6,a6,a8 # [17] bnez.n a5,.LBB59_esp_nn_avg_pool_s8_esp32s3_asm # [18] .Lt_0_38914: # 0x57a # Loop body line 163 l32i a8,a1,100 # [0] gra_spill_temp_23 l32i a15,a1,144 # [1] gra_spill_temp_24 l32i a9,a1,148 # [2] gra_spill_temp_35 l32i a14,a1,240 # [3] stride_wd l32i a10,a1,108 # [4] gra_spill_temp_25 l32i a13,a1,96 # [5] gra_spill_temp_22 add.n a10,a10,a5 # [6] s32i a10,a1,108 # [7] gra_spill_temp_25 sub a13,a13,a14 # [8] add.n a9,a9,a14 # [9] sub a15,a15,a14 # [10] add.n a8,a8,a14 # [11] s32i a8,a1,100 # [12] gra_spill_temp_23 s32i a15,a1,144 # [13] gra_spill_temp_24 s32i a9,a1,148 # [14] gra_spill_temp_35 s32i a13,a1,96 # [15] gra_spill_temp_22 l32i a9,a1,88 # [16] gra_spill_temp_20 l32i a10,a1,128 # [17] gra_spill_temp_32 addi.n a9,a9,1 # [18] s32i a9,a1,88 # [19] gra_spill_temp_20 beq a9,a10,.Lt_0_38146 # [20] beqz.n a5,.Lt_0_38914 # [0] .LBB59_esp_nn_avg_pool_s8_esp32s3_asm: # 0x5b7 # Part of loop body line 161, head labeled .Lt_0_37890 l32i a14,a1,144 # [0] gra_spill_temp_24 l32i a13,a1,248 # [1] filter_wd l32i a9,a1,136 # [2] gra_spill_temp_30 l32i a8,a1,108 # [3] gra_spill_temp_25 movi.n a15,0 # [4] s32i a15,a1,216 # [5] gra_spill_temp_52 add.n a10,a8,a5 # [6] add.n a8,a8,a9 # [8] min a13,a13,a14 # [9] add.n a10,a9,a10 # [10] s32i a10,a1,172 # [11] gra_spill_temp_41 s32i a13,a1,132 # [12] gra_spill_temp_31 l32i a14,a1,96 # [13] gra_spill_temp_22 s32i a8,a1,164 # [14] gra_spill_temp_39 max a14,a14,a15 # [15] l32i a15,a1,100 # [16] gra_spill_temp_23 s32i a14,a1,208 # [17] gra_spill_temp_50 add.n a8,a15,a14 # [18] s32i a8,a1,60 # [19] gra_spill_temp_13 add.n a15,a15,a13 # [20] s32i a15,a1,196 # [21] gra_spill_temp_37 sub a13,a13,a14 # [22] s32i a13,a1,52 # [23] gra_spill_temp_11 j .Lt_0_39426 # [24] .LBB62_esp_nn_avg_pool_s8_esp32s3_asm: # 0x5fb # Part of loop body line 173, head labeled .Lt_0_39426 l32i a10,a1,192 # [0] gra_spill_temp_36 l32i a14,a1,196 # [1] gra_spill_temp_37 add.n a10,a10,a15 # [2] mull a10,a11,a10 # [3] movi.n a15,0 # [4] add.n a14,a10,a14 # [5] .Lt_0_40194: # 0x60a # Loop body line 178, nesting depth: 4, estimated iterations: 252 l32i a9,a1,132 # [0] gra_spill_temp_31 l32i a8,a1,208 # [1] gra_spill_temp_50 add.n a14,a14,a11 # [2] bge a8,a9,.Lt_0_40450 # [3] .LBB65_esp_nn_avg_pool_s8_esp32s3_asm: # 0x615 # Part of loop body line 178, head labeled .Lt_0_40194 l32i a3,a1,60 # [0] gra_spill_temp_13 l32i a2,a1,216 # [1] gra_spill_temp_52 add.n a3,a3,a10 # [2] mull a3,a3,a5 # [3] l32i a4,a1,52 # [4] gra_spill_temp_11 add.n a2,a2,a3 # [5] add.n a2,a12,a2 # [6] loopgtz a4,.LBB155_esp_nn_avg_pool_s8_esp32s3_asm # [7] ee.vldbc.32.xp q0,a2,a5 # [0*II+0] id:684 ee.vcmp.lt.s8 q1,q0,q4 # [0*II+2] ee.vzip.8 q0,q1 # [0*II+3] ee.vcmp.lt.s16 q1,q0,q4 # [0*II+4] ee.vzip.16 q0,q1 # [0*II+5] ee.vadds.s32 q2,q2,q0 # [0*II+6] .LBB155_esp_nn_avg_pool_s8_esp32s3_asm: # 0x63e # Part of loop body line 178, head labeled .Lt_0_40194 l32i a2,a1,148 # [0] gra_spill_temp_35 movi.n a8,0 # [1] sub a9,a7,a2 # [2] sub a2,a2,a7 # [3] max a9,a9,a8 # [4] l32i a8,a1,248 # [5] filter_wd sub a2,a11,a2 # [6] min a8,a8,a2 # [7] sub a8,a8,a9 # [8] add.n a15,a15,a8 # [9] .Lt_0_40450: # 0x65a # Part of loop body line 178, head labeled .Lt_0_40194 add.n a10,a10,a11 # [0] addi.n a13,a13,1 # [1] bne a6,a13,.Lt_0_40194 # [2] .Lt_0_39682: # 0x661 # Part of loop body line 173, head labeled .Lt_0_39426 srai a2,a15,1 # [5] // move to gp registers and average ee.movi.32.a q2,a9,0 # [0] ee.movi.32.a q2,a4,1 # [0] blti a9,1,.Lt_0_41986 # [3] add.n a9,a9,a2 # [0] j .Lt_0_41730 # [2] .Lt_0_41986: # 0x482 sub a9,a9,a2 # [0] .Lt_0_41730: # 0x3f1 blti a4,1,.Lt_0_42498 # [1] add.n a4,a2,a4 # [0] j .Lt_0_42242 # [2] .Lt_0_42498: # 0x48b sub a4,a4,a2 # [0] .Lt_0_42242: # 0x3fc quos a9,a9,a15 # [1] quos a4,a4,a15 # [1] ee.movi.32.q q2,a9,0 # [0] ee.movi.32.q q2,a4,1 # [1] ee.movi.32.a q2,a9,2 # [2] ee.movi.32.a q2,a14,3 # [0] blti a9,1,.Lt_0_43010 # [3] add.n a9,a9,a2 # [0] j .Lt_0_42754 # [2] .Lt_0_43010: # 0x494 sub a9,a9,a2 # [0] .Lt_0_42754: # 0x40d blti a14,1,.Lt_0_43522 # [1] add.n a14,a2,a14 # [0] j .Lt_0_43266 # [2] .Lt_0_43522: # 0x49d sub a14,a14,a2 # [0] .Lt_0_43266: # 0x418 quos a9,a9,a15 # [1] quos a14,a14,a15 # [1] ee.movi.32.q q2,a9,2 # [0] ee.movi.32.q q2,a14,3 # [1] l32i a9,a1,172 # [0] gra_spill_temp_41 l32i a8,a1,164 # [1] gra_spill_temp_39 l32i a14,a1,216 # [2] gra_spill_temp_52 addi.n a14,a14,4 # [5] ee.vmin.s32 q2,q2,q6 # [6] s32i a14,a1,216 # [7] gra_spill_temp_52 ee.vmax.s32 q2,q2,q7 # [8] ee.vunzip.16 q2,q1 # [9] ee.vunzip.8 q2,q1 # [10] ee.vst.l.64.ip q2,a1,0 # [11] id:691 l32i.n a13,a1,0 # [12] id:692 s32i.n a13,a8,0 # [13] id:693 addi.n a8,a8,4 # [14] s32i a8,a1,164 # [15] gra_spill_temp_39 bge a8,a9,.Lt_0_38914 # [16] .Lt_0_39426: # 0x6cb l32i a15,a1,168 # [0] gra_spill_temp_40 mv.qr q2,q4 # [1] mov.n a13,a15 # [2] blt a15,a6,.LBB62_esp_nn_avg_pool_s8_esp32s3_asm # [3] .Lt_0_52738: # 0x6d6 movi.n a15,0 # [0] j .Lt_0_39682 # [1] .size esp_nn_avg_pool_s8_esp32s3_asm, . - esp_nn_avg_pool_s8_esp32s3_asm ================================================ FILE: src/pooling/esp_nn_avg_pool_s8_esp32s3.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /* * ESP32-S3 optimized avg pool wrapper. * Routes to existing assembly for channels%4==0, * provides int16-accumulation C path for other cases. */ #include #include #include /* Existing S3 assembly (handles depth%4==0) */ extern void esp_nn_avg_pool_s8_esp32s3_asm(const int8_t *input, const uint16_t input_wd, const uint16_t input_ht, int8_t *output, const uint16_t output_wd, const uint16_t output_ht, const uint16_t stride_wd, const uint16_t stride_ht, const uint16_t filter_wd, const uint16_t filter_ht, const uint16_t pad_wd, const uint16_t pad_ht, const int32_t activation_min, const int32_t activation_max, const uint16_t channels); void esp_nn_avg_pool_s8_esp32s3(const int8_t *input, const uint16_t input_wd, const uint16_t input_ht, int8_t *output, const uint16_t output_wd, const uint16_t output_ht, const uint16_t stride_wd, const uint16_t stride_ht, const uint16_t filter_wd, const uint16_t filter_ht, const uint16_t pad_wd, const uint16_t pad_ht, const int32_t activation_min, const int32_t activation_max, const uint16_t channels) { /* Use existing assembly for channels % 4 == 0 */ if (channels % 4 == 0) { esp_nn_avg_pool_s8_esp32s3_asm(input, input_wd, input_ht, output, output_wd, output_ht, stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht, activation_min, activation_max, channels); return; } /* C path with int16 accumulation for non-aligned channels */ int16_t acc_buf[channels]; int32_t base_y = -pad_ht; for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) { int32_t base_x = -pad_wd; for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) { int32_t fy_start = max(0, -base_y); int32_t fx_start = max(0, -base_x); int32_t fy_end = min(filter_ht, input_ht - base_y); int32_t fx_end = min(filter_wd, input_wd - base_x); int32_t filter_cnt = (fy_end - fy_start) * (fx_end - fx_start); memset(acc_buf, 0, channels * sizeof(int16_t)); for (int32_t fy = fy_start; fy < fy_end; fy++) { for (int32_t fx = fx_start; fx < fx_end; fx++) { int32_t in_idx = ((base_y + fy) * input_wd + (base_x + fx)) * channels; for (int c = 0; c < channels; c++) { acc_buf[c] += (int16_t)input[in_idx + c]; } } } int32_t half_cnt = filter_cnt / 2; int32_t out_idx = (out_y * output_wd + out_x) * channels; for (int c = 0; c < channels; c++) { int32_t result = acc_buf[c]; result = result > 0 ? (result + half_cnt) / filter_cnt : (result - half_cnt) / filter_cnt; result = max(result, activation_min); result = min(result, activation_max); output[out_idx + c] = (int8_t)result; } } } } ================================================ FILE: src/pooling/esp_nn_max_pool_ansi.c ================================================ // Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include void esp_nn_max_pool_s8_ansi(const int8_t *input, const uint16_t input_wd, const uint16_t input_ht, int8_t *output, const uint16_t output_wd, const uint16_t output_ht, const uint16_t stride_wd, const uint16_t stride_ht, const uint16_t filter_wd, const uint16_t filter_ht, const uint16_t pad_wd, const uint16_t pad_ht, const int32_t activation_min, const int32_t activation_max, const uint16_t channels) { int32_t base_y = -pad_ht; for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) { int32_t base_x = -pad_wd; for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) { /* Make sure filter does not cross the input box */ int32_t filter_y_start = max(0, -base_y); int32_t filter_x_start = max(0, -base_x); int32_t filter_y_end = min(filter_ht, input_ht - base_y); int32_t filter_x_end = min(filter_wd, input_wd - base_x); for (int32_t ch_idx = 0; ch_idx < channels; ch_idx++) { int8_t result = INT8_MIN; for (int32_t filter_y = filter_y_start; filter_y < filter_y_end; filter_y++) { for (int32_t filter_x = filter_x_start; filter_x < filter_x_end; filter_x++) { int32_t in_x_idx = base_x + filter_x; int32_t in_y_idx = base_y + filter_y; int32_t input_index = (in_y_idx * input_wd + in_x_idx) * channels + ch_idx; result = max(input[input_index], result); } } /* Activation function */ result = max(result, activation_min); result = min(result, activation_max); int32_t output_index = (out_y * output_wd + out_x) * channels + ch_idx; output[output_index] = result; } } } } ================================================ FILE: src/pooling/esp_nn_max_pool_s8_esp32p4.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include #include #include /** * Max pooling for s8 using ESP32-P4 PIE SIMD. * Vectorizes the channel dimension: processes 16 channels per iteration * using esp.vmax.s8 to find running maximum across the filter window. */ void esp_nn_max_pool_s8_esp32p4(const int8_t *input, const uint16_t input_wd, const uint16_t input_ht, int8_t *output, const uint16_t output_wd, const uint16_t output_ht, const uint16_t stride_wd, const uint16_t stride_ht, const uint16_t filter_wd, const uint16_t filter_ht, const uint16_t pad_wd, const uint16_t pad_ht, const int32_t activation_min, const int32_t activation_max, const uint16_t channels) { /* Enable PIE */ asm volatile ( "csrsi 0x7f2, 0b01 \n\t" "li x29, 0b10 \n\t" "esp.movx.w.cfg x29 \n\t" ::: "x29" ); /* Broadcast activation_min and activation_max into vectors */ int8_t act_min_val = (int8_t) activation_min; int8_t act_max_val = (int8_t) activation_max; int8_t int8_min_val = INT8_MIN; asm volatile ( "mv x30, %0 \n\t" "esp.vldbc.8.ip q4, x30, 0 \n\t" /* q4 = broadcast(activation_min) */ "mv x30, %1 \n\t" "esp.vldbc.8.ip q5, x30, 0 \n\t" /* q5 = broadcast(activation_max) */ "mv x30, %2 \n\t" "esp.vldbc.8.ip q6, x30, 0 \n\t" /* q6 = broadcast(INT8_MIN) for init */ :: "r"(&act_min_val), "r"(&act_max_val), "r"(&int8_min_val) : "x30" ); const int32_t ch_16 = channels >> 4; /* number of full 16-ch blocks */ int32_t base_y = -pad_ht; for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) { int32_t base_x = -pad_wd; for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) { int32_t filter_y_start = max(0, -base_y); int32_t filter_x_start = max(0, -base_x); int32_t filter_y_end = min(filter_ht, input_ht - base_y); int32_t filter_x_end = min(filter_wd, input_wd - base_x); int8_t *out_ptr = output + (out_y * output_wd + out_x) * channels; /* Process channels in blocks of 16 */ int32_t ch_offset = 0; for (int32_t ch_blk = 0; ch_blk < ch_16; ch_blk++, ch_offset += 16) { /* Initialize running max to INT8_MIN (copy q6 -> q0) */ asm volatile ("esp.vmax.s8 q0, q6, q6 \n\t"); /* Accumulate max across filter window. * For fx loop: input channels are at stride=channels apart. */ for (int32_t fy = filter_y_start; fy < filter_y_end; fy++) { int32_t in_y = base_y + fy; const int8_t *row_ptr = input + (in_y * input_wd + base_x + filter_x_start) * channels + ch_offset; int32_t fx_count = filter_x_end - filter_x_start; asm volatile ( "mv x30, %[ptr] \n\t" "mv s7, %[cnt] \n\t" "1: \n\t" "esp.vld.128.ip q1, x30, 0 \n\t" "esp.vmax.s8 q0, q0, q1 \n\t" "add x30, x30, %[stride] \n\t" "addi s7, s7, -1 \n\t" "bnez s7, 1b \n\t" : : [ptr] "r"(row_ptr), [cnt] "r"(fx_count), [stride] "r"((int32_t)channels) : "x30", "s7" ); } /* Apply activation: max(act_min, min(act_max, result)) and store */ { int8_t *store_ptr = out_ptr + ch_offset; asm volatile ( "esp.vmax.s8 q0, q0, q4 \n\t" /* max(result, act_min) */ "esp.vmin.s8 q0, q0, q5 \n\t" /* min(result, act_max) */ "mv x30, %0 \n\t" "esp.vst.128.ip q0, x30, 0 \n\t" /* store 16 channels */ : : "r"(store_ptr) : "x30", "memory" ); } } /* Handle remaining channels scalar */ for (int32_t ch_idx = ch_offset; ch_idx < channels; ch_idx++) { int8_t result = INT8_MIN; for (int32_t fy = filter_y_start; fy < filter_y_end; fy++) { for (int32_t fx = filter_x_start; fx < filter_x_end; fx++) { int32_t in_y = base_y + fy; int32_t in_x = base_x + fx; int32_t input_index = (in_y * input_wd + in_x) * channels + ch_idx; result = max(input[input_index], result); } } result = max(result, (int8_t) activation_min); result = min(result, (int8_t) activation_max); out_ptr[ch_idx] = result; } } } } ================================================ FILE: src/pooling/esp_nn_max_pool_s8_esp32s3.S ================================================ // Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. .text .align 4 .literal_position # Program Unit: esp_nn_max_pool_s8_esp32s3 .type esp_nn_max_pool_s8_esp32s3, @function .align 4 .global esp_nn_max_pool_s8_esp32s3 // no of channels must be multiple of 4 esp_nn_max_pool_s8_esp32s3: # 0x4 # int8_min = 0 # gra_spill_temp_0 = 4 # gra_spill_temp_1 = 8 # gra_spill_temp_2 = 12 # gra_spill_temp_3 = 16 # gra_spill_temp_4 = 20 # gra_spill_temp_5 = 24 # gra_spill_temp_6 = 28 # gra_spill_temp_7 = 32 # gra_spill_temp_8 = 36 # gra_spill_temp_9 = 40 # gra_spill_temp_10 = 44 # gra_spill_temp_11 = 48 # gra_spill_temp_12 = 52 # gra_spill_temp_13 = 56 # gra_spill_temp_14 = 60 # gra_spill_temp_15 = 64 # gra_spill_temp_16 = 68 # gra_spill_temp_17 = 72 # gra_spill_temp_18 = 76 # gra_spill_temp_19 = 80 # gra_spill_temp_20 = 84 # gra_spill_temp_21 = 88 # gra_spill_temp_22 = 92 # gra_spill_temp_23 = 96 // a2: input // a3: input_wd // a4: input_ht // a5: output // a6: output_wd // a7: output_ht // on stack: stride_wd = 120 // on stack: stride_ht = 124 // on stack: filter_wd = 128 // on stack: filter_ht = 132 // on stack: pad_wd = 136 // on stack: pad_ht = 140 // on stack: activation_min // on stack: activation_max // on stack: channels entry a1,120 # mov.n a12,a2 # [0] s32i a6,a1,4 # [2] gra_spill_temp_0 s32i a7,a1,68 # [3] gra_spill_temp_16 mov.n a11,a3 # [4] s32i a5,a1,96 # [5] gra_spill_temp_23 l16ui a5,a1,152 # [6] id:465 channels+0x0 movi a3,-128 # [7] s32i.n a3,a1,0 # [1] int8_min addi.n a9,a1,148 # [0] activation_max addi.n a15,a1,144 # [1] activation_min ee.vldbc.8 q3,a1 # [7] id:473 int8_min+0x0 ee.vldbc.8 q5,a15 # [8] id:470 activation_min+0x0 ee.vldbc.8 q4,a9 # [9] id:471 activation_max+0x0 extui a8,a5,0,3 # [8] beqz.n a8,.LBB3_esp_nn_max_pool_s8_esp32s3 # [9] // if (channels % 8 == 0) extui a14,a5,0,2 # [0] beqz.n a14,.LBB25_esp_nn_max_pool_s8_esp32s3 # [1] // if (channels % 4 == 0) retw.n # [0] // exit .LBB3_esp_nn_max_pool_s8_esp32s3: # 0x1c5 // if (channels % 8 == 0) l16ui a15,a1,136 # [1] id:475 pad_wd+0x0 l16ui a14,a1,140 # [4] id:474 pad_ht+0x0 movi.n a8,0 # [13] movi.n a10,0 # [15] s32i a14,a1,44 # [7] gra_spill_temp_10 neg a15,a15 # [12] mul16u a9,a6,a5 # [14] neg a14,a14 # [16] s32i a14,a1,92 # [17] gra_spill_temp_22 s32i a10,a1,52 # [18] gra_spill_temp_12 s32i a9,a1,60 # [19] gra_spill_temp_14 s32i.n a8,a1,36 # [16] gra_spill_temp_8 s32i a15,a1,56 # [21] gra_spill_temp_13 sub a13,a4,a14 # [22] s32i a13,a1,48 # [23] gra_spill_temp_11 sub a15,a11,a15 # [24] s32i.n a15,a1,40 # [25] gra_spill_temp_9 .Lt_0_21506: # 0x229 l32i a8,a1,4 # [0] gra_spill_temp_0 beqz.n a8,.Lt_0_21762 # [2] movi.n a10,0 # [0] l32i a9,a1,44 # [1] gra_spill_temp_10 l32i.n a15,a1,40 # [2] gra_spill_temp_9 l32i a8,a1,52 # [3] gra_spill_temp_12 l32i.n a13,a1,136 # [4] ,pad_wd l32i a14,a1,56 # [5] gra_spill_temp_13 s32i a14,a1,80 # [6] gra_spill_temp_19 s32i a13,a1,76 # [7] gra_spill_temp_18 s32i a8,a1,88 # [8] gra_spill_temp_21 s32i a15,a1,84 # [9] gra_spill_temp_20 l32i a8,a1,48 # [10] gra_spill_temp_11 max a9,a9,a10 # [11] l32i a15,a1,132 # [12] filter_ht s32i a9,a1,8 # [13] gra_spill_temp_1 movi.n a9,0 # [14] min a15,a15,a8 # [15] s32i a9,a1,64 # [16] gra_spill_temp_15 .Lt_0_22274: # 0x25d beqz.n a5,.Lt_0_22530 # [0] .LBB10_esp_nn_max_pool_s8_esp32s3: # 0x25f # Part of loop body line 46, head labeled .Lt_0_22274 l32i a6,a1,76 # [0] gra_spill_temp_18 l32i a13,a1,96 # [1] gra_spill_temp_23 l32i a8,a1,84 # [2] gra_spill_temp_20 l32i a7,a1,128 # [3] filter_wd l32i a10,a1,88 # [4] gra_spill_temp_21 movi.n a9,0 # [5] s32i a9,a1,20 # [6] gra_spill_temp_4 add.n a14,a10,a5 # [7] min a7,a7,a8 # [8] add.n a10,a10,a13 # [9] add.n a14,a13,a14 # [10] s32i a14,a1,12 # [11] gra_spill_temp_2 s32i a10,a1,16 # [12] gra_spill_temp_3 movi.n a8,0 # [13] l32i a10,a1,80 # [14] gra_spill_temp_19 max a6,a6,a8 # [15] sub a9,a7,a6 # [16] s32i a9,a1,28 # [17] gra_spill_temp_6 add.n a13,a10,a6 # [18] s32i a13,a1,24 # [19] gra_spill_temp_5 add.n a10,a10,a7 # [16] s32i a10,a1,72 # [21] gra_spill_temp_17 .Lt_0_23042: # 0x29a l32i a8,a1,8 # [0] gra_spill_temp_1 mv.qr q1,q3 # [1] mov.n a13,a8 # [2] bge a8,a15,.Lt_0_23298 # [3] .LBB13_esp_nn_max_pool_s8_esp32s3: # 0x2a5 # Part of loop body line 40, head labeled .Lt_0_23042 l32i a10,a1,92 # [0] gra_spill_temp_22 l32i a14,a1,72 # [1] gra_spill_temp_17 add.n a10,a10,a8 # [2] mull a10,a11,a10 # [3] add.n a14,a10,a14 # [5] .Lt_0_23810: # 0x2b2 add.n a14,a14,a11 # [0] addi.n a13,a13,1 # [1] bge a6,a7,.Lt_0_24066 # [2] .LBB16_esp_nn_max_pool_s8_esp32s3: # 0x2b9 l32i a3,a1,24 # [0] gra_spill_temp_5 l32i a2,a1,20 # [1] gra_spill_temp_4 add.n a3,a3,a10 # [2] mull a3,a3,a5 # [3] add.n a2,a2,a3 # [5] l32i a3,a1,28 # [6] gra_spill_temp_6 add.n a2,a12,a2 # [7] loopgtz a3,.LBB93_esp_nn_max_pool_s8_esp32s3 # [8] ee.vld.l.64.ip q0,a2,0 # [0*II+1] id:481 add.n a2,a2,a5 # [0*II+2] ee.vmax.s8 q1,q1,q0 # [0*II+3] .LBB93_esp_nn_max_pool_s8_esp32s3: # 0x2d8 .Lt_0_24066: # 0x2d8 add.n a10,a10,a11 # [0] bne a15,a13,.Lt_0_23810 # [1] .Lt_0_23298: # 0x2dd l32i a9,a1,12 # [0] gra_spill_temp_2 l32i a13,a1,20 # [1] gra_spill_temp_4 l32i a8,a1,16 # [2] gra_spill_temp_3 ee.vmin.s8 q2,q1,q4 # [3] ee.vmax.s8 q2,q2,q5 # [4] mov.n a10,a8 # [5] addi.n a13,a13,8 # [6] s32i a13,a1,20 # [7] gra_spill_temp_4 ee.vst.l.64.ip q2,a10,0 # [8] id:482 addi.n a8,a8,8 # [9] s32i a8,a1,16 # [10] gra_spill_temp_3 blt a8,a9,.Lt_0_23042 # [11] .Lt_0_22530: # 0x2fe l32i a13,a1,84 # [0] gra_spill_temp_20 l32i a14,a1,80 # [1] gra_spill_temp_19 l32i a10,a1,120 # [2] stride_wd l32i a8,a1,88 # [3] gra_spill_temp_21 l32i a9,a1,76 # [4] gra_spill_temp_18 add.n a8,a8,a5 # [5] s32i a8,a1,88 # [6] gra_spill_temp_21 sub a9,a9,a10 # [7] add.n a14,a14,a10 # [8] sub a13,a13,a10 # [9] s32i a13,a1,84 # [10] gra_spill_temp_20 s32i a14,a1,80 # [11] gra_spill_temp_19 s32i a9,a1,76 # [12] gra_spill_temp_18 l32i a14,a1,64 # [13] gra_spill_temp_15 l32i a8,a1,4 # [14] gra_spill_temp_0 addi.n a14,a14,1 # [15] s32i a14,a1,64 # [16] gra_spill_temp_15 sub a14,a14,a8 # [17] bnez a14,.Lt_0_22274 # [18] .Lt_0_21762: # 0x334 # Part of loop body line 20, head labeled .Lt_0_21506 l32i a8,a1,44 # [0] gra_spill_temp_10 l32i a15,a1,92 # [1] gra_spill_temp_22 l32i a10,a1,60 # [2] gra_spill_temp_14 l32i a14,a1,124 # [3] stride_ht l32i a13,a1,48 # [4] gra_spill_temp_11 l32i a9,a1,52 # [5] gra_spill_temp_12 sub a13,a13,a14 # [6] add.n a9,a9,a10 # [7] add.n a15,a15,a14 # [8] sub a8,a8,a14 # [9] s32i a8,a1,44 # [10] gra_spill_temp_10 s32i a15,a1,92 # [11] gra_spill_temp_22 s32i a9,a1,52 # [12] gra_spill_temp_12 s32i a13,a1,48 # [13] gra_spill_temp_11 l32i.n a9,a1,36 # [14] gra_spill_temp_8 l32i a10,a1,68 # [15] gra_spill_temp_16 addi.n a9,a9,1 # [16] s32i.n a9,a1,36 # [17] gra_spill_temp_8 sub a9,a9,a10 # [18] bnez a9,.Lt_0_21506 # [19] retw.n # [0] // exit .LBB25_esp_nn_max_pool_s8_esp32s3: # 0x36d // if (channels % 4 == 0) l16ui a10,a1,136 # [1] id:475 pad_wd+0x0 l16ui a9,a1,140 # [4] id:474 pad_ht+0x0 movi.n a13,0 # [13] movi.n a15,0 # [15] neg a10,a10 # [12] s32i a9,a1,44 # [7] gra_spill_temp_10 mul16u a14,a6,a5 # [14] neg a9,a9 # [16] s32i a9,a1,92 # [17] gra_spill_temp_22 s32i a15,a1,52 # [18] gra_spill_temp_12 s32i a14,a1,60 # [19] gra_spill_temp_14 s32i.n a13,a1,36 # [16] gra_spill_temp_8 s32i a10,a1,56 # [21] gra_spill_temp_13 sub a8,a4,a9 # [22] s32i a8,a1,48 # [23] gra_spill_temp_11 sub a10,a11,a10 # [24] s32i.n a10,a1,40 # [25] gra_spill_temp_9 .Lt_0_27138: # 0x3d5 l32i a13,a1,4 # [0] gra_spill_temp_0 beqz.n a13,.Lt_0_27394 # [2] .LBB29_esp_nn_max_pool_s8_esp32s3: # 0x3da # Part of loop body line 107, head labeled .Lt_0_27138 movi.n a10,0 # [0] l32i a9,a1,44 # [1] gra_spill_temp_10 l32i.n a15,a1,40 # [2] gra_spill_temp_9 l32i a8,a1,52 # [3] gra_spill_temp_12 l32i a14,a1,56 # [4] gra_spill_temp_13 l32i.n a13,a1,136 # [5] pad_wd s32i a13,a1,76 # [6] gra_spill_temp_18 s32i a14,a1,80 # [7] gra_spill_temp_19 s32i a8,a1,88 # [8] gra_spill_temp_21 s32i a15,a1,84 # [9] gra_spill_temp_20 l32i a8,a1,48 # [10] gra_spill_temp_11 l32i a15,a1,132 # [11] filter_ht movi.n a14,0 # [12] max a9,a9,a10 # [13] s32i a9,a1,8 # [14] gra_spill_temp_1 s32i a14,a1,64 # [15] gra_spill_temp_15 min a15,a15,a8 # [16] .Lt_0_27906: # 0x409 # Loop body line 109, nesting depth: 2, estimated iterations: 56 beqz.n a5,.Lt_0_28162 # [0] .LBB32_esp_nn_max_pool_s8_esp32s3: # 0x40b # Part of loop body line 109, head labeled .Lt_0_27906 l32i a6,a1,76 # [0] gra_spill_temp_18 l32i a13,a1,96 # [1] gra_spill_temp_23 l32i a8,a1,84 # [2] gra_spill_temp_20 l32i a7,a1,128 # [3] filter_wd l32i a10,a1,88 # [4] gra_spill_temp_21 movi.n a9,0 # [5] s32i a9,a1,32 # [6] gra_spill_temp_7 add.n a14,a10,a5 # [7] min a7,a7,a8 # [8] add.n a10,a10,a13 # [9] add.n a14,a13,a14 # [10] s32i a14,a1,12 # [11] gra_spill_temp_2 s32i a10,a1,16 # [12] gra_spill_temp_3 movi.n a8,0 # [13] l32i a10,a1,80 # [14] gra_spill_temp_19 max a6,a6,a8 # [15] sub a9,a7,a6 # [16] s32i a9,a1,28 # [17] gra_spill_temp_6 add.n a13,a10,a6 # [18] s32i a13,a1,24 # [19] gra_spill_temp_5 add.n a10,a10,a7 # [16] s32i a10,a1,72 # [21] gra_spill_temp_17 .Lt_0_28674: # 0x446 # Loop body line 8, nesting depth: 3, estimated iterations: 56 l32i a8,a1,8 # [0] gra_spill_temp_1 mv.qr q1,q3 # [1] mov.n a13,a8 # [2] bge a8,a15,.Lt_0_28930 # [3] .LBB35_esp_nn_max_pool_s8_esp32s3: # 0x451 # Part of loop body line 8, head labeled .Lt_0_28674 l32i a10,a1,92 # [0] gra_spill_temp_22 l32i a14,a1,72 # [1] gra_spill_temp_17 add.n a10,a10,a8 # [2] mull a10,a11,a10 # [3] add.n a14,a10,a14 # [5] .Lt_0_29442: # 0x45e add.n a14,a14,a11 # [0] addi.n a13,a13,1 # [1] bge a6,a7,.Lt_0_29698 # [2] .LBB38_esp_nn_max_pool_s8_esp32s3: # 0x465 l32i a3,a1,24 # [0] gra_spill_temp_5 l32i a2,a1,32 # [1] gra_spill_temp_7 add.n a3,a3,a10 # [2] mull a3,a3,a5 # [3] l32i a4,a1,28 # [4] gra_spill_temp_6 add.n a2,a2,a3 # [5] add.n a2,a12,a2 # [6] loopgtz a4,.LBB108_esp_nn_max_pool_s8_esp32s3 # [7] ee.vldbc.32 q0,a2 # [0*II+0] id:489 add.n a2,a2,a5 # [0*II+1] ee.vmax.s8 q1,q1,q0 # [0*II+2] .LBB108_esp_nn_max_pool_s8_esp32s3: # 0x482 .Lt_0_29698: # 0x482 add.n a10,a10,a11 # [0] bne a15,a13,.Lt_0_29442 # [1] .Lt_0_28930: # 0x487 # Part of loop body line 8, head labeled .Lt_0_28674 l32i a9,a1,12 # [0] gra_spill_temp_2 l32i a8,a1,16 # [1] gra_spill_temp_3 l32i a10,a1,32 # [3] gra_spill_temp_7 ee.vmin.s8 q5,q1,q4 # [4] ee.vmax.s8 q5,q5,q5 # [5] addi.n a10,a10,4 # [6] ee.movi.32.a q5,a13,0 s32i a10,a1,32 # [9] gra_spill_temp_7 s32i.n a13,a8,0 # [10] id:492 addi.n a8,a8,4 # [11] s32i a8,a1,16 # [12] gra_spill_temp_3 blt a8,a9,.Lt_0_28674 # [13] .Lt_0_28162: # 0x4ad # Part of loop body line 109, head labeled .Lt_0_27906 l32i a13,a1,84 # [0] gra_spill_temp_20 l32i a14,a1,80 # [1] gra_spill_temp_19 l32i a10,a1,120 # [2] stride_wd l32i a8,a1,88 # [3] gra_spill_temp_21 l32i a9,a1,76 # [4] gra_spill_temp_18 add.n a8,a8,a5 # [5] s32i a8,a1,88 # [6] gra_spill_temp_21 sub a9,a9,a10 # [7] add.n a14,a14,a10 # [8] sub a13,a13,a10 # [9] s32i a13,a1,84 # [10] gra_spill_temp_20 s32i a14,a1,80 # [11] gra_spill_temp_19 s32i a9,a1,76 # [12] gra_spill_temp_18 l32i a14,a1,64 # [13] gra_spill_temp_15 l32i a8,a1,4 # [14] gra_spill_temp_0 addi.n a14,a14,1 # [15] s32i a14,a1,64 # [16] gra_spill_temp_15 sub a14,a14,a8 # [17] bnez a14,.Lt_0_27906 # [18] .Lt_0_27394: # 0x4e3 # Part of loop body line 107, head labeled .Lt_0_27138 l32i a8,a1,44 # [0] gra_spill_temp_10 l32i a15,a1,92 # [1] gra_spill_temp_22 l32i a10,a1,60 # [2] gra_spill_temp_14 l32i a14,a1,124 # [3] stride_ht l32i a13,a1,48 # [4] gra_spill_temp_11 l32i a9,a1,52 # [5] gra_spill_temp_12 sub a13,a13,a14 # [6] add.n a9,a9,a10 # [7] add.n a15,a15,a14 # [8] sub a8,a8,a14 # [9] s32i a8,a1,44 # [10] gra_spill_temp_10 s32i a15,a1,92 # [11] gra_spill_temp_22 s32i a9,a1,52 # [12] gra_spill_temp_12 s32i a13,a1,48 # [13] gra_spill_temp_11 l32i.n a9,a1,36 # [14] gra_spill_temp_8 l32i a10,a1,68 # [15] gra_spill_temp_16 addi.n a9,a9,1 # [16] s32i.n a9,a1,36 # [17] gra_spill_temp_8 sub a9,a9,a10 # [18] bnez a9,.Lt_0_27138 # [19] retw.n # [0] // exit .size esp_nn_max_pool_s8_esp32s3, . - esp_nn_max_pool_s8_esp32s3 ================================================ FILE: src/softmax/esp_nn_softmax_ansi.c ================================================ // Copyright 2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "softmax_common.h" int32_t esp_nn_get_softmax_scratch_size_ansi(const int32_t width, const int32_t height) { (void) width; (void) height; return 0; } void esp_nn_set_softmax_scratch_buf_ansi(void *buffer) { (void) buffer; return; } void esp_nn_softmax_s8_ansi(const int8_t *input_data, const int32_t height, const int32_t width, const int32_t mult, const int32_t shift, const int32_t diff_min, int8_t *output_data) { // The representation chosen for the input to the exp() function is Q5.26. // We need to leave extra space since values that we skip might be as large as // -32 before multiplying by input mult, and therefore as large as // -16 afterwards. Note that exp(-8) is definitely not insignificant to // accumulation, but exp(-16) definitely is. #define ACCUM_BITS 12 #define DIFF_BITS 5 const int32_t mask = (1 << shift); int32_t col = 0; const int8_t *in_ptr = input_data; int8_t *out_ptr = output_data; for (int row_idx = 0; row_idx < height; row_idx++) { int8_t max_in_row = in_ptr[0]; for (col = 1; col < width; col++) { max_in_row = max(max_in_row, in_ptr[col]); } int32_t input_diff = 0; int32_t sum_of_exps = 0; for (col = 0; col < width; col++) { input_diff = in_ptr[col] - max_in_row; if (input_diff >= diff_min) { const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult); const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled); sum_of_exps += DIV_POW2(exp_raw, ACCUM_BITS); } } const int32_t headroom_plus1 = esp_nn_clz32((uint32_t) sum_of_exps); const int32_t shifted_scale = ONE_OVER_ONE_X((sum_of_exps << headroom_plus1) - (1 << 31)); const int32_t bits_over_unit = ACCUM_BITS - headroom_plus1 + 31 - sizeof(int8_t) * 8; for (col = 0; col < width; col++) { input_diff = in_ptr[col] - max_in_row; if (input_diff >= diff_min) { const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult); const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled); const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw); const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128; out_ptr[col] = (int8_t) esp_nn_saturate8(result); } else { out_ptr[col] = -128; } } in_ptr += width; out_ptr += width; } } ================================================ FILE: src/softmax/esp_nn_softmax_opt.c ================================================ // Copyright 2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "softmax_common.h" #include static int32_t *scratch_buf = NULL; /** * @brief Get scratch buffer size needed by softmax function * * @param width * @param height * @return size in bytes * * @note buffer must be 4 byte aligned */ int32_t esp_nn_get_softmax_scratch_size_opt(const int32_t width, const int32_t height) { (void) height; return width * 4; } /** * @brief Set scratch buffer to be used by softmax function * * @param buffer this can be NULL if one needs to unset it * must be aligned to 4 bytes */ void esp_nn_set_softmax_scratch_buf_opt(void *buffer) { scratch_buf = (int32_t *) buffer; } void esp_nn_softmax_s8_opt(const int8_t *input_data, const int32_t height, const int32_t width, const int32_t mult, const int32_t shift, const int32_t diff_min, int8_t *output_data) { if (scratch_buf == NULL) { printf("%s error! scratch buffer not set\n", __FUNCTION__); return; } // The representation chosen for the input to the exp() function is Q5.26. // We need to leave extra space since values that we skip might be as large as // -32 before multiplying by input mult, and therefore as large as // -16 afterwards. Note that exp(-8) is definitely not insignificant to // accumulation, but exp(-16) definitely is. #define ACCUM_BITS 12 #define DIFF_BITS 5 const int32_t mask = (1 << shift); int32_t col = 0; const int8_t *in_ptr = input_data; int8_t *out_ptr = output_data; for (int row_idx = 0; row_idx < height; row_idx++) { int8_t max_in_row = in_ptr[0]; for (col = 1; col < width; col++) { max_in_row = max(max_in_row, in_ptr[col]); } int32_t input_diff = 0; int32_t sum_of_exps = 0; for (col = 0; col < width; col++) { input_diff = in_ptr[col] - max_in_row; if (input_diff >= diff_min) { const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult); const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled); scratch_buf[col] = exp_raw; // store to avoid duplicate calculation later sum_of_exps += DIV_POW2(exp_raw, ACCUM_BITS); } } const int32_t headroom_plus1 = esp_nn_clz32((uint32_t) sum_of_exps); const int32_t shifted_scale = ONE_OVER_ONE_X((sum_of_exps << headroom_plus1) - (1 << 31)); const int32_t bits_over_unit = ACCUM_BITS - headroom_plus1 + 31 - sizeof(int8_t) * 8; for (col = 0; col < width; col++) { input_diff = in_ptr[col] - max_in_row; if (input_diff >= diff_min) { int32_t exp_raw = scratch_buf[col]; const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw); const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128; out_ptr[col] = (int8_t) esp_nn_saturate8(result); } else { out_ptr[col] = -128; } } in_ptr += width; out_ptr += width; } } ================================================ FILE: src/softmax/esp_nn_softmax_s8_esp32p4.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include "softmax_common.h" #include #include static int32_t *p4_scratch_buf = NULL; int32_t esp_nn_get_softmax_scratch_size_esp32p4(const int32_t width, const int32_t height) { (void) height; return width * 4; } void esp_nn_set_softmax_scratch_buf_esp32p4(void *buffer) { /* Enable PIE */ asm volatile ( "csrsi 0x7f2, 0b01 \n\t" "li x29, 0b10 \n\t" "esp.movx.w.cfg x29 \n\t" ::: "x29" ); p4_scratch_buf = (int32_t *) buffer; } /** * Softmax for s8 optimized for ESP32-P4. * Phase 1 (find-max) uses PIE esp.vmax.s8 for 16 elements at a time. * Phases 2-3 (exp + normalize) use cached exp values in scratch buffer. */ void esp_nn_softmax_s8_esp32p4(const int8_t *input_data, const int32_t height, const int32_t width, const int32_t mult, const int32_t shift, const int32_t diff_min, int8_t *output_data) { if (p4_scratch_buf == NULL) { printf("%s error! scratch buffer not set\n", __FUNCTION__); return; } #define ACCUM_BITS 12 #define DIFF_BITS 5 const int32_t mask = (1 << shift); int32_t col = 0; const int8_t *in_ptr = input_data; int8_t *out_ptr = output_data; for (int row_idx = 0; row_idx < height; row_idx++) { /* Phase 1: Find max in row using PIE vectorization. * Use auto-incrementing loads to avoid redundant mv per iteration. */ int8_t max_in_row; if (width >= 16) { int32_t vec_count = (width >> 4); /* number of 16-element groups */ int32_t vec_processed = vec_count << 4; int32_t max_scalar; asm volatile ( "mv x30, %[ptr] \n\t" "esp.vld.128.ip q0, x30, 16 \n\t" /* load first 16, advance */ "addi %[cnt], %[cnt], -1 \n\t" /* one group already loaded */ "beqz %[cnt], 2f \n\t" "1: \n\t" "esp.vld.128.ip q1, x30, 16 \n\t" /* load next 16, advance */ "esp.vmax.s8 q0, q0, q1 \n\t" /* running max */ "addi %[cnt], %[cnt], -1 \n\t" "bnez %[cnt], 1b \n\t" "2: \n\t" "esp.max.s8.a q0, %[max] \n\t" /* horizontal reduce */ : [cnt] "+r"(vec_count), [max] "=r"(max_scalar) : [ptr] "r"(in_ptr) : "x30" ); max_in_row = (int8_t) max_scalar; /* Check remaining elements (< 16) */ for (int32_t i = vec_processed; i < width; i++) { if (in_ptr[i] > max_in_row) max_in_row = in_ptr[i]; } } else { max_in_row = in_ptr[0]; for (col = 1; col < width; col++) { max_in_row = max(max_in_row, in_ptr[col]); } } /* Phase 2: Compute exp values and sum */ int32_t input_diff = 0; int32_t sum_of_exps = 0; for (col = 0; col < width; col++) { input_diff = in_ptr[col] - max_in_row; if (input_diff >= diff_min) { const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult); const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled); p4_scratch_buf[col] = exp_raw; sum_of_exps += DIV_POW2(exp_raw, ACCUM_BITS); } } /* Phase 3: Normalize */ const int32_t headroom_plus1 = esp_nn_clz32((uint32_t) sum_of_exps); const int32_t shifted_scale = ONE_OVER_ONE_X((sum_of_exps << headroom_plus1) - (1 << 31)); const int32_t bits_over_unit = ACCUM_BITS - headroom_plus1 + 31 - sizeof(int8_t) * 8; for (col = 0; col < width; col++) { input_diff = in_ptr[col] - max_in_row; if (input_diff >= diff_min) { int32_t exp_raw = p4_scratch_buf[col]; const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw); const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128; out_ptr[col] = (int8_t) esp_nn_saturate8(result); } else { out_ptr[col] = -128; } } in_ptr += width; out_ptr += width; } } ================================================ FILE: src/softmax/esp_nn_softmax_s8_esp32s3.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /* * ESP32-S3 optimized softmax with SIMD find-max for width >= 16. */ #include #include "softmax_common.h" static int32_t *scratch_buf_s3 = NULL; int32_t esp_nn_get_softmax_scratch_size_esp32s3(const int32_t width, const int32_t height) { (void) height; return width * 4; } void esp_nn_set_softmax_scratch_buf_esp32s3(void *buffer) { scratch_buf_s3 = (int32_t *) buffer; } /* Find max of int8 array — SIMD for len >= 32, scalar for smaller */ static inline int8_t find_max_s8(const int8_t *data, int32_t len) { int8_t m = -128; int32_t idx = 0; #if defined(__XTENSA__) if (len >= 32) { /* Use ee.vmax.s8 for 16 elements/cycle — only for len >= 32 * to avoid potential alignment issues with small buffers */ int8_t tmp_buf[16] __attribute__((aligned(16))); const int8_t *ptr = data; int8_t *buf_ptr = tmp_buf; int32_t simd_len = len & ~15; /* round down to multiple of 16 */ asm volatile ( "ee.vld.128.ip q0, %[ptr], 16 \n\t" /* q0 = running max */ "movi.n %[idx], 16 \n\t" "j 2f \n\t" "1: \n\t" "ee.vld.128.ip q1, %[ptr], 16 \n\t" "ee.vmax.s8 q0, q0, q1 \n\t" "addi %[idx], %[idx], 16 \n\t" "2: \n\t" "blt %[idx], %[slen], 1b \n\t" /* Store vector max to tmp_buf for horizontal reduction */ "ee.vst.128.ip q0, %[buf], 16 \n\t" : [idx] "+r"(idx), [ptr] "+r"(ptr), [buf] "+r"(buf_ptr) : [slen] "r"(simd_len) : "memory" ); /* Horizontal reduction of 16 max values */ for (int i = 0; i < 16; i++) { if (tmp_buf[i] > m) m = tmp_buf[i]; } idx = simd_len; } #endif /* Scalar for remainder or small arrays */ for (; idx < len; idx++) { if (data[idx] > m) m = data[idx]; } return m; } void esp_nn_softmax_s8_esp32s3(const int8_t *input_data, const int32_t height, const int32_t width, const int32_t mult, const int32_t shift, const int32_t diff_min, int8_t *output_data) { if (scratch_buf_s3 == NULL) { /* Fall through to opt version if scratch not set */ return; } #define ACCUM_BITS 12 const int32_t mask = (1 << shift); const int8_t *in_ptr = input_data; int8_t *out_ptr = output_data; for (int row_idx = 0; row_idx < height; row_idx++) { /* Phase 1: Find max */ int8_t max_in_row = find_max_s8(in_ptr, width); /* Phase 2: Compute exp and accumulate sum */ int32_t sum_of_exps = 0; for (int col = 0; col < width; col++) { int32_t input_diff = in_ptr[col] - max_in_row; if (input_diff >= diff_min) { const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult); const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled); scratch_buf_s3[col] = exp_raw; sum_of_exps += DIV_POW2(exp_raw, ACCUM_BITS); } } /* Phase 3: Compute normalization scale */ const int32_t headroom_plus1 = esp_nn_clz32((uint32_t) sum_of_exps); const int32_t shifted_scale = ONE_OVER_ONE_X((sum_of_exps << headroom_plus1) - (1 << 31)); const int32_t bits_over_unit = ACCUM_BITS - headroom_plus1 + 31 - 8; /* Phase 4: Normalize and output — unrolled 4x for reduced loop overhead */ int col = 0; for (; col + 3 < width; col += 4) { for (int k = 0; k < 4; k++) { int32_t input_diff = in_ptr[col + k] - max_in_row; if (input_diff >= diff_min) { int32_t exp_raw = scratch_buf_s3[col + k]; const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw); const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128; out_ptr[col + k] = (int8_t) esp_nn_saturate8(result); } else { out_ptr[col + k] = -128; } } } /* Remainder */ for (; col < width; col++) { int32_t input_diff = in_ptr[col] - max_in_row; if (input_diff >= diff_min) { int32_t exp_raw = scratch_buf_s3[col]; const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw); const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128; out_ptr[col] = (int8_t) esp_nn_saturate8(result); } else { out_ptr[col] = -128; } } in_ptr += width; out_ptr += width; } #undef ACCUM_BITS } ================================================ FILE: src/softmax/softmax_common.h ================================================ // Copyright 2022 Espressif Systems (Shanghai) PTE LTD // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include #include #define MASK_IF_ZERO(x) (x) == 0 ? ~0 : 0 #define MASK_IF_NON_ZERO(x) (x) != 0 ? ~0 : 0 #define SELECT_USING_MASK(mask, a, b) ((mask) & (a)) ^ (~(mask) & (b)) #define SAT_HIGH_MUL(x, y) esp_nn_sat_round_doubling_high_mul((x), (y)) #define DIV_POW2(x,y) esp_nn_div_by_power_of_two((x), (y)) __NN_FORCE_INLINE__ int32_t mul_power_of_2(int val, int exp) { const int32_t thresh = ((1 << (31 - exp)) - 1); int32_t result = val << exp; result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val > thresh), INT32_MAX, result); result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val < -thresh), INT32_MIN, result); return result; } /** * @brief Calculate `1 / (1 + x)` for x in [0, 1] * * @param val input value to calculate `1/(1+x)` for * @return `int32_t` result * @note Newton-Raphson division * * https://en.wikipedia.org/wiki/Division_algorithm#Newton.E2.80.93Raphson_division * Refer to that page for the logic behind the 48/17 and 32/17 constants. * Pseudocode: https://en.wikipedia.org/wiki/Division_algorithm#Pseudocode */ __NN_FORCE_INLINE__ int32_t esp_nn_one_over_one_plus_x_for_x_in_0_1(int32_t val) { const int64_t sum = (int64_t) val + INT32_MAX; const int32_t half_denominator = (int32_t) ((sum + (sum >= 0 ? 1 : -1)) / 2L); int32_t constant_48_over_17 = 1515870810; int32_t constant_neg_32_over_17 = -1010580540; int32_t x = constant_48_over_17 + SAT_HIGH_MUL(half_denominator, constant_neg_32_over_17); const int32_t fixed_2_one = (1 << 29); x += mul_power_of_2(SAT_HIGH_MUL(x, fixed_2_one - SAT_HIGH_MUL(half_denominator, x)), 2); x += mul_power_of_2(SAT_HIGH_MUL(x, fixed_2_one - SAT_HIGH_MUL(half_denominator, x)), 2); x += mul_power_of_2(SAT_HIGH_MUL(x, fixed_2_one - SAT_HIGH_MUL(half_denominator, x)), 2); return mul_power_of_2(x, 1); } #define ONE_OVER_ONE_X(x) esp_nn_one_over_one_plus_x_for_x_in_0_1((x)) /** * @brief Return exp(x) for x < 0. * */ __NN_FORCE_INLINE__ int32_t esp_nn_exp_on_negative_values(int32_t val) { int32_t shift = 24; const int32_t one_quarter = (1 << shift); int32_t mask = one_quarter - 1; const int32_t val_mod_minus_quarter = (val & mask) - one_quarter; const int32_t remainder = val_mod_minus_quarter - val; // calculate exponent for x in [-1/4, 0) in `result` const int32_t x = (val_mod_minus_quarter << 5) + (1 << 28); const int32_t x2 = SAT_HIGH_MUL(x, x); const int32_t x3 = SAT_HIGH_MUL(x2, x); const int32_t x4 = SAT_HIGH_MUL(x2, x2); const int32_t one_over_3 = 715827883; const int32_t one_over_8 = 1895147668; const int32_t x4_over_4 = DIV_POW2(x4, 2); const int32_t x4_over_4_plus_x3_over_6_plus_x2_over_2 = DIV_POW2(SAT_HIGH_MUL(x4_over_4 + x3, one_over_3) + x2, 1); int32_t result = one_over_8 + SAT_HIGH_MUL(one_over_8, x + x4_over_4_plus_x3_over_6_plus_x2_over_2); #define SELECT_IF_NON_ZERO(x) { \ mask = MASK_IF_NON_ZERO(remainder & (1 << shift++)); \ result = SELECT_USING_MASK(mask, SAT_HIGH_MUL(result, x), result); \ } SELECT_IF_NON_ZERO(1672461947) SELECT_IF_NON_ZERO(1302514674) SELECT_IF_NON_ZERO(790015084) SELECT_IF_NON_ZERO(290630308) SELECT_IF_NON_ZERO(39332535) SELECT_IF_NON_ZERO(720401) SELECT_IF_NON_ZERO(242) #undef SELECT_IF_NON_ZERO mask = MASK_IF_ZERO(val); return SELECT_USING_MASK(mask, INT32_MAX, result); } ================================================ FILE: test_app/CMakeLists.txt ================================================ # The following lines of boilerplate have to be in your project's # CMakeLists in this exact order for cmake to work correctly cmake_minimum_required(VERSION 3.5) set(EXTRA_COMPONENT_DIRS "../" "../tests/") set(IDF_EXCLUDE_COMPONENTS test test_app) include($ENV{IDF_PATH}/tools/cmake/project.cmake) project(test_app) ================================================ FILE: test_app/Makefile ================================================ # # This is a project Makefile. It is assumed the directory this Makefile resides in is a # project subdirectory. # PROJECT_NAME := test_app # This line has to be included into the make file # to include components that are located somewhere # but not in "component" directory EXTRA_COMPONENT_DIRS := $(realpath ../) EXCLUDE_COMPONENTS := test include $(IDF_PATH)/make/project.mk ================================================ FILE: test_app/main/CMakeLists.txt ================================================ set(COMPONENT_SRCS "main.c") set(COMPONENT_ADD_INCLUDEDIRS "") set(COMPONENT_PRIV_REQUIRES tests esp_timer) register_component() ================================================ FILE: test_app/main/component.mk ================================================ # # Main component makefile. # # This Makefile can be left empty. By default, it will take the sources in the # src/ directory, compile them and link them into lib(subdirectory_name).a # in the build directory. This behaviour is entirely configurable, # please read the ESP-IDF documents if you need to do this. # ================================================ FILE: test_app/main/main.c ================================================ /* * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include #include #include #include #include #include #include #include #include #if __has_include("esp_idf_version.h") #include #if ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(5, 0, 0) #define esp_cpu_get_ccount esp_cpu_get_cycle_count #endif #endif static const char *TAG = "test_app"; static uint32_t start_c, start_opt, total_c, total_opt; void profile_c_start() { /* initiate profiling */ start_c = esp_cpu_get_ccount(); } uint32_t profile_c_end() { /* record profile number */ total_c = esp_cpu_get_ccount() - start_c; return total_c; } void profile_opt_start() { /* initiate profiling */ start_opt = esp_cpu_get_ccount(); } uint32_t profile_opt_end() { /* record profile number */ total_opt = esp_cpu_get_ccount() - start_opt; return total_opt; } static void print_profile(const char *kernel) { float speedup = (total_c > 0 && total_opt > 0) ? (float)total_c / (float)total_opt : 0.0f; printf("PROFILE: %s, ansi=%"PRIu32", opt=%"PRIu32", speedup=%.2fx\n", kernel, total_c, total_opt, speedup); } void app_main() { /* s8 tests */ ESP_LOGI(TAG, "Running s8 tests..."); esp_nn_add_elementwise_s8_test(); print_profile("add_s8"); esp_nn_mul_elementwise_s8_test(); print_profile("mul_s8"); esp_nn_mul_broadcast_channel_s8_test(); print_profile("mul_broadcast_ch_s8"); esp_nn_depthwise_conv_s8_test(); print_profile("depthwise_conv_s8"); esp_nn_conv_s8_test(); print_profile("conv_s8"); esp_nn_relu6_s8_test(); print_profile("relu6_s8"); esp_nn_avg_pool_s8_test(); print_profile("avg_pool_s8"); esp_nn_max_pool_s8_test(); print_profile("max_pool_s8"); esp_nn_fully_connected_s8_test(); print_profile("fc_s8"); esp_nn_fully_connected_per_ch_s8_test(); print_profile("fc_per_ch_s8"); esp_nn_softmax_s8_test(); print_profile("softmax_s8"); esp_nn_hard_swish_s8_test(); print_profile("hard_swish_s8"); esp_nn_mean_nhwc_s8_test(); print_profile("mean_nhwc_s8"); ESP_LOGI(TAG, "s8 tests done!\n"); /* u8 tests */ //ESP_LOGI(TAG, "Running u8 tests..."); //esp_nn_add_elementwise_u8_test(); //esp_nn_depthwise_conv_u8_test(); //esp_nn_conv_u8_test(); //esp_nn_avg_pool_u8_test(); //esp_nn_max_pool_u8_test(); //esp_nn_fully_connected_u8_test(); //ESP_LOGI(TAG, "u8 tests done!\n"); } ================================================ FILE: test_app/sdkconfig.defaults ================================================ # # esp-nn # CONFIG_NN_OPTIMIZED=y ================================================ FILE: test_app/sdkconfig.defaults.esp32p4 ================================================ # Enables high speed SPIRAM and other options CONFIG_IDF_EXPERIMENTAL_FEATURES=y # # ESP System Settings # CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ=360 CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_360=y # # ESP PSRAM # CONFIG_SPIRAM=y CONFIG_SPIRAM_BOOT_INIT=y CONFIG_SPIRAM_MODE_HEX=y CONFIG_SPIRAM_SPEED_200M=y CONFIG_SPIRAM_SPEED=200 CONFIG_SPIRAM_ALLOW_STACK_EXTERNAL_MEMORY=y CONFIG_SPIRAM_ALLOW_BSS_SEG_EXTERNAL_MEMORY=n CONFIG_SPIRAM_USE_CAPS_ALLOC=y CONFIG_SPIRAM_TRY_ALLOCATE_WIFI_LWIP=y # # GDB Stub # CONFIG_ESP_GDBSTUB_ENABLED=y CONFIG_ESP_SYSTEM_PANIC_GDBSTUB=y # # Heap memory debugging # # CONFIG_HEAP_POISONING_DISABLED is not set CONFIG_HEAP_POISONING_LIGHT=y ================================================ FILE: test_app/sdkconfig.defaults.esp32s3 ================================================ # Default configurations for ESP32-S3 CONFIG_ESP32S3_DEFAULT_CPU_FREQ_240=y # CONFIG_ESP32S3_SPIRAM_SUPPORT is not set CONFIG_ESP32S3_DATA_CACHE_64KB=y CONFIG_ESP32S3_DATA_CACHE_8WAYS=y CONFIG_ESP32S3_DATA_CACHE_LINE_64B=y ================================================ FILE: tests/CMakeLists.txt ================================================ set(COMPONENT_ADD_INCLUDEDIRS ./include/) set(COMPONENT_SRCS "src/basic_math_test.c" "src/convolution_test.c" "src/fully_connected_test.c" "src/pooling_test.c" "src/relu_test.c" "src/softmax_test.c" "src/hard_swish_test.c" "src/mean_test.c") set(COMPONENT_REQUIRES ) set(COMPONENT_PRIV_REQUIRES esp-nn) register_component() target_compile_options(${COMPONENT_LIB} PRIVATE -Wno-unused-function) ================================================ FILE: tests/README.md ================================================ # Tests for esp_nn library - Include these in your test framework and run the framework. - For IDF test please refer `test_app` ================================================ FILE: tests/component.mk ================================================ #FIXME COMPONENT_ADD_INCLUDEDIRS := include/ COMPONENT_SRCDIRS := src/ ================================================ FILE: tests/include/test_functions.h ================================================ /* * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ /* int8_t ops tests */ void esp_nn_add_elementwise_s8_test(); void esp_nn_mul_elementwise_s8_test(); void esp_nn_mul_broadcast_channel_s8_test(); void esp_nn_depthwise_conv_s8_test(); void esp_nn_conv_s8_test(); void esp_nn_avg_pool_s8_test(); void esp_nn_max_pool_s8_test(); void esp_nn_fully_connected_s8_test(); void esp_nn_fully_connected_per_ch_s8_test(); void esp_nn_relu6_s8_test(); void esp_nn_softmax_s8_test(); void esp_nn_hard_swish_s8_test(); void esp_nn_mean_nhwc_s8_test(); /* uint8_t ops tests */ void esp_nn_add_elementwise_u8_test(); void esp_nn_depthwise_conv_u8_test(); void esp_nn_conv_u8_test(); void esp_nn_avg_pool_u8_test(); void esp_nn_max_pool_u8_test(); void esp_nn_fully_connected_u8_test(); /* instructions test functions */ void compare_instructions_test(); void arith_instructions_test(); void min_max_instructions_test(); void bitwise_instructions_test(); void load_store_instructions_test(); ================================================ FILE: tests/include/test_utils.h ================================================ /* * SPDX-FileCopyrightText: 2020-2023 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include #include #include #include /* mult value range */ #define MULT_MAX INT32_MAX #define MULT_MIN 0 /* shift value range */ #define SHIFT_MIN -31 #define SHIFT_MAX 30 /** * @brief callback function to run before C function */ void profile_c_start(); /** * @brief callback function to run after C function * * @return uint32_t cycles consumed running C function */ uint32_t profile_c_end(); /** * @brief callback function to run before optimized function */ void profile_opt_start(); /** * @brief callback function to run after optimized function * * @return uint32_t cycles consumed running optimized function */ uint32_t profile_opt_end(); #define ANSI_COLOR_RED "\x1b[31m" #define ANSI_COLOR_GREEN "\x1b[32m" #define ANSI_COLOR_YELLOW "\x1b[33m" #define ANSI_COLOR_BLUE "\x1b[34m" #define ANSI_COLOR_MAGENTA "\x1b[35m" #define ANSI_COLOR_CYAN "\x1b[36m" #define ANSI_COLOR_RESET "\x1b[0m" #define CHECK_EQUAL(ARRAY1, ARRAY2, size) ({ \ bool res = true; \ for (int _i = 0; _i < size; _i++) { \ if (ARRAY1[_i] != ARRAY2[_i]) { \ res = false; \ break; \ } \ } \ res; \ }) #define PRINT_ARRAY_INT(ARRAY, width, height) ({ \ int *_array = (int *) ARRAY; \ for (int _j = 0; _j < height; _j++) { \ for (int _i = 0; _i < width; _i++) { \ printf("%d\t", _array[width * _j + _i]); \ } \ printf("\n"); \ } \ printf("\n"); \ }) #define PRINT_ARRAY_HEX(ARRAY, width, height) ({ \ uint8_t *_array = (uint8_t *) ARRAY; \ for (int _j = 0; _j < height; _j++) { \ for (int _i = 0; _i < width; _i++) { \ printf("%02x\t", _array[width * _j + _i]); \ } \ printf("\n"); \ } \ printf("\n"); \ }) #define PRINT_ARRAY_INT8(ARRAY, width, height) ({ \ int8_t *_array = (int8_t *) ARRAY; \ for (int _j = 0; _j < height; _j++) { \ for (int _i = 0; _i < width; _i++) { \ printf("%4d ", _array[width * _j + _i]); \ } \ printf("\n"); \ } \ printf("\n"); \ }) #if CONFIG_IDF_CMAKE #if ((CONFIG_SPIRAM || CONFIG_SPIRAM_SUPPORT || CONFIG_ESP32S3_SPIRAM_SUPPORT) && \ (CONFIG_SPIRAM_USE_CAPS_ALLOC || CONFIG_SPIRAM_USE_MALLOC)) #define IDF_HEAP_CAPS 1 #endif #endif #if IDF_HEAP_CAPS #include "esp_heap_caps.h" /* Try SPIRAM first, fall back to internal RAM */ static inline void *esp_nn_test_alloc(size_t size) { void *ptr = heap_caps_malloc(size, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT); if (!ptr) { ptr = heap_caps_malloc(size, MALLOC_CAP_8BIT); } return ptr; } #define ESP_NN_TEST_ALLOC(SIZE) esp_nn_test_alloc(SIZE) #else #include #define ESP_NN_TEST_ALLOC(SIZE) malloc(SIZE) #endif ================================================ FILE: tests/src/basic_math_test.c ================================================ /* * SPDX-FileCopyrightText: 2020-2023 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include #include #include #include #include #include #include #include #include "test_utils.h" const int8_t test_add_in1[] = { 13, 26, -26, 26, -13, 13, -13, 13, -13, 13, -13, 13, -26, -51, -26, -51, -26, -39, -26, -39, -39, -39, -26, -51, -13, -13, -13, -13, -26, -13, -13, -13, -13, -13, 0, -26, 0, -13, 0, -26, -13, -26, -26, -26, -26, -26, -26, -26, -13, -13, 0, -26, -13, -26, -26, -26, 0, 0, -26, -13, 13, 0, 26, 0, 13, 0, 13, 0, 0, 0, 13, 0, 13, 26, -26, 13, -26, 13, -13, 13, -13, 13, -13, 13, -26, -26, -13, -26, -26, -26, -26, -26, -39, -26, -26, -26, -13, 0, -13, 0, -26, 0, -13, 0, -13, 0, -13, -13, 0, 0, 0, -13, -13, -13, -26, -13, -26, -13, -13, -13, -13, 0, 0, -13, -13, -13, -13, -13, 0, 0, -13, 0, 13, 13, 13, 0, 0, 0, 13, 13, 0, 0, 13, 13, 0, 26, 0, 13, 0, 13, 0, 13, 0, 13, 0, 13, 0, 13, 0, 0, 0, 0, 0, 13, 0, 0, 0, 0, 13, 13, 0, 13, 0, 13, 0, 13, 0, 13, 0, 13, 13, 13, 0, 13, 0, 13, 0, 13, 0, 13, 13, 13, 13, 13, 0, 13, 0, 13, 0, 13, 13, 13, 13, 13, 13, 13, 13, 13, 0, 13, 13, 13, 13, 13, 13, 13, }; const int8_t test_add_in2[] = { -128, -128, -103, -128, -77, -128, -52, -128, -26, -128, -1, -128, -128, -103, -103, -103, -77, -103, -52, -103, -26, -103, -1, -103, -128, -77, -103, -77, -77, -77, -52, -77, -26, -77, -1, -77, -128, -52, -103, -52, -77, -52, -52, -52, -26, -52, -1, -52, -128, -26, -103, -26, -77, -26, -52, -26, -26, -26, -1, -26, -128, -1, -103, -1, -77, -1, -52, -1, -26, -1, -1, -1, -128, -128, -103, -128, -77, -128, -52, -128, -26, -128, -1, -128, -128, -103, -103, -103, -77, -103, -52, -103, -26, -103, -1, -103, -128, -77, -103, -77, -77, -77, -52, -77, -26, -77, -1, -77, -128, -52, -103, -52, -77, -52, -52, -52, -26, -52, -1, -52, -128, -26, -103, -26, -77, -26, -52, -26, -26, -26, -1, -26, -128, -1, -103, -1, -77, -1, -52, -1, -26, -1, -1, -1, -128, -128, -103, -128, -77, -128, -52, -128, -26, -128, -1, -128, -128, -103, -103, -103, -77, -103, -52, -103, -26, -103, -1, -103, -128, -77, -103, -77, -77, -77, -52, -77, -26, -77, -1, -77, -128, -52, -103, -52, -77, -52, -52, -52, -26, -52, -1, -52, -128, -26, -103, -26, -77, -26, -52, -26, -26, -26, -1, -26, -128, -1, -103, -1, -77, -1, -52, -1, -26, -1, -1, -1, }; void esp_nn_add_elementwise_s8_test() { /* prepare data */ int size = 1600 + 8 + 7; /* odd len to test leftover */ int8_t *input1; int8_t *input2; int8_t *out_data_c; int8_t *out_data_opt; int8_t *input1_orig = NULL; int8_t *input2_orig = NULL; int8_t *out_c_orig = NULL; int8_t *out_opt_orig = NULL; int32_t input1_offset = 34; int32_t input2_offset = 35; int32_t output_offset = 36; int32_t input1_shift = -8; // right_shift amt always <= 0 int32_t input2_shift = -8; // right_shift amt always <= 0 int32_t output_shift = -9; // right_shift amt always <= 0 int32_t left_shift = 15; // always +ve int32_t input1_mult = INT32_MAX; int32_t input2_mult = INT32_MAX; int32_t output_mult = INT32_MAX; int32_t activation_min = -128; int32_t activation_max = 127; for (int itr = 0; itr < 10; itr++) { switch (itr) { case 0: // all zeros input1_offset = 0; input2_offset = 0; output_offset = 0; input1_mult = 0; input2_mult = 0; output_mult = 0; input1_shift = 0; input2_shift = 0; output_shift = 0; left_shift = 0; break; case 1: // hit min input1_offset = -127; input2_offset = -127; output_offset = -128; input1_mult = MULT_MIN; input2_mult = MULT_MIN; output_mult = MULT_MIN; input1_shift = 0; input2_shift = 0; output_shift = 0; left_shift = 0; break; case 2: // hit max input1_offset = 128; input2_offset = 128; output_offset = -127; input1_mult = MULT_MAX; input2_mult = MULT_MAX; output_mult = MULT_MAX; input1_shift = SHIFT_MIN; input2_shift = SHIFT_MIN; output_shift = SHIFT_MIN; left_shift = 30 - 8; // since input is 8 bits break; case 3: // hit extreme max input1_offset = 128; input2_offset = 128; output_offset = -127; input1_mult = MULT_MAX; input2_mult = MULT_MAX; output_mult = MULT_MAX; input1_shift = 0; input2_shift = 0; output_shift = 0; left_shift = 30 - 8; // -8 since input is 8 bit break; case 4: // from yolo model input1_offset = 64; input2_offset = 128; output_offset = -128; input1_mult = 1705397815; input2_mult = 1073741824; output_mult = 1756091225; input1_shift = -3; input2_shift = 0; output_shift = -19; left_shift = 20; size = 216; break; default: // practical random input input1_offset = rand() % 256 - 127; // range [-127, 128] input2_offset = rand() % 256 - 127; // range [-127, 128] output_offset = rand() % 256 - 128; // range [-128, 127] input1_mult = MULT_MAX / 2 + rand() % INT16_MAX; input2_mult = MULT_MAX / 2 + rand() % INT16_MAX; output_mult = MULT_MAX / 2 + rand() % INT16_MAX; input1_shift = -8 + rand() % 4; input2_shift = -8 + rand() % 4; output_shift = -8 + rand() % 4; left_shift = rand() % 15; } input1_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16); input2_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16); out_c_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16); out_opt_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16); if (input1_orig == NULL || input2_orig == NULL || out_c_orig == NULL || out_opt_orig == NULL) { printf(ANSI_COLOR_RED"%s error allocating buffers\n"ANSI_COLOR_RESET, __FUNCTION__); goto elementwise_add_test_cleanup; } input1 = (int8_t *) (((uint32_t) input1_orig + 15) & ~15); input2 = (int8_t *) (((uint32_t) input2_orig + 15) & ~15); if (itr == 4) { input2 = input2_orig; // unaligned input } out_data_c = (int8_t *) (((uint32_t)out_c_orig + 15) & ~15); out_data_opt = (int8_t *) (((uint32_t)out_opt_orig + 15) & ~15); if (itr == 4) { memcpy(input1, test_add_in1, size); memcpy(input2, test_add_in2, size); } else { for (int i = 0; i < size; ++i) { input1[i] = rand() % 256 - 128; input2[i] = rand() % 256 - 128; } } if (itr == 0) { /* enable profiler */ profile_c_start(); } /* C function */ esp_nn_add_elementwise_s8_ansi(input1, input2, input1_offset, input2_offset, input1_mult, input2_mult, input1_shift, input2_shift, left_shift, out_data_c, output_offset, output_mult, output_shift, activation_min, activation_max, size); if (itr == 0) { profile_c_end(); profile_opt_start(); } /* Optimized function */ esp_nn_add_elementwise_s8(input1, input2, input1_offset, input2_offset, input1_mult, input2_mult, input1_shift, input2_shift, left_shift, out_data_opt, output_offset, output_mult, output_shift, activation_min, activation_max, size); if (itr == 0) { /* disable profiler */ profile_opt_end(); } bool ret = CHECK_EQUAL(out_data_c, out_data_opt, size); if (ret == false) { printf(ANSI_COLOR_RED"%s[%d] failed\n"ANSI_COLOR_RESET, __FUNCTION__, itr); printf("Output: \n"); PRINT_ARRAY_INT8(out_data_opt, size, 1); printf("Expected: \n"); PRINT_ARRAY_INT8(out_data_c, size, 1); printf("Input1:\n"); PRINT_ARRAY_INT8(input1, size, 1); printf("Input2:\n"); PRINT_ARRAY_INT8(input2, size, 1); printf("in1_shift %"PRIi32", in2_shift %"PRIi32", left_shift %"PRIi32", out_shift %"PRIi32"\n", input1_shift, input2_shift, left_shift, output_shift); printf("in1_mult %"PRIi32", in2_mult %"PRIi32", out_mult %"PRIi32"\n", input1_mult, input2_mult, output_mult); goto elementwise_add_test_cleanup; } printf(ANSI_COLOR_GREEN"%s[%d] passed\n"ANSI_COLOR_RESET, __FUNCTION__, itr); elementwise_add_test_cleanup: if (input1_orig) { free(input1_orig); } if (input2_orig) { free(input2_orig); } if (out_c_orig) { free(out_c_orig); } if (out_opt_orig) { free(out_opt_orig); } } } void esp_nn_mul_elementwise_s8_test() { /* prepare data */ int size = 1600 + 8 + 7; /* odd len to test leftover */ int8_t *input1; int8_t *input2; int8_t *out_data_c; int8_t *out_data_opt; int32_t input1_offset = 34; int32_t input2_offset = 35; int32_t output_offset = 36; int32_t output_shift = -7; int32_t output_mult = MULT_MAX; // max out_mult int32_t activation_min = -128; int32_t activation_max = 127; int8_t *input1_orig = NULL; int8_t *input2_orig = NULL; int8_t *out_c_orig = NULL; int8_t *out_opt_orig = NULL; for (int itr = 0; itr < 10; itr++) { switch (itr) { case 0: // all zeros input1_offset = 0; input2_offset = 0; output_offset = 0; output_mult = 0; output_shift = 0; break; case 1: // hit min input1_offset = -127; input2_offset = -127; output_offset = -128; output_mult = MULT_MIN; output_shift = 0; break; case 2: // hit max input1_offset = 128; input2_offset = 128; output_offset = -127; output_mult = MULT_MAX; output_shift = SHIFT_MIN; break; case 3: // hit extreme max input1_offset = 128; input2_offset = 128; output_offset = -127; output_mult = MULT_MAX; output_shift = 0; break; default: // practical random input input1_offset = rand() % 256 - 127; // range [-127, 128] input2_offset = rand() % 256 - 127; // range [-127, 128] output_offset = rand() % 256 - 128; // range [-128, 127] output_mult = MULT_MAX / 2 + rand() % INT16_MAX; output_shift = -8 + rand() % 4; size = 4 + rand() % 64; } input1_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16); input2_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16); out_c_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16); out_opt_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16); if (input1_orig == NULL || input2_orig == NULL || out_c_orig == NULL || out_opt_orig == NULL) { printf(ANSI_COLOR_RED"%s error allocating buffers\n"ANSI_COLOR_RESET, __FUNCTION__); goto elementwise_mult_test_cleanup; } input1 = (int8_t *) (((uint32_t) input1_orig + 15) & ~15); input2 = (int8_t *) (((uint32_t) input2_orig + 15) & ~15); if (itr == 4 || itr == 5) { input2 = input2_orig; // unaligned input } out_data_c = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15); out_data_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15); for (int i = 0; i < size; ++i) { input1[i] = rand() % 256 - 128; input2[i] = rand() % 256 - 128; } if (itr == 0) { /* enable profiler */ profile_c_start(); } /* C function */ esp_nn_mul_elementwise_s8_ansi(input1, input2, input1_offset, input2_offset, out_data_c, output_offset, output_mult, output_shift, activation_min, activation_max, size); if (itr == 0) { profile_c_end(); profile_opt_start(); } /* Optimized function */ esp_nn_mul_elementwise_s8(input1, input2, input1_offset, input2_offset, out_data_opt, output_offset, output_mult, output_shift, activation_min, activation_max, size); if (itr == 0) { /* disable profiler */ profile_opt_end(); } bool ret = CHECK_EQUAL(out_data_c, out_data_opt, size); if (ret == false) { printf(ANSI_COLOR_RED"%s[%d] failed\n"ANSI_COLOR_RESET, __FUNCTION__, itr); printf("Output: \n"); PRINT_ARRAY_HEX(out_data_opt, size, 1); printf("Expected: \n"); PRINT_ARRAY_HEX(out_data_c, size, 1); printf("Input1:\n"); PRINT_ARRAY_HEX(input1, size, 1); printf("Input2:\n"); PRINT_ARRAY_HEX(input2, size, 1); goto elementwise_mult_test_cleanup; } printf(ANSI_COLOR_GREEN"%s[%d] passed\n"ANSI_COLOR_RESET, __FUNCTION__, itr); elementwise_mult_test_cleanup: if (input1_orig) { free(input1_orig); } if (input2_orig) { free(input2_orig); } if (out_c_orig) { free(out_c_orig); } if (out_opt_orig) { free(out_opt_orig); } } } void esp_nn_mul_broadcast_channel_s8_test() { int total_spatial = 49; /* 7x7 feature map */ int channels = 64; int8_t *input1; int8_t *input2_per_ch; int8_t *out_data_c; int8_t *out_data_opt; int8_t *input1_orig = NULL; int8_t *input2_orig = NULL; int8_t *out_c_orig = NULL; int8_t *out_opt_orig = NULL; int32_t input1_offset = 34; int32_t input2_offset = 35; int32_t output_offset = 36; int32_t output_shift = -7; int32_t output_mult = MULT_MAX; int32_t activation_min = -128; int32_t activation_max = 127; for (int itr = 0; itr < 10; itr++) { switch (itr) { case 0: // all zeros input1_offset = 0; input2_offset = 0; output_offset = 0; output_mult = 0; output_shift = 0; total_spatial = 49; channels = 64; break; case 1: // hit min input1_offset = -127; input2_offset = -127; output_offset = -128; output_mult = MULT_MIN; output_shift = 0; break; case 2: // hit max input1_offset = 128; input2_offset = 128; output_offset = -127; output_mult = MULT_MAX; output_shift = SHIFT_MIN; break; case 3: // small channels (leftover only, no SIMD) input1_offset = 64; input2_offset = 32; output_offset = -10; output_mult = MULT_MAX / 2; output_shift = -5; total_spatial = 16; channels = 5; break; case 4: // unaligned channels (SIMD + leftover) total_spatial = 14; channels = 19; break; case 5: // typical SE-block: 7x7 spatial, 96 channels input1_offset = 128; input2_offset = 128; output_offset = -128; output_mult = 1705397815; output_shift = -3; total_spatial = 49; channels = 96; break; default: // random input1_offset = rand() % 256 - 127; input2_offset = rand() % 256 - 127; output_offset = rand() % 256 - 128; output_mult = MULT_MAX / 2 + rand() % INT16_MAX; output_shift = -8 + rand() % 4; total_spatial = 4 + rand() % 64; channels = 8 + rand() % 128; } int size = total_spatial * channels; input1_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16); input2_orig = (int8_t *) ESP_NN_TEST_ALLOC(channels + 16); out_c_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16); out_opt_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16); if (input1_orig == NULL || input2_orig == NULL || out_c_orig == NULL || out_opt_orig == NULL) { printf(ANSI_COLOR_RED"%s error allocating buffers\n"ANSI_COLOR_RESET, __FUNCTION__); goto broadcast_mul_test_cleanup; } input1 = (int8_t *) (((uint32_t) input1_orig + 15) & ~15); input2_per_ch = (int8_t *) (((uint32_t) input2_orig + 15) & ~15); out_data_c = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15); out_data_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15); if (itr == 4) { input1 = input1_orig; // unaligned input } for (int i = 0; i < size; ++i) { input1[i] = rand() % 256 - 128; } for (int i = 0; i < channels; ++i) { input2_per_ch[i] = rand() % 256 - 128; } if (itr == 0) { profile_c_start(); } /* C reference */ esp_nn_mul_broadcast_channel_s8_ansi(input1, input2_per_ch, input1_offset, input2_offset, out_data_c, output_offset, output_mult, output_shift, activation_min, activation_max, total_spatial, channels); if (itr == 0) { profile_c_end(); profile_opt_start(); } /* Optimized function */ esp_nn_mul_broadcast_channel_s8(input1, input2_per_ch, input1_offset, input2_offset, out_data_opt, output_offset, output_mult, output_shift, activation_min, activation_max, total_spatial, channels); if (itr == 0) { profile_opt_end(); } bool ret = CHECK_EQUAL(out_data_c, out_data_opt, size); if (ret == false) { printf(ANSI_COLOR_RED"%s[%d] failed\n"ANSI_COLOR_RESET, __FUNCTION__, itr); printf("spatial=%d channels=%d size=%d\n", total_spatial, channels, size); for (int idx = 0; idx < size; idx++) { if (out_data_c[idx] != out_data_opt[idx]) { printf("first mismatch at idx=%d (row=%d ch=%d): got %02x exp %02x\n", idx, idx / channels, idx % channels, (uint8_t)out_data_opt[idx], (uint8_t)out_data_c[idx]); // print 8 more mismatches int cnt = 0; for (int j = idx + 1; j < size && cnt < 8; j++) { if (out_data_c[j] != out_data_opt[j]) { printf(" mismatch at idx=%d (row=%d ch=%d): got %02x exp %02x\n", j, j / channels, j % channels, (uint8_t)out_data_opt[j], (uint8_t)out_data_c[j]); cnt++; } } break; } } goto broadcast_mul_test_cleanup; } printf(ANSI_COLOR_GREEN"%s[%d] passed\n"ANSI_COLOR_RESET, __FUNCTION__, itr); broadcast_mul_test_cleanup: if (input1_orig) { free(input1_orig); } if (input2_orig) { free(input2_orig); } if (out_c_orig) { free(out_c_orig); } if (out_opt_orig) { free(out_opt_orig); } } } ================================================ FILE: tests/src/convolution_test.c ================================================ /* * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include #include #include #include #include #include #include #include "test_utils.h" void esp_nn_depthwise_conv_s8_test() { uint32_t total_c = 0, total_opt = 0; int8_t *input = NULL, *filter_data = NULL; int8_t *out_data_c = NULL, *out_data_opt = NULL; int32_t *bias = NULL; int32_t input_offset = 5; /* some number in [-128, 127] */ int32_t out_offset = 7; int32_t activation_min = -125; int32_t activation_max = 120; void *scratch_buf = NULL; /* independent variables */ int input_wd, input_ht, channels; uint16_t filter_ht, filter_wd, ch_mult, out_wd, out_ht; uint16_t pad_wd, pad_ht, stride_wd, stride_ht; printf("\n######## Running %s ##########\n", __FUNCTION__); // run for 17 iterations for (int itr = 0; itr < 17; itr++) { /* prepare data */ switch (itr) { case 0: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (0,0) input_wd = 18; input_ht = 18; filter_ht = 3; filter_wd = 3; ch_mult = 1; channels = 16; pad_wd = 0; pad_ht = 0; stride_wd = 1; stride_ht = 1; break; case 1: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (1,1) input_wd = 10; input_ht = 10; filter_ht = 3; filter_wd = 3; ch_mult = 1; channels = 16; pad_wd = 1; pad_ht = 1; stride_wd = 1; stride_ht = 1; break; case 2: // (ch_mult 1, (channels % 8) = 0), filter (3,3), pad (1,1) input_wd = 10; input_ht = 10; filter_ht = 3; filter_wd = 3; ch_mult = 1; channels = 24; pad_wd = 1; pad_ht = 1; stride_wd = 1; stride_ht = 1; break; case 3: // other filter sizes (ch_mult 1, (channels % 8) = 0) input_wd = 10; input_ht = 10; filter_ht = 3; filter_wd = 3; ch_mult = 1; channels = 24; pad_wd = 1; pad_ht = 1; stride_wd = 1; stride_ht = 1; break; case 4: // other filter sizes (ch_mult 8 = 0) input_wd = 6; input_ht = 6; filter_ht = 3; filter_wd = 3; ch_mult = 8; channels = 4; pad_wd = 1; pad_ht = 1; stride_wd = 1; stride_ht = 1; break; case 5: // other filter sizes (ch_mult 8 = 0) input_wd = 12; input_ht = 12; filter_ht = 5; filter_wd = 5; ch_mult = 8; channels = 4; pad_wd = 1; pad_ht = 1; stride_wd = 1; stride_ht = 1; break; case 6: // other filter sizes (ch_mult 4 = 0) input_wd = 6; input_ht = 6; filter_ht = 3; filter_wd = 3; ch_mult = 4; channels = 4; pad_wd = 1; pad_ht = 1; stride_wd = 1; stride_ht = 1; break; case 7: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (0,0) stride (2,2) input_wd = 6; input_ht = 6; filter_ht = 3; filter_wd = 3; ch_mult = 1; channels = 16; pad_wd = 0; pad_ht = 0; stride_wd = 2; stride_ht = 2; break; case 8: // same as case 7, with large parameters (reduced for non-PSRAM boards) input_wd = 28; input_ht = 28; filter_ht = 3; filter_wd = 3; ch_mult = 1; channels = 64; pad_wd = 0; pad_ht = 0; stride_wd = 2; stride_ht = 2; break; case 9: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (0,0) stride (2,2) input_wd = 6; input_ht = 6; filter_ht = 3; filter_wd = 3; ch_mult = 1; channels = 16; pad_wd = 0; pad_ht = 0; stride_wd = 2; stride_ht = 2; break; case 15: // ch=8, 3x3, pad=1 (person_detection model layer, ch<12 path) input_wd = 48; input_ht = 48; filter_ht = 3; filter_wd = 3; ch_mult = 1; channels = 8; pad_wd = 1; pad_ht = 1; stride_wd = 1; stride_ht = 1; break; case 16: // ch=8, 3x3, pad=0, stride=2 (another ch<12 variant) input_wd = 12; input_ht = 12; filter_ht = 3; filter_wd = 3; ch_mult = 1; channels = 8; pad_wd = 0; pad_ht = 0; stride_wd = 2; stride_ht = 2; break; default: input_wd = 6; input_ht = 6; filter_ht = 3; filter_wd = 3; ch_mult = 1; channels = 16; stride_wd = rand() % 2 + 1; stride_ht = stride_wd; pad_wd = stride_wd == 1 ? 0 : rand() % 2; pad_ht = pad_wd; break; } /* prepare data */ if (pad_wd) { out_wd = (input_wd + stride_wd - 1) / stride_wd; } else { out_wd = (input_wd + stride_wd - filter_wd) / stride_wd; } if (pad_ht) { out_ht = (input_ht + stride_ht - 1) / stride_ht; } else { out_ht = (input_ht + stride_ht - filter_ht) / stride_ht; } // if (itr == 9) { // expect the function to handle this gracefully // out_wd += 1; // out_ht += 1; // } int in_size = input_wd * input_ht * channels; int out_size = out_wd * out_ht * channels * ch_mult; int filter_size = filter_wd * filter_ht * channels * ch_mult + 4; int bias_size = channels * ch_mult + 1; int32_t out_shift[channels * ch_mult]; int32_t out_mult[channels * ch_mult]; int8_t *input_orig = ESP_NN_TEST_ALLOC(in_size + 16); int8_t *out_c_orig = ESP_NN_TEST_ALLOC(out_size + 16); int8_t *out_opt_orig = ESP_NN_TEST_ALLOC(out_size + 16); filter_data = ESP_NN_TEST_ALLOC(filter_size); bias = ESP_NN_TEST_ALLOC(bias_size * 4); if (bias == NULL || input_orig == NULL || filter_data == NULL || out_c_orig == NULL || out_opt_orig == NULL) { printf(ANSI_COLOR_RED"[%d] allocations failed\n"ANSI_COLOR_RESET, itr); goto dc_s8_cleanup; } input = (int8_t *) (((uint32_t) input_orig + 15) & ~15); out_data_c = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15); out_data_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15); /* Generate input data */ for (int i = 0; i < in_size; ++i) { input[i] = rand() % 128; } /* Generate filter data */ for (int i = 0; i < filter_size; ++i) { filter_data[i] = rand() % 256 - 128; } /* Generate bias data */ for (int i = 0; i < channels * ch_mult; ++i) { bias[i + 1] = rand() % INT16_MAX; //0th index left for unalignment out_shift[i] = -8 + rand() % 3; out_mult[i] = 0x7eb0e200 + rand() % 50; } data_dims_t input_dims = {.width = input_wd, .height = input_ht, .channels = channels, 1}; data_dims_t output_dims = {.width = out_wd, .height = out_ht, .channels = channels * ch_mult, 1}; data_dims_t filter_dims = {.width = filter_wd, .height = filter_ht, 0, 0}; dw_conv_params_t conv_params = {.in_offset = input_offset, .out_offset = out_offset, .ch_mult = ch_mult, .stride = {stride_wd, stride_ht}, .padding = {pad_wd, pad_ht}, .dilation = {0, 0}, .activation = {activation_min, activation_max}}; quant_data_t quant_data = {.shift = out_shift, .mult = out_mult}; int scratch_buf_size = esp_nn_get_depthwise_conv_scratch_size(&input_dims, &filter_dims, &output_dims, &conv_params); if (scratch_buf_size > 0) { scratch_buf = ESP_NN_TEST_ALLOC(scratch_buf_size + 16); if (scratch_buf == NULL) { printf(ANSI_COLOR_RED"[%d] scratch_buf alloc failed size %d\n"ANSI_COLOR_RESET, itr, scratch_buf_size); goto dc_s8_cleanup; } int align_sz = 16 - (((int32_t) scratch_buf) & 0xf); esp_nn_set_depthwise_conv_scratch_buf(scratch_buf + align_sz); } /* enable profiler */ profile_c_start(); /* C function */ esp_nn_depthwise_conv_s8_ansi(&input_dims, input, &filter_dims, filter_data + 4, bias + 1, &output_dims, out_data_c, &conv_params, &quant_data); total_c = profile_c_end(); profile_opt_start(); /* Optimized function */ esp_nn_depthwise_conv_s8(&input_dims, input, &filter_dims, filter_data + 4, bias + 1, &output_dims, out_data_opt, &conv_params, &quant_data); /* disable profiler */ total_opt = profile_opt_end(); bool ret = CHECK_EQUAL(out_data_c, out_data_opt, out_size); if (ret == false) { printf(ANSI_COLOR_RED"[%3d] failed [pad: (%d, %d), stride: (%d, %d)" " out: (%3d,%3d), filter: (%d, %d,%3d), ch_mult %d]\n"ANSI_COLOR_RESET, itr, pad_wd, pad_ht, stride_wd, stride_ht, out_wd, out_ht, filter_wd, filter_ht, channels, ch_mult); #if 0 printf("Output: \n"); PRINT_ARRAY_HEX(out_data_opt, out_size / out_ht, out_ht); printf("Expected: \n"); PRINT_ARRAY_HEX(out_data_c, out_size / out_ht, out_ht); printf("Input:\n"); PRINT_ARRAY_HEX(input, in_size / input_ht, input_ht); printf("Filter data:\n"); PRINT_ARRAY_HEX(filter_data + 4, (filter_size - 4) / filter_ht, filter_ht); printf("bias data:\n"); PRINT_ARRAY_INT(bias + 1, ch_mult * channels, 1); #endif goto dc_s8_cleanup; } printf(ANSI_COLOR_GREEN"[%3d] passed [pad: (%d, %d), stride: (%d, %d)" " out: (%3d,%3d), filter: (%d, %d,%3d), ch_mult %d]"ANSI_COLOR_RESET, itr, pad_wd, pad_ht, stride_wd, stride_ht, out_wd, out_ht, filter_wd, filter_ht, channels, ch_mult); printf("\tcycles: c %8"PRIu32", opt %8"PRIu32"\n", total_c, total_opt); dc_s8_cleanup: if (input_orig) { free(input_orig); } if (filter_data) { free(filter_data); } if (out_c_orig) { free(out_c_orig); } if (out_opt_orig) { free(out_opt_orig); } if (bias) { free(bias); } if (scratch_buf) { free(scratch_buf); } } } void esp_nn_conv_s8_test() { uint32_t total_c = 0, total_opt = 0; int32_t input_offset = 5; /* some number in [-128, 127] */ int32_t activation_min = -125; int32_t activation_max = 122; int32_t out_offset = 3; void *scratch_buf = NULL; int8_t *input_orig = NULL; int8_t *out_c_orig = NULL; int8_t *out_opt_orig = NULL; int8_t *filter_data = NULL; int32_t *bias = NULL; int32_t *out_shift = NULL; int32_t *out_mult = NULL; /* independent variable */ int in_wd, in_ht, in_channels, out_channels; uint16_t filter_ht, filter_wd, out_wd, out_ht; uint16_t pad_wd, pad_ht, stride_wd, stride_ht; printf("\n######## Running %s ##########\n", __FUNCTION__); for (int itr = 0; itr < 18; itr++) { /* Reset quant params to defaults each iteration */ input_offset = 5; out_offset = 3; activation_min = -125; activation_max = 122; switch (itr) { case 0: // ch % 8 == 0 && filter (1,1), padding (0,0) in_wd = 10; in_ht = 10; in_channels = 64; out_channels = 64; filter_ht = 1; filter_wd = 1; pad_wd = 0; pad_ht = 0; stride_wd = 1; stride_ht = 1; break; case 1: // ch % 4 == 0 && (in_wd * in_ht) % 16 == 0 in_wd = 4; in_ht = 4; in_channels = 20; out_channels = 8; filter_ht = 1; filter_wd = 1; pad_wd = 0; pad_ht = 0; stride_wd = 1; stride_ht = 1; break; case 2: // ch, filter (3x3x3) in_wd = 10; in_ht = 10; in_channels = 3; out_channels = 64; filter_ht = 3; filter_wd = 3; pad_wd = 0; pad_ht = 0; stride_wd = 1; stride_ht = 1; break; case 3: // remaining pad (0, 0) in_wd = 10; in_ht = 10; in_channels = 3; out_channels = 64; filter_ht = 1; filter_wd = 1; pad_wd = 0; pad_ht = 0; stride_wd = 1; stride_ht = 1; break; case 4: // unopt case in_wd = 10; in_ht = 10; in_channels = 12; out_channels = 64; filter_ht = 3; filter_wd = 3; pad_wd = 1; pad_ht = 1; stride_wd = 1; stride_ht = 1; break; case 5: // ch % 8 == 0 & stride (2,2) in_wd = 16; in_ht = 16; in_channels = 16; out_channels = 16; filter_ht = 1; filter_wd = 1; pad_wd = 0; pad_ht = 0; stride_wd = 2; stride_ht = 2; break; case 6: // ch % 8 == 0 && filter (1,1), padding (0,0) in_wd = 2; in_ht = 2; in_channels = 8; out_channels = 8; filter_ht = 1; filter_wd = 1; pad_wd = 0; pad_ht = 0; stride_wd = 1; stride_ht = 1; break; case 7: // ch == 3, pad (0, 0) in_wd = 112; in_ht = 112; in_channels = 3; out_channels = 16; filter_ht = 6; filter_wd = 6; pad_wd = 0; pad_ht = 0; stride_wd = 2; stride_ht = 2; break; case 8: // ch == 5, remaining pad (0, 0) in_wd = 8; in_ht = 8; in_channels = 5; out_channels = 16; filter_ht = 6; filter_wd = 6; pad_wd = 0; pad_ht = 0; stride_wd = 2; stride_ht = 2; break; case 9: // in_wd = 3; in_ht = 3; in_channels = 32; out_channels = 1; filter_ht = 3; filter_wd = 3; pad_wd = 1; pad_ht = 1; stride_wd = 1; stride_ht = 1; break; case 10: // needs right and bottom padding in_wd = 4; in_ht = 8; in_channels = 1; out_channels = 3; filter_ht = 3; filter_wd = 3; pad_wd = 0; pad_ht = 0; stride_wd = 2; stride_ht = 2; break; case 11: // needs right and bottom padding in_wd = 4; in_ht = 8; in_channels = 3; out_channels = 4; filter_ht = 3; filter_wd = 3; pad_wd = 0; pad_ht = 0; stride_wd = 2; stride_ht = 2; break; case 15: // 1x1 conv, large spatial, YOLO-like quant params in_wd = 48; in_ht = 48; in_channels = 32; out_channels = 32; filter_ht = 1; filter_wd = 1; pad_wd = 0; pad_ht = 0; stride_wd = 1; stride_ht = 1; // Override quant params to match YOLO Op[8] input_offset = 127; out_offset = 39; break; case 16: // 1x1, YOLO exact data: 48x48x32->32 with real filter/bias/quant in_wd = 48; in_ht = 48; in_channels = 32; out_channels = 32; filter_ht = 1; filter_wd = 1; pad_wd = 0; pad_ht = 0; stride_wd = 1; stride_ht = 1; input_offset = 127; out_offset = 39; activation_min = -128; activation_max = 127; break; case 17: // 1x1 conv with DELIBERATELY UNALIGNED filter + small out_shift // Tests both alignment (filter+5) AND transpose correctness (shift=-6 won't mask 8x error) in_wd = 24; in_ht = 24; in_channels = 32; out_channels = 32; filter_ht = 1; filter_wd = 1; pad_wd = 0; pad_ht = 0; stride_wd = 1; stride_ht = 1; input_offset = 110; /* typical YOLO value that exposed the bug */ out_offset = 39; activation_min = -128; activation_max = 127; break; default: // ch % 8 == 0 in_wd = 8; in_ht = 8; in_channels = 16; out_channels = 16; filter_ht = 1; filter_wd = 1; pad_wd = 0; pad_ht = 0; stride_wd = 1; stride_ht = 1; break; } int8_t *filter_data_orig_save = NULL; /* for case 17 unaligned filter restore */ /* prepare data */ if (pad_wd) { out_wd = (in_wd + stride_wd - 1) / stride_wd; } else { out_wd = (in_wd + stride_wd - filter_wd) / stride_wd; } if (pad_ht) { out_ht = (in_ht + stride_ht - 1) / stride_ht; } else { out_ht = (in_ht + stride_ht - filter_ht) / stride_ht; } int in_size = in_wd * in_ht * in_channels; int filter_size = filter_wd * filter_ht * in_channels * out_channels + 2; int out_size = out_wd * out_ht * out_channels; input_orig = ESP_NN_TEST_ALLOC(in_size + 16); out_c_orig = ESP_NN_TEST_ALLOC(out_size + 16); out_opt_orig = ESP_NN_TEST_ALLOC(out_size + 16); filter_data = ESP_NN_TEST_ALLOC(filter_size + 16); bias = ESP_NN_TEST_ALLOC(128 + sizeof (int32_t) * out_channels); out_shift = ESP_NN_TEST_ALLOC(128 + sizeof (int32_t) * out_channels); out_mult = ESP_NN_TEST_ALLOC(128 + sizeof (int32_t) * out_channels); if (input_orig == NULL || filter_data == NULL || out_c_orig == NULL || out_opt_orig == NULL || bias == NULL || out_shift == NULL || out_mult == NULL) { printf(ANSI_COLOR_RED"[%3d] alloc failed (in=%d filter=%d out=%d)\n"ANSI_COLOR_RESET, itr, in_size, filter_size, out_size); goto conv_s8_cleanup; } int8_t *input = (int8_t *) (((uint32_t) input_orig + 15) & ~15); int8_t *out_data_c = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15); int8_t *out_data_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15); /* Generate input data between -128 -> +127 */ for (int i = 0; i < in_size; ++i) { input[i] = rand() % 255 - 128; } /* Generate filter data between -128 -> +127 */ for (int i = 0; i < filter_size; ++i) { filter_data[i] = rand() % 256 - 128; } /* Case 17: deliberately misalign filter by 5 bytes to test alignment handling. * This reproduces the bug where ee.vld.l.64.ip ignores lower address bits. */ filter_data_orig_save = filter_data; if (itr == 17) { filter_data = filter_data + 5; /* misalign by 5 bytes (like YOLO's 0x3c05fe55) */ } /* Generate bias data */ for (int i = 0; i < out_channels; ++i) { bias[i] = (int32_t)rand() % UINT16_MAX + UINT8_MAX; } /* Shift and multiplier */ for (int i = 0; i < out_channels; ++i) { out_shift[i] = -10 + rand() % 2; out_mult[i] = 0x7f67f4f8 + rand() % 50; } /* Case 17: use small out_shift to expose transpose cross-position contamination. * out_shift=-6 (÷64) won't mask an 8x error like -10 (÷1024) would. */ if (itr == 17) { for (int i = 0; i < out_channels; ++i) { out_shift[i] = -6; } } /* Case 16: override ALL data with exact YOLO Op[8] values */ if (itr == 16) { static const int8_t yolo_filter[] = { 6,127,57,21,23,8,5,109,2,15,-1,-99,14,7,-67,-59,-12,40,-90,16,-1,-3,25,7,17,-16,14,24,-53,-2,-110,-10, -6,-5,5,5,55,3,2,-6,-4,-17,0,17,-10,7,-3,-13,56,-3,-13,-83,-1,-4,-49,6,-127,1,5,1,8,-10,7,-2, 3,-1,-2,0,-29,-1,-5,-14,-2,-22,-1,-1,-9,1,-12,-18,-127,-1,-14,71,-1,0,-3,-2,-5,3,0,-4,0,-21,-1,-1, -13,-9,-20,-77,-2,-77,-20,59,127,-7,120,-51,-9,-47,50,45,11,8,17,8,112,-20,2,-50,-12,-34,-88,-14,-59,8,-29,2, -4,11,-32,-32,3,-4,5,-113,-11,2,-18,-13,-2,-7,127,8,8,7,2,6,-16,-3,1,-15,1,-5,-20,-2,13,1,24,3, -8,-7,1,-1,54,1,1,-1,7,-6,5,3,-4,-5,-2,11,-68,-3,-10,27,5,4,-61,4,-127,3,0,2,1,1,2,3, 0,-2,0,2,11,2,-3,3,1,-61,1,-5,127,2,-2,-5,8,0,27,9,2,-2,4,1,2,-1,2,-2,0,101,0,0, 1,2,-51,-1,6,-6,2,10,-7,-2,2,-19,2,-3,-115,127,12,12,6,1,0,-6,2,-22,5,-4,-18,3,1,-2,51,-2, -20,-21,-60,123,-4,127,-17,25,-80,-7,-95,-45,-7,11,49,51,14,0,-4,8,-73,-52,-5,-47,-11,-33,119,-21,-31,4,21,1, -1,4,-1,1,-2,1,-1,1,1,71,1,-3,-127,1,2,4,-1,3,46,1,2,-2,-7,-1,2,-2,1,0,-1,85,2,0, -2,-127,81,81,-5,71,18,22,80,14,83,58,14,-2,-14,36,6,29,-106,7,71,-46,-27,88,-19,-66,79,-13,77,7,66,-18, 46,17,-9,-24,3,17,-22,-4,-9,-1,-36,15,-1,-49,11,-3,5,-29,2,2,0,64,-1,-19,4,12,-4,32,9,-5,-9,-127, -30,6,105,-67,-16,-61,-45,110,-56,-15,-50,-54,-18,-37,14,36,19,1,21,22,-66,-13,2,127,-4,-52,-60,-22,92,-6,45,-7, -14,12,18,13,5,10,12,29,6,-10,2,-29,-3,-28,-3,-15,-2,39,127,14,3,-43,13,6,1,-103,9,11,4,-10,-5,-27, -88,-35,-15,7,4,-6,64,-2,-48,-3,-18,-8,-3,-71,-8,0,7,63,12,3,-7,74,0,16,1,-67,-10,-78,-9,-7,5,127, -3,7,2,4,15,2,0,9,2,127,2,-16,74,2,3,-5,9,1,29,13,2,-5,-16,1,8,-4,3,2,-2,-122,-2,-1, 8,-15,-2,3,11,-1,0,57,1,-7,2,19,-4,-2,-127,54,0,17,48,0,0,2,-2,-4,5,5,4,-5,-8,-7,-20,5, 11,39,-91,-65,11,-67,3,4,-56,-4,-66,-3,-3,5,4,8,-3,6,28,-8,-51,11,15,-106,10,23,-73,9,-127,-3,-78,8, 8,-3,1,0,-127,0,-7,-5,-2,-17,-2,0,5,-3,-7,-5,-3,11,-10,-3,-2,-2,3,1,70,-3,2,-7,2,-17,0,-1, 5,-127,13,6,2,3,12,25,0,-3,-2,-45,0,-6,4,-6,9,-11,-19,6,0,4,-14,9,2,25,3,0,3,5,-5,3, -9,36,18,-4,-4,2,-19,-101,0,10,-9,-127,-4,-5,-37,82,-1,-20,6,-13,-2,-1,4,4,1,-11,3,-10,-27,5,-45,-3, -6,-13,6,-3,4,-1,-5,5,2,-4,-2,-5,-1,-16,-5,-1,0,-1,0,-1,-15,-127,-3,-2,0,23,-3,0,-1,2,0,16, 3,45,17,24,8,27,-5,42,25,-9,21,-47,-14,18,27,23,-4,-15,127,19,24,14,19,21,-2,39,23,9,14,-7,15,15, -127,-30,2,-10,3,-5,71,11,-16,0,-15,-18,3,-3,14,6,-1,73,-3,-1,-12,27,-2,-2,0,8,-7,-108,9,3,-6,8, 63,-47,0,-37,11,-20,-48,6,-19,-1,-18,13,-3,76,-18,15,3,-48,16,2,-4,-34,-6,3,7,-127,-7,58,1,-3,-23,108, 102,-1,0,3,11,1,-127,-7,-4,0,-2,-8,-13,-6,-6,-22,5,115,18,7,-1,-6,-4,3,-5,10,-1,-88,0,4,-1,7, 127,-5,12,-6,10,-13,-89,16,-20,1,-24,-12,-6,4,-4,-15,-3,-110,3,-6,-17,89,-10,9,13,-80,-18,105,-3,-4,3,-85, 2,7,2,-1,-25,-2,-5,-5,0,-25,-1,-6,-5,0,-14,-24,74,0,-13,-127,-1,0,-1,-1,-4,0,-1,-4,-2,-19,-8,1, -84,-1,-6,-2,-19,-4,105,-3,-2,8,-2,-32,2,-3,2,-21,1,-127,-10,5,-3,8,0,2,-5,-8,-4,85,1,10,4,2, 19,-15,0,1,95,2,-15,-2,0,-56,-3,-4,-24,-5,-2,3,-16,-6,-37,-6,-3,1,127,-1,-119,4,2,-13,0,-41,3,3, -10,-29,-13,-4,5,-9,-7,71,-6,7,-3,113,-2,0,51,-127,-10,-11,26,-3,-4,-1,0,-23,1,5,-7,-9,-20,8,-2,7, -66,-1,-1,-10,3,-31,43,9,-18,-9,-2,-22,-2,75,22,-1,5,39,-14,4,-5,-62,-2,3,-1,69,-19,-61,-17,-2,-8,-127 }; static const int8_t yolo_input[] = { -127,-65,-96,-127,-124,-100,-122,-127,-93,-122,-127,-127,-114,-91,-126,-105, -127,-127,-128,-118,-102,-127,-127,-93,-127,-126,-127,-103,-127,-124,-127,-127, -126,-63,-128,-128,-127,-127,-122,-118,-127,-126,-128,-114,-112,-122,-120,-122, -114,-127,-127,-114,-126,-118,-127,-127,-127,-124,-128,-100,-128,-124,-127,-107, -126,-63,-128,-128,-128,-126,-120,-118,-124,-126,-128,-112,-112,-122,-120,-122, -114,-127,-127,-114,-128,-120,-127,-124,-127,-124,-127,-98,-128,-124,-127,-105, -127,-62,-127,-127,-127,-128,-118,-114,-128,-126,-126,-112,-112,-124,-122,-124, -114,-127,-127,-114,-127,-120,-127,-128,-127,-122,-128,-100,-128,-124,-128,-105, -126,-63,-128,-127,-127,-128,-120,-116,-128,-124,-128,-112,-114,-122,-120,-124, -114,-127,-127,-112,-126,-118,-127,-127,-127,-124,-127,-98,-128,-124,-128,-105, -127,-63,-128,-128,-127,-128,-120,-114,-127,-124,-120,-112,-114,-122,-122,-124, -114,-127,-127,-114,-127,-120,-127,-127,-127,-124,-127,-98,-124,-124,-128,-107, -128,-67,-127,-126,-127,-127,-118,-112,-127,-124,-122,-111,-114,-128,-118,-127, -114,-127,-127,-114,-128,-118,-127,-127,-127,-122,-127,-102,-127,-124,-128,-102, -126,-69,-128,-128,-127,-127,-120,-112,-127,-124,-118,-111,-114,-124,-124,-126, -112,-127,-127,-116,-128,-120,-127,-127,-127,-124,-126,-105,-128,-124,-122,-107 }; static const int32_t yolo_bias[] = { 2420,1649,1302,1816,-446,1562,685,32,2503,-74,3143,463,1507,1883,-932,525, 1205,162,540,1680,1846,388,338,274,-433,502,817,1021,812,1371,-30,1525 }; static const int32_t yolo_shifts[] = { -8,-7,-6,-8,-6,-7,-8,-7,-8,-7,-9,-6,-8,-8,-7,-8,-8,-8,-7,-7,-7,-6,-8,-7,-7,-8,-8,-7,-8,-8,-8,-7 }; static const int32_t yolo_mults[] = { 0x52a119c9,0x53a7fce0,0x4430a104,0x5afd73fd,0x4a9394b6,0x5e2b6940,0x7c02c5c9,0x509cb64d, 0x5941a055,0x5d50f6be,0x60b9e0ad,0x41e9ef39,0x67d9347b,0x6b36dcc7,0x5406c784,0x70ae9dd9, 0x6a183a7f,0x78f48e0e,0x53e7df22,0x63cc6072,0x448b1623,0x4cd5d08c,0x6175e8be,0x5cd03362, 0x4de1312d,0x6c5bd16d,0x6e89094f,0x64a1947e,0x78e1060e,0x63d8179b,0x791c8d51,0x532420c2 }; memcpy(input, yolo_input, sizeof(yolo_input)); memcpy(filter_data, yolo_filter, sizeof(yolo_filter)); memcpy(bias, yolo_bias, sizeof(yolo_bias)); memcpy(out_shift, yolo_shifts, sizeof(yolo_shifts)); memcpy(out_mult, yolo_mults, sizeof(yolo_mults)); } data_dims_t input_dims = {.width = in_wd, .height = in_ht, .channels = in_channels, 1}; data_dims_t output_dims = {.width = out_wd, .height = out_ht, .channels = out_channels, 1}; data_dims_t filter_dims = {.width = filter_wd, .height = filter_ht, .channels = in_channels, 1}; conv_params_t conv_params = {.in_offset = input_offset, .out_offset = out_offset, .stride = {stride_wd, stride_ht}, .padding = {pad_wd, pad_ht}, .dilation = {0, 0}, .activation = {activation_min, activation_max}}; quant_data_t quant_data = {.shift = out_shift, .mult = out_mult}; int scratch_buf_size = esp_nn_get_conv_scratch_size(&input_dims, &filter_dims, &output_dims, &conv_params); if (scratch_buf_size > 0) { scratch_buf = ESP_NN_TEST_ALLOC(scratch_buf_size + 16); if (scratch_buf == NULL) { printf(ANSI_COLOR_RED"[%3d] scratch_buf alloc failed size %d\n"ANSI_COLOR_RESET, itr, scratch_buf_size); goto conv_s8_cleanup; } int align_sz = 16 - (((int32_t) scratch_buf) & 0xf); esp_nn_set_conv_scratch_buf(scratch_buf + align_sz); } /* enable profiler */ profile_c_start(); /* C function */ esp_nn_conv_s8_ansi(&input_dims, input, &filter_dims, filter_data, bias, &output_dims, out_data_c, &conv_params, &quant_data); total_c = profile_c_end(); profile_opt_start(); /* Optimized function */ esp_nn_conv_s8(&input_dims, input, &filter_dims, filter_data, bias, &output_dims, out_data_opt, &conv_params, &quant_data); /* disable profiler */ total_opt = profile_opt_end(); bool ret = CHECK_EQUAL(out_data_c, out_data_opt, out_size); if (ret == false) { printf(ANSI_COLOR_RED"[%3d] failed [pad: (%d, %d), stride: (%d, %d)" " out: (%3d,%3d,%3d), filter: (%d, %d,%3d)]\n"ANSI_COLOR_RESET, itr, pad_wd, pad_ht, stride_wd, stride_ht, out_wd, out_ht, out_channels, filter_wd, filter_ht, in_channels); goto conv_s8_cleanup; } printf(ANSI_COLOR_GREEN"[%3d] passed [pad: (%d, %d), stride: (%d, %d)" " out: (%3d,%3d,%3d), filter: (%d, %d,%3d)]"ANSI_COLOR_RESET, itr, pad_wd, pad_ht, stride_wd, stride_ht, out_wd, out_ht, out_channels, filter_wd, filter_ht, in_channels); printf("\tcycles: c %8"PRIu32", opt %8"PRIu32"\n", total_c, total_opt); conv_s8_cleanup: /* Restore original filter pointer (may have been offset for alignment test) */ filter_data = filter_data_orig_save; if (input_orig) { free(input_orig); input_orig = NULL; } if (filter_data) { free(filter_data); filter_data = NULL; } if (out_c_orig) { free(out_c_orig); out_c_orig = NULL; } if (out_opt_orig) { free(out_opt_orig); out_opt_orig = NULL; } if (bias) { free(bias); bias = NULL; } if (out_shift) { free(out_shift); out_shift = NULL; } if (out_mult) { free(out_mult); out_mult = NULL; } if (scratch_buf) { free(scratch_buf); scratch_buf = NULL; } } } ================================================ FILE: tests/src/fully_connected_test.c ================================================ /* * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include #include #include #include #include #include #include "test_utils.h" void esp_nn_fully_connected_s8_test() { uint32_t total_c = 0, total_opt = 0; /* prepare data */ uint16_t row_len = 256 + 8 + 7; /* odd len to test unaligned+left-over */ const int32_t max_out_ch = 16; const int32_t max_row_len = 271; uint16_t out_channels = 3; /* Use heap-allocated aligned buffers (matches TFLite real-world usage) */ int8_t *input_orig = malloc(max_row_len + 16); int8_t *filter_orig = malloc(max_row_len * max_out_ch + 16); int8_t *out_c_orig = malloc(max_out_ch + 16); int8_t *out_opt_orig = malloc(max_out_ch + 16); if (!input_orig || !filter_orig || !out_c_orig || !out_opt_orig) { printf(ANSI_COLOR_RED"%s allocations failed\n"ANSI_COLOR_RESET, __FUNCTION__); goto fc_s8_cleanup; } int8_t *input = (int8_t *)(((uint32_t)input_orig + 15) & ~15); int8_t *filter_data = (int8_t *)(((uint32_t)filter_orig + 15) & ~15); int8_t *output_c = (int8_t *)(((uint32_t)out_c_orig + 15) & ~15); int8_t *output_opt = (int8_t *)(((uint32_t)out_opt_orig + 15) & ~15); int32_t activation_min = -128; int32_t activation_max = 127; int32_t input_offset = 0; int32_t filter_offset = 0; int32_t out_shift = -10; int32_t out_offset = 5; int32_t out_mult = 0x59e492c4; printf("\n######## Running %s ##########\n", __FUNCTION__); for (int itr = 0; itr < 15; itr++) { out_mult = INT32_MAX / row_len + rand() % INT16_MAX; switch (itr) { case 0: out_shift = -10; break; case 1: out_shift = SHIFT_MIN; break; case 2: out_shift = SHIFT_MAX; break; case 3: out_shift = 0; break; case 4: row_len = 1; out_channels = 16; out_shift = -10 + rand() % 5; break; case 5: row_len = 16; out_channels = 8; out_shift = -10 + rand() % 5; break; case 6: row_len = 8; out_channels = 8; out_shift = -10 + rand() % 5; break; case 7: row_len = 8; out_channels = 15; out_shift = -10 + rand() % 5; break; case 8: row_len = 8; out_channels = 1; out_shift = -10 + rand() % 5; break; default: row_len = rand() % 7 + 1; out_channels = 8; out_shift = -10 + rand() % 5; break; } if (itr == 0) { out_shift = SHIFT_MAX; } /* Generate input and filter data */ for (int i = 0; i < row_len; ++i) { input[i] = rand() % 256 - 128; } for (int i = 0; i < row_len * out_channels; ++i) { filter_data[i] = rand() % 256 - 128; } /* enable profiler */ profile_c_start(); /* C function */ esp_nn_fully_connected_s8_ansi(input, input_offset, row_len, filter_data, filter_offset, NULL, output_c, out_channels, out_offset, out_shift, out_mult, activation_min, activation_max); total_c = profile_c_end(); profile_opt_start(); /* Optimized function */ esp_nn_fully_connected_s8(input, input_offset, row_len, filter_data, filter_offset, NULL, output_opt, out_channels, out_offset, out_shift, out_mult, activation_min, activation_max); /* disable profiler */ total_opt = profile_opt_end(); bool ret = CHECK_EQUAL(output_c, output_opt, out_channels); if (ret == false) { printf(ANSI_COLOR_RED"[%3d] failed\n"ANSI_COLOR_RESET, itr); #if 0 printf("Output: \n"); PRINT_ARRAY_HEX(output_opt, out_channels, 1); printf("Expected: \n"); PRINT_ARRAY_HEX(output_c, out_channels, 1); printf("Input:\n"); PRINT_ARRAY_HEX(input, row_len, 1); printf("Filter data:\n"); PRINT_ARRAY_HEX(filter_data, row_len, out_channels); printf("Out shift: %d\n", out_shift); printf("Out mult: %x\n", out_mult); #endif goto fc_s8_cleanup; } printf(ANSI_COLOR_GREEN"[%3d] passed [row_len %"PRIu16", out_ch %"PRIu16"]"ANSI_COLOR_RESET, itr, row_len, out_channels); printf("\tcycles: c %8"PRIu32", opt %8"PRIu32"\n", total_c, total_opt); } fc_s8_cleanup: if (input_orig) { free(input_orig); } if (filter_orig) { free(filter_orig); } if (out_c_orig) { free(out_c_orig); } if (out_opt_orig) { free(out_opt_orig); } } void esp_nn_fully_connected_per_ch_s8_test() { uint32_t total_c = 0, total_opt = 0; /* prepare data */ uint16_t row_len = 256 + 8 + 7; /* odd len to test unaligned+left-over */ const int32_t max_out_ch = 16; const int32_t max_row_len = 271; uint16_t out_channels = 3; /* Use heap-allocated aligned buffers (matches TFLite real-world usage) */ int8_t *input_orig = malloc(max_row_len + 16); int8_t *filter_orig = malloc(max_row_len * max_out_ch + 16); int8_t *out_c_orig = malloc(max_out_ch + 16); int8_t *out_opt_orig = malloc(max_out_ch + 16); if (!input_orig || !filter_orig || !out_c_orig || !out_opt_orig) { printf(ANSI_COLOR_RED"%s allocations failed\n"ANSI_COLOR_RESET, __FUNCTION__); goto fc_per_ch_s8_buffers_cleanup; } int8_t *input = (int8_t *)(((uint32_t)input_orig + 15) & ~15); int8_t *filter_data = (int8_t *)(((uint32_t)filter_orig + 15) & ~15); int8_t *output_c = (int8_t *)(((uint32_t)out_c_orig + 15) & ~15); int8_t *output_opt = (int8_t *)(((uint32_t)out_opt_orig + 15) & ~15); int32_t activation_min = -128; int32_t activation_max = 127; int32_t input_offset = 0; int32_t filter_offset = 0; int32_t out_offset = 7; int32_t* out_mult = NULL; int32_t* out_shift = NULL; printf("\n######## Running %s ##########\n", __FUNCTION__); for (int itr = 0; itr < 15; itr++) { int32_t out_shift_val = 0; switch (itr) { case 0: out_shift_val = -10; break; case 1: out_shift_val = SHIFT_MIN; break; case 2: out_shift_val = SHIFT_MAX; break; case 3: out_shift_val = 0; break; case 4: row_len = 1; out_channels = 16; break; case 5: row_len = 16; out_channels = 8; break; case 6: row_len = 8; out_channels = 8; break; case 7: row_len = 8; out_channels = 15; break; case 8: row_len = 8; out_channels = 1; break; default: row_len = rand() % 7 + 1; out_channels = 8; break; } out_mult = ESP_NN_TEST_ALLOC(out_channels * sizeof(int32_t)); out_shift = ESP_NN_TEST_ALLOC(out_channels * sizeof(int32_t)); if (out_shift == NULL || out_mult == NULL) { printf(ANSI_COLOR_RED"out_shift/out_mult allocations failed\n"ANSI_COLOR_RESET); goto fully_connected_per_ch_cleanup; } for (int i = 0; i < out_channels; i++) { out_mult[i] = INT32_MAX / row_len + rand() % INT16_MAX; if (i < 4) { out_shift[i] = out_shift_val; } else { out_shift[i] = -10 + rand() % 5; } } /* Generate input and filter data */ for (int i = 0; i < row_len; ++i) { input[i] = rand() % 256 - 128; } for (int i = 0; i < row_len * out_channels; ++i) { filter_data[i] = rand() % 256 - 128; } /* enable profiler */ profile_c_start(); /* C function */ esp_nn_fully_connected_per_ch_s8_ansi(input, input_offset, row_len, filter_data, filter_offset, NULL, output_c, out_channels, out_offset, out_shift, out_mult, activation_min, activation_max); total_c = profile_c_end(); profile_opt_start(); /* Optimized function */ esp_nn_fully_connected_per_ch_s8(input, input_offset, row_len, filter_data, filter_offset, NULL, output_opt, out_channels, out_offset, out_shift, out_mult, activation_min, activation_max); /* disable profiler */ total_opt = profile_opt_end(); bool ret = CHECK_EQUAL(output_c, output_opt, out_channels); if (ret == false) { printf(ANSI_COLOR_RED"[%3d] failed\n"ANSI_COLOR_RESET, itr); goto fully_connected_per_ch_cleanup; } printf(ANSI_COLOR_GREEN"[%3d] passed [row_len %"PRIu16", out_ch %"PRIu16"]"ANSI_COLOR_RESET, itr, row_len, out_channels); printf("\tcycles: c %8"PRIu32", opt %8"PRIu32"\n", total_c, total_opt); fully_connected_per_ch_cleanup: if (out_shift) { free(out_shift); } if (out_mult) { free(out_mult); } } fc_per_ch_s8_buffers_cleanup: if (input_orig) { free(input_orig); } if (filter_orig) { free(filter_orig); } if (out_c_orig) { free(out_c_orig); } if (out_opt_orig) { free(out_opt_orig); } } ================================================ FILE: tests/src/hard_swish_test.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include #include #include #include #include #include #include "test_utils.h" void esp_nn_hard_swish_s8_test() { /* Test with representative MobileNetV3 parameters */ const int test_sizes[] = {1, 8, 16, 32, 100, 1024, 12544}; const int num_tests = sizeof(test_sizes) / sizeof(test_sizes[0]); /* Typical quantization params from MobileNetV3 layers */ const int16_t input_zp = -128; const int16_t output_mult_fxp = 19661; /* typical value */ const int16_t reluish_mult_fxp = 22938; /* typical value */ const int16_t output_zp = -128; /* Test all three branches: exp > 0, exp < 0, exp == 0 */ int32_t reluish_exps[] = {2, -1, 0}; int32_t output_exps[] = {-1, -2, -1}; printf("\n######## Running %s ##########\n", __FUNCTION__); /* Set up scratch buffer for LUT-based optimization */ int32_t scratch_size = esp_nn_get_hard_swish_scratch_size(); void *scratch_buf = NULL; if (scratch_size > 0) { scratch_buf = malloc(scratch_size); if (scratch_buf) { esp_nn_set_hard_swish_scratch_buf(scratch_buf); } } for (int t = 0; t < num_tests; t++) { int size = test_sizes[t]; int8_t *input_orig = malloc(size + 16); int8_t *out_c_orig = malloc(size + 16); int8_t *out_opt_orig = malloc(size + 16); if (!input_orig || !out_c_orig || !out_opt_orig) { printf(ANSI_COLOR_RED"hard_swish [%d] alloc failed\n"ANSI_COLOR_RESET, t); goto cleanup; } int8_t *input = (int8_t *)(((uint32_t)input_orig + 15) & ~15); int8_t *out_c = (int8_t *)(((uint32_t)out_c_orig + 15) & ~15); int8_t *out_opt = (int8_t *)(((uint32_t)out_opt_orig + 15) & ~15); for (int i = 0; i < size; i++) { input[i] = rand() % 256 - 128; } for (int exp_idx = 0; exp_idx < 3; exp_idx++) { /* ANSI C reference */ profile_c_start(); esp_nn_hard_swish_s8_ansi(input, out_c, size, input_zp, output_mult_fxp, reluish_mult_fxp, reluish_exps[exp_idx], output_exps[exp_idx], output_zp); profile_c_end(); /* Optimized */ profile_opt_start(); esp_nn_hard_swish_s8(input, out_opt, size, input_zp, output_mult_fxp, reluish_mult_fxp, reluish_exps[exp_idx], output_exps[exp_idx], output_zp); profile_opt_end(); bool ret = CHECK_EQUAL(out_c, out_opt, size); if (!ret) { printf(ANSI_COLOR_RED"hard_swish [size=%d, exp=%d] failed\n"ANSI_COLOR_RESET, size, (int)reluish_exps[exp_idx]); goto cleanup; } } printf(ANSI_COLOR_GREEN"hard_swish [%2d] passed [size %d]\n"ANSI_COLOR_RESET, t, size); cleanup: if (input_orig) free(input_orig); if (out_c_orig) free(out_c_orig); if (out_opt_orig) free(out_opt_orig); } if (scratch_buf) free(scratch_buf); } ================================================ FILE: tests/src/mean_test.c ================================================ /* * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include #include #include #include #include #include #include #include "test_utils.h" void esp_nn_mean_nhwc_s8_test() { /* Test dimensions matching MobileNetV3 SE blocks */ struct { int height, width, channels; } test_cases[] = { {7, 7, 16}, /* small SE block */ {7, 7, 72}, /* medium SE block */ {14, 14, 40}, /* larger spatial */ {14, 14, 120}, /* larger channels */ {28, 28, 24}, /* early layer SE */ {1, 1, 576}, /* degenerate 1x1 */ {3, 3, 96}, /* small spatial */ }; const int num_tests = sizeof(test_cases) / sizeof(test_cases[0]); const int32_t input_zp = -128; const int32_t output_zp = -128; const int32_t multiplier = 1073741824; /* typical */ const int32_t shift = -1; printf("\n######## Running %s ##########\n", __FUNCTION__); for (int t = 0; t < num_tests; t++) { int h = test_cases[t].height; int w = test_cases[t].width; int c = test_cases[t].channels; int input_size = h * w * c; int8_t *input_orig = malloc(input_size + 16); int8_t *out_c_orig = malloc(c + 16); int8_t *out_opt_orig = malloc(c + 16); if (!input_orig || !out_c_orig || !out_opt_orig) { printf(ANSI_COLOR_RED"mean [%d] alloc failed\n"ANSI_COLOR_RESET, t); goto cleanup; } int8_t *input = (int8_t *)(((uint32_t)input_orig + 15) & ~15); int8_t *out_c = (int8_t *)(((uint32_t)out_c_orig + 15) & ~15); int8_t *out_opt = (int8_t *)(((uint32_t)out_opt_orig + 15) & ~15); for (int i = 0; i < input_size; i++) { input[i] = rand() % 256 - 128; } /* ANSI C reference */ profile_c_start(); esp_nn_mean_nhwc_s8_ansi(input, out_c, h, w, c, input_zp, output_zp, multiplier, shift); profile_c_end(); /* Optimized */ profile_opt_start(); esp_nn_mean_nhwc_s8(input, out_opt, h, w, c, input_zp, output_zp, multiplier, shift); profile_opt_end(); bool ret = CHECK_EQUAL(out_c, out_opt, c); if (!ret) { printf(ANSI_COLOR_RED"mean [%d] failed [%dx%dx%d]\n"ANSI_COLOR_RESET, t, h, w, c); goto cleanup; } printf(ANSI_COLOR_GREEN"mean [%2d] passed [%dx%dx%d]\n"ANSI_COLOR_RESET, t, h, w, c); cleanup: if (input_orig) free(input_orig); if (out_c_orig) free(out_c_orig); if (out_opt_orig) free(out_opt_orig); } } ================================================ FILE: tests/src/pooling_test.c ================================================ /* * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include #include #include #include #include #include #include #include "test_utils.h" static void run_avg_pool_test(uint16_t input_wd, uint16_t input_ht, uint16_t channels, uint16_t filter_wd, uint16_t filter_ht, uint16_t stride_wd, uint16_t stride_ht, uint16_t pad_wd, uint16_t pad_ht, int iter) { const int32_t activation_min = -128; const int32_t activation_max = 127; const uint16_t out_wd = (input_wd + 2 * pad_wd - filter_wd) / stride_wd + 1; const uint16_t out_ht = (input_ht + 2 * pad_ht - filter_ht) / stride_ht + 1; const int size = input_wd * input_ht * channels; const int out_size = out_wd * out_ht * channels; int8_t *input = NULL, *output_c = NULL, *output_opt = NULL; int8_t *input_orig = malloc(size + 16); int8_t *out_c_orig = malloc(out_size + 16); int8_t *out_opt_orig = malloc(out_size + 16); if (input_orig == NULL || out_c_orig == NULL || out_opt_orig == NULL) { printf(ANSI_COLOR_RED"avg_pool [%d] allocations failed\n"ANSI_COLOR_RESET, iter); goto avg_pool_cleanup; } input = (int8_t *) (((uint32_t) input_orig + 15) & ~15); output_c = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15); output_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15); for (int i = 0; i < size; ++i) { input[i] = rand() % 256 - 128; } profile_c_start(); esp_nn_avg_pool_s8_ansi(input, input_wd, input_ht, output_c, out_wd, out_ht, stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht, activation_min, activation_max, channels); profile_c_end(); profile_opt_start(); esp_nn_avg_pool_s8(input, input_wd, input_ht, output_opt, out_wd, out_ht, stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht, activation_min, activation_max, channels); profile_opt_end(); bool ret = CHECK_EQUAL(output_c, output_opt, out_size); if (ret == false) { printf(ANSI_COLOR_RED"avg_pool [%d] failed [in %dx%dx%d, f %dx%d, s %dx%d, p %dx%d]\n"ANSI_COLOR_RESET, iter, input_wd, input_ht, channels, filter_wd, filter_ht, stride_wd, stride_ht, pad_wd, pad_ht); goto avg_pool_cleanup; } printf(ANSI_COLOR_GREEN"avg_pool [%2d] passed [in %dx%dx%d, f %dx%d, s %dx%d, p %dx%d]\n"ANSI_COLOR_RESET, iter, input_wd, input_ht, channels, filter_wd, filter_ht, stride_wd, stride_ht, pad_wd, pad_ht); avg_pool_cleanup: if (input_orig) free(input_orig); if (out_c_orig) free(out_c_orig); if (out_opt_orig) free(out_opt_orig); } void esp_nn_avg_pool_s8_test() { int iter = 0; /* Original test case */ run_avg_pool_test(16, 16, 16, 3, 3, 1, 1, 1, 1, iter++); /* Varying channel counts */ run_avg_pool_test(16, 16, 4, 3, 3, 1, 1, 1, 1, iter++); run_avg_pool_test(16, 16, 8, 3, 3, 1, 1, 1, 1, iter++); run_avg_pool_test(16, 16, 32, 3, 3, 1, 1, 1, 1, iter++); run_avg_pool_test(16, 16, 64, 3, 3, 1, 1, 1, 1, iter++); /* Note: non-multiple-of-4 channels not supported by S3 optimized path */ /* Different filter sizes */ run_avg_pool_test(16, 16, 16, 1, 1, 1, 1, 0, 0, iter++); run_avg_pool_test(16, 16, 16, 2, 2, 1, 1, 0, 0, iter++); run_avg_pool_test(16, 16, 16, 5, 5, 1, 1, 2, 2, iter++); /* Stride > 1 */ run_avg_pool_test(16, 16, 16, 3, 3, 2, 2, 1, 1, iter++); run_avg_pool_test(24, 24, 32, 3, 3, 2, 2, 1, 1, iter++); /* Person detection final pooling: 6x6x128, filter 6x6 */ run_avg_pool_test(6, 6, 128, 6, 6, 1, 1, 0, 0, iter++); /* No padding */ run_avg_pool_test(16, 16, 16, 3, 3, 1, 1, 0, 0, iter++); } static void run_max_pool_test(uint16_t input_wd, uint16_t input_ht, uint16_t channels, uint16_t filter_wd, uint16_t filter_ht, uint16_t stride_wd, uint16_t stride_ht, uint16_t pad_wd, uint16_t pad_ht, int iter) { const int32_t activation_min = -128; const int32_t activation_max = 127; const uint16_t out_wd = (input_wd + 2 * pad_wd - filter_wd) / stride_wd + 1; const uint16_t out_ht = (input_ht + 2 * pad_ht - filter_ht) / stride_ht + 1; const int size = input_wd * input_ht * channels; const int out_size = out_wd * out_ht * channels; int8_t *input = NULL, *output_c = NULL, *output_opt = NULL; int8_t *input_orig = malloc(size + 16); int8_t *out_c_orig = malloc(out_size + 16); int8_t *out_opt_orig = malloc(out_size + 16); if (input_orig == NULL || out_c_orig == NULL || out_opt_orig == NULL) { printf(ANSI_COLOR_RED"max_pool [%d] allocations failed\n"ANSI_COLOR_RESET, iter); goto max_pool_cleanup; } input = (int8_t *) (((uint32_t) input_orig + 15) & ~15); output_c = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15); output_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15); for (int i = 0; i < size; ++i) { input[i] = rand() % 256 - 128; } profile_c_start(); esp_nn_max_pool_s8_ansi(input, input_wd, input_ht, output_c, out_wd, out_ht, stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht, activation_min, activation_max, channels); profile_c_end(); profile_opt_start(); esp_nn_max_pool_s8(input, input_wd, input_ht, output_opt, out_wd, out_ht, stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht, activation_min, activation_max, channels); profile_opt_end(); bool ret = CHECK_EQUAL(output_c, output_opt, out_size); if (ret == false) { printf(ANSI_COLOR_RED"max_pool [%d] failed [in %dx%dx%d, f %dx%d, s %dx%d, p %dx%d]\n"ANSI_COLOR_RESET, iter, input_wd, input_ht, channels, filter_wd, filter_ht, stride_wd, stride_ht, pad_wd, pad_ht); goto max_pool_cleanup; } printf(ANSI_COLOR_GREEN"max_pool [%2d] passed [in %dx%dx%d, f %dx%d, s %dx%d, p %dx%d]\n"ANSI_COLOR_RESET, iter, input_wd, input_ht, channels, filter_wd, filter_ht, stride_wd, stride_ht, pad_wd, pad_ht); max_pool_cleanup: if (input_orig) free(input_orig); if (out_c_orig) free(out_c_orig); if (out_opt_orig) free(out_opt_orig); } void esp_nn_max_pool_s8_test() { int iter = 0; /* Original test case */ run_max_pool_test(16, 16, 16, 3, 3, 1, 1, 1, 1, iter++); /* Varying channel counts */ run_max_pool_test(16, 16, 4, 3, 3, 1, 1, 1, 1, iter++); run_max_pool_test(16, 16, 8, 3, 3, 1, 1, 1, 1, iter++); run_max_pool_test(16, 16, 32, 3, 3, 1, 1, 1, 1, iter++); run_max_pool_test(16, 16, 64, 3, 3, 1, 1, 1, 1, iter++); /* Note: non-multiple-of-4 channels not supported by S3 optimized path */ /* Different filter sizes */ run_max_pool_test(16, 16, 16, 1, 1, 1, 1, 0, 0, iter++); run_max_pool_test(16, 16, 16, 2, 2, 1, 1, 0, 0, iter++); run_max_pool_test(16, 16, 16, 5, 5, 1, 1, 2, 2, iter++); /* Stride > 1 */ run_max_pool_test(16, 16, 16, 3, 3, 2, 2, 1, 1, iter++); run_max_pool_test(24, 24, 32, 3, 3, 2, 2, 1, 1, iter++); /* Person detection final pooling-like: 6x6x128 */ run_max_pool_test(6, 6, 128, 6, 6, 1, 1, 0, 0, iter++); /* No padding */ run_max_pool_test(16, 16, 16, 3, 3, 1, 1, 0, 0, iter++); } ================================================ FILE: tests/src/relu_test.c ================================================ /* * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include #include #include #include #include #include #include "test_utils.h" static void run_relu6_test(int size, int iter) { int8_t *input = NULL, *inout_ansi = NULL, *inout_opt = NULL; int8_t *input_orig = malloc(size + 16); int8_t *inout_c_orig = malloc(size + 16); int8_t *inout_opt_orig = malloc(size + 16); if (input_orig == NULL || inout_c_orig == NULL || inout_opt_orig == NULL) { printf(ANSI_COLOR_RED"relu6 [%d] allocations failed\n"ANSI_COLOR_RESET, iter); goto relu6_cleanup; } input = (int8_t *) (((uint32_t) input_orig + 15) & ~15); inout_ansi = (int8_t *) (((uint32_t) inout_c_orig + 15) & ~15); inout_opt = (int8_t *) (((uint32_t) inout_opt_orig + 15) & ~15); for (int i = 0; i < size; ++i) { input[i] = rand() % 255 - 128; inout_ansi[i] = input[i]; inout_opt[i] = input[i]; } profile_c_start(); esp_nn_relu6_s8_ansi(inout_ansi, size); profile_c_end(); profile_opt_start(); esp_nn_relu6_s8(inout_opt, size); profile_opt_end(); bool ret = CHECK_EQUAL(inout_ansi, inout_opt, size); if (ret == false) { printf(ANSI_COLOR_RED"relu6 [%d] failed [size %d]\n"ANSI_COLOR_RESET, iter, size); goto relu6_cleanup; } printf(ANSI_COLOR_GREEN"relu6 [%2d] passed [size %d]\n"ANSI_COLOR_RESET, iter, size); relu6_cleanup: if (input_orig) free(input_orig); if (inout_c_orig) free(inout_c_orig); if (inout_opt_orig) free(inout_opt_orig); } void esp_nn_relu6_s8_test() { int iter = 0; /* Original test case: odd size with leftover */ run_relu6_test(1600 + 8 + 7, iter++); /* Very small sizes (< 8 elements, below SIMD width) */ run_relu6_test(1, iter++); run_relu6_test(3, iter++); run_relu6_test(7, iter++); /* Between 8 and 16 (partial SIMD) */ run_relu6_test(8, iter++); run_relu6_test(12, iter++); run_relu6_test(15, iter++); /* Exact multiple of 16 (full SIMD, no leftover) */ run_relu6_test(16, iter++); run_relu6_test(32, iter++); run_relu6_test(256, iter++); /* Non-aligned sizes */ run_relu6_test(17, iter++); run_relu6_test(33, iter++); run_relu6_test(100, iter++); } ================================================ FILE: tests/src/softmax_test.c ================================================ /* * SPDX-FileCopyrightText: 2022-2026 Espressif Systems (Shanghai) CO LTD * * SPDX-License-Identifier: Apache-2.0 */ #include #include #include #include #include #include #include #include "test_utils.h" static void run_softmax_test(int32_t height, int32_t width, int32_t mult, int32_t shift, int32_t diff_min, int iter) { void *scratch_buf = NULL, *scratch_buf_orig = NULL; const int size = width * height; int8_t *input = NULL, *out_ansi = NULL, *out_opt = NULL; int8_t *input_orig = malloc(size + 16); int8_t *out_c_orig = malloc(size + 16); int8_t *out_opt_orig = malloc(size + 16); if (input_orig == NULL || out_c_orig == NULL || out_opt_orig == NULL) { printf(ANSI_COLOR_RED"softmax [%d] allocations failed\n"ANSI_COLOR_RESET, iter); goto softmax_cleanup; } input = (int8_t *) (((uint32_t) input_orig + 15) & ~15); out_ansi = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15); out_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15); for (int i = 0; i < size; ++i) { input[i] = rand() % 255 - 128; } profile_c_start(); esp_nn_softmax_s8_ansi(input, height, width, mult, shift, diff_min, out_ansi); profile_c_end(); int32_t scratch_buf_size = esp_nn_get_softmax_scratch_size(width, height); if (scratch_buf_size) { scratch_buf_orig = malloc(scratch_buf_size * 4 + 16); if (scratch_buf_orig == NULL) { printf(ANSI_COLOR_RED"softmax [%d] scratch alloc failed size %"PRIi32"\n"ANSI_COLOR_RESET, iter, scratch_buf_size); goto softmax_cleanup; } scratch_buf = (void *)(((uint32_t) scratch_buf_orig + 15) & ~15); esp_nn_set_softmax_scratch_buf(scratch_buf); } profile_opt_start(); esp_nn_softmax_s8(input, height, width, mult, shift, diff_min, out_opt); profile_opt_end(); bool ret = CHECK_EQUAL(out_ansi, out_opt, size); if (ret == false) { printf(ANSI_COLOR_RED"softmax [%d] failed [h %"PRIi32", w %"PRIi32", mult %"PRIi32", shift %"PRIi32", diff_min %"PRIi32"]\n"ANSI_COLOR_RESET, iter, height, width, mult, shift, diff_min); printf("Output: \n"); PRINT_ARRAY_HEX(out_opt, width, height); printf("Expected: \n"); PRINT_ARRAY_HEX(out_ansi, width, height); goto softmax_cleanup; } printf(ANSI_COLOR_GREEN"softmax [%2d] passed [h %"PRIi32", w %"PRIi32", mult %"PRIi32", shift %"PRIi32"]\n"ANSI_COLOR_RESET, iter, height, width, mult, shift); softmax_cleanup: if (input_orig) free(input_orig); if (out_c_orig) free(out_c_orig); if (out_opt_orig) free(out_opt_orig); if (scratch_buf_orig) free(scratch_buf_orig); } void esp_nn_softmax_s8_test() { int iter = 0; /* Original test case */ run_softmax_test(8, 32, INT32_MAX / 2, 7, -128, iter++); /* Small output classes (person_detection: 2, micro_speech: 4) */ run_softmax_test(1, 2, INT32_MAX / 2, 7, -128, iter++); run_softmax_test(1, 4, INT32_MAX / 2, 7, -128, iter++); /* Single element (degenerate) */ run_softmax_test(1, 1, INT32_MAX / 2, 7, -128, iter++); /* Medium width */ run_softmax_test(1, 10, INT32_MAX / 2, 7, -128, iter++); run_softmax_test(4, 10, INT32_MAX / 2, 7, -128, iter++); /* Large width (ImageNet-class) */ run_softmax_test(1, 1000, INT32_MAX / 2, 7, -128, iter++); /* Large height */ run_softmax_test(64, 32, INT32_MAX / 2, 7, -128, iter++); /* Varying diff_min */ run_softmax_test(8, 32, INT32_MAX / 2, 7, -64, iter++); run_softmax_test(8, 32, INT32_MAX / 2, 7, -32, iter++); run_softmax_test(8, 32, INT32_MAX / 2, 7, 0, iter++); /* Varying multiplier and shift */ run_softmax_test(8, 32, INT32_MAX / 4, 5, -128, iter++); run_softmax_test(8, 32, INT32_MAX, 10, -128, iter++); /* Odd width (non-aligned) */ run_softmax_test(8, 17, INT32_MAX / 2, 7, -128, iter++); run_softmax_test(8, 3, INT32_MAX / 2, 7, -128, iter++); }