Showing preview only (841K chars total). Download the full file or copy to clipboard to get everything.
Repository: espressif/esp-nn
Branch: master
Commit: d45b843ca5f8
Files: 99
Total size: 801.9 KB
Directory structure:
gitextract__zjpraf8/
├── .github/
│ └── workflows/
│ └── upload_component.yml
├── .gitignore
├── .gitlab-ci.yml
├── CMakeLists.txt
├── CONTRIBUTING.md
├── Kconfig.projbuild
├── LICENSE
├── README.md
├── idf_component.yml
├── include/
│ ├── esp_nn.h
│ ├── esp_nn_ansi_c.h
│ ├── esp_nn_ansi_headers.h
│ ├── esp_nn_defs.h
│ ├── esp_nn_esp32p4.h
│ ├── esp_nn_esp32s3.h
│ └── esp_nn_generic_opt.h
├── src/
│ ├── activation_functions/
│ │ ├── esp_nn_hard_swish_ansi.c
│ │ ├── esp_nn_hard_swish_s8_esp32p4.c
│ │ ├── esp_nn_hard_swish_s8_esp32s3.c
│ │ ├── esp_nn_relu_ansi.c
│ │ ├── esp_nn_relu_s8_esp32p4.c
│ │ └── esp_nn_relu_s8_esp32s3.S
│ ├── basic_math/
│ │ ├── esp_nn_add_ansi.c
│ │ ├── esp_nn_add_s8_esp32p4.c
│ │ ├── esp_nn_add_s8_esp32s3.S
│ │ ├── esp_nn_mul_ansi.c
│ │ ├── esp_nn_mul_broadcast_s8_esp32s3.S
│ │ ├── esp_nn_mul_s8_esp32p4.c
│ │ └── esp_nn_mul_s8_esp32s3.S
│ ├── common/
│ │ ├── common_functions.h
│ │ ├── esp_nn_common_functions_esp32s3.S
│ │ ├── esp_nn_dot_s8_esp32s3.S
│ │ ├── esp_nn_mean_ansi.c
│ │ ├── esp_nn_mean_s8_esp32p4.c
│ │ ├── esp_nn_mean_s8_esp32s3.c
│ │ ├── esp_nn_multiply_by_quantized_mult_esp32p4.S
│ │ ├── esp_nn_multiply_by_quantized_mult_esp32s3.S
│ │ └── esp_nn_multiply_by_quantized_mult_ver1_esp32s3.S
│ ├── convolution/
│ │ ├── esp_nn_conv_ansi.c
│ │ ├── esp_nn_conv_esp32p4.c
│ │ ├── esp_nn_conv_esp32s3.c
│ │ ├── esp_nn_conv_opt.c
│ │ ├── esp_nn_conv_s16_mult4_1x1_esp32s3.S
│ │ ├── esp_nn_conv_s16_mult8_esp32s3.S
│ │ ├── esp_nn_conv_s8_1x1_esp32s3.c
│ │ ├── esp_nn_conv_s8_3x3_opt_esp32s3.c
│ │ ├── esp_nn_conv_s8_filter_aligned_input_padded_esp32s3.S
│ │ ├── esp_nn_conv_s8_mult8_1x1_esp32s3.S
│ │ ├── esp_nn_depthwise_conv_ansi.c
│ │ ├── esp_nn_depthwise_conv_esp32p4.c
│ │ ├── esp_nn_depthwise_conv_opt.c
│ │ ├── esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3.S
│ │ ├── esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3.S
│ │ ├── esp_nn_depthwise_conv_s16_mult1_esp32s3.S
│ │ ├── esp_nn_depthwise_conv_s16_mult4_esp32s3.S
│ │ ├── esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3.S
│ │ ├── esp_nn_depthwise_conv_s16_mult8_esp32s3.S
│ │ ├── esp_nn_depthwise_conv_s8_esp32s3.c
│ │ └── esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3.S
│ ├── fully_connected/
│ │ ├── esp_nn_fc_s8_mac16_esp32s3.S
│ │ ├── esp_nn_fully_connected_ansi.c
│ │ ├── esp_nn_fully_connected_esp32s3.c
│ │ ├── esp_nn_fully_connected_per_ch_s8_esp32s3.S
│ │ ├── esp_nn_fully_connected_s8_esp32p4.c
│ │ └── esp_nn_fully_connected_s8_esp32s3.S
│ ├── logistic/
│ │ └── esp_nn_logistic_ansi.c
│ ├── pooling/
│ │ ├── esp_nn_avg_pool_ansi.c
│ │ ├── esp_nn_avg_pool_s8_esp32p4.c
│ │ ├── esp_nn_avg_pool_s8_esp32s3.S
│ │ ├── esp_nn_avg_pool_s8_esp32s3.c
│ │ ├── esp_nn_max_pool_ansi.c
│ │ ├── esp_nn_max_pool_s8_esp32p4.c
│ │ └── esp_nn_max_pool_s8_esp32s3.S
│ └── softmax/
│ ├── esp_nn_softmax_ansi.c
│ ├── esp_nn_softmax_opt.c
│ ├── esp_nn_softmax_s8_esp32p4.c
│ ├── esp_nn_softmax_s8_esp32s3.c
│ └── softmax_common.h
├── test_app/
│ ├── CMakeLists.txt
│ ├── Makefile
│ ├── main/
│ │ ├── CMakeLists.txt
│ │ ├── component.mk
│ │ └── main.c
│ ├── sdkconfig.defaults
│ ├── sdkconfig.defaults.esp32p4
│ └── sdkconfig.defaults.esp32s3
└── tests/
├── CMakeLists.txt
├── README.md
├── component.mk
├── include/
│ ├── test_functions.h
│ └── test_utils.h
└── src/
├── basic_math_test.c
├── convolution_test.c
├── fully_connected_test.c
├── hard_swish_test.c
├── mean_test.c
├── pooling_test.c
├── relu_test.c
└── softmax_test.c
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/workflows/upload_component.yml
================================================
name: Push esp-nn to IDF Component Registry
on:
push:
branches:
- master
jobs:
upload_components:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Upload esp-nn to IDF Component Registry
uses: espressif/upload-components-ci-action@v1
with:
namespace: "espressif"
name: "esp-nn"
api_token: ${{ secrets.IDF_COMPONENT_API_TOKEN }}
================================================
FILE: .gitignore
================================================
.config
*.o
*.i
*.s
*.orig
*.pyc
# gtags
GTAGS
GRTAGS
GPATH
# emacs
.dir-locals.el
# emacs temp file suffixes
*~
.#*
\#*#
# eclipse setting
.settings
# MacOS directory files
.DS_Store
# Example project files
examples/**/sdkconfig
examples/**/sdkconfig.old
examples/**/build
# Test app files
test_app/build
test_app/sdkconfig
test_app/sdkconfig.old
# Doc build artifacts
docs/_build/
docs/doxygen-warning-log.txt
docs/sphinx-warning-log.txt
docs/sphinx-warning-log-sanitized.txt
docs/xml/
docs/xml_in/
docs/man/
docs/doxygen_sqlite3.db
TEST_LOGS
# gcov coverage reports
*.gcda
*.gcno
coverage.info
coverage_report/
# VS Code Settings
.vscode/
================================================
FILE: .gitlab-ci.yml
================================================
stages:
- build
# Avoid running duplicate pipeline
workflow:
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
- if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH'
variables:
GIT_STRATEGY: fetch
GIT_SUBMODULE_STRATEGY: recursive
before_script:
- mkdir -p ~/.ssh
- chmod 700 ~/.ssh
- echo -n $GITLAB_KEY_TMP > ~/.ssh/id_rsa_base64
- base64 --decode --ignore-garbage ~/.ssh/id_rsa_base64 > ~/.ssh/id_rsa
- chmod 600 ~/.ssh/id_rsa
- echo -e "Host gitlab.espressif.cn\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config
- |
if [ -n "$IDF_COMPONENT_MGR_VER" ]; then
pip install idf-component-manager==$IDF_COMPONENT_MGR_VER
fi
.test_build: &test_build
# Build examples
- for TARGET in $EXAMPLE_TARGETS; do
- idf.py set-target $TARGET build
- done
.build_template:
stage: build
image: espressif/idf:latest
tags:
- build
variables:
PEDANTIC_FLAGS: "-Werror -Wno-error=cpp -Werror=unused-variable -Werror=unused-but-set-variable -Werror=unused-function"
EXTRA_CFLAGS: "${PEDANTIC_FLAGS}"
EXTRA_CXXFLAGS: "${PEDANTIC_FLAGS}"
rules:
- if: '$CI_PIPELINE_SOURCE == "schedule"'
when: never
- when: always
script:
- cd ${CI_PROJECT_DIR}/test_app
# build examples
- *test_build
- cd ${CI_PROJECT_DIR}
build_idf_v5.5:
extends: .build_template
image: espressif/idf:release-v5.5
variables:
EXAMPLE_TARGETS: esp32 esp32s3 esp32c3 esp32p4
build_idf_v5.2:
extends: .build_template
image: espressif/idf:release-v5.2
variables:
EXAMPLE_TARGETS: esp32 esp32s3 esp32c3
build_idf_v5.0:
extends: .build_template
image: espressif/idf:release-v5.0
variables:
EXAMPLE_TARGETS: esp32 esp32s3 esp32c3
build_idf_v4.4:
extends: .build_template
image: espressif/idf:release-v4.4
variables:
EXAMPLE_TARGETS: esp32 esp32s3 esp32c3
IDF_COMPONENT_MGR_VER: "1.2.0"
build_idf_v4.3:
extends: .build_template
image: espressif/idf:release-v4.3
variables:
EXAMPLE_TARGETS: esp32
build_idf_v4.2:
extends: .build_template
image: espressif/idf:release-v4.2
variables:
EXAMPLE_TARGETS: esp32
================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.5)
set(c_srcs
"src/activation_functions/esp_nn_relu_ansi.c"
"src/activation_functions/esp_nn_hard_swish_ansi.c"
"src/common/esp_nn_mean_ansi.c"
"src/basic_math/esp_nn_add_ansi.c"
"src/basic_math/esp_nn_mul_ansi.c"
"src/convolution/esp_nn_conv_ansi.c"
"src/convolution/esp_nn_conv_opt.c"
"src/convolution/esp_nn_depthwise_conv_ansi.c"
"src/convolution/esp_nn_depthwise_conv_opt.c"
"src/fully_connected/esp_nn_fully_connected_ansi.c"
"src/softmax/esp_nn_softmax_ansi.c"
"src/softmax/esp_nn_softmax_opt.c"
"src/logistic/esp_nn_logistic_ansi.c"
"src/pooling/esp_nn_avg_pool_ansi.c"
"src/pooling/esp_nn_max_pool_ansi.c")
if(CONFIG_IDF_TARGET_ESP32S3)
set(s3_srcs
"src/common/esp_nn_common_functions_esp32s3.S"
"src/common/esp_nn_dot_s8_esp32s3.S"
"src/common/esp_nn_multiply_by_quantized_mult_esp32s3.S"
"src/common/esp_nn_multiply_by_quantized_mult_ver1_esp32s3.S"
"src/activation_functions/esp_nn_relu_s8_esp32s3.S"
"src/activation_functions/esp_nn_hard_swish_s8_esp32s3.c"
"src/common/esp_nn_mean_s8_esp32s3.c"
"src/basic_math/esp_nn_add_s8_esp32s3.S"
"src/basic_math/esp_nn_mul_s8_esp32s3.S"
"src/basic_math/esp_nn_mul_broadcast_s8_esp32s3.S"
"src/convolution/esp_nn_conv_esp32s3.c"
"src/convolution/esp_nn_conv_s8_1x1_esp32s3.c"
"src/convolution/esp_nn_conv_s8_3x3_opt_esp32s3.c"
"src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c"
"src/convolution/esp_nn_conv_s16_mult8_esp32s3.S"
"src/convolution/esp_nn_conv_s8_mult8_1x1_esp32s3.S"
"src/convolution/esp_nn_conv_s16_mult4_1x1_esp32s3.S"
"src/convolution/esp_nn_conv_s8_filter_aligned_input_padded_esp32s3.S"
"src/convolution/esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3.S"
"src/convolution/esp_nn_depthwise_conv_s16_mult1_esp32s3.S"
"src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3.S"
"src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3.S"
"src/convolution/esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3.S"
"src/convolution/esp_nn_depthwise_conv_s16_mult4_esp32s3.S"
"src/convolution/esp_nn_depthwise_conv_s16_mult8_esp32s3.S"
"src/fully_connected/esp_nn_fully_connected_esp32s3.c"
"src/fully_connected/esp_nn_fc_s8_mac16_esp32s3.S"
"src/fully_connected/esp_nn_fully_connected_s8_esp32s3.S"
"src/fully_connected/esp_nn_fully_connected_per_ch_s8_esp32s3.S"
"src/pooling/esp_nn_max_pool_s8_esp32s3.S"
"src/pooling/esp_nn_avg_pool_s8_esp32s3.c"
"src/pooling/esp_nn_avg_pool_s8_esp32s3.S"
"src/softmax/esp_nn_softmax_s8_esp32s3.c")
endif()
if(CONFIG_IDF_TARGET_ESP32P4)
set(p4_srcs
"src/common/esp_nn_mean_s8_esp32p4.c"
"src/common/esp_nn_multiply_by_quantized_mult_esp32p4.S"
"src/activation_functions/esp_nn_hard_swish_s8_esp32p4.c"
"src/activation_functions/esp_nn_relu_s8_esp32p4.c"
"src/basic_math/esp_nn_add_s8_esp32p4.c"
"src/basic_math/esp_nn_mul_s8_esp32p4.c"
"src/convolution/esp_nn_conv_esp32p4.c"
"src/convolution/esp_nn_depthwise_conv_esp32p4.c"
"src/fully_connected/esp_nn_fully_connected_s8_esp32p4.c"
"src/pooling/esp_nn_avg_pool_s8_esp32p4.c"
"src/pooling/esp_nn_max_pool_s8_esp32p4.c"
"src/softmax/esp_nn_softmax_s8_esp32p4.c")
endif()
idf_component_register(SRCS "${c_srcs}"
"${s3_srcs}"
"${p4_srcs}"
INCLUDE_DIRS "include" "src/common")
if(CONFIG_IDF_TARGET_ESP32S3)
target_compile_options(${COMPONENT_LIB} PRIVATE -mlongcalls -fno-unroll-loops -O2 -Wno-unused-function)
else()
target_compile_options(${COMPONENT_LIB} PRIVATE -O2 -Wno-unused-function)
endif()
if(CONFIG_NN_SKIP_NUDGE)
target_compile_definitions(${COMPONENT_LIB} PRIVATE SKIP_NUDGE)
endif()
================================================
FILE: CONTRIBUTING.md
================================================
# Contributing
Contributions to ESP-NN project in the form of pull requests, bug reports, and feature requests are welcome!
This document covers various topics related to contributions to the ESP-NN projects. Please read it if you plan to submit a PR!
## CLA
We require accepting the contributor's license agreement for all pull requests. When opening a pull request the first time you will be prompted to sign the CLA by the [CLA Assistant](https://cla-assistant.io/) service.
## Large-scale Changes
If you'd like to propose a change to the existing APIs or a large-scale refactoring of the implementation, we recommend opening an issue first to discuss this.
## Updating the Benchmarks Table
The benchmarks table in [README.md](README.md) contains benchmarks for ESP32-S3. The benchmarks are collected by running the app in [test_app](test_app/) directory. Please update this table if you have changed the implementations of some of the functions or added the new ones.
## Releasing a new version
Maintainers should follow the steps below to release a new version of ESP-NN component. Assuming the new version is `vX.Y.Z`:
1. Ensure you are on the latest `master` branch:
```bash
git checkout master
git pull --ff-only origin master
```
1. Create the new tag:
```bash
git tag -s -a -m "vX.Y.Z" vX.Y.Z
```
1. Push the tag and the branch to the internal repository:
```bash
git push origin vX.Y.Z
```
1. CI will automatically push the tag to Github and will upload the new version to the IDF Component Registry.
1. Go to https://github.com/espressif/esp-nn/releases and create a release from the tag vX.Y.Z.
1. Write the release notes and publish the release.
================================================
FILE: Kconfig.projbuild
================================================
menu "ESP-NN"
choice NN_OPTIMIZATIONS
bool "Optimization for nn functions"
default NN_OPTIMIZED
help
Use ANSI-C versions for verification and debug purpose.
Optimisations are automatically picked up for a chipset.
For ESP32-S3, assembly optimisations are selected.
For other platforms(viz., ESP32, ESP32-C3), generic optimisations are used.
config NN_ANSI_C
bool "ANSI C"
help
ANSI C versions for verification and debug purposes.
config NN_OPTIMIZED
bool "Optimized versions"
help
Optimisations are automatically picked up for a chipset.
For ESP32-S3, assembly optimisations are selected.
For other platforms(viz., ESP32, ESP32-C3), generic optimisations are used.
endchoice
config NN_OPTIMIZATIONS
int
default 0 if NN_ANSI_C
default 1 if NN_OPTIMIZED
config NN_SKIP_NUDGE
bool "Use fast (non-bit-exact) requantization"
depends on NN_OPTIMIZED
default n
help
When enabled, kernels use a faster requantize path that may differ
from the TFLite reference by +/-1 LSB at half-shift boundaries.
On ESP32-S3, this also skips the nudge addition in the assembly
requantize for ~20% speedup.
Leave disabled for bit-exact behavior (recommended for tests and
for matching reference outputs).
endmenu
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: README.md
================================================
# ESP-NN
The library contains optimised NN (Neural Network) functions for various Espressif chips.
* Supported platforms:
* TensorFlow Lite Micro (TFLite Micro). Repo can be found [here](https://github.com/espressif/tflite-micro-esp-examples)
* Supported ESP chips include:
* ESP32-S3 (Assembly versions optimised to benefit from vector instructions of ESP32-S3)
* ESP32-P4 (Optimised using PIE/QACC SIMD instructions)
* ESP32 (Generic optimisations)
* ESP32-C3 (Generic optimisations)
## Performance
### Kernelwise performance for s8 versions:
* Kernelwise performance on ESP32-P4 chip
* Numbers are ticks taken for kernel to execute
* Chip config: 360MHz, SPI-RAM: HEX 200MHz, L2-Cache: 128KB
| Function | ANSI C | Optimized | Opt Ratio | Data info | Memory |
| ----------------| --------|---------|---------|-------------|-----------|
| elementwise_add | 190786 | 88451 | 2.16 | size = 1615 | External |
| elementwise_mul | 76585 | 47601 | 1.60 | size = 1615 | External |
| convolution | 4005512 | 572459 | 7.00 | input(10,10), filter(64x1x1x64), pad(0,0), stride(1,1) | External |
| convolution | 249700 | 71104 | 3.51 | input(8,8), filter(16x1x1x16), pad(0,0), stride(1,1) | External |
| convolution | 816975 | 533318 | 1.53 | input(10,10), filter(64x3x3x3), pad(0,0), stride(1,1) | External |
| depthwise conv | 962834 | 482389 | 2.00 | input (16, 16), pad(0,0), stride(1,1) filter: 1x3x3x16 | External |
| depthwise conv | 1365066 | 703989 | 1.94 | input (12, 12), pad(1,1), stride(1,1) filter: 8x5x5x4 | External |
| max pool | 482184 | 24178 | 19.94 | input(16,16), filter (1x3x3x16) | Internal |
| avg pool | 303210 | 84401 | 3.59 | input(16,16), filter (1x3x3x16) | Internal |
| fully connected | 7650 | 915 | 8.36 | len: 271, ch = 3 | Internal |
| prelu (relu6) | 1195 | 154 | 7.76 | size, 1615 | Internal |
| softmax | 14260 | 8587 | 1.66 | width: 256 | Internal |
| hard_swish | 703970 | 516582 | 1.36 | size: 12544 | External |
| mean | 10113 | 4686 | 2.16 | 7x7x16 | Internal |
* Kernelwise performance on ESP32-S3 chip
* Numbers are ticks taken for kernel to execute
* Chip config: 240MHz, SPI: QPI 80MHz, Data cache: 64KB
| Function | ANSI C | Optimized | Opt Ratio | Data info | Memory |
| ----------------| ---------|-----------|-----------|-------------|-----------|
| elementwise_add | 281337 | 74440 | 3.78 | size = 1615 | External |
| elementwise_mul | 122703 | 35002 | 3.51 | size = 1615 | External |
| convolution | 4712500 | 331008 | 14.24 | input(10,10), filter(64x1x1x64), pad(0,0), stride(1,1) | External |
| convolution | 312754 | 39022 | 8.01 | input(8,8), filter(16x1x1x16), pad(0,0), stride(1,1) | External |
| convolution | 2193289 | 394842 | 5.55 | input(8,8), filter(64x3x3x3), pad(0,0), stride(1,1) | External |
| depthwise conv | 1159831 | 184176 | 6.30 | input(18,18), pad(0,0), stride(1,1), filter: 1x3x3x16 | External |
| depthwise conv | 1671363 | 372435 | 4.49 | input(12,12), pad(1,1), stride(1,1), filter: 8x5x5x4 | External |
| max pool | 376294 | 48069 | 7.83 | input(16,16), filter(1x3x3x16) | Internal |
| avg pool | 427293 | 118052 | 3.62 | input(16,16), filter(1x3x3x16) | Internal |
| fully connected | 8443 | 1078 | 7.83 | len: 271, ch = 3 | Internal |
| softmax | 15209 | 11107 | 1.37 | h: 8, w: 32 | Internal |
| prelu (relu6) | 1125 | 98 | 11.48 | size: 1615 | Internal |
### Model-level performance:
* **Person Detection** (Visual Wake Words, INT8 quantized — from [esp-tflite-micro](https://github.com/espressif/esp-tflite-micro))
* Numbers are time (ms) for `invoke()` call, using internal memory
| Chip | CPU Freq | without ESP-NN | with ESP-NN |
| -------- | -------- | -------------- | ----------- |
| ESP32-P4 | 360MHz | 1395ms | 73ms |
| ESP32-S3 | 240MHz | 2300ms | 54ms |
| ESP32 | 240MHz | 4084ms | 380ms |
| ESP32-C3 | 160MHz | 3355ms | 426ms |
* **MobileNetV3 Small** (INT8 quantized, 224x224x3, 1000 classes)
| Chip | CPU Freq | without ESP-NN | with ESP-NN |
| -------- | -------- | -------------- | ----------- |
| ESP32-S3 | 240MHz | 26000ms | 1434ms |
| ESP32-P4 | 360MHz | 11600ms | 1050ms |
> **Note**:
- The above is time taken for execution of the `invoke()` call
- SPIRAM used for TensorArena.
- Person detection on ESP32-S3 with internal RAM: 47ms
- ESP32-P4 optimisation is work in progress
- `Without ESP-NN` case is when `esp-nn` is completely disabled by removing below flag from [CMakeLists.txt](CMakeLists.txt):
```cmake
# enable ESP-NN optimizations by Espressif
target_compile_options(${COMPONENT_LIB} PRIVATE -DESP_NN)
```
## Configuration
* To configure, please use `idf.py menuconfig` and under `ESP-NN` select `NN_OPTIMIZATIONS`
* There are two options presented:
* Optimized versions
* ANSI C
* Default selection is for `Optimized versions`. For ESP32-S3 and ESP32-P4, assembly versions are automatically selected, whereas for other chips (viz., ESP32, ESP32-C3), generic optimisations are selected.
* For debugging purposes, you may want to select `ANSI C` reference versions.
## Contributing
If you encounter an issue with ESP-NN, or wish to submit a feature request, please use the Issues section on the Github.
For general questions related to this library, please use the esp32.com forum.
Please check [CONTRIBUTING.md](CONTRIBUTING.md) for further information if you'd like to contribute to ESP-NN.
## Copyrights and License
All original source code in this repository is Copyright (C) 2020-2021 Espressif Systems. This source code is licensed under the Apache License 2.0 as described in the file LICENSE.
================================================
FILE: idf_component.yml
================================================
version: "1.2.3"
description: Optimized NN (Neural Network) functions for Espressif chips
url: https://github.com/espressif/esp-nn
repository: https://github.com/espressif/esp-nn.git
issues: https://github.com/espressif/esp-nn/issues
dependencies:
idf:
version: ">=4.2"
files:
exclude:
- test_app
- tests
================================================
FILE: include/esp_nn.h
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#if defined(CONFIG_NN_OPTIMIZED)
// select apt optimisations
#ifdef CONFIG_IDF_TARGET_ESP32P4
#define ARCH_ESP32_P4 1
#endif
#ifdef CONFIG_IDF_TARGET_ESP32S3
#define ARCH_ESP32_S3 1
#endif
#ifdef CONFIG_IDF_TARGET_ESP32
#define ARCH_ESP32 1
#endif
#endif
#ifdef __cplusplus
extern "C" {
#endif
/* reference kernels included by default */
#include "esp_nn_ansi_headers.h"
#if defined(CONFIG_NN_OPTIMIZED)
#if defined(ARCH_ESP32_P4)
#include "esp_nn_esp32p4.h"
#elif defined(ARCH_ESP32_S3)
#include "esp_nn_esp32s3.h"
#else // for other platforms use generic optimisations
#include "esp_nn_generic_opt.h"
#endif // #if defined(ARCH_ESP32_S3)
#else
#include "esp_nn_ansi_c.h"
#endif
#ifdef __cplusplus
}
#endif
================================================
FILE: include/esp_nn_ansi_c.h
================================================
/*
* SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
/**
* @file Header definitions to include for ANSI C versions.
* These are just typedefs to pick up ANSI versions.
*/
#pragma once
#include "esp_nn_defs.h"
#include "esp_nn_ansi_headers.h"
#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_ansi
#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_ansi
#define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_ansi
#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_ansi
#define esp_nn_conv_s8 esp_nn_conv_s8_ansi
#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_ansi
#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_ansi
#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_ansi
#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_ansi
#define esp_nn_relu6_s8 esp_nn_relu6_s8_ansi
#define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_ansi
#define esp_nn_get_hard_swish_scratch_size() 0
#define esp_nn_set_hard_swish_scratch_buf(buf)
#define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_ansi
#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_ansi
#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_ansi
#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_ansi
#define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_ansi
#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_ansi
#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_ansi
#define esp_nn_softmax_s8 esp_nn_softmax_s8_ansi
#define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi
#define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi
#define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi
================================================
FILE: include/esp_nn_ansi_headers.h
================================================
/*
* SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once
/**
* @file Header definitions to include for esp_nn reference functions
*/
#include "esp_nn_defs.h"
/************************** Basic math functions ****************************/
/**
* @brief elementwise addition
*
* @note inputs type: int8_t, output: int8_t
* input offsets: although int32_t, they are contained in 8 bits [-128, 127]
*
* shift values are expected to be <= 0
*/
void esp_nn_add_elementwise_s8_ansi(const int8_t *input1_data,
const int8_t *input2_data,
const int32_t input1_offset,
const int32_t input2_offset,
const int32_t input1_mult,
const int32_t input2_mult,
const int32_t input1_shift,
const int32_t input2_shift,
const int32_t left_shift,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t activation_min,
const int32_t activation_max,
const int32_t size);
/**
* @brief elementwise multiplication
*
* @note inputs type: int8_t, output: int8_t
* input offsets: although int32_t, they are contained in 8 bits [-128, 127]
*
* output shift is expected to be <= 0
*/
void esp_nn_mul_elementwise_s8_ansi(const int8_t *input1_data,
const int8_t *input2_data,
const int32_t input1_offset,
const int32_t input2_offset,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t activation_min,
const int32_t activation_max,
const int32_t size);
/**
* @brief broadcast MUL for [H,W,C] * [1,1,C] pattern (SE-block)
*
* @note input2_per_ch has `channels` elements, broadcast to all spatial positions.
* Uses fast requantization (constant nudge).
*/
void esp_nn_mul_broadcast_channel_s8_ansi(const int8_t *input1,
const int8_t *input2_per_ch,
const int32_t input1_offset,
const int32_t input2_offset,
int8_t *output,
const int32_t output_offset,
const int32_t output_mult,
const int32_t output_shift,
const int32_t activation_min,
const int32_t activation_max,
const int32_t total_spatial,
const int32_t channels);
/************************** Convolution functions *****************************/
/**
* @brief depthwise convolution per channel
*
* @note inputs type: int8_t, output: int8_t
* Version used in tflite is per channel.
* This version follows the same footsprints.
* Meaning, it has per out_channel shift and multiplier for
* requantization
*
* optimization notes: Though input_offset is int32 type,
* offset values are contained in 8 bits [-128, 127]
*/
void esp_nn_depthwise_conv_s8_ansi(const data_dims_t *input_dims,
const int8_t *input_data,
const data_dims_t *filter_dims,
const int8_t *filter_data,
const int32_t *bias,
const data_dims_t *output_dims,
int8_t *out_data,
const dw_conv_params_t *conv_params,
const quant_data_t *quant_data);
/**
* @brief 2d-convolution channelwise
*
* @note operation: result += (input + offset) * filter
*
* inputs type: int8_t, output: int8_t
* input offsets: although int32_t, they are contained in 8 bits [-128, 127]
*/
void esp_nn_conv_s8_ansi(const data_dims_t *input_dims,
const int8_t *input_data,
const data_dims_t *filter_dims,
const int8_t *filter_data,
const int32_t *bias,
const data_dims_t *output_dims,
int8_t *out_data,
const conv_params_t *conv_params,
const quant_data_t *quant_data);
int esp_nn_get_conv_scratch_size_ansi(const data_dims_t *input_dims,
const data_dims_t *filter_dims,
const data_dims_t *output_dims,
const conv_params_t *conv_params);
void esp_nn_set_conv_scratch_buf_ansi(const void *buf);
int esp_nn_get_depthwise_conv_scratch_size_ansi(const data_dims_t *input_dims,
const data_dims_t *filter_dims,
const data_dims_t *output_dims,
const dw_conv_params_t *conv_params);
void esp_nn_set_depthwise_conv_scratch_buf_ansi(const void *buf);
/************************** Activation functions *****************************/
/**
* @brief relu6
*
* @note inout: int8_t
*/
void esp_nn_relu6_s8_ansi(int8_t *data, uint16_t size);
/**
* @brief hard_swish activation: y = x * relu6(x + 3) / 6
*
* @note Quantized int8 fixed-point implementation
*/
void esp_nn_hard_swish_s8_ansi(const int8_t *input, int8_t *output,
const int32_t size,
const int16_t input_zero_point,
const int16_t output_mult_fxp,
const int16_t reluish_mult_fxp,
const int32_t reluish_mult_exp,
const int32_t output_mult_exp,
const int16_t output_zero_point);
/**
* @brief mean reduction over spatial dims (H,W) for NHWC int8 tensor
*
* @note Specialized for 4D [N,H,W,C] → [N,1,1,C] reduction.
* Used by Squeeze-and-Excite in MobileNetV3.
*/
void esp_nn_mean_nhwc_s8_ansi(const int8_t *input, int8_t *output,
const int32_t height, const int32_t width,
const int32_t channels,
const int32_t input_zero_point,
const int32_t output_zero_point,
const int32_t multiplier,
const int32_t shift);
/************************** Pooling functions *****************************/
/**
* @brief max_pool
*
* @note inputs type: int8_t, output: int8_t
* input offsets: although int32_t, they are contained in 8 bits [-128, 127]
*/
void esp_nn_max_pool_s8_ansi(const int8_t *input,
const uint16_t input_wd,
const uint16_t input_ht,
int8_t *output,
const uint16_t output_wd,
const uint16_t output_ht,
const uint16_t stride_wd,
const uint16_t stride_ht,
const uint16_t filter_wd,
const uint16_t filter_ht,
const uint16_t pad_wd,
const uint16_t pad_ht,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t channels);
/**
* @brief avg_pool
*
* @note inputs type: int8_t, output: int8_t
* input offsets: although int32_t, they are contained in 8 bits [-128, 127]
*/
void esp_nn_avg_pool_s8_ansi(const int8_t *input,
const uint16_t input_wd,
const uint16_t input_ht,
int8_t *output,
const uint16_t output_wd,
const uint16_t output_ht,
const uint16_t stride_wd,
const uint16_t stride_ht,
const uint16_t filter_wd,
const uint16_t filter_ht,
const uint16_t pad_wd,
const uint16_t pad_ht,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t channels);
/************************** Fully connected functions ***********************/
/**
* @brief fully connected
*
* @note inputs type: int8_t, output: int8_t
* input offsets: although int32_t, they are contained in 8 bits [-128, 127]
*/
void esp_nn_fully_connected_s8_ansi(const int8_t *input_data,
const int32_t input_offset,
const uint16_t row_len,
const int8_t *filter_data,
const int32_t filter_offset,
const int32_t *bias,
int8_t *out_data,
const uint16_t out_channels,
const int32_t out_offset,
const int32_t out_shift,
const int32_t out_mult,
const int32_t activation_min,
const int32_t activation_max);
/**
* @brief fully connected
*
* @note inputs type: int8_t, output: int8_t
* input offsets: although int32_t, they are contained in 8 bits [-128, 127]
* out_mult, out_shift: int32_t* containing per-channel data
*/
void esp_nn_fully_connected_per_ch_s8_ansi(const int8_t *input_data,
const int32_t input_offset,
const uint16_t row_len,
const int8_t *filter_data,
const int32_t filter_offset,
const int32_t *bias,
int8_t *out_data,
const uint16_t out_channels,
const int32_t out_offset,
const int32_t* out_shift,
const int32_t* out_mult,
const int32_t activation_min,
const int32_t activation_max);
/**
* @brief Get scratch buffer size needed by softmax function
*
* @param width
* @param height
* @return size in bytes
*
* @note buffer must be 4 byte aligned
*/
int32_t esp_nn_get_softmax_scratch_size_ansi(const int32_t width, const int32_t height);
/* ANSI C function to be hooked up when optimised version needed */
int32_t esp_nn_get_softmax_scratch_size_opt(const int32_t width, const int32_t height);
/**
* @brief Set scratch buffer to be used by softmax function
*
* @param buffer this can be NULL if one needs to unset it
* must be aligned to 4 bytes
*/
void esp_nn_set_softmax_scratch_buf_ansi(void *buffer);
/**
* @brief reference softmax function
*
* @note inputs type: int8_t, output: int8_t
*/
void esp_nn_softmax_s8_ansi(const int8_t *input_data,
const int32_t height,
const int32_t width,
const int32_t mult,
const int32_t shift,
const int32_t diff_min,
int8_t *output_data);
//////////////////////////// Generic optimisations /////////////////////////////
/************************** Convolution functions *****************************/
/**
* @brief 2d-convolution channelwise optimized version
*
* @note operation: result += (input + offset) * filter
*
* inputs type: int8_t, output: int8_t
* input offsets: although int32_t, they are contained in 8 bits [-128, 127]
*/
void esp_nn_conv_s8_opt(const data_dims_t *input_dims,
const int8_t *input_data,
const data_dims_t *filter_dims,
const int8_t *filter_data,
const int32_t *bias,
const data_dims_t *output_dims,
int8_t *out_data,
const conv_params_t *conv_params,
const quant_data_t *quant_data);
/**
* @brief depthwise convolution per channel optimized version
*
* @note inputs type: int8_t, output: int8_t
* Version used in tflite is per channel.
* This version follows the same footsprints.
* Meaning, it has per out_channel shift and multiplier for
* requantization
*
* optimization notes: Though input_offset is int32 type,
* offset values are contained in 8 bits [-128, 127]
*/
void esp_nn_depthwise_conv_s8_opt(const data_dims_t *input_dims,
const int8_t *input_data,
const data_dims_t *filter_dims,
const int8_t *filter_data,
const int32_t *bias,
const data_dims_t *output_dims,
int8_t *out_data,
const dw_conv_params_t *conv_params,
const quant_data_t *quant_data);
int esp_nn_get_conv_scratch_size_opt(const data_dims_t *input_dims,
const data_dims_t *filter_dims,
const data_dims_t *output_dims,
const conv_params_t *conv_params);
void esp_nn_set_conv_scratch_buf_opt(const void *buf);
int esp_nn_get_depthwise_conv_scratch_size_opt(const data_dims_t *input_dims,
const data_dims_t *filter_dims,
const data_dims_t *output_dims,
const dw_conv_params_t *conv_params);
void esp_nn_set_depthwise_conv_scratch_buf_opt(const void *buf);
/* ANSI C function to be hooked up when optimised version needed */
void esp_nn_set_softmax_scratch_buf_opt(void *buffer);
/**
* @brief optimised version of softmax function
*
* @note the function uses extra buffer (4 * width bytes)
* hence, scratch buffers must be set before calling this.
*/
void esp_nn_softmax_s8_opt(const int8_t *input_data,
const int32_t height,
const int32_t width,
const int32_t mult,
const int32_t shift,
const int32_t diff_min,
int8_t *output_data);
/**
* @brief Get scratch buffer size for int8 logistic (sigmoid).
* @return 256 (size of LUT in bytes)
*/
int32_t esp_nn_get_logistic_s8_scratch_size_ansi(void);
/**
* @brief Prepare LUT for int8 logistic (sigmoid).
* Call once during model preparation after scratch is allocated.
*
* @param scratch_buf Scratch buffer (256 bytes, from get_scratch_size)
* @param input_zero_point Input quantization zero point
* @param input_scale Input quantization scale (float)
*
* @note Output quantization is fixed: scale=1/256, zero_point=-128.
*/
void esp_nn_logistic_s8_prepare_ansi(int8_t *scratch_buf,
int32_t input_zero_point,
float input_scale);
/**
* @brief Apply int8 logistic (sigmoid) using precomputed LUT.
*
* @param input Input int8 data
* @param output Output int8 data
* @param size Number of elements
* @param scratch_buf 256-byte LUT from esp_nn_logistic_s8_prepare()
*/
void esp_nn_logistic_s8_ansi(const int8_t *input, int8_t *output,
int32_t size, const int8_t *scratch_buf);
================================================
FILE: include/esp_nn_defs.h
================================================
// Copyright 2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
/**
* @brief structure to club data dims
* this structure can be used for input, output and filter
*/
typedef struct data_dims {
int32_t width;
int32_t height;
int32_t channels;
int32_t extra; // can be used as batch or any other param
} data_dims_t;
/**
* @brief 2d data structure (width, height)
*
*/
typedef struct data_2d {
int32_t width;
int32_t height;
} data_2d_t;
/**
* @brief min/max activation
*/
typedef struct act_params {
int32_t min;
int32_t max;
} act_params_t;
/**
* @brief per channel quant data
*
* @note number of shift and mult elements are equal to output channels
*/
typedef struct quant_data {
int32_t *shift;
int32_t *mult;
} quant_data_t;
/**
* @brief params specific to convolution 2d
*
*/
typedef struct conv_params {
int32_t in_offset;
int32_t out_offset;
data_2d_t stride;
data_2d_t padding;
data_2d_t dilation;
act_params_t activation;
} conv_params_t;
/**
* @brief params specific to depthwise convolution 2d
*
*/
typedef struct dw_conv_params {
int32_t in_offset;
int32_t out_offset;
int32_t ch_mult; // channel multiplier. (in_ch * ch_mult = out_ch)
data_2d_t stride;
data_2d_t padding;
data_2d_t dilation;
act_params_t activation;
} dw_conv_params_t;
================================================
FILE: include/esp_nn_esp32p4.h
================================================
/*
* SPDX-FileCopyrightText: 2024-2026 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
/**
* @file Header definitions to include for esp_nn optimized functions for
* the ESP32-P4 platform
*/
#pragma once
#include "esp_nn_defs.h"
#include "esp_nn_ansi_headers.h"
/**
* @brief 2d - convolution channelwise
*
* @note operation: result += (input + offset) * filter
*
* inputs type: int8_t, output: int8_t
* input offsets: although int32_t, they are contained in 8 bits [-128, 127]
*/
void esp_nn_conv_s8_esp32p4(const data_dims_t *input_dims,
const int8_t *input_data,
const data_dims_t *filter_dims,
const int8_t *filter_data,
const int32_t *bias,
const data_dims_t *output_dims,
int8_t *output_data,
const conv_params_t *conv_params,
const quant_data_t *quant_data);
int esp_nn_get_conv_scratch_size_esp32p4(const data_dims_t *input_dims,
const data_dims_t *filter_dims,
const data_dims_t *output_dims,
const conv_params_t *conv_params);
void esp_nn_set_conv_scratch_buf_esp32p4(const void *buf);
/********************** function defines ***************************/
#define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_ansi
void esp_nn_add_elementwise_s8_esp32p4(const int8_t *input1_data,
const int8_t *input2_data,
const int32_t input1_offset,
const int32_t input2_offset,
const int32_t input1_mult,
const int32_t input2_mult,
const int32_t input1_shift,
const int32_t input2_shift,
const int32_t left_shift,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t activation_min,
const int32_t activation_max,
const int32_t size);
#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_esp32p4
void esp_nn_mul_elementwise_s8_esp32p4(const int8_t *input1_data,
const int8_t *input2_data,
const int32_t input1_offset,
const int32_t input2_offset,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t activation_min,
const int32_t activation_max,
const int32_t size);
#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_esp32p4
void esp_nn_depthwise_conv_s8_esp32p4(const data_dims_t *input_dims,
const int8_t *input_data,
const data_dims_t *filter_dims,
const int8_t *filter_data,
const int32_t *bias,
const data_dims_t *output_dims,
int8_t *out_data,
const dw_conv_params_t *conv_params,
const quant_data_t *quant_data);
int esp_nn_get_depthwise_conv_scratch_size_esp32p4(const data_dims_t *input_dims,
const data_dims_t *filter_dims,
const data_dims_t *output_dims,
const dw_conv_params_t *conv_params);
void esp_nn_set_depthwise_conv_scratch_buf_esp32p4(const void *buf);
#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_esp32p4
#define esp_nn_conv_s8 esp_nn_conv_s8_esp32p4
#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_esp32p4
#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_esp32p4
#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_esp32p4
#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_esp32p4
/* Functions not yet optimized for P4 - use ANSI fallback */
void esp_nn_hard_swish_s8_esp32p4(const int8_t *input, int8_t *output,
const int32_t size,
const int16_t input_zero_point,
const int16_t output_mult_fxp,
const int16_t reluish_mult_fxp,
const int32_t reluish_mult_exp,
const int32_t output_mult_exp,
const int16_t output_zero_point);
#define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_esp32p4
#define esp_nn_get_hard_swish_scratch_size() 0
#define esp_nn_set_hard_swish_scratch_buf(buf)
void esp_nn_mean_nhwc_s8_esp32p4(const int8_t *input, int8_t *output,
const int32_t height, const int32_t width,
const int32_t channels,
const int32_t input_zero_point,
const int32_t output_zero_point,
const int32_t multiplier,
const int32_t shift);
#define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_esp32p4
void esp_nn_relu6_s8_esp32p4(int8_t *data, uint16_t size);
#define esp_nn_relu6_s8 esp_nn_relu6_s8_esp32p4
void esp_nn_avg_pool_s8_esp32p4(const int8_t *input,
const uint16_t input_wd,
const uint16_t input_ht,
int8_t *output,
const uint16_t output_wd,
const uint16_t output_ht,
const uint16_t stride_wd,
const uint16_t stride_ht,
const uint16_t filter_wd,
const uint16_t filter_ht,
const uint16_t pad_wd,
const uint16_t pad_ht,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t channels);
#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_esp32p4
void esp_nn_max_pool_s8_esp32p4(const int8_t *input,
const uint16_t input_wd,
const uint16_t input_ht,
int8_t *output,
const uint16_t output_wd,
const uint16_t output_ht,
const uint16_t stride_wd,
const uint16_t stride_ht,
const uint16_t filter_wd,
const uint16_t filter_ht,
const uint16_t pad_wd,
const uint16_t pad_ht,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t channels);
#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_esp32p4
void esp_nn_fully_connected_s8_esp32p4(const int8_t *input_data,
const int32_t input_offset,
const uint16_t row_len,
const int8_t *filter_data,
const int32_t filter_offset,
const int32_t *bias,
int8_t *out_data,
const uint16_t out_channels,
const int32_t out_offset,
const int32_t out_shift,
const int32_t out_mult,
const int32_t activation_min,
const int32_t activation_max);
void esp_nn_fully_connected_per_ch_s8_esp32p4(const int8_t *input_data,
const int32_t input_offset,
const uint16_t row_len,
const int8_t *filter_data,
const int32_t filter_offset,
const int32_t *bias,
int8_t *out_data,
const uint16_t out_channels,
const int32_t out_offset,
const int32_t *out_shift,
const int32_t *out_mult,
const int32_t activation_min,
const int32_t activation_max);
#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_esp32p4
#define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_esp32p4
int32_t esp_nn_get_softmax_scratch_size_esp32p4(const int32_t width, const int32_t height);
void esp_nn_set_softmax_scratch_buf_esp32p4(void *buffer);
void esp_nn_softmax_s8_esp32p4(const int8_t *input_data,
const int32_t height,
const int32_t width,
const int32_t mult,
const int32_t shift,
const int32_t diff_min,
int8_t *output_data);
#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_esp32p4
#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_esp32p4
#define esp_nn_softmax_s8 esp_nn_softmax_s8_esp32p4
#define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi
#define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi
#define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi
================================================
FILE: include/esp_nn_esp32s3.h
================================================
/*
* SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
/**
* @file Header definitions to include for esp_nn optimized functions for
* the ESP32-S3 platform
*/
#pragma once
#include "esp_nn_defs.h"
#include "esp_nn_ansi_headers.h"
/************************** Basic math functions *****************************/
/**
* @brief elementwise addition
*
* @note inputs type: int8_t, output: int8_t
* input offsets: although int32_t, they are contained in 8 bits [-128, 127]
*
* shift values are expected to be <= 0
*/
void esp_nn_add_elementwise_s8_esp32s3(const int8_t *input1_data,
const int8_t *input2_data,
const int32_t input1_offset,
const int32_t input2_offset,
const int32_t input1_mult,
const int32_t input2_mult,
const int32_t input1_shift,
const int32_t input2_shift,
const int32_t left_shift,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t activation_min,
const int32_t activation_max,
const int32_t size);
/**
* @brief elementwise multiplication
*
* @note inputs type: int8_t, output: int8_t
* input offsets: although int32_t, they are contained in 8 bits [-128, 127]
*
* output shift is expected to be <= 0
*/
void esp_nn_mul_elementwise_s8_esp32s3(const int8_t *input1_data,
const int8_t *input2_data,
const int32_t input1_offset,
const int32_t input2_offset,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t activation_min,
const int32_t activation_max,
const int32_t size);
/************************** Convolution functions *****************************/
/**
* @brief depthwise convolution per channel
*
* @note inputs type: int8_t, output: int8_t
* Version used in tflite is per channel.
* This version follows the same footsprints.
* Meaning, it has per out_channel shift and multiplier for
* requantization
*
* optimization notes: Though input_offset is int32 type,
* offset values are contained in 8 bits [-128, 127]
*/
void esp_nn_depthwise_conv_s8_esp32s3(const data_dims_t *input_dims,
const int8_t *input_data,
const data_dims_t *filter_dims,
const int8_t *filter_data,
const int32_t *bias,
const data_dims_t *output_dims,
int8_t *output_data,
const dw_conv_params_t *conv_params,
const quant_data_t *quant_data);
/**
* @brief 2d - convolution channelwise
*
* @note operation: result += (input + offset) * filter
*
* inputs type: int8_t, output: int8_t
* input offsets: although int32_t, they are contained in 8 bits [-128, 127]
*/
void esp_nn_conv_s8_esp32s3(const data_dims_t *input_dims,
const int8_t *input_data,
const data_dims_t *filter_dims,
const int8_t *filter_data,
const int32_t *bias,
const data_dims_t *output_dims,
int8_t *output_data,
const conv_params_t *conv_params,
const quant_data_t *quant_data);
int esp_nn_get_conv_scratch_size_esp32s3(const data_dims_t *input_dims,
const data_dims_t *filter_dims,
const data_dims_t *output_dims,
const conv_params_t *conv_params);
void esp_nn_set_conv_scratch_buf_esp32s3(const void *buf);
int esp_nn_get_depthwise_conv_scratch_size_esp32s3(const data_dims_t *input_dims,
const data_dims_t *filter_dims,
const data_dims_t *output_dims,
const dw_conv_params_t *conv_params);
void esp_nn_set_depthwise_conv_scratch_buf_esp32s3(const void *buf);
/************************** Pooling functions *****************************/
/**
* @brief max_pool
*
* @note inputs type: int8_t, output: int8_t
* input offsets: although int32_t, they are contained in 8 bits [-128, 127]
*/
void esp_nn_max_pool_s8_esp32s3(const int8_t *input,
const uint16_t input_wd,
const uint16_t input_ht,
int8_t *output,
const uint16_t output_wd,
const uint16_t output_ht,
const uint16_t stride_wd,
const uint16_t stride_ht,
const uint16_t filter_wd,
const uint16_t filter_ht,
const uint16_t pad_wd,
const uint16_t pad_ht,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t channels);
/**
* @brief avg_pool
*
* @note inputs type: int8_t, output: int8_t
* input offsets: although int32_t, they are contained in 8 bits [-128, 127]
*/
void esp_nn_avg_pool_s8_esp32s3(const int8_t *input,
const uint16_t input_wd,
const uint16_t input_ht,
int8_t *output,
const uint16_t output_wd,
const uint16_t output_ht,
const uint16_t stride_wd,
const uint16_t stride_ht,
const uint16_t filter_wd,
const uint16_t filter_ht,
const uint16_t pad_wd,
const uint16_t pad_ht,
const int32_t activation_min,
const int32_t activation_max,
const uint16_t channels);
/************************** Fully connected functions *****************************/
/**
* @brief fully connected
*
* @note inputs type: int8_t, output: int8_t
* input offsets: although int32_t, they are contained in 8 bits [-128, 127]
*
* Current version works only on aligned input.
* row_len and channels should both be multiple of 8.
*/
void esp_nn_fully_connected_s8_esp32s3(const int8_t *input_data,
const int32_t input_offset,
const uint16_t row_len,
const int8_t *filter_data,
const int32_t filter_offset,
const int32_t *bias,
int8_t *out_data,
const uint16_t out_channels,
const int32_t out_offset,
const int32_t out_shift,
const int32_t out_mult,
const int32_t activation_min,
const int32_t activation_max);
/**
* @brief fully connected - per channel
*
* @note inputs type: int8_t, output: int8_t
* input offsets: although int32_t, they are contained in 8 bits [-128, 127]
* out_mult, out_shift: int32_t* containing per-channel data
*
* Current version works only on aligned input.
* row_len and channels should both be multiple of 8.
*/
void esp_nn_fully_connected_per_ch_s8_esp32s3(const int8_t *input_data,
const int32_t input_offset,
const uint16_t row_len,
const int8_t *filter_data,
const int32_t filter_offset,
const int32_t *bias,
int8_t *out_data,
const uint16_t out_channels,
const int32_t out_offset,
const int32_t* out_shift,
const int32_t* out_mult,
const int32_t activation_min,
const int32_t activation_max);
/**
* @brief relu6
*
* @note inout: int8_t
*/
void esp_nn_relu6_s8_esp32s3(int8_t *data, uint16_t size);
/********************** function defines ***************************/
#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_esp32s3
#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_esp32s3
void esp_nn_mul_broadcast_channel_s8_esp32s3(const int8_t *input1,
const int8_t *input2_per_ch,
const int32_t input1_offset,
const int32_t input2_offset,
int8_t *output,
const int32_t output_offset,
const int32_t output_mult,
const int32_t output_shift,
const int32_t activation_min,
const int32_t activation_max,
const int32_t total_spatial,
const int32_t channels);
#define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_esp32s3
#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_esp32s3
#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_esp32s3
#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_esp32s3
#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_esp32s3
#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_esp32s3
#define esp_nn_conv_s8 esp_nn_conv_s8_esp32s3
#define esp_nn_relu6_s8 esp_nn_relu6_s8_esp32s3
int32_t esp_nn_get_hard_swish_scratch_size_esp32s3(void);
void esp_nn_set_hard_swish_scratch_buf_esp32s3(void *buf);
void esp_nn_hard_swish_s8_esp32s3(const int8_t *input, int8_t *output,
const int32_t size,
const int16_t input_zero_point,
const int16_t output_mult_fxp,
const int16_t reluish_mult_fxp,
const int32_t reluish_mult_exp,
const int32_t output_mult_exp,
const int16_t output_zero_point);
#define esp_nn_get_hard_swish_scratch_size esp_nn_get_hard_swish_scratch_size_esp32s3
#define esp_nn_set_hard_swish_scratch_buf esp_nn_set_hard_swish_scratch_buf_esp32s3
#define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_esp32s3
void esp_nn_mean_nhwc_s8_esp32s3(const int8_t *input, int8_t *output,
const int32_t height, const int32_t width,
const int32_t channels,
const int32_t input_zero_point,
const int32_t output_zero_point,
const int32_t multiplier,
const int32_t shift);
#define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_esp32s3
#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_esp32s3
#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_esp32s3
#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_esp32s3
#define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_esp32s3
int32_t esp_nn_get_softmax_scratch_size_esp32s3(const int32_t width, const int32_t height);
void esp_nn_set_softmax_scratch_buf_esp32s3(void *buffer);
void esp_nn_softmax_s8_esp32s3(const int8_t *input_data, const int32_t height,
const int32_t width, const int32_t mult,
const int32_t shift, const int32_t diff_min,
int8_t *output_data);
#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_esp32s3
#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_esp32s3
#define esp_nn_softmax_s8 esp_nn_softmax_s8_esp32s3
/* Logistic (sigmoid) — LUT-based, same impl for all targets */
#define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi
#define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi
#define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi
================================================
FILE: include/esp_nn_generic_opt.h
================================================
/*
* SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
/**
* @file Header definitions to include for esp_nn generic optimisations
* For functions which not having optimisations, _ansi versions are picked.
*/
#pragma once
#include "esp_nn_defs.h"
#include "esp_nn_ansi_headers.h"
#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_ansi
#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_ansi
#define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_ansi
#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_opt
#define esp_nn_conv_s8 esp_nn_conv_s8_opt
#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_opt
#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_opt
#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_opt
#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_opt
#define esp_nn_relu6_s8 esp_nn_relu6_s8_ansi
#define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_ansi
#define esp_nn_get_hard_swish_scratch_size() 0
#define esp_nn_set_hard_swish_scratch_buf(buf)
#define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_ansi
#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_ansi
#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_ansi
#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_ansi
#define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_ansi
#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_opt
#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_opt
#define esp_nn_softmax_s8 esp_nn_softmax_s8_opt
#define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi
#define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi
#define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi
================================================
FILE: src/activation_functions/esp_nn_hard_swish_ansi.c
================================================
/*
* SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
/*
* HardSwish activation function: y = x * relu6(x + 3) / 6
* Quantized int8 implementation using fixed-point arithmetic.
*/
#include <stdint.h>
#include <common_functions.h>
/*
* Saturating left shift for int16
*/
static inline int16_t sat_left_shift_s16(int16_t val, int shift)
{
int32_t result = (int32_t)val << shift;
if (result > 32767) return 32767;
if (result < -32768) return -32768;
return (int16_t)result;
}
/*
* SaturatingRoundingDoublingHighMul for int16: (a * b + (1<<14)) >> 15
*/
static inline int16_t sat_round_dbl_high_mul_s16(int16_t a, int16_t b)
{
if (a == b && a == -32768) return 32767;
int32_t ab = (int32_t)a * (int32_t)b;
return (int16_t)((ab + (1 << 14)) >> 15);
}
/*
* SaturatingDoublingHighMul (NOT rounding): (a * b) >> 15
*/
static inline int16_t sat_dbl_high_mul_s16(int16_t a, int16_t b)
{
if (a == b && a == -32768) return 32767;
return (int16_t)(((int32_t)a * (int32_t)b) / (1 << 15));
}
/*
* RoundingDivideByPOT for int16
*/
static inline int16_t rounding_div_pot_s16(int16_t val, int exponent)
{
int32_t mask = (1 << exponent) - 1;
int32_t remainder = val & mask;
int32_t threshold = (mask >> 1) + (val < 0 ? 1 : 0);
return (int16_t)((val >> exponent) + (remainder > threshold ? 1 : 0));
}
void esp_nn_hard_swish_s8_ansi(const int8_t *input,
int8_t *output,
const int32_t size,
const int16_t input_zero_point,
const int16_t output_mult_fxp,
const int16_t reluish_mult_fxp,
const int32_t reluish_mult_exp,
const int32_t output_mult_exp,
const int16_t output_zero_point)
{
for (int i = 0; i < size; i++) {
const int16_t in_val = input[i] - input_zero_point;
const int16_t in_hires = in_val * 128; /* << 7 */
/* Scale input to output scale */
const int16_t in_on_out_scale = sat_round_dbl_high_mul_s16(in_hires, output_mult_fxp);
/* Compute reluish value: maps input from [-3,3] to [-1,1] */
int16_t reluish = in_hires;
if (reluish_mult_exp > 0) {
reluish = sat_left_shift_s16(reluish, reluish_mult_exp - 1);
}
reluish = sat_round_dbl_high_mul_s16(reluish, reluish_mult_fxp);
if (reluish_mult_exp > 0) {
reluish = sat_left_shift_s16(reluish, 1);
}
if (reluish_mult_exp < 0) {
reluish = rounding_div_pot_s16(reluish, -reluish_mult_exp);
}
/* Convert from [-1,1] to [0,1] */
reluish = (reluish + (1 << 15)) >> 1;
/* Multiply: output = reluish * input_on_output_scale */
const int16_t pre_out = sat_dbl_high_mul_s16(reluish, in_on_out_scale);
/* Final shift and offset */
int16_t out_val = rounding_div_pot_s16(pre_out, -output_mult_exp);
out_val += output_zero_point;
if (out_val > 127) out_val = 127;
if (out_val < -128) out_val = -128;
output[i] = (int8_t)out_val;
}
}
================================================
FILE: src/activation_functions/esp_nn_hard_swish_s8_esp32p4.c
================================================
/*
* SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
/*
* ESP32-P4 optimized HardSwish with:
* 1. Branch hoisting (borrowed from S3): dispatch on reluish_mult_exp ONCE
* 2. 2x loop unrolling for better ILP on RISC-V pipeline
* 3. All int16 arithmetic - no 64-bit multiply bottleneck
*/
#include <stdint.h>
static inline __attribute__((always_inline))
int16_t sat_rnd_dbl_hi_mul(int16_t a, int16_t b) {
if (__builtin_expect(a == b && a == -32768, 0)) return 32767;
return (int16_t)(((int32_t)a * (int32_t)b + (1 << 14)) >> 15);
}
static inline __attribute__((always_inline))
int16_t sat_dbl_hi_mul(int16_t a, int16_t b) {
if (__builtin_expect(a == b && a == -32768, 0)) return 32767;
return (int16_t)(((int32_t)a * (int32_t)b) >> 15);
}
static inline __attribute__((always_inline))
int16_t sat_left_shift_s16(int32_t val) {
if (val > 32767) return 32767;
if (val < -32768) return -32768;
return (int16_t)val;
}
static inline __attribute__((always_inline))
int16_t rounding_div_pot_s16(int16_t val, int exp) {
int32_t mask = (1 << exp) - 1;
int32_t remainder = val & mask;
int32_t threshold = (mask >> 1) + (val < 0 ? 1 : 0);
return (int16_t)((val >> exp) + (remainder > threshold ? 1 : 0));
}
/* Core output computation shared by all paths */
static inline __attribute__((always_inline))
int8_t hard_swish_output(int16_t reluish, int16_t in_on_out_scale,
int neg_out_exp, int16_t output_zero_point) {
int16_t pre = sat_dbl_hi_mul(reluish, in_on_out_scale);
int16_t ov = rounding_div_pot_s16(pre, neg_out_exp);
int32_t result = ov + output_zero_point;
if (result > 127) result = 127;
if (result < -128) result = -128;
return (int8_t)result;
}
void esp_nn_hard_swish_s8_esp32p4(const int8_t *input,
int8_t *output,
const int32_t size,
const int16_t input_zero_point,
const int16_t output_mult_fxp,
const int16_t reluish_mult_fxp,
const int32_t reluish_mult_exp,
const int32_t output_mult_exp,
const int16_t output_zero_point)
{
const int neg_out_exp = -output_mult_exp;
int i = 0;
/* Branch on reluish_mult_exp ONCE - 3 specialized loops */
if (reluish_mult_exp > 0) {
const int ls1 = reluish_mult_exp - 1;
for (; i <= size - 2; i += 2) {
int16_t iv0 = input[i] - input_zero_point;
int16_t iv1 = input[i+1] - input_zero_point;
int16_t hi0 = iv0 * 128, hi1 = iv1 * 128;
int16_t on0 = sat_rnd_dbl_hi_mul(hi0, output_mult_fxp);
int16_t on1 = sat_rnd_dbl_hi_mul(hi1, output_mult_fxp);
int16_t rv0 = sat_left_shift_s16((int32_t)hi0 << ls1);
int16_t rv1 = sat_left_shift_s16((int32_t)hi1 << ls1);
rv0 = sat_rnd_dbl_hi_mul(rv0, reluish_mult_fxp);
rv1 = sat_rnd_dbl_hi_mul(rv1, reluish_mult_fxp);
rv0 = sat_left_shift_s16((int32_t)rv0 * 2);
rv1 = sat_left_shift_s16((int32_t)rv1 * 2);
rv0 = (int16_t)(((int32_t)rv0 + 32768) >> 1);
rv1 = (int16_t)(((int32_t)rv1 + 32768) >> 1);
output[i] = hard_swish_output(rv0, on0, neg_out_exp, output_zero_point);
output[i+1] = hard_swish_output(rv1, on1, neg_out_exp, output_zero_point);
}
} else if (reluish_mult_exp < 0) {
const int neg_relu_exp = -reluish_mult_exp;
for (; i <= size - 2; i += 2) {
int16_t iv0 = input[i] - input_zero_point;
int16_t iv1 = input[i+1] - input_zero_point;
int16_t hi0 = iv0 * 128, hi1 = iv1 * 128;
int16_t on0 = sat_rnd_dbl_hi_mul(hi0, output_mult_fxp);
int16_t on1 = sat_rnd_dbl_hi_mul(hi1, output_mult_fxp);
int16_t rv0 = sat_rnd_dbl_hi_mul(hi0, reluish_mult_fxp);
int16_t rv1 = sat_rnd_dbl_hi_mul(hi1, reluish_mult_fxp);
rv0 = rounding_div_pot_s16(rv0, neg_relu_exp);
rv1 = rounding_div_pot_s16(rv1, neg_relu_exp);
rv0 = (int16_t)(((int32_t)rv0 + 32768) >> 1);
rv1 = (int16_t)(((int32_t)rv1 + 32768) >> 1);
output[i] = hard_swish_output(rv0, on0, neg_out_exp, output_zero_point);
output[i+1] = hard_swish_output(rv1, on1, neg_out_exp, output_zero_point);
}
} else {
for (; i <= size - 2; i += 2) {
int16_t iv0 = input[i] - input_zero_point;
int16_t iv1 = input[i+1] - input_zero_point;
int16_t hi0 = iv0 * 128, hi1 = iv1 * 128;
int16_t on0 = sat_rnd_dbl_hi_mul(hi0, output_mult_fxp);
int16_t on1 = sat_rnd_dbl_hi_mul(hi1, output_mult_fxp);
int16_t rv0 = sat_rnd_dbl_hi_mul(hi0, reluish_mult_fxp);
int16_t rv1 = sat_rnd_dbl_hi_mul(hi1, reluish_mult_fxp);
rv0 = (int16_t)(((int32_t)rv0 + 32768) >> 1);
rv1 = (int16_t)(((int32_t)rv1 + 32768) >> 1);
output[i] = hard_swish_output(rv0, on0, neg_out_exp, output_zero_point);
output[i+1] = hard_swish_output(rv1, on1, neg_out_exp, output_zero_point);
}
}
/* Scalar remainder */
for (; i < size; i++) {
int16_t iv = input[i] - input_zero_point;
int16_t hi = iv * 128;
int16_t on_out = sat_rnd_dbl_hi_mul(hi, output_mult_fxp);
int16_t rv = hi;
if (reluish_mult_exp > 0)
rv = sat_left_shift_s16((int32_t)rv << (reluish_mult_exp - 1));
rv = sat_rnd_dbl_hi_mul(rv, reluish_mult_fxp);
if (reluish_mult_exp > 0)
rv = sat_left_shift_s16((int32_t)rv * 2);
if (reluish_mult_exp < 0)
rv = rounding_div_pot_s16(rv, -reluish_mult_exp);
rv = (int16_t)(((int32_t)rv + 32768) >> 1);
output[i] = hard_swish_output(rv, on_out, neg_out_exp, output_zero_point);
}
}
================================================
FILE: src/activation_functions/esp_nn_hard_swish_s8_esp32s3.c
================================================
/*
* SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
/*
* ESP32-S3 optimized HardSwish using 256-byte lookup table.
*
* Key insight: HardSwish maps int8 -> int8 with fixed quantization parameters
* per layer. Only 256 possible input values exist. We precompute the full
* mapping once using the ANSI reference (bit-exact), then the inner loop
* is a single byte load per element.
*
* Scratch buffer: 256 bytes (set via esp_nn_set_hard_swish_scratch_buf).
*/
#include <stdint.h>
#include <stddef.h>
/* Use ANSI C reference to build LUT — guarantees bit-exact match */
extern void esp_nn_hard_swish_s8_ansi(const int8_t *input,
int8_t *output,
const int32_t size,
const int16_t input_zero_point,
const int16_t output_mult_fxp,
const int16_t reluish_mult_fxp,
const int32_t reluish_mult_exp,
const int32_t output_mult_exp,
const int16_t output_zero_point);
static int8_t *hard_swish_scratch = NULL;
int32_t esp_nn_get_hard_swish_scratch_size_esp32s3(void)
{
return 512; /* 256 for lut_input + 256 for lut output */
}
void esp_nn_set_hard_swish_scratch_buf_esp32s3(void *buf)
{
hard_swish_scratch = (int8_t *)buf;
}
void esp_nn_hard_swish_s8_esp32s3(const int8_t *input,
int8_t *output,
const int32_t size,
const int16_t input_zero_point,
const int16_t output_mult_fxp,
const int16_t reluish_mult_fxp,
const int32_t reluish_mult_exp,
const int32_t output_mult_exp,
const int16_t output_zero_point)
{
if (!hard_swish_scratch) {
/* No scratch — fall through to ANSI */
esp_nn_hard_swish_s8_ansi(input, output, size,
input_zero_point, output_mult_fxp,
reluish_mult_fxp, reluish_mult_exp,
output_mult_exp, output_zero_point);
return;
}
/* Build 256-byte LUT using ANSI reference (bit-exact).
* lut[i] = hardswish((int8_t)i) for the given quant params.
* Indexed by (uint8_t)input_val for direct lookup. */
int8_t *lut_input = hard_swish_scratch;
int8_t *lut = hard_swish_scratch + 256;
for (int i = 0; i < 256; i++) {
lut_input[i] = (int8_t)i;
}
esp_nn_hard_swish_s8_ansi(lut_input, lut, 256,
input_zero_point, output_mult_fxp,
reluish_mult_fxp, reluish_mult_exp,
output_mult_exp, output_zero_point);
/* Apply LUT — one byte load per element */
for (int i = 0; i < size; i++) {
output[i] = lut[(uint8_t)input[i]];
}
}
================================================
FILE: src/activation_functions/esp_nn_relu_ansi.c
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdint.h>
#include <stdlib.h>
#include <common_functions.h>
void esp_nn_relu6_s8_ansi(int8_t *data, uint16_t size)
{
int32_t i;
for (i = 0; i < size; i++) {
int32_t ip = data[i];
ip = max(ip, 0);
data[i] = min(ip, 6);
}
}
================================================
FILE: src/activation_functions/esp_nn_relu_s8_esp32p4.c
================================================
/*
* SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <stdint.h>
/**
* In-place ReLU6 for s8 data using ESP32-P4 PIE SIMD.
* Clamps each element to [0, 6].
* Processes 16 elements per iteration via 128-bit vector ops.
*/
void esp_nn_relu6_s8_esp32p4(int8_t *data, uint16_t size)
{
/* Enable PIE */
asm volatile (
"csrsi 0x7f2, 0b01 \n\t"
"li x29, 0b10 \n\t"
"esp.movx.w.cfg x29 \n\t"
::: "x29"
);
int i = 0;
if (size >= 16) {
/* Broadcast 0 into q2 and 6 into q3 */
const int8_t zero_val = 0;
const int8_t six_val = 6;
asm volatile (
"esp.vldbc.8.ip q2, %0, 0 \n\t"
"esp.vldbc.8.ip q3, %1, 0 \n\t"
:: "r"(&zero_val), "r"(&six_val)
);
int count = size >> 4;
int stride = 16;
asm volatile (
"mv x30, %[ptr] \n\t"
"mv x31, %[cnt] \n\t"
"1: \n\t"
"esp.vld.128.ip q0, x30, 0 \n\t" /* load 16 bytes, no auto-increment */
"esp.vmax.s8 q0, q0, q2 \n\t" /* max(val, 0) */
"esp.vmin.s8 q0, q0, q3 \n\t" /* min(val, 6) */
"esp.vst.128.xp q0, x30, %[stride] \n\t" /* store and advance ptr by 16 */
"addi x31, x31, -1 \n\t"
"bnez x31, 1b \n\t"
:
: [ptr] "r"(data), [cnt] "r"(count), [stride] "r"(stride)
: "x30", "x31", "memory"
);
i = count << 4;
}
/* Handle remaining elements scalar */
for (; i < size; i++) {
int32_t val = data[i];
if (val < 0) val = 0;
if (val > 6) val = 6;
data[i] = (int8_t) val;
}
}
================================================
FILE: src/activation_functions/esp_nn_relu_s8_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
.text
.align 4
.literal_position
# in place relu6 function. a2: data, a3: size
# Program Unit: esp_nn_relu6_s8_esp32s3
.type esp_nn_relu6_s8_esp32s3, @function
.align 4
.global esp_nn_relu6_s8_esp32s3
esp_nn_relu6_s8_esp32s3:
entry a1,48 #
mov.n a9,a2 # [0], data
mov.n a7,a3 # [1], size
// process multiple of 16
movi.n a4,6 # [4]
s8i a4,a1,0 # [5] six
addi a10,a3,-7 # [2]
ee.vldbc.8 q1,a1 # [6] id:72 six+0x0
blti a3,16,.Lt_0_5634 # [7]
srai a8,a3,4 # [0]
ee.zero.q q2 # [1]
loopgtz a8,.LBB37_esp_nn_relu6_s8_esp32s3 # [3]
ee.vld.128.ip q0,a2,0 # [0*II+0] id:73
ee.vmax.s8 q0,q0,q2 # [0*II+2]
ee.vmin.s8 q0,q0,q1 # [0*II+3]
ee.vst.128.ip q0,a2,16 # [0*II+4] id:74
.LBB37_esp_nn_relu6_s8_esp32s3: # 0x34
slli a8,a8,4 # [0]
// remaining multiple of 8 data
bge a8,a10,.Lt_0_3586 # [1]
.Lt_0_3842: # 0x3a
sub a6,a7,a8 # [0]
srai a6,a6,3 # [1]
loopgtz a6,.LBB52_esp_nn_relu6_s8_esp32s3 # [2]
ee.vld.l.64.ip q0,a2,0 # [0*II+0] id:75
ee.vmax.s8 q0,q0,q2 # [0*II+2]
ee.vmin.s8 q0,q0,q1 # [0*II+3]
ee.vst.l.64.ip q0,a2,8 # [0*II+4] id:76
.LBB52_esp_nn_relu6_s8_esp32s3: # 0x4f
addx8 a8,a6,a8 # [0]
.Lt_0_3586: # 0x52
// process leftover
bge a8,a7,.Lt_0_6402 # [0]
.Lt_0_4866: # 0x55
movi.n a5,0 # [0]
sub a3,a7,a8 # [1]
add.n a2,a8,a9 # [2]
l8ui a6,a2,0 # [3] id:78
addi.n a3,a3,-1 # [4]
sext a6,a6,7
max a6,a5,a6 # [6]
min a6,a4,a6 # [7]
s8i a6,a2,0 # [8] id:79
loopgtz a3,.LBB67_esp_nn_relu6_s8_esp32s3 # [9]
l8ui a3,a2,1 # [0*II+0] id:78
addi.n a2,a2,1 # [1*II+1]
sext a3,a3,7
max a3,a5,a3 # [0*II+3]
min a3,a4,a3 # [0*II+4]
s8i a3,a2,0 # [0*II+5] id:79
.LBB67_esp_nn_relu6_s8_esp32s3: # 0x81
.Lt_0_6402: # 0x83
retw.n # [0]
.Lt_0_5634: # 0x85
blti a10,1,.Lt_0_5890 # [0]
movi.n a8,0 # [0]
ee.zero.q q2 # [1]
j .Lt_0_3842 # [2]
.Lt_0_5890: # 0x90
beqz.n a3,.Lt_0_6402 # [0]
movi.n a8,0 # [0]
j .Lt_0_4866 # [1]
.size esp_nn_relu6_s8_esp32s3, . - esp_nn_relu6_s8_esp32s3
================================================
FILE: src/basic_math/esp_nn_add_ansi.c
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdint.h>
#include <common_functions.h>
void esp_nn_add_elementwise_u8_ansi(const uint8_t *input1_data,
const uint8_t *input2_data,
const int32_t input1_offset,
const int32_t input2_offset,
const int32_t input1_mult,
const int32_t input2_mult,
const int32_t input1_shift,
const int32_t input2_shift,
const int32_t left_shift,
uint8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t activation_min,
const int32_t activation_max,
const int32_t size)
{
for (int i = 0; i < size; i++) {
int32_t tmp1 = input1_data[i] + input1_offset;
int32_t tmp2 = input2_data[i] + input2_offset;
tmp1 <<= left_shift;
tmp2 <<= left_shift;
tmp1 = esp_nn_sat_round_doubling_high_mul(tmp1, input1_mult);
tmp2 = esp_nn_sat_round_doubling_high_mul(tmp2, input2_mult);
tmp1 = esp_nn_div_by_power_of_two(tmp1, -input1_shift);
tmp2 = esp_nn_div_by_power_of_two(tmp2, -input2_shift);
int32_t out = tmp1 + tmp2;
out = esp_nn_sat_round_doubling_high_mul(out, out_mult);
out = esp_nn_div_by_power_of_two(out, -out_shift);
out = out + out_offset;
out = max(activation_min, min(out, activation_max));
output[i] = (uint8_t) out;
}
}
void esp_nn_add_elementwise_s8_ansi(const int8_t *input1_data,
const int8_t *input2_data,
const int32_t input1_offset,
const int32_t input2_offset,
const int32_t input1_mult,
const int32_t input2_mult,
const int32_t input1_shift,
const int32_t input2_shift,
const int32_t left_shift,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t activation_min,
const int32_t activation_max,
const int32_t size)
{
for (int i = 0; i < size; i++) {
int32_t tmp1 = input1_data[i] + input1_offset;
int32_t tmp2 = input2_data[i] + input2_offset;
tmp1 <<= left_shift;
tmp2 <<= left_shift;
tmp1 = esp_nn_sat_round_doubling_high_mul(tmp1, input1_mult);
tmp2 = esp_nn_sat_round_doubling_high_mul(tmp2, input2_mult);
tmp1 = esp_nn_div_by_power_of_two(tmp1, -input1_shift);
tmp2 = esp_nn_div_by_power_of_two(tmp2, -input2_shift);
int32_t out = tmp1 + tmp2;
out = esp_nn_sat_round_doubling_high_mul(out, out_mult);
out = esp_nn_div_by_power_of_two(out, -out_shift);
out = out + out_offset;
out = max(activation_min, min(out, activation_max));
output[i] = (int8_t) out;
}
}
================================================
FILE: src/basic_math/esp_nn_add_s8_esp32p4.c
================================================
/*
* SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <stdint.h>
#include <common_functions.h>
/**
* Optimized elementwise add for s8 on ESP32-P4.
* Uses fast multiply-by-quantized-mult and 2x unrolling.
*/
/* Inline the core requantization to avoid function call overhead */
/* Inlined fast requant using explicit RISC-V mul/mulh to avoid
* compiler generating 64-bit multiply helper calls */
static inline __attribute__((always_inline))
int32_t add_requant(int32_t val, int32_t mult, int32_t neg_shift)
{
/* Use C 64-bit multiply - compiler already generates mul+mulh pair at -O2 */
int64_t prod64 = (int64_t)val * mult + ((int64_t)1 << 30);
int32_t result = (int32_t)(prod64 >> 31);
if (neg_shift > 0) {
int32_t rnd = (1 << (neg_shift - 1)) - (result < 0);
result = (result + rnd) >> neg_shift;
}
return result;
}
void esp_nn_add_elementwise_s8_esp32p4(const int8_t *input1_data,
const int8_t *input2_data,
const int32_t input1_offset,
const int32_t input2_offset,
const int32_t input1_mult,
const int32_t input2_mult,
const int32_t input1_shift,
const int32_t input2_shift,
const int32_t left_shift,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t activation_min,
const int32_t activation_max,
const int32_t size)
{
const int32_t neg_in1_shift = -input1_shift;
const int32_t neg_in2_shift = -input2_shift;
const int32_t neg_out_shift = -out_shift;
int i = 0;
/* Process 2 at a time - C inline requant lets compiler optimize across calls */
for (; i <= size - 2; i += 2) {
int32_t a0 = (input1_data[i + 0] + input1_offset) << left_shift;
int32_t b0 = (input2_data[i + 0] + input2_offset) << left_shift;
a0 = add_requant(a0, input1_mult, neg_in1_shift);
b0 = add_requant(b0, input2_mult, neg_in2_shift);
int32_t out0 = add_requant(a0 + b0, out_mult, neg_out_shift) + out_offset;
out0 = max(activation_min, min(out0, activation_max));
int32_t a1 = (input1_data[i + 1] + input1_offset) << left_shift;
int32_t b1 = (input2_data[i + 1] + input2_offset) << left_shift;
a1 = add_requant(a1, input1_mult, neg_in1_shift);
b1 = add_requant(b1, input2_mult, neg_in2_shift);
int32_t out1 = add_requant(a1 + b1, out_mult, neg_out_shift) + out_offset;
out1 = max(activation_min, min(out1, activation_max));
output[i + 0] = (int8_t) out0;
output[i + 1] = (int8_t) out1;
}
for (; i < size; i++) {
int32_t tmp1 = (input1_data[i] + input1_offset) << left_shift;
int32_t tmp2 = (input2_data[i] + input2_offset) << left_shift;
tmp1 = add_requant(tmp1, input1_mult, neg_in1_shift);
tmp2 = add_requant(tmp2, input2_mult, neg_in2_shift);
int32_t out = add_requant(tmp1 + tmp2, out_mult, neg_out_shift) + out_offset;
out = max(activation_min, min(out, activation_max));
output[i] = (int8_t) out;
}
}
================================================
FILE: src/basic_math/esp_nn_add_s8_esp32s3.S
================================================
// Copyright 2021-2023 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
.text
.align 4
.literal_position
.literal .nudge_val, 1073741824
# Program Unit: esp_nn_add_elementwise_s8_esp32s3
.type esp_nn_add_elementwise_s8_esp32s3, @function
.align 4
.global esp_nn_add_elementwise_s8_esp32s3
esp_nn_add_elementwise_s8_esp32s3: # 0x4
# temp_neg_out_shift = 0
# temp_neg_input2_shift = 4
# temp_neg_input1_shift = 8
# gra_spill_temp_2 = 12
# gra_spill_temp_3 = 16
# gra_spill_temp_4 = 20
# gra_spill_temp_5 = 24
# gra_spill_temp_6 = 28
# gra_spill_temp_7 = 32
# gra_spill_temp_8 = 36
# gra_spill_temp_9 = 40
# gra_spill_temp_10 = 44
# gra_spill_temp_11 = 48
# gra_spill_temp_12 = 52
# gra_spill_temp_13 = 56
// a2 : *input1_data
// a3 : *input2_data
// a4 : input1_offset
// a5 : input2_offset
// a6 : input1_mult
// a7 : input2_mult
// On stack:
// 80: input1_shift
// 84: input2_shift
// 88: left_shift
// 92: *output
// 96: out_offset
// 100: out_mult, loaded in `a8`
// 104: out_shift
// 108: activation_min
// 112: activation_max
// 116: size
entry a1,80 #
s32i.n a4,a1,48 # [10] gra_spill_temp_11, input1_offset
s32i.n a5,a1,52 # [0] gra_spill_temp_12, input2_offset
s32i.n a2,a1,32 # [5] gra_spill_temp_7, input1_data
s32i.n a3,a1,12 # [3] gra_spill_temp_2, input2_data
l32i a12,a1,116 # [11] id:720 size+0x0
mov.n a14,a2 # [6]
mov.n a10,a3 # [8]
blti a12,1,.exit # [1] // exit
l32i a3,a1,80 # [0] id:721 input1_shift+0x0
l32i a13,a1,84 # [1] id:722 input2_shift+0x0
l32i a2,a1,104 # [8] id:723 out_shift+0x0
l32i a8,a1,100 # [1] out_mult
neg a3,a3 # [12]
neg a13,a13 # [7]
neg a2,a2 # [11]
s32i.n a3,a1,8 # [12] temp_neg_input1_shift, -input1_shift
s32i.n a13,a1,4 # [7] temp_neg_input2_shift, -input2_shift
s32i.n a2,a1,0 # [16] temp_neg_out_shift, -out_shift
movi.n a5,1
addi a9,a3,-1
ssl a9
sll a15,a5
s32i.n a15,a1,16 # gra_spill_temp_3, 1 << (exponent - 1) for input1
addi a9,a13,-1
ssl a9
sll a15,a5
s32i.n a15,a1,20 # gra_spill_temp_4, 1 << (exponent - 1) for input2
addi a9,a2,-1
ssl a9
sll a15,a5
s32i.n a15,a1,24 # gra_spill_temp_5, 1 << (exponent - 1) for out
movi.n a2,0
blti a12,12,.process_leftover # [23]
// skip to leftover routine if inputs are unaligned
or a9,a14,a10
extui a9,a9,0,4
bnez a9,.process_leftover
l32i a9,a1,92 # [17] id:1279 output+0x0
l32i a13,a1,116 # [20]
srai a13,a13,3 # [21]
s32i.n a13,a1,56 # [22] gra_spill_temp_13
movi.n a13,8
s32i.n a13,a1,28 # gra_spill_temp_6, mult_of8 counter
ee.zero.q q6 # [8]
.vector_loop: // process 8 values in one go
l32i a15,a1,88 # [6] left_shift
ee.vld.l.64.ip q0,a14,8 # [9] id:729
s32i.n a9,a1,44 # [10] gra_spill_temp_10, out_ptr
s32i.n a14,a1,40 # [20] gra_spill_temp_9
wsr.sar a15 # [21] load left shift
addi.n a15,a1,48 # [14]
ee.vldbc.16 q7,a15 # [21] id:1277 input1_offset
ee.vcmp.lt.s8 q5,q0,q6 # [29]
ee.vzip.8 q0,q5 # [31], 20 bits
ee.vadds.s16 q0,q0,q7 # [34], add offset
ee.vcmp.lt.s16 q2,q0,q6 # [36]
ee.vzip.16 q0,q2 # [39], 32 bits
ee.vsl.32 q0,q0 # [41] left_shift
ee.vsl.32 q2,q2 # [42] left_shift
l32r a9,.nudge_val # [15], nudge
// mulhi32 for q0
ee.movi.32.a q0,a3,2 # [44]
ee.movi.32.a q0,a4,3 # [45]
ee.movi.32.a q0,a14,1 # [46]
ee.movi.32.a q0,a5,0 # [62]
mulsh a13,a6,a3 # [51]
mull a3,a6,a3 # [53]
mulsh a12,a6,a4 # [50]
mull a4,a6,a4 # [55]
mulsh a15,a6,a14 # [48]
mull a14,a6,a14 # [49]
ssai 31 # [47]
add a3,a3,a9
saltu a2,a3,a9
add.n a13,a13,a2
src a13,a13,a3
add a4,a4,a9
saltu a2,a4,a9
add.n a12,a12,a2
src a12,a12,a4
ee.movi.32.q q0,a13,2 # [62]
add a14,a14,a9
saltu a2,a14,a9
add.n a15,a15,a2
src a15,a15,a14
ee.movi.32.q q0,a12,3 # [62]
mulsh a13,a6,a5 # [51]
mull a5,a6,a5 # [53]
ee.movi.32.q q0,a15,1 # [62]
add a5,a5,a9
saltu a2,a5,a9
add.n a13,a13,a2
src a13,a13,a5
ee.movi.32.q q0,a13,0 # [62]
// mulhi32 for q2
ee.movi.32.a q2,a3,2 # [44]
ee.movi.32.a q2,a4,3 # [45]
ee.movi.32.a q2,a14,1 # [46]
ee.movi.32.a q2,a5,0 # [62]
mulsh a13,a6,a3 # [51]
mull a3,a6,a3 # [53]
mulsh a12,a6,a4 # [50]
mull a4,a6,a4 # [55]
mulsh a15,a6,a14 # [48]
mull a14,a6,a14 # [49]
ssai 31 # [47]
add a3,a3,a9
saltu a2,a3,a9
add.n a13,a13,a2
src a13,a13,a3
add a4,a4,a9
saltu a2,a4,a9
add.n a12,a12,a2
src a12,a12,a4
ee.movi.32.q q2,a13,2 # [62]
add a14,a14,a9
saltu a2,a14,a9
add.n a15,a15,a2
src a15,a15,a14
ee.movi.32.q q2,a12,3 # [62]
mulsh a13,a6,a5 # [51]
mull a5,a6,a5 # [53]
ee.movi.32.q q2,a15,1 # [62]
l32i a3,a1,8 # [12] temp_neg_input1_shift, -input1_shift
add a5,a5,a9
saltu a2,a5,a9
add.n a13,a13,a2
src a13,a13,a5
ee.movi.32.q q2,a13,0 # [62]
blti a3,1, .skip_div_by2_in0
addi.n a13,a1,16
ee.vcmp.lt.s32 q1,q0,q6
ee.vcmp.lt.s32 q3,q2,q6
ee.vldbc.32 q5,a13 // 1 << (exponent - 1)
wsr.sar a3 // load right_shift
ee.vadds.s32 q0,q0,q1 // subtract 1 `if (val < 0)`
ee.vadds.s32 q2,q2,q3 // subtract 1 `if (val < 0)`
ee.vadds.s32 q0,q0,q5
ee.vadds.s32 q2,q2,q5
ee.vsr.32 q0,q0
ee.vsr.32 q2,q2
.skip_div_by2_in0:
ee.vld.l.64.ip q1,a10,8 # [11] id:1290
addi.n a15,a1,52 # [12]
ee.vldbc.16 q7,a15 # [19] id:1278 input2_offset
l32i a15,a1,88 # [6] left_shift
s32i a10,a1,36 # [14] gra_spill_temp_8
ee.vcmp.lt.s8 q3,q1,q6 # [271]
wsr.sar a15 # [21], load shift for left shift
ee.vzip.8 q1,q3 # [274], 20 bits
ee.vadds.s16 q1,q1,q7 # [281]
ee.vcmp.lt.s16 q3,q1,q6 # [282]
ee.vzip.16 q1,q3 # [283], 32 bits
ee.vsl.32 q1,q1 # [284]
ee.vsl.32 q3,q3 # [285]
// mulhi32 for q1
ee.movi.32.a q1,a3,2 # [44]
ee.movi.32.a q1,a4,3 # [45]
ee.movi.32.a q1,a14,1 # [46]
ee.movi.32.a q1,a5,0 # [62]
mulsh a13,a7,a3 # [51]
mull a3,a7,a3 # [53]
mulsh a12,a7,a4 # [50]
mull a4,a7,a4 # [55]
mulsh a15,a7,a14 # [48]
mull a14,a7,a14 # [49]
ssai 31 # [47]
add a3,a3,a9
saltu a2,a3,a9
add.n a13,a13,a2
src a13,a13,a3
add a4,a4,a9
saltu a2,a4,a9
add.n a12,a12,a2
src a12,a12,a4
ee.movi.32.q q1,a13,2 # [62]
add a14,a14,a9
saltu a2,a14,a9
add.n a15,a15,a2
src a15,a15,a14
ee.movi.32.q q1,a12,3 # [62]
mulsh a13,a7,a5 # [51]
mull a5,a7,a5 # [53]
ee.movi.32.q q1,a15,1 # [62]
add a5,a5,a9
saltu a2,a5,a9
add.n a13,a13,a2
src a13,a13,a5
ee.movi.32.q q1,a13,0 # [62]
// mulhi32 for q3
ee.movi.32.a q3,a3,2 # [44]
ee.movi.32.a q3,a4,3 # [45]
ee.movi.32.a q3,a14,1 # [46]
ee.movi.32.a q3,a5,0 # [62]
mulsh a13,a7,a3 # [51]
mull a3,a7,a3 # [53]
mulsh a12,a7,a4 # [50]
mull a4,a7,a4 # [55]
mulsh a15,a7,a14 # [48]
mull a14,a7,a14 # [49]
ssai 31 # [47]
add a3,a3,a9
saltu a2,a3,a9
add.n a13,a13,a2
src a13,a13,a3
add a4,a4,a9
saltu a2,a4,a9
add.n a12,a12,a2
src a12,a12,a4
ee.movi.32.q q3,a13,2 # [62]
add a14,a14,a9
saltu a2,a14,a9
add.n a15,a15,a2
src a15,a15,a14
ee.movi.32.q q3,a12,3 # [62]
mulsh a13,a7,a5 # [51]
mull a5,a7,a5 # [53]
ee.movi.32.q q3,a15,1 # [62]
l32i a14,a1,4 # [7] temp_neg_input2_shift, -input2_shift
add a5,a5,a9
saltu a2,a5,a9
add.n a13,a13,a2
src a13,a13,a5
ee.movi.32.q q3,a13,0 # [62]
// multiplication results: q0-q2 & q1-q3
blti a14,1, .skip_div_by2_in1
addi.n a5,a1,20
ee.vcmp.lt.s32 q4,q1,q6
ee.vcmp.lt.s32 q5,q3,q6
ee.vldbc.32 q7,a5 // 1 << (exponent - 1)
wsr.sar a14 // load right_shift
ee.vadds.s32 q4,q4,q7 // subtract 1 `if (val < 0)`
ee.vadds.s32 q5,q5,q7 // subtract 1 `if (val < 0)`
ee.vadds.s32 q1,q1,q4
ee.vadds.s32 q3,q3,q5
ee.vsr.32 q1,q1
ee.vsr.32 q3,q3
.skip_div_by2_in1:
ee.vadds.s32 q0,q0,q1
ee.vadds.s32 q1,q2,q3
// mulhi32 for q0
ee.movi.32.a q0,a3,2 # [44]
ee.movi.32.a q0,a4,3 # [45]
ee.movi.32.a q0,a14,1 # [46]
ee.movi.32.a q0,a5,0 # [62]
mulsh a13,a8,a3 # [51]
mull a3,a8,a3 # [53]
mulsh a12,a8,a4 # [50]
mull a4,a8,a4 # [55]
mulsh a15,a8,a14 # [48]
mull a14,a8,a14 # [49]
ssai 31 # [47]
add a3,a3,a9
saltu a2,a3,a9
add.n a13,a13,a2
src a13,a13,a3
add a4,a4,a9
saltu a2,a4,a9
add.n a12,a12,a2
src a12,a12,a4
ee.movi.32.q q0,a13,2 # [62]
add a14,a14,a9
saltu a2,a14,a9
add.n a15,a15,a2
src a15,a15,a14
ee.movi.32.q q0,a12,3 # [62]
mulsh a13,a8,a5 # [51]
mull a5,a8,a5 # [53]
ee.movi.32.q q0,a15,1 # [62]
add a5,a5,a9
saltu a2,a5,a9
add.n a13,a13,a2
src a13,a13,a5
ee.movi.32.q q0,a13,0 # [62]
// mulhi32 for q1
ee.movi.32.a q1,a3,2 # [44]
ee.movi.32.a q1,a4,3 # [45]
ee.movi.32.a q1,a14,1 # [46]
ee.movi.32.a q1,a5,0 # [62]
mulsh a13,a8,a3 # [51]
mull a3,a8,a3 # [53]
mulsh a12,a8,a4 # [50]
mull a4,a8,a4 # [55]
mulsh a15,a8,a14 # [48]
mull a14,a8,a14 # [49]
ssai 31 # [47]
add a3,a3,a9
saltu a2,a3,a9
add.n a13,a13,a2
src a13,a13,a3
add a4,a4,a9
saltu a2,a4,a9
add.n a12,a12,a2
src a12,a12,a4
ee.movi.32.q q1,a13,2 # [62]
add a14,a14,a9
saltu a2,a14,a9
add.n a15,a15,a2
src a15,a15,a14
ee.movi.32.q q1,a12,3 # [62]
mulsh a13,a8,a5 # [51]
mull a5,a8,a5 # [53]
ee.movi.32.q q1,a15,1 # [62]
l32i a14,a1,0 # [738] temp_neg_out_shift, -out_shift
add a5,a5,a9
saltu a2,a5,a9
add.n a13,a13,a2
src a13,a13,a5
ee.movi.32.q q1,a13,0 # [62]
//q0-q1 has output
blti a14,1,.skip_div_by2_out
addi.n a5,a1,24
ee.vcmp.lt.s32 q2,q0,q6
ee.vcmp.lt.s32 q3,q1,q6
ee.vldbc.32 q5,a5 // 1 << (exponent - 1)
wsr.sar a14 // load right shift
ee.vadds.s32 q0,q0,q2 // subtract 1 `if (val < 0)`
ee.vadds.s32 q1,q1,q3 // subtract 1 `if (val < 0)`
ee.vadds.s32 q0,q0,q5
ee.vadds.s32 q1,q1,q5
ee.vsr.32 q0,q0
ee.vsr.32 q1,q1
.skip_div_by2_out:
// add offset and apply activation
addi a15,a1,96
ee.vldbc.32 q3,a15 # [809] id:802 out_offset
ee.vadds.s32 q0,q0,q3 # [811]
ee.vadds.s32 q1,q1,q3 # [812]
addi a13,a1,108
addi a14,a1,112
ee.vldbc.32 q3,a14 # [813] id:803 activation_max
ee.vmin.s32 q0,q0,q3 # [815]
ee.vmin.s32 q1,q1,q3 # [816]
ee.vldbc.32 q3,a13 # [817] id:804 activation_min
l32i a13,a1,4 # [818] temp_neg_input2_shift
ee.vmax.s32 q1,q1,q3 # [819]
ee.vmax.s32 q0,q0,q3 # [820]
//pack the data and store
l32i.n a9,a1,44 # [784] gra_spill_temp_10
ee.vunzip.16 q0,q1 # [821]
ee.vunzip.8 q0,q1 # [822]
l32i.n a13,a1,28 # gra_spill_temp_6, multiple of 12 index
ee.vst.l.64.ip q0,a9,8 # [823] id:805
l32i a15,a1,116 # [1], size
l32i.n a14,a1,40 # [20] gra_spill_temp_9
l32i.n a10,a1,36 # [14] gra_spill_temp_8
addi a13,a13,8
s32i.n a13,a1,28 # gra_spill_temp_6
bge a15,a13,.vector_loop
l32i.n a2,a1,56 # [0] gra_spill_temp_13
// check for leftover
l32i a10,a1,116 # [1]
slli a2,a2,3 # [2]
bge a2,a10,.exit # [3] // done, exit
.process_leftover:
l32i.n a3,a1,48 # [1] gra_spill_temp_11
l32i.n a12,a1,52 # [2] gra_spill_temp_12
l32i.n a10,a1,12 # [3] gra_spill_temp_2
l32i.n a14,a1,32 # [8] gra_spill_temp_7
add.n a10,a2,a10 # [5]
add.n a14,a2,a14 # [6]
l8ui a14,a14,0 # [7] id:809, input1
l8ui a10,a10,0 # [12] id:1370, input2
sext a14,a14,7 # [9]
sext a10,a10,7 # [10]
add.n a10,a10,a12 # [11] // add offset2
add.n a14,a14,a3 # [16] // add offset1
l32i a12,a1,88 # [13] left_shift
// sat_round_doubling_high_mul step for input1 and input2
ssl a12 # [15]
sll a10,a10 # [20]
sll a14,a14 # [17]
l32r a12,.nudge_val # [0], nudge
// a13,a3 are free, a12: nudge, a6:mult1
mulsh a13,a14,a6
mull a9,a14,a6
ssai 31
add a9,a9,a12
saltu a3,a9,a12
add.n a13,a13,a3
src a14,a13,a9 //result in a14
mulsh a13,a10,a7
mull a9,a10,a7
ssai 31
add a9,a9,a12
saltu a3,a9,a12
add.n a13,a13,a3
src a10,a13,a9 //result in a10
// divide_by_power_of2_step for input1 (a14), input2 (a10)
// free registers: a13, a12, a9, a3
l32i.n a12,a1,8 // -input1_shift
l32i.n a13,a1,4 // -input2_shift
blti a12,1,.skip_div_by2_in0_remain
l32i.n a3,a1,16 // 1 << (exponent - 1)
extui a9,a14,31,1
ssr a12 // load right_shift
sub a3,a3,a9 // 1 << (exponent - 1) - (val < 0)
add a14,a14,a3
sra a14,a14
.skip_div_by2_in0_remain:
blti a13,1,.skip_div_by2_in1_remain
l32i.n a3,a1,20 // 1 << (exponent - 1)
extui a9,a10,31,1
ssr a13 // load right_shift
sub a3,a3,a9 // 1 << (exponent - 1) - (val < 0)
add a10,a10,a3
sra a10,a10
.skip_div_by2_in1_remain:
// process output
l32r a12,.nudge_val # [0], nudge
l32i a13,a1,0 // -out_shift
add.n a10,a10,a14 # [45]
// multiply and pick high32
mulsh a3,a10,a8
mull a10,a10,a8
ssai 31 # [0]
add a10,a10,a12
saltu a9,a10,a12
add a12,a3,a9
src a12,a12,a10
// div by power of 2 for output
l32i a9,a1,96 # [31] out_offset
blti a13,1,.skip_div_by2_out_remain
l32i.n a3,a1,24 // 1 << (exponent - 1)
extui a14,a12,31,1
ssr a13 // load right_shift
sub a3,a3,a14 // 1 << (exponent - 1) - (val < 0)
add a12,a12,a3
sra a12,a12
.skip_div_by2_out_remain:
// add offset
add.n a9,a9,a12 # [33]
// apply activation
l32i a13,a1,112 # [34] activation_max
l32i a12,a1,108 # [35] activation_min
min a13,a13,a9 # [36]
l32i a9,a1,92 # [37] output
max a13,a13,a12 # [38]
add.n a9,a2,a9 # [39]
s8i a13,a9,0 # [40] id:1371
l32i a12,a1,116
addi.n a2,a2,1 # [41]
blt a2,a12,.process_leftover
.exit:
retw.n # [0]
.size esp_nn_add_elementwise_s8_esp32s3, . - esp_nn_add_elementwise_s8_esp32s3
================================================
FILE: src/basic_math/esp_nn_mul_ansi.c
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <stdint.h>
#include <common_functions.h>
void esp_nn_mul_elementwise_s8_ansi(const int8_t *input1_data,
const int8_t *input2_data,
const int32_t input1_offset,
const int32_t input2_offset,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t activation_min,
const int32_t activation_max,
const int32_t size)
{
for (int i = 0; i < size; i++) {
int32_t tmp1 = input1_data[i] + input1_offset;
int32_t tmp2 = input2_data[i] + input2_offset;
int32_t out = tmp1 * tmp2;
out = esp_nn_multiply_by_quantized_mult(out, out_mult, out_shift);
out = out + out_offset;
out = max(activation_min, min(out, activation_max));
output[i] = (int8_t) out;
}
}
void esp_nn_mul_broadcast_channel_s8_ansi(const int8_t *input1,
const int8_t *input2_per_ch,
const int32_t input1_offset,
const int32_t input2_offset,
int8_t *output,
const int32_t output_offset,
const int32_t output_mult,
const int32_t output_shift,
const int32_t activation_min,
const int32_t activation_max,
const int32_t total_spatial,
const int32_t channels)
{
for (int s = 0; s < total_spatial; s++) {
const int8_t *in_row = input1 + s * channels;
int8_t *out_row = output + s * channels;
for (int c = 0; c < channels; c++) {
int32_t val = ((int32_t)in_row[c] + input1_offset) *
((int32_t)input2_per_ch[c] + input2_offset);
val = esp_nn_multiply_by_quantized_mult(val, output_mult, output_shift);
val += output_offset;
val = max(val, activation_min);
val = min(val, activation_max);
out_row[c] = (int8_t)val;
}
}
}
================================================
FILE: src/basic_math/esp_nn_mul_broadcast_s8_esp32s3.S
================================================
// Copyright 2026 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Optimized broadcast MUL for SE-block pattern: [H,W,C] * [1,1,C]
// Processes 8 channels at a time using S3 SIMD.
.text
.align 4
.literal_position
.literal .LC_nudge, 1073741824 // 1 << 30
.type esp_nn_mul_broadcast_channel_s8_esp32s3, @function
.align 4
.global esp_nn_mul_broadcast_channel_s8_esp32s3
// void esp_nn_mul_broadcast_channel_s8_esp32s3(
// const int8_t *input1, // a2
// const int8_t *input2_per_ch, // a3
// const int32_t input1_offset, // a4
// const int32_t input2_offset, // a5
// int8_t *output, // a6
// const int32_t output_offset, // a7
// const int32_t output_mult, // stack+120
// const int32_t output_shift, // stack+124
// const int32_t activation_min, // stack+128
// const int32_t activation_max, // stack+132
// const int32_t total_spatial, // stack+136
// const int32_t channels); // stack+140
// Stack frame layout (entry a1, 120):
// 0: to_add (for div by power of 2)
// 4: input2_per_ch (saved)
// 8: output base (saved)
// 12: channels
// 16: input1 base (saved)
// 20: right_shift
// 24: input1_offset (saved)
// 28: input2_offset (saved)
// 32: spatial counter
// 36: out_ptr (current)
// 40: out_offset (from a7)
// 44: input1_offset (for vldbc)
// 48: input2_offset (for vldbc)
esp_nn_mul_broadcast_channel_s8_esp32s3:
entry a1, 120
// Save args
s32i.n a3, a1, 4 // input2_per_ch base
s32i.n a6, a1, 8 // output base
s32i.n a2, a1, 16 // input1 base
s32i.n a4, a1, 24 // input1_offset
s32i.n a5, a1, 28 // input2_offset
s32i a7, a1, 40 // out_offset
l32i a8, a1, 136 // total_spatial
l32i a9, a1, 140 // channels
s32i.n a9, a1, 12 // save channels
blti a8, 1, .Lexit // no spatial positions
blti a9, 1, .Lexit // no channels
// Prepare shift values
l32i a15, a1, 124 // output_shift
movi.n a11, 0
max a14, a15, a11 // left_shift = max(shift, 0)
sub a4, a14, a15 // right_shift = left_shift - shift
s32i.n a4, a1, 20 // save right_shift
l32i a13, a1, 120 // output_mult
l32r a4, .LC_nudge // nudge = 1 << 30
// Store offsets for vldbc
l32i a8, a1, 136 // reload total_spatial
s32i a5, a1, 48 // input2_offset for vldbc
l32i.n a5, a1, 24 // input1_offset
s32i a5, a1, 44 // input1_offset for vldbc
// Init spatial counter
movi.n a10, 0
s32i a10, a1, 32 // spatial counter = 0
// Pointers: a2 = input1 (current), a3 = input2_per_ch (reloaded each row),
// a6 = output (current)
.Lspatial_loop:
l32i a8, a1, 136 // total_spatial
l32i a10, a1, 32 // spatial counter
bge a10, a8, .Lexit
// Reset input2 pointer for each spatial position
l32i.n a3, a1, 4 // input2_per_ch base
// Channel counter
l32i.n a9, a1, 12 // channels
movi.n a11, 0 // channel index
blti a9, 8, .Lchannel_leftover
// Check alignment for SIMD path
or a8, a2, a3
or a8, a8, a6
extui a8, a8, 0, 4
bnez a8, .Lchannel_leftover
// Setup SIMD constants
ee.zero.q q1 // zero register
addi a8, a1, 44
ee.vldbc.16 q0, a8 // input1_offset broadcast
addi a8, a1, 48
ee.vldbc.16 q7, a8 // input2_offset broadcast
st.qr q0, a1, 64 // save for reload in loop
.Lchannel_simd_loop:
addi a8, a9, -7 // channels - 7
blt a11, a8, .Lchannel_simd_body
j .Lchannel_leftover
.Lchannel_simd_body:
ld.qr q4, a1, 64 // input1_offset
ee.vld.l.64.ip q2, a2, 8 // load 8 input1 values
movi.n a7, 16
ee.vld.h.64.ip q2, a3, 8 // load 8 input2 values (per-ch)
wsr.sar a7
ee.vcmp.lt.s8 q5, q2, q1 // sign extend
ee.vzip.8 q2, q5 // interleave to 16-bit
ee.vadds.s16 q5, q5, q7 // add input2_offset
ee.vadds.s16 q4, q2, q4 // add input1_offset
ee.vmul.s16 q3, q4, q5 // multiply (high part)
ssai 0 // sar = 0
ee.vmul.s16 q2, q4, q5 // multiply (low part)
// Requantize 8 results (same pattern as elementwise mul)
wsr.sar a14 // left_shift
ee.vzip.16 q2, q3
ee.vsl.32 q6, q2 // left shift first 4
ssai 31
// Element 2 of q6
ee.movi.32.a q6, a8, 2
mulsh a7, a13, a8
mull a8, a13, a8
add.n a8, a4, a8
saltu a5, a8, a4
add.n a5, a5, a7
src a5, a5, a8
// Element 3
ee.movi.32.a q6, a8, 3
mulsh a7, a13, a8
mull a8, a13, a8
add.n a8, a4, a8
saltu a12, a8, a4
add.n a12, a12, a7
src a12, a12, a8
ee.movi.32.q q2, a5, 2
ee.movi.32.q q2, a12, 3
// Element 1
ee.movi.32.a q6, a8, 1
mulsh a7, a13, a8
mull a8, a13, a8
add.n a8, a4, a8
saltu a5, a8, a4
add.n a5, a5, a7
src a5, a5, a8
// Element 0
ee.movi.32.a q6, a8, 0
mulsh a7, a13, a8
mull a8, a13, a8
add.n a8, a4, a8
saltu a12, a8, a4
add.n a12, a12, a7
src a12, a12, a8
ee.movi.32.q q2, a5, 1
ee.movi.32.q q2, a12, 0
// Second group of 4 (q3)
wsr.sar a14 // left_shift
ee.vsl.32 q4, q3
ssai 31
ee.movi.32.a q4, a8, 2
mulsh a7, a13, a8
mull a8, a13, a8
add.n a8, a4, a8
saltu a5, a8, a4
add.n a5, a5, a7
src a5, a5, a8
ee.movi.32.a q4, a8, 3
mulsh a7, a13, a8
mull a8, a13, a8
add.n a8, a4, a8
saltu a12, a8, a4
add.n a12, a12, a7
src a12, a12, a8
ee.movi.32.q q0, a5, 2
ee.movi.32.q q0, a12, 3
ee.movi.32.a q4, a8, 1
mulsh a7, a13, a8
mull a8, a13, a8
add.n a8, a4, a8
saltu a5, a8, a4
add.n a5, a5, a7
src a5, a5, a8
ee.movi.32.a q4, a8, 0
mulsh a7, a13, a8
mull a8, a13, a8
add.n a8, a4, a8
saltu a12, a8, a4
add.n a12, a12, a7
src a12, a12, a8
ee.movi.32.q q0, a5, 1
ee.movi.32.q q0, a12, 0
// Divide by power of 2 (right_shift)
l32i.n a5, a1, 20 // right_shift
movi.n a7, 1
blti a5, 1, .Lskip_div
ee.vcmp.lt.s32 q5, q2, q1
ee.vcmp.lt.s32 q6, q0, q1
addi.n a8, a5, -1
ssl a8
sll a7, a7 // to_add = 1 << (right_shift - 1)
s32i.n a7, a1, 0
ee.vldbc.32 q4, a1 // broadcast to_add
wsr.sar a5
ee.vadds.s32 q5, q4, q5
ee.vadds.s32 q5, q2, q5
ee.vsr.32 q2, q5
wsr.sar a5
ee.vadds.s32 q5, q4, q6
ee.vadds.s32 q5, q0, q5
ee.vsr.32 q0, q5
.Lskip_div:
// Add output offset, apply activation
addi a8, a1, 132
ee.vldbc.32 q4, a8 // activation_max
addi a5, a1, 40
ee.vldbc.32 q6, a5 // output_offset
addi a7, a1, 128
ee.vadds.s32 q0, q0, q6 // add offset
ee.vadds.s32 q2, q2, q6
ee.vldbc.32 q6, a7 // activation_min
ee.vmin.s32 q0, q0, q4
ee.vmin.s32 q2, q2, q4
ee.vmax.s32 q0, q0, q6
ee.vmax.s32 q2, q2, q6
// Pack 32-bit -> 8-bit and store
ee.vunzip.16 q2, q0
ee.vunzip.8 q2, q0
ee.vst.l.64.ip q2, a6, 8
addi a11, a11, 8 // channel index += 8
j .Lchannel_simd_loop
.Lchannel_leftover:
// Process remaining channels one by one
l32i.n a9, a1, 12 // channels
bge a11, a9, .Lspatial_next
ssl a14 // left_shift
l32i.n a8, a1, 24 // input1_offset
l8ui a10, a2, 0 // *input1
sext a10, a10, 7
add.n a10, a10, a8 // + input1_offset
l32i.n a8, a1, 28 // input2_offset
l8ui a12, a3, 0 // *input2_per_ch
sext a12, a12, 7
add.n a12, a12, a8 // + input2_offset
mull a10, a10, a12 // multiply
// Requantize
sll a10, a10 // left shift
l32i.n a9, a1, 20 // right_shift
mulsh a8, a10, a13
mull a12, a10, a13
ssai 31
add.n a12, a4, a12
saltu a10, a12, a4
add.n a10, a10, a8
src a10, a10, a12 // result
blti a9, 1, .Lskip_div_scalar
addi a8, a9, -1
ssl a8
movi a7, 1
sll a7, a7 // to_add
extui a8, a10, 31, 1 // sign bit (1 if neg, 0 if pos)
sub a10, a10, a8 // val -= sign (fast rounding)
add a10, a10, a7
ssr a9
sra a10, a10
.Lskip_div_scalar:
l32i a8, a1, 40 // output_offset
l32i a7, a1, 128 // activation_min
l32i a12, a1, 132 // activation_max
add.n a10, a10, a8
min a10, a10, a12
max a10, a10, a7
s8i a10, a6, 0 // store
addi a2, a2, 1 // input1++
addi a3, a3, 1 // input2++
addi a6, a6, 1 // output++
addi a11, a11, 1 // channel index++
j .Lchannel_leftover
.Lspatial_next:
l32i a10, a1, 32 // spatial counter
addi a10, a10, 1
s32i a10, a1, 32
j .Lspatial_loop
.Lexit:
retw.n
.size esp_nn_mul_broadcast_channel_s8_esp32s3, . - esp_nn_mul_broadcast_channel_s8_esp32s3
================================================
FILE: src/basic_math/esp_nn_mul_s8_esp32p4.c
================================================
/*
* SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#include <stdint.h>
#include <common_functions.h>
/**
* Elementwise multiply for s8 optimized for ESP32-P4.
* Uses inlined fast requantization with 4x unrolled loop.
* Interleaves independent computations to hide latency.
*/
void esp_nn_mul_elementwise_s8_esp32p4(const int8_t *input1_data,
const int8_t *input2_data,
const int32_t input1_offset,
const int32_t input2_offset,
int8_t *output,
const int32_t out_offset,
const int32_t out_mult,
const int32_t out_shift,
const int32_t activation_min,
const int32_t activation_max,
const int32_t size)
{
const int32_t left_shift = out_shift > 0 ? out_shift : 0;
const int32_t right_shift = left_shift - out_shift;
const int64_t nudge = (int64_t)1 << 30;
int i = 0;
for (; i <= size - 4; i += 4) {
int32_t prod0 = (input1_data[i+0] + input1_offset) * (input2_data[i+0] + input2_offset);
int32_t prod1 = (input1_data[i+1] + input1_offset) * (input2_data[i+1] + input2_offset);
int32_t prod2 = (input1_data[i+2] + input1_offset) * (input2_data[i+2] + input2_offset);
int32_t prod3 = (input1_data[i+3] + input1_offset) * (input2_data[i+3] + input2_offset);
int32_t s0 = prod0 << left_shift;
int32_t s1 = prod1 << left_shift;
int32_t s2 = prod2 << left_shift;
int32_t s3 = prod3 << left_shift;
int32_t r0 = (int32_t)(((int64_t)s0 * out_mult + nudge) >> 31);
int32_t r1 = (int32_t)(((int64_t)s1 * out_mult + nudge) >> 31);
int32_t r2 = (int32_t)(((int64_t)s2 * out_mult + nudge) >> 31);
int32_t r3 = (int32_t)(((int64_t)s3 * out_mult + nudge) >> 31);
if (right_shift > 0) {
int32_t rnd = (1 << (right_shift - 1));
r0 = (r0 + rnd - (r0 < 0)) >> right_shift;
r1 = (r1 + rnd - (r1 < 0)) >> right_shift;
r2 = (r2 + rnd - (r2 < 0)) >> right_shift;
r3 = (r3 + rnd - (r3 < 0)) >> right_shift;
}
r0 = max(activation_min, min(r0 + out_offset, activation_max));
r1 = max(activation_min, min(r1 + out_offset, activation_max));
r2 = max(activation_min, min(r2 + out_offset, activation_max));
r3 = max(activation_min, min(r3 + out_offset, activation_max));
output[i+0] = (int8_t) r0;
output[i+1] = (int8_t) r1;
output[i+2] = (int8_t) r2;
output[i+3] = (int8_t) r3;
}
for (; i < size; i++) {
int32_t prod = (input1_data[i] + input1_offset) * (input2_data[i] + input2_offset);
int32_t out = esp_nn_requantize(prod, out_mult, out_shift);
out = max(activation_min, min(out + out_offset, activation_max));
output[i] = (int8_t) out;
}
}
================================================
FILE: src/basic_math/esp_nn_mul_s8_esp32s3.S
================================================
// Copyright 2021-2023 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
.text
.align 4
.literal_position
.literal .LC0_26_123, 1073741824 // `1 << 30`
# Program Unit: esp_nn_mul_elementwise_s8_esp32s3
.type esp_nn_mul_elementwise_s8_esp32s3, @function
.align 4
.global esp_nn_mul_elementwise_s8_esp32s3
esp_nn_mul_elementwise_s8_esp32s3: # 0x4
# to_add = 0
# gra_spill_temp_0 = 4
# gra_spill_temp_1 = 8
# gra_spill_temp_2 = 12
# gra_spill_temp_3 = 16
# gra_spill_temp_4 = 20
# gra_spill_temp_5 = 24
# gra_spill_temp_6 = 28
# gra_spill_temp_7 = 32
# gra_spill_temp_8 = 36
# gra_spill_temp_<> = 40
# gra_spill_temp_<> = 44
# gra_spill_temp_<> = 48
# gra_spill_temp_13 = 64
// registers:
// a2: const int8_t *input1_data
// a3: const int8_t *input2_data
// a4: const int32_t input1_offset
// a5: const int32_t input2_offset
// a6: int8_t *output
// a7: const int32_t out_offset
// on stack:
// 120: const int32_t out_mult
// 124: const int32_t out_shift
// 128: const int32_t activation_min
// 132: const int32_t activation_max
// 136: const int32_t size
entry a1,120 #
s32i.n a4,a1,24 # [0] gra_spill_temp_5, input1_offset
s32i.n a5,a1,28 # [1] gra_spill_temp_12, input2_offset
s32i.n a3,a1,4 # [5] gra_spill_temp_0, input2
mov.n a10,a3 # [6]
l32i a3,a1,136 # [18] id:361 size+0x0
mov.n a9,a6 # [2] // out_addr
blti a3,1,.exit # [0] // exit
s32i.n a2,a1,16 # [9] gra_spill_temp_3, input1
s32i a7,a1,40 # [4] id:358 out_offset+0x0
movi.n a11,0 # [3]
mov.n a12,a2 # [10]
s32i a4,a1,44 # [13] id:356 input1_offset+0x0
s32i a5,a1,48 # [14] id:357 input2_offset+0x0
movi.n a2,1 # [15]
l32i a15,a1,124 # [3] id:362 out_shift+0x0
l32i a13,a1,120 # [4] id:363 out_mult+0x0
s32i.n a6,a1,8 # [1] gra_spill_temp_1, out_addr
max a14,a15,a11 # [11] left_shift
sub a4,a14,a15 # right_shift
s32i.n a4,a1,20 # [9] gra_spill_temp_4
blti a3,8,.process_leftover # [20]
// skip to leftover routine if inputs are unaligned
or a6,a12,a10
extui a6,a6,0,4
bnez a6,.process_leftover
// `size > 8`, s3 optimisation path...
ee.zero.q q1 # [0]
addi a4,a1,44 # [7]
addi a8,a1,48 # [8]
ee.vldbc.16 q0,a4 # [17] id:359 input1_offset
ee.vldbc.16 q7,a8 # [16] id:360 input2_offset
l32r a4,.LC0_26_123 # [12]
movi a8, 8
st.qr q0,a1,64 # [19] gra_spill_temp_13
s32i.n a8,a1,12 # [6] gra_spill_temp_2
.Lt_0_7682: # 0x60
s32i a9,a1,36 # [1] gra_spill_temp_8, out_addr
ld.qr q4,a1,64 # [2] gra_spill_temp_13, input1_offset
ee.vld.l.64.ip q2,a12,8 # [4] id:367, input1_ptr
movi.n a7,16 # [3]
ee.vld.h.64.ip q2,a10,8 # [5] id:368, input2_ptr
wsr.sar a7 # [6]
ee.vcmp.lt.s8 q5,q2,q1 # [7]
ee.vzip.8 q2,q5 # [8]
ee.vadds.s16 q5,q5,q7 # [9] input2_offset
ee.vadds.s16 q4,q2,q4 # [10] input1_offset
ee.vmul.s16 q3,q4,q5 # [11]
wsr.sar a11 # [12]
ee.vmul.s16 q2,q4,q5 # [13]
wsr.sar a14 # [14] left_shift
ee.vzip.16 q2,q3 # [15]
ee.vsl.32 q6,q2 # [16] left_shift
ssai 31 # [17]
ee.movi.32.a q6,a3,2 # [18]
ee.movi.32.a q6,a8,3 # [26]
mulsh a6,a13,a3 # [19]
mull a3,a13,a3 # [20]
mulsh a7,a13,a8 # [27]
add.n a3,a4,a3 # [22]
saltu a2,a3,a4 # [23]
add.n a2,a2,a6 # [24]
src a2,a2,a3 # [25]
mull a6,a13,a8 # [28]
add.n a6,a4,a6 # [30]
saltu a9,a6,a4 # [31]
add.n a9,a9,a7 # [32]
src a9,a9,a6 # [33]
ee.movi.32.q q2,a2,2 # [53]
ee.movi.32.q q2,a9,3 # [54]
ee.movi.32.a q6,a6,1 # [34]
mulsh a7,a13,a6 # [35]
mull a6,a13,a6 # [36]
add.n a6,a4,a6 # [38]
saltu a3,a6,a4 # [39]
add.n a3,a3,a7 # [16]
src a3,a3,a6 # [41]
ee.movi.32.a q6,a2,0 # [42]
mulsh a8,a13,a2 # [43]
mull a7,a13,a2 # [4]
add.n a7,a4,a7 # [46]
saltu a6,a7,a4 # [47]
add.n a6,a6,a8 # [24]
src a6,a6,a7 # [49]
ee.movi.32.q q2,a3,1 # [28]
ee.movi.32.q q2,a6,0 # [50]
wsr.sar a14 # [10]
ee.vsl.32 q4,q3 # [11]
ee.movi.32.a q4,a2,2 # [13]
mulsh a3,a13,a2 # [14]
mull a2,a13,a2 # [15]
ssai 31 # [12]
add.n a2,a4,a2 # [17]
saltu a5,a2,a4 # [18]
add.n a5,a5,a3 # [19]
src a5,a5,a2 # [20]
ee.movi.32.a q4,a3,3 # [21]
mulsh a6,a13,a3 # [22]
mull a3,a13,a3 # [23]
add.n a3,a4,a3 # [25]
saltu a8,a3,a4 # [26]
add.n a8,a8,a6 # [27]
src a8,a8,a3 # [28]
ee.movi.32.q q0,a5,2 # [24]
ee.movi.32.q q0,a8,3 # [51]
ee.movi.32.a q4,a7,1 # [29]
mulsh a6,a13,a7 # [30]
mull a3,a13,a7 # [31]
add.n a3,a4,a3 # [33]
saltu a2,a3,a4 # [34]
add.n a2,a2,a6 # [35]
src a2,a2,a3 # [36]
ee.movi.32.a q4,a6,0 # [37]
mulsh a7,a13,a6 # [38]
mull a6,a13,a6 # [39]
add.n a6,a4,a6 # [41]
saltu a3,a6,a4 # [42]
add.n a3,a3,a7 # [43]
src a3,a3,a6 # [4]
ee.movi.32.q q0,a2,1 # [47]
ee.movi.32.q q0,a3,0 # [46]
l32i.n a5,a1,20 # [0] gra_spill_temp_4, right_shift
movi.n a7,1 # [51]
blti a5,1,.skip_div_by_pow_of_2
// divide by power of 2
ee.vcmp.lt.s32 q5,q2,q1 # [56]
ee.vcmp.lt.s32 q6,q0,q1 # [28]
addi.n a8,a5,-1 # [1]
ssl a8 # [2]
sll a7,a7 # [3]
s32i.n a7,a1,0 # [4] to_add
ee.vldbc.32 q4,a1 # [5] id:376 to_add
wsr.sar a5 # [6]
ee.vadds.s32 q5,q4,q5 # [7]
ee.vadds.s32 q5,q2,q5 # [8]
ee.vsr.32 q2,q5 # [9]
wsr.sar a5 # [5]
ee.vadds.s32 q5,q4,q6 # [9]
ee.vadds.s32 q5,q0,q5 # [11]
ee.vsr.32 q0,q5 # [12]
.skip_div_by_pow_of_2:
// add offset, apply activation
addi a8,a1,132 # [54]
ee.vldbc.32 q4,a8 # [55] id:385 activation_max
addi a5,a1,40 # [8]
ee.vldbc.32 q6,a5 # [10] id:384 out_offset
addi a7,a1,128 # [4]
ee.vadds.s32 q0,q0,q6 # [13] // add out_offset
ee.vadds.s32 q2,q2,q6 # [14] // add out_offset
ee.vldbc.32 q6,a7 # [16] id:386 activation_min
ee.vmin.s32 q0,q0,q4 # [17]
ee.vmin.s32 q2,q2,q4 # [15]
ee.vmax.s32 q0,q0,q6 # [18]
ee.vmax.s32 q2,q2,q6 # [19]
// pack and store
ee.vunzip.16 q2,q0 # [20]
ee.vunzip.8 q2,q0 # [21]
l32i.n a7,a1,12 // count
l32i a9,a1,36 # [55] gra_spill_temp_8
l32i.n a3,a1,136 # [1] , size
ee.vst.l.64.ip q2,a9,8 # [22] id:387
addi a7,a7,8
s32i.n a7,a1,12 // increment count
bge a3,a7,.Lt_0_7682
addi a11,a7,-8
bge a11,a3,.exit # [3] // exit
.process_leftover:
sub a8,a3,a11 # [1]
loopgtz a8,.LBB33_esp_nn_mul_elementwise_s8_esp32s3 # [9]
ssl a14 # [0] left_shift
l32i.n a8,a1,24 # [1] gra_spill_temp_5, input1_offset
l32i.n a10,a1,4 # [2] gra_spill_temp_0, input2
l32i.n a12,a1,16 # [3] gra_spill_temp_3, input1
add.n a10,a11,a10 # [4], input2
add.n a12,a11,a12 # [5], input1
l8ui a12,a12,0 # [6] id:390
l8ui a10,a10,0 # [7] id:391
sext a12,a12,7 # [8]
add.n a12,a12,a8 # [9]
l32i.n a8,a1,28 # [10] gra_spill_temp_12, input2_offset
sext a10,a10,7 # [11]
add.n a10,a10,a8 # [12]
mull a10,a12,a10 # [13] // multiplication result
// multiply by quantised mult
l32i.n a9,a1,20 # [0] gra_spill_temp_4, load right_shift
sll a10,a10 # [15] // left shift
mulsh a3,a10,a13 # [1]
mull a8,a10,a13 # [6]
ssai 31 # [0]
add.n a6,a8,a4 # [8]
saltu a8,a6,a8 # [9]
add.n a8,a8,a3 # [10]
src a3,a8,a6 # [19] // result
blti a9, 1, .skip_div_by_pow_of_2_remains
// divide by power of 2
// calculate to_add = `1 << (exponent - 1)`
addi a6,a9,-1
ssl a6 # [23]
movi a7,1
sll a7,a7 // to_add
extui a8,a3,31,1 # [24], sign
add a3,a3,a8 // add sign
add a3,a3,a7 // add to_add
ssr a9 # [20] load right_shift
sra a3,a3 // right shift
.skip_div_by_pow_of_2_remains:
l32i.n a6,a1,40 # [32], out_offset
l32i.n a8,a1,132 # [35], act_max
l32i.n a7,a1,128 # [36], act_min
// add offset and apply activation
add.n a3,a3,a6 # [34], offset added
min a8,a8,a3 # [37]
l32i.n a3,a1,8 # [38] gra_spill_temp_1, load base out_addr
max a8,a8,a7 # [39]
// store
add.n a3,a11,a3 # [16], add index from `a11`
s8i a8,a3,0 # [41] id:392 // store
addi.n a11,a11,1 # [42] // inc index
.LBB33_esp_nn_mul_elementwise_s8_esp32s3: # 0x2ed
.exit:
retw.n # [0]
.size esp_nn_mul_elementwise_s8_esp32s3, . - esp_nn_mul_elementwise_s8_esp32s3
================================================
FILE: src/common/common_functions.h
================================================
/*
* SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
#pragma once
#include <stdint.h>
#include <stdbool.h>
#include <string.h>
/**
* c99 standard still doesn't strictly inline functions
* We need to use attribute as well to do this.
*/
#define __NN_FORCE_INLINE__ __attribute((always_inline)) static inline
/* min/max macros */
#ifndef max
#define max(a, b) ({ \
__typeof__ (a) _a = (a); \
__typeof__ (b) _b = (b); \
_a > _b ? _a : _b; \
})
#define min(a, b) ({ \
__typeof__ (a) _a = (a); \
__typeof__ (b) _b = (b); \
_a < _b ? _a : _b; \
})
#endif
__NN_FORCE_INLINE__ int32_t esp_nn_clz32(uint32_t in)
{
#if CONFIG_IDF_TARGET_ARCH_XTENSA
__asm__ volatile("nsau %0, %0" : "+r" (in));
return in;
#elif defined(__GNUC__)
return __builtin_clz(in);
#else
int32_t count = 32;
uint32_t x = in, y = in >> 16;
if (y != 0) {
count -= 16;
x = y;
}
y = x >> 8;
if (y != 0) {
count -= 8;
x = y;
}
y = x >> 4;
if (y != 0) {
count -= 4;
x = y;
}
y = x >> 2;
if (y != 0) {
count -= 2;
x = y;
}
y = x >> 1;
if (y != 0) {
return count - 2;
}
return count - x;
#endif
}
/**
* Signed saturate a 32 bit value to 8 bits keeping output in 32 bit variable.
*/
__NN_FORCE_INLINE__ int32_t esp_nn_saturate8(int32_t in)
{
#if CONFIG_IDF_TARGET_ARCH_XTENSA
__asm__ volatile("clamps %0, %0, 7" : "+a"(in));
return in;
#else
return max(INT8_MIN, min(in, INT8_MAX));
#endif
}
__NN_FORCE_INLINE__ int32_t esp_nn_pick_sat_high32_of64(int64_t val64)
{
int32_t sign = (int32_t) (val64 >> 63);
int32_t to_add = sign & ((1ul << 31) - 1);
return (int32_t) ((int64_t) (val64 + to_add) >> 31);
}
__NN_FORCE_INLINE__ int32_t esp_nn_sat_round_doubling_high_mul(int32_t in0, int32_t in1)
{
int32_t result;
int64_t in0_64 = (int64_t) in0;
bool overflow = (in0 == in1) && (in0 == (int32_t) INT32_MIN);
/* Nudge value */
int64_t nudge_val = 1 << 30;
if ((in0 < 0) ^ (in1 < 0)) {
nudge_val = 1 - nudge_val;
}
/* Multiply and add nudge */
int64_t mult = in0_64 * in1 + nudge_val;
/* Round and pickup 32 bits */
result = esp_nn_pick_sat_high32_of64(mult);
return overflow ? INT32_MAX : result;
}
/**
* fast version
* this will fail for values closer to INT32_MAX and INT32_MIN by `1 << (exponent - 1)`.
* We can afford to do this because we are at the very last stage of filter.
* Also it is pretty rare condition as our output is going to be 8 bit.
*/
__NN_FORCE_INLINE__ int32_t esp_nn_div_by_power_of_two_fast(int32_t val, int32_t exponent)
{
int32_t to_add = (1 << (exponent - 1)) - (val < 0);
return (int32_t) ((val + to_add) >> exponent);
}
__NN_FORCE_INLINE__ int32_t esp_nn_div_by_power_of_two(int32_t val, int32_t exponent)
{
int32_t result;
const int32_t mask = (1 << exponent) - 1;
const int32_t remainder = val & mask;
result = val >> exponent;
int32_t threshold = (mask >> 1) + (result < 0);
if (remainder > threshold) {
result += 1;
}
return result;
}
__NN_FORCE_INLINE__ int32_t esp_nn_multiply_by_quantized_mult(int32_t x, int32_t mult, int32_t shift)
{
int32_t left_shift = shift > 0 ? shift : 0;
int32_t right_shift = shift > 0 ? 0 : -shift;
int32_t result = esp_nn_sat_round_doubling_high_mul(x * (1 << left_shift), mult);
return esp_nn_div_by_power_of_two(result, right_shift);
}
#if CONFIG_IDF_TARGET_ESP32P4
/** PIE enable macro - call once before using any esp.* instructions */
#define ESP_NN_PIE_ENABLE() do { \
asm volatile ( \
"csrsi 0x7f2, 0b01 \n\t" \
"li x29, 0b10 \n\t" \
"esp.movx.w.cfg x29 \n\t" \
::: "x29" \
); \
} while(0)
/** Extract 16 int32 per-lane results from QACC into array */
#define ESP_NN_QACC_EXTRACT_S32(dst) do { \
asm volatile ( \
"mv x30, %0 \n\t" \
"esp.st.qacc.l.l.128.ip x30, 16 \n\t" \
"esp.st.qacc.l.h.128.ip x30, 16 \n\t" \
"esp.st.qacc.h.l.128.ip x30, 16 \n\t" \
"esp.st.qacc.h.h.128.ip x30, 0 \n\t" \
:: "r"(dst) \
: "x30", "memory" \
); \
} while(0)
#endif /* CONFIG_IDF_TARGET_ESP32P4 - PIE_ENABLE and QACC_EXTRACT */
/**
* 2-wide interleaved requant macro for ESP32-P4 RISC-V.
* Interleaves mulh across two independent elements for pipeline fill.
* Outputs r0, r1 as requantized int32 values (before offset/clamp).
*/
#if CONFIG_IDF_TARGET_ESP32P4
#define ESP_NN_REQUANT_2X(x0, x1, m0, m1, s0, s1, r0, r1) do { \
int32_t _ls0 = (s0) > 0 ? (s0) : 0; \
int32_t _ls1 = (s1) > 0 ? (s1) : 0; \
int32_t _v0 = (x0) << _ls0; \
int32_t _v1 = (x1) << _ls1; \
int32_t _rs0 = _ls0 - (s0); \
int32_t _rs1 = _ls1 - (s1); \
int32_t _hi0, _lo0, _hi1, _lo1; \
asm volatile ( \
"mulh %[h0], %[v0], %[mm0] \n\t" \
"mulh %[h1], %[v1], %[mm1] \n\t" \
"mul %[l0], %[v0], %[mm0] \n\t" \
"mul %[l1], %[v1], %[mm1] \n\t" \
: [h0] "=&r"(_hi0), [h1] "=&r"(_hi1), \
[l0] "=&r"(_lo0), [l1] "=&r"(_lo1) \
: [v0] "r"(_v0), [v1] "r"(_v1), \
[mm0] "r"((int32_t)(m0)), [mm1] "r"((int32_t)(m1)) \
); \
/* Add nudge (1<<30) and extract bits [31:62] */ \
uint32_t _n = 0x40000000u; \
uint32_t _a0 = (uint32_t)_lo0 + _n; \
_hi0 += (_a0 < (uint32_t)_lo0); \
(r0) = (_hi0 << 1) | (_a0 >> 31); \
uint32_t _a1 = (uint32_t)_lo1 + _n; \
_hi1 += (_a1 < (uint32_t)_lo1); \
(r1) = (_hi1 << 1) | (_a1 >> 31); \
/* Right shift with rounding */ \
if (_rs0) { (r0) = ((r0) + (1 << (_rs0 - 1)) - ((r0) < 0)) >> _rs0; } \
if (_rs1) { (r1) = ((r1) + (1 << (_rs1 - 1)) - ((r1) < 0)) >> _rs1; } \
} while(0)
#endif
__NN_FORCE_INLINE__ int32_t esp_nn_multiply_by_quantized_mult_fast(int32_t x, int32_t mult, int32_t shift)
{
int32_t left_shift = max(shift, 0);
int32_t right_shift = left_shift - shift;
int64_t nudge_val = 1 << 30;
int64_t in0_64 = (int64_t) (x << left_shift);
/* Multiply and add nudge */
int64_t mult_64 = in0_64 * mult + nudge_val;
int32_t result = (int32_t) (mult_64 >> 31);
if (right_shift) {
result = esp_nn_div_by_power_of_two_fast(result, right_shift);
}
return result;
}
/*
* Unified requantize wrapper. Defining either SKIP_NUDGE (legacy) or
* CONFIG_NN_SKIP_NUDGE (Kconfig-driven) selects the faster, non-bit-exact
* path; otherwise the bit-exact TFLite-reference path is used.
*/
#if defined(SKIP_NUDGE) || defined(CONFIG_NN_SKIP_NUDGE)
#define esp_nn_requantize(x, m, s) esp_nn_multiply_by_quantized_mult_fast((x), (m), (s))
#else
#define esp_nn_requantize(x, m, s) esp_nn_multiply_by_quantized_mult((x), (m), (s))
#endif
static void esp_nn_aligned_s8_pad_with_value(const int8_t *src, int8_t *dst,
const uint16_t input_wd,
const uint16_t input_ht,
const uint16_t channels,
const int32_t pad_val,
const uint16_t pad_wd,
const uint16_t pad_ht)
{
/* memset with pad_val */
memset(dst, pad_val, ((input_wd + 2 * pad_wd) * (input_ht + 2 * pad_ht)) * channels);
dst += (pad_wd + input_wd + pad_wd) * pad_ht * channels;
for (int i = 0; i < input_ht; i++) {
dst += pad_wd * channels;
for (int j = 0; j < input_wd * channels; j++) {
*dst++ = *src++;
}
dst += pad_wd * channels;
}
}
static void esp_nn_aligned_s8_pad_end_with_value(const int8_t *src, int8_t *dst,
const uint16_t input_wd,
const uint16_t input_ht,
const uint16_t channels,
const int32_t pad_val,
const uint16_t pad_wd,
const uint16_t pad_ht)
{
for (int i = 0; i < input_ht; i++) {
for (int j = 0; j < input_wd * channels; j++) {
*dst++ = *src++;
}
if (pad_wd) {
memset(dst, pad_val, pad_wd * channels);
dst += pad_wd * channels;
}
}
/* pad end `pad_ht` lines at end */
if (pad_ht) {
memset(dst, pad_val, (input_wd + pad_wd) * pad_ht * channels);
}
}
/**
* @brief convert 8 bit input data to 16 bit
*
* @param src int8_t source data
* @param dst int16_t dst data
* @param size length of data
* @param offset offset to be added to src data. Range: [-128, 127]
*/
__NN_FORCE_INLINE__ void esp_nn_s8_to_s16_with_offset(const int8_t *src, int16_t *dst,
const int size, const int32_t offset)
{
int i = 0;
for (; i < size; i += 2) {
dst[i + 0] = src[i + 0] + offset;
dst[i + 1] = src[i + 1] + offset;
}
if(i < size) {
dst[i] = src[i] + offset;
}
}
/**
* @brief convert 8 bit input data to 16 bit
*
* @param src int8_t source data
* @param dst int16_t dst data
* @param size length of data
*/
__NN_FORCE_INLINE__ void esp_nn_s8_to_s16(const int8_t *src, int16_t *dst, const int size)
{
int i = 0;
for (; i < size; i += 2) {
dst[i + 0] = src[i + 0];
dst[i + 1] = src[i + 1];
}
if(i < size) {
dst[i] = src[i];
}
}
#if CONFIG_IDF_TARGET_ESP32S3
/**
* @brief s8 dot product — both pointers 16-byte aligned.
* Uses ACCX accumulator with fused MAC+load.
*
* @param a input data (16-byte aligned)
* @param b filter data (16-byte aligned)
* @param len number of elements (must be multiple of 16, >= 16)
* @return int32_t dot product result
*/
extern int32_t esp_nn_dot_s8_aligned_esp32s3(const int8_t *a, const int8_t *b, int32_t len);
/**
* @brief s8 dot product — input aligned, filter may be unaligned.
* Uses USAR+QUP pattern for filter data.
*
* @param a input data (16-byte aligned)
* @param b filter data (may be unaligned)
* @param len_div16 number of 16-element chunks (>= 1)
* @return int32_t dot product result
*/
extern int32_t esp_nn_dot_s8_unaligned_esp32s3(const int8_t *a, const int8_t *b, int32_t len_div16);
#endif
================================================
FILE: src/common/esp_nn_common_functions_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
.text
# Program Unit: esp_nn_aligned_s8_to_s16_with_offset_esp32s3
.type esp_nn_aligned_s8_to_s16_with_offset_esp32s3, @function
.align 4
.global esp_nn_aligned_s8_to_s16_with_offset_esp32s3
esp_nn_aligned_s8_to_s16_with_offset_esp32s3: # 0x30d
entry a1,48 #
mov.n a10,a2 # // src
mov.n a9,a3 # // dst
mov.n a8,a4 # // size
s32i.n a5,a1,12 # [3] // offset
addi.n a2,a1,12 # [4]
blti a4,32,.Lt_2_6402 # [5] if (size < 32) goto unopt
addi.n a6,a8,-1 # [0]
ee.zero.q q5 # [1]
ee.vldbc.16 q4,a2 # [2] id:136 offset
mov.n a3,a10 # [3]
mov.n a2,a9 # [4]
ee.vld.128.ip q0,a3,16 # [5] id:137
ee.vld.128.ip q1,a3,16 # [6] id:138
ee.vcmp.lt.s8 q2,q0,q5 # [7]
ee.vzip.8 q0,q2 # [8]
ee.vadds.s16 q0,q0,q4 # [9]
ee.vadds.s16.st.incp q0,a2,q0,q2,q4 # [10] id:139
blti a4,64,.Lt_2_7170 # [11]
addi a5,a4,-32 # [0]
srai a5,a5,5 # [1]
slli a4,a5,5 # [2]
loopgtz a5,.LBB37_esp_nn_aligned_s8_to_s16_with_offset_esp32s3 # [3]
ee.vst.128.ip q0,a2,16 # [0*II+0] id:140
ee.vcmp.lt.s8 q0,q1,q5 # [0*II+1]
ee.vzip.8 q1,q0 # [0*II+2]
ee.vadds.s16.ld.incp q2,a3,q3,q1,q4 # [0*II+3] id:141
ee.vadds.s16.st.incp q3,a2,q0,q0,q4 # [0*II+4] id:142
ee.vcmp.lt.s8 q3,q2,q5 # [0*II+5]
ee.vst.128.ip q0,a2,16 # [0*II+6] id:143
ee.vzip.8 q2,q3 # [0*II+7]
ee.vadds.s16.ld.incp q1,a3,q0,q2,q4 # [0*II+8] id:144
ee.vadds.s16.st.incp q0,a2,q0,q3,q4 # [0*II+9] id:145
.LBB37_esp_nn_aligned_s8_to_s16_with_offset_esp32s3: # 0x36d
addi a4,a4,32 # [0]
.Lt_2_3842: # 0x370
ee.vst.128.ip q0,a2,16 # [0] id:146
ee.vcmp.lt.s8 q2,q1,q5 # [1]
ee.vzip.8 q1,q2 # [2]
ee.vadds.s16 q2,q2,q4 # [3]
ee.vadds.s16 q3,q1,q4 # [4]
ee.vst.128.ip q3,a2,16 # [5] id:147
ee.vst.128.ip q2,a2,16 # [6] id:148
bge a4,a6,.Lt_2_4866 # [7]
l32i.n a5,a1,12 # [0] id:135 offset+0x0
.Lt_2_5122: # 0x38a
mov.n a11,a4 # [0]
add.n a2,a4,a10 # [1]
# 576 dst[i + 0] = src[i + 0] + offset;
l8ui a7,a2,0 # [2] id:149
addx2 a6,a4,a9 # [3]
sext a7,a7,7 # [4]
add.n a7,a7,a5 # [5]
s16i a7,a6,0 # [6] id:150
# 577 dst[i + 1] = src[i + 1] + offset;
l8ui a3,a2,1 # [7] id:151
sub a7,a8,a4 # [8]
addi.n a2,a2,2 # [9]
srai a7,a7,1 # [10]
sext a3,a3,7 # [11]
add.n a3,a3,a5 # [12]
s16i a3,a6,2 # [13] id:152
addi.n a3,a7,-1 # [14]
loopgtz a3,.LBB52_esp_nn_aligned_s8_to_s16_with_offset_esp32s3 # [15]
l8ui a3,a2,0 # [0*II+0] id:149
addi.n a6,a6,4 # [1*II+1]
sext a3,a3,7 # [0*II+2]
add.n a3,a3,a5 # [0*II+3]
s16i a3,a6,0 # [0*II+4] id:150
l8ui a3,a2,1 # [0*II+5] id:151
addi.n a2,a2,2 # [0*II+6]
sext a3,a3,7 # [0*II+7]
add.n a3,a3,a5 # [0*II+8]
s16i a3,a6,2 # [0*II+9] id:152
.LBB52_esp_nn_aligned_s8_to_s16_with_offset_esp32s3: # 0x3ce
addx2 a4,a7,a11 # [0]
.Lt_2_4866: # 0x3d1
bge a4,a8,.Lt_2_7682 # [0]
# 580 dst[i] = src[i] + offset;
addx2 a11,a4,a9 # [0]
add.n a8,a4,a10 # [1]
l8ui a8,a8,0 # [2] id:153
l32i.n a12,a1,12 # [3] id:135 offset+0x0
sext a8,a8,7 # [4]
add.n a8,a8,a12 # [5]
s16i a8,a11,0 # [6] id:154
retw.n # [7]
.Lt_2_6402: # 0x3e8
blti a4,2,.Lt_2_6658 # [0]
movi.n a4,0 # [0]
j .Lt_2_5122 # [1]
.Lt_2_7682: # 0x3f0
retw.n # [0]
.Lt_2_6658: # 0x3f2
blti a4,1,.Lt_2_7682 # [0]
l8ui a11,a10,0 # [0] id:153
sext a11,a11,7 # [2]
add.n a11,a11,a5 # [3]
s16i a11,a3,0 # [4] id:154
retw.n # [5]
.Lt_2_7170: # 0x402
movi.n a4,32 # [0]
j .Lt_2_3842 # [1]
.size esp_nn_aligned_s8_to_s16_with_offset_esp32s3, . - esp_nn_aligned_s8_to_s16_with_offset_esp32s3
.literal_position
# Program Unit: esp_nn_s8_to_s16_esp32s3
.type esp_nn_s8_to_s16_esp32s3, @function
.align 4
.global esp_nn_s8_to_s16_esp32s3
esp_nn_s8_to_s16_esp32s3: # 0x40b
entry a1,32 #
mov.n a9,a2 // src
mov.n a8,a3 // dst
mov.n a7,a4 // size
blti a4,1,.Lt_3_4866 // size == 0
blti a4,16,.Lt_3_4610 // if (size < 16) jump to unopt path
// load align_len to sar_byte
extui a2,a2,0,4 # [0]
wur.sar_byte a2 # [1]
mov.n a2,a9 # [2]
// preload
ee.vld.128.ip q0,a2,16
ee.vld.128.ip q1,a2,16
ee.zero.q q4
# 672
# 673 for (i = 16; i < size - 15; i += 16) {
blti a4,32,.Lt_3_5378 # [5]
addi a6,a4,-16 # [1]
srai a6,a6,4 # [2]
slli a4,a6,4 # [3]
loopgtz a6,.LBB35_esp_nn_s8_to_s16_esp32s3 # [4]
ee.src.q.qup q2,q0,q1 # [0*II+0]
ee.vcmp.lt.s8 q3,q2,q4 # [0*II+1] // sign
ee.vld.128.ip q1,a2,16 # [0*II+2] // for next iteration
ee.vzip.8 q2,q3 # [0*II+3]
ee.vst.128.ip q2,a3,16 # [0*II+4] id:93
ee.vst.128.ip q3,a3,16 # [0*II+5] id:94
.LBB35_esp_nn_s8_to_s16_esp32s3: # 0x449
addi a4,a4,16 # [0]
.Lt_3_2050: # 0x44c
ee.src.q.qup q5,q0,q1 # [0]
ee.vcmp.lt.s8 q3,q5,q4 # [1]
ee.vzip.8 q5,q3 # [2]
ee.vst.128.ip q5,a3,16 # [3] id:96
ee.vst.128.ip q3,a3,16 # [4] id:97
# 687
# 688 skip_to_remains_s8_to_s16:
# 689 for (; i < size; i += 2) {
bge a4,a7,.Lt_3_4866 # [5]
.Lt_3_3330: # 0x45e
mov.n a11,a4 # [0]
add.n a2,a4,a9 # [1]
# 690 dst[i + 0] = src[i + 0];
l8ui a10,a2,0 # [2] id:98
addx2 a5,a4,a8 # [3]
sext a10,a10,7 # [4]
s16i a10,a5,0 # [5] id:99
# 691 dst[i + 1] = src[i + 1];
l8ui a3,a2,1 # [6] id:100
sub a10,a7,a4 # [7]
addi.n a2,a2,2 # [8]
addi.n a10,a10,1 # [9]
srai a10,a10,1 # [10]
sext a3,a3,7 # [11]
s16i a3,a5,2 # [12] id:101
addi.n a3,a10,-1 # [13]
loopgtz a3,.LBB50_esp_nn_s8_to_s16_esp32s3 # [14]
l8ui a3,a2,0 # [0*II+0] id:98
addi.n a5,a5,4 # [1*II+1]
sext a3,a3,7 # [0*II+2]
s16i a3,a5,0 # [0*II+3] id:99
l8ui a3,a2,1 # [0*II+4] id:100
addi.n a2,a2,2 # [0*II+5]
sext a3,a3,7 # [0*II+6]
s16i a3,a5,2 # [0*II+7] id:101
.LBB50_esp_nn_s8_to_s16_esp32s3: # 0x49c
addx2 a4,a10,a11 # [0]
# 692 }
# 693 if(i < size) {
bge a4,a7,.Lt_3_4866 # [1]
# 694 dst[i] = src[i];
add.n a11,a4,a9 # [0]
l8ui a11,a11,0 # [1] id:102
addx2 a12,a4,a8 # [2]
sext a11,a11,7 # [3]
s16i a11,a12,0 # [4] id:103
retw.n # [5]
.Lt_3_4610: # 0x4b2
movi.n a4,0 # [0]
j .Lt_3_3330 # [1]
.Lt_3_4866: # 0x4ba
retw.n # [0]
.Lt_3_5378: # 0x4bc
movi.n a4,16 # [1]
j .Lt_3_2050 # [2]
.size esp_nn_s8_to_s16_esp32s3, . - esp_nn_s8_to_s16_esp32s3
================================================
FILE: src/common/esp_nn_dot_s8_esp32s3.S
================================================
//
// SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
//
// SPDX-License-Identifier: Apache-2.0
//
//
// Reusable s8 dot product kernels for ESP32-S3.
// Used by conv im2col, FC, and any kernel that reduces to a dot product.
//
// esp_nn_dot_s8_aligned_esp32s3:
// Both input and filter 16-byte aligned. Uses ee.vld.128.ip + fused MAC.
//
// esp_nn_dot_s8_unaligned_esp32s3:
// Input aligned, filter may be unaligned. Uses USAR+QUP for filter.
//
.text
.align 4
// ============================================================
// esp_nn_dot_s8_aligned_esp32s3
// Both pointers must be 16-byte aligned.
// a2: input_data (aligned)
// a3: filter_data (aligned)
// a4: len (must be multiple of 16, >= 16)
// Returns: int32_t dot product in a2
// ============================================================
.type esp_nn_dot_s8_aligned_esp32s3, @function
.align 4
.global esp_nn_dot_s8_aligned_esp32s3
esp_nn_dot_s8_aligned_esp32s3:
entry a1, 32
ee.zero.accx
beqz a4, .Lalign_done
// Compute loop count and remainder
srli a5, a4, 4 // a5 = len / 16
beqz a5, .Lalign_done
// Prime: load first pair
ee.vld.128.ip q0, a2, 16
ee.vld.128.ip q1, a3, 16
addi a5, a5, -1
beqz a5, .Lalign_last
// Main loop: fused MAC + load
loopgtz a5, .Lalign_loop_end
ee.vmulas.s8.accx.ld.ip q0, a2, 16, q0, q1
ee.vld.128.ip q1, a3, 16
.Lalign_loop_end:
.Lalign_last:
// Final MAC
ee.vmulas.s8.accx q0, q1
.Lalign_done:
// Read lower 32 bits of ACCX (sufficient for int8 dot products)
nop
nop
rur.accx_0 a2
retw.n
.size esp_nn_dot_s8_aligned_esp32s3, . - esp_nn_dot_s8_aligned_esp32s3
// ============================================================
// esp_nn_dot_s8_unaligned_esp32s3
// Input must be 16-byte aligned. Filter can be unaligned.
// Uses USAR+QUP pattern for filter loads.
// a2: input_data (aligned)
// a3: filter_data (may be unaligned)
// a4: len_div16 (>= 1)
// Returns: int32_t dot product in a2
// ============================================================
.type esp_nn_dot_s8_unaligned_esp32s3, @function
.align 4
.global esp_nn_dot_s8_unaligned_esp32s3
esp_nn_dot_s8_unaligned_esp32s3:
entry a1, 32
ee.zero.accx
beqz a4, .Lunalign_done
// Prime: first unaligned filter load (sets SAR_BYTE)
ee.ld.128.usar.ip q0, a3, 16
// Check if we can do 2x unrolled (need >= 2 iterations)
srai a5, a4, 1 // a5 = len_div16 / 2
beqz a5, .Lunalign_single
// Load first input + filter pair for unrolled loop
ee.vld.128.ip q1, a2, 16
ee.ld.128.usar.ip q2, a3, 16
// 2x unrolled main loop
loopgtz a5, .Lunalign_loop2_end
ee.src.q.qup q4, q0, q2 // align filter[i]
ee.vld.128.ip q3, a2, 16 // input[i+1]
ee.vmulas.s8.accx q4, q1 // MAC filter[i] * input[i]
ee.ld.128.usar.ip q0, a3, 16 // filter chunk[i+2]
ee.src.q.qup q5, q2, q0 // align filter[i+1]
ee.vld.128.ip q1, a2, 16 // input[i+2] (primed for next)
ee.vmulas.s8.accx q5, q3 // MAC filter[i+1] * input[i+1]
ee.ld.128.usar.ip q2, a3, 16 // filter chunk[i+3]
.Lunalign_loop2_end:
// Check if there's a remaining single iteration (odd len_div16)
bbci a4, 0, .Lunalign_done_mac
// Odd remainder: the 2x loop already loaded q0/q2 for the next chunk.
// Just qup the filter and MAC with the primed input (q1).
// But q1 was loaded as input[i+2] in the last loop iteration — we need
// to re-read the correct input. Actually, q1 is already the right input.
// q0 and q2 are the filter chunks ready for qup.
ee.src.q.qup q4, q0, q2
ee.vmulas.s8.accx q4, q1
j .Lunalign_done_mac
.Lunalign_single:
// Called when len_div16 < 2 (single chunk only)
ee.vld.128.ip q1, a2, 16
ee.ld.128.usar.ip q2, a3, 16
ee.src.q.qup q4, q0, q2
ee.vmulas.s8.accx q4, q1
.Lunalign_done_mac:
.Lunalign_done:
// 2-cycle gap before ACCX read
movi.n a3, 0
nop
ee.srs.accx a2, a3, 0
retw.n
.size esp_nn_dot_s8_unaligned_esp32s3, . - esp_nn_dot_s8_unaligned_esp32s3
================================================
FILE: src/common/esp_nn_mean_ansi.c
================================================
/*
* SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
/*
* Quantized mean reduction over spatial dimensions (axes 1,2).
* Specialized for 4D tensors [N, H, W, C] → [N, 1, 1, C].
* This is the common case in Squeeze-and-Excite blocks.
*/
#include <stdint.h>
#include <common_functions.h>
void esp_nn_mean_nhwc_s8_ansi(const int8_t *input,
int8_t *output,
const int32_t height,
const int32_t width,
const int32_t channels,
const int32_t input_zero_point,
const int32_t output_zero_point,
const int32_t multiplier,
const int32_t shift)
{
const int32_t num_elements = height * width;
for (int c = 0; c < channels; c++) {
/* Sum over spatial dimensions */
int32_t sum = 0;
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
sum += input[(h * width + w) * channels + c];
}
}
/* Apply zero point correction */
sum -= num_elements * input_zero_point;
/* Requantize: multiply_by_quantized_mult(sum, multiplier, shift) */
int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);
result += output_zero_point;
result = max(result, -128);
result = min(result, 127);
output[c] = (int8_t)result;
}
}
================================================
FILE: src/common/esp_nn_mean_s8_esp32p4.c
================================================
/*
* SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
/*
* ESP32-P4 optimized spatial mean reduction using QACC per-lane accumulation.
* Processes 16 channels in parallel via esp.vmulas.s8.qacc (same pattern as avg_pool).
*/
#include <stdint.h>
#include <common_functions.h>
void esp_nn_mean_nhwc_s8_esp32p4(const int8_t *input,
int8_t *output,
const int32_t height,
const int32_t width,
const int32_t channels,
const int32_t input_zero_point,
const int32_t output_zero_point,
const int32_t multiplier,
const int32_t shift)
{
const int32_t num_elements = height * width;
const int32_t ch_16 = channels >> 4;
const int8_t one_val = 1;
if (ch_16 > 0) {
/* Enable PIE and broadcast 1 into q7 */
asm volatile (
"csrsi 0x7f2, 0b01 \n\t"
"li x29, 0b10 \n\t"
"esp.movx.w.cfg x29 \n\t"
::: "x29"
);
asm volatile (
"mv x30, %0 \n\t"
"esp.vldbc.8.ip q7, x30, 0 \n\t"
:: "r"(&one_val) : "x30"
);
}
/* Process all channels - QACC for 16-channel blocks, scalar for remainder */
int ch = 0;
for (int ch_blk = 0; ch_blk < ch_16; ch_blk++, ch += 16) {
/* Single asm block: broadcast ones, zero QACC, accumulate all spatial
* positions. Keeping in one block prevents compiler from clobbering
* q7 between the broadcast and the MAC loop. */
const int8_t *base_ptr = input + ch;
asm volatile (
/* Broadcast 1 into q7 */
"mv x30, %[one] \n\t"
"esp.vldbc.8.ip q7, x30, 0 \n\t"
/* Zero QACC */
"esp.zero.qacc \n\t"
/* Accumulate loop: stride = channels between spatial positions */
"mv x30, %[base] \n\t"
"mv s7, %[cnt] \n\t"
"1: \n\t"
"esp.vld.128.ip q0, x30, 0 \n\t"
"esp.vmulas.s8.qacc q0, q7 \n\t"
"add x30, x30, %[stride] \n\t"
"addi s7, s7, -1 \n\t"
"bnez s7, 1b \n\t"
:
: [one] "r"(&one_val), [base] "r"(base_ptr),
[cnt] "r"(num_elements), [stride] "r"((int32_t)channels)
: "x30", "s7"
);
int32_t sums[16] __attribute__((aligned(16)));
ESP_NN_QACC_EXTRACT_S32(sums);
int32_t zp_correction = num_elements * input_zero_point;
for (int k = 0; k < 16; k++) {
int32_t result = sums[k] - zp_correction;
result = esp_nn_multiply_by_quantized_mult(result, multiplier, shift);
result += output_zero_point;
result = max(result, -128);
result = min(result, 127);
output[ch + k] = (int8_t)result;
}
}
/* Remaining channels scalar */
for (; ch < channels; ch++) {
int32_t sum = 0;
for (int hw = 0; hw < num_elements; hw++) {
sum += input[hw * channels + ch];
}
sum -= num_elements * input_zero_point;
int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);
result += output_zero_point;
result = max(result, -128);
result = min(result, 127);
output[ch] = (int8_t)result;
}
}
================================================
FILE: src/common/esp_nn_mean_s8_esp32s3.c
================================================
/*
* SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
/*
* ESP32-S3 optimized mean reduction for NHWC int8 tensors.
* Uses int16 accumulation for small spatial sizes (H*W <= 256),
* int32 for larger. Accumulates all channels at once per spatial position.
*/
#include <stdint.h>
#include <string.h>
#include <common_functions.h>
void esp_nn_mean_nhwc_s8_esp32s3(const int8_t *input,
int8_t *output,
const int32_t height,
const int32_t width,
const int32_t channels,
const int32_t input_zero_point,
const int32_t output_zero_point,
const int32_t multiplier,
const int32_t shift)
{
const int32_t num_elements = height * width;
const int32_t zp_correction = num_elements * input_zero_point;
if (num_elements <= 256 && channels <= 512) {
/* int16 accumulation (safe: 256 * 127 = 32,512 < 32,767) */
/* Process 8 channels at a time using int16 accumulators */
int16_t acc16[channels];
memset(acc16, 0, channels * sizeof(int16_t));
const int8_t *ptr = input;
for (int i = 0; i < num_elements; i++) {
/* Inner loop — compiler should auto-vectorize with -O2 */
for (int c = 0; c < channels; c++) {
acc16[c] += (int16_t)ptr[c];
}
ptr += channels;
}
/* Requantize per channel */
for (int c = 0; c < channels; c++) {
int32_t sum = (int32_t)acc16[c] - zp_correction;
int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);
result += output_zero_point;
result = max(result, -128);
result = min(result, 127);
output[c] = (int8_t)result;
}
} else if (channels <= 512) {
/* int32 accumulation for larger spatial sizes */
int32_t acc[channels];
memset(acc, 0, channels * sizeof(int32_t));
const int8_t *ptr = input;
for (int i = 0; i < num_elements; i++) {
for (int c = 0; c < channels; c++) {
acc[c] += ptr[c];
}
ptr += channels;
}
for (int c = 0; c < channels; c++) {
int32_t sum = acc[c] - zp_correction;
int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);
result += output_zero_point;
result = max(result, -128);
result = min(result, 127);
output[c] = (int8_t)result;
}
} else {
/* Per-channel fallback for huge channel counts */
for (int c = 0; c < channels; c++) {
int32_t sum = 0;
for (int i = 0; i < num_elements; i++) {
sum += input[i * channels + c];
}
sum -= zp_correction;
int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);
result += output_zero_point;
result = max(result, -128);
result = min(result, 127);
output[c] = (int8_t)result;
}
}
}
================================================
FILE: src/common/esp_nn_multiply_by_quantized_mult_esp32p4.S
================================================
/*
* SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
*
* SPDX-License-Identifier: Apache-2.0
*/
/*
* Fast 2-wide requantization for ESP32-P4 (RISC-V).
* Interleaves mul/mulh across 2 elements for better pipeline utilization.
* Uses a0-a7 and t0-t6 only (no callee-saved registers needed).
*
* void esp_nn_requant_2x_esp32p4(
* int32_t x0, // a0
* int32_t x1, // a1
* int32_t mult0, // a2
* int32_t mult1, // a3
* int32_t shift0, // a4
* int32_t shift1, // a5
* int32_t *out // a6: pointer to store 2 results
* );
*/
.text
.align 4
.global esp_nn_requant_2x_esp32p4
.type esp_nn_requant_2x_esp32p4, @function
esp_nn_requant_2x_esp32p4:
/* Compute left_shift and apply */
mv t0, a0 /* x0 */
mv t1, a1 /* x1 */
bgez a4, .Lls0_pos
mv t6, zero /* ls0 = 0 */
j .Lls0_done
.Lls0_pos:
sll t0, t0, a4 /* x0 <<= shift0 (positive = left shift) */
mv t6, a4 /* ls0 = shift0 */
.Lls0_done:
sub a4, t6, a4 /* rs0 = ls0 - shift0 */
bgez a5, .Lls1_pos
mv t6, zero
j .Lls1_done
.Lls1_pos:
sll t1, t1, a5
mv t6, a5
.Lls1_done:
sub a5, t6, a5 /* rs1 = ls1 - shift1 */
/* ---- Interleaved 64-bit multiply ---- */
/* mulh first (both elements), then mul (both elements) */
mulh t2, t0, a2 /* hi0 */
mulh t3, t1, a3 /* hi1 */
mul t0, t0, a2 /* lo0 */
mul t1, t1, a3 /* lo1 */
/* Add nudge and combine: result = ((hi:lo) + (1<<30)) >> 31 */
li t4, 0x40000000 /* nudge = 1 << 30 */
add t5, t0, t4 /* lo0 + nudge */
sltu t6, t5, t0 /* carry0 */
add t2, t2, t6 /* hi0 += carry0 */
srli t5, t5, 31 /* (lo0+nudge) >> 31 */
slli t0, t2, 1 /* hi0 << 1 */
or t0, t0, t5 /* result0 */
add t5, t1, t4 /* lo1 + nudge */
sltu t6, t5, t1 /* carry1 */
add t3, t3, t6 /* hi1 += carry1 */
srli t5, t5, 31
slli t1, t3, 1
or t1, t1, t5 /* result1 */
/* ---- Right shift with rounding ---- */
li t4, 1
beqz a4, .Lskip_rs0
addi t5, a4, -1
sll t5, t4, t5 /* round0 = 1 << (rs0-1) */
srai t6, t0, 31 /* -1 if negative, 0 otherwise */
add t5, t5, t6 /* round0 += sign */
add t0, t0, t5
sra t0, t0, a4
.Lskip_rs0:
beqz a5, .Lskip_rs1
addi t5, a5, -1
sll t5, t4, t5
srai t6, t1, 31
add t5, t5, t6
add t1, t1, t5
sra t1, t1, a5
.Lskip_rs1:
/* Store results */
sw t0, 0(a6)
sw t1, 4(a6)
ret
.size esp_nn_requant_2x_esp32p4, . - esp_nn_requant_2x_esp32p4
================================================
FILE: src/common/esp_nn_multiply_by_quantized_mult_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// the macro `use_nudge` enables adding rounding factor similar to tflite implementation
// this barely changes any accuracy
// keep this disabled for better performance
#ifndef SKIP_NUDGE
# set SKIP_NUDGE flag for ~20% faster (but not bit-exact) quantisation
.set use_nudge, 1
#endif
.text
.literal_position
.literal .nudge_val, 1073741824 # 1 << 30
.type esp_nn_multiply_by_quantized_mult_asm_esp32s3, @function
.align 4
.global esp_nn_multiply_by_quantized_mult_asm_esp32s3
esp_nn_multiply_by_quantized_mult_asm_esp32s3: # 0x4
# to_add = 4
entry a1,32
wsr.sar a3
ee.zero.q q2
bltz a3, .skip_left_shift
ee.vsl.32 q0,q0 # [13]
.skip_left_shift:
ssai 31 # [15]
# move data to general purpose registers
ee.movi.32.a q0,a12,0 # [17]
ee.movi.32.a q0,a13,1 # [16]
ee.movi.32.a q0,a14,2
gitextract__zjpraf8/
├── .github/
│ └── workflows/
│ └── upload_component.yml
├── .gitignore
├── .gitlab-ci.yml
├── CMakeLists.txt
├── CONTRIBUTING.md
├── Kconfig.projbuild
├── LICENSE
├── README.md
├── idf_component.yml
├── include/
│ ├── esp_nn.h
│ ├── esp_nn_ansi_c.h
│ ├── esp_nn_ansi_headers.h
│ ├── esp_nn_defs.h
│ ├── esp_nn_esp32p4.h
│ ├── esp_nn_esp32s3.h
│ └── esp_nn_generic_opt.h
├── src/
│ ├── activation_functions/
│ │ ├── esp_nn_hard_swish_ansi.c
│ │ ├── esp_nn_hard_swish_s8_esp32p4.c
│ │ ├── esp_nn_hard_swish_s8_esp32s3.c
│ │ ├── esp_nn_relu_ansi.c
│ │ ├── esp_nn_relu_s8_esp32p4.c
│ │ └── esp_nn_relu_s8_esp32s3.S
│ ├── basic_math/
│ │ ├── esp_nn_add_ansi.c
│ │ ├── esp_nn_add_s8_esp32p4.c
│ │ ├── esp_nn_add_s8_esp32s3.S
│ │ ├── esp_nn_mul_ansi.c
│ │ ├── esp_nn_mul_broadcast_s8_esp32s3.S
│ │ ├── esp_nn_mul_s8_esp32p4.c
│ │ └── esp_nn_mul_s8_esp32s3.S
│ ├── common/
│ │ ├── common_functions.h
│ │ ├── esp_nn_common_functions_esp32s3.S
│ │ ├── esp_nn_dot_s8_esp32s3.S
│ │ ├── esp_nn_mean_ansi.c
│ │ ├── esp_nn_mean_s8_esp32p4.c
│ │ ├── esp_nn_mean_s8_esp32s3.c
│ │ ├── esp_nn_multiply_by_quantized_mult_esp32p4.S
│ │ ├── esp_nn_multiply_by_quantized_mult_esp32s3.S
│ │ └── esp_nn_multiply_by_quantized_mult_ver1_esp32s3.S
│ ├── convolution/
│ │ ├── esp_nn_conv_ansi.c
│ │ ├── esp_nn_conv_esp32p4.c
│ │ ├── esp_nn_conv_esp32s3.c
│ │ ├── esp_nn_conv_opt.c
│ │ ├── esp_nn_conv_s16_mult4_1x1_esp32s3.S
│ │ ├── esp_nn_conv_s16_mult8_esp32s3.S
│ │ ├── esp_nn_conv_s8_1x1_esp32s3.c
│ │ ├── esp_nn_conv_s8_3x3_opt_esp32s3.c
│ │ ├── esp_nn_conv_s8_filter_aligned_input_padded_esp32s3.S
│ │ ├── esp_nn_conv_s8_mult8_1x1_esp32s3.S
│ │ ├── esp_nn_depthwise_conv_ansi.c
│ │ ├── esp_nn_depthwise_conv_esp32p4.c
│ │ ├── esp_nn_depthwise_conv_opt.c
│ │ ├── esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3.S
│ │ ├── esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3.S
│ │ ├── esp_nn_depthwise_conv_s16_mult1_esp32s3.S
│ │ ├── esp_nn_depthwise_conv_s16_mult4_esp32s3.S
│ │ ├── esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3.S
│ │ ├── esp_nn_depthwise_conv_s16_mult8_esp32s3.S
│ │ ├── esp_nn_depthwise_conv_s8_esp32s3.c
│ │ └── esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3.S
│ ├── fully_connected/
│ │ ├── esp_nn_fc_s8_mac16_esp32s3.S
│ │ ├── esp_nn_fully_connected_ansi.c
│ │ ├── esp_nn_fully_connected_esp32s3.c
│ │ ├── esp_nn_fully_connected_per_ch_s8_esp32s3.S
│ │ ├── esp_nn_fully_connected_s8_esp32p4.c
│ │ └── esp_nn_fully_connected_s8_esp32s3.S
│ ├── logistic/
│ │ └── esp_nn_logistic_ansi.c
│ ├── pooling/
│ │ ├── esp_nn_avg_pool_ansi.c
│ │ ├── esp_nn_avg_pool_s8_esp32p4.c
│ │ ├── esp_nn_avg_pool_s8_esp32s3.S
│ │ ├── esp_nn_avg_pool_s8_esp32s3.c
│ │ ├── esp_nn_max_pool_ansi.c
│ │ ├── esp_nn_max_pool_s8_esp32p4.c
│ │ └── esp_nn_max_pool_s8_esp32s3.S
│ └── softmax/
│ ├── esp_nn_softmax_ansi.c
│ ├── esp_nn_softmax_opt.c
│ ├── esp_nn_softmax_s8_esp32p4.c
│ ├── esp_nn_softmax_s8_esp32s3.c
│ └── softmax_common.h
├── test_app/
│ ├── CMakeLists.txt
│ ├── Makefile
│ ├── main/
│ │ ├── CMakeLists.txt
│ │ ├── component.mk
│ │ └── main.c
│ ├── sdkconfig.defaults
│ ├── sdkconfig.defaults.esp32p4
│ └── sdkconfig.defaults.esp32s3
└── tests/
├── CMakeLists.txt
├── README.md
├── component.mk
├── include/
│ ├── test_functions.h
│ └── test_utils.h
└── src/
├── basic_math_test.c
├── convolution_test.c
├── fully_connected_test.c
├── hard_swish_test.c
├── mean_test.c
├── pooling_test.c
├── relu_test.c
└── softmax_test.c
SYMBOL INDEX (143 symbols across 47 files)
FILE: include/esp_nn_defs.h
type data_dims_t (line 23) | typedef struct data_dims {
type data_2d_t (line 35) | typedef struct data_2d {
type act_params_t (line 43) | typedef struct act_params {
type quant_data_t (line 53) | typedef struct quant_data {
type conv_params_t (line 62) | typedef struct conv_params {
type dw_conv_params_t (line 75) | typedef struct dw_conv_params {
FILE: src/activation_functions/esp_nn_hard_swish_ansi.c
function sat_left_shift_s16 (line 18) | static inline int16_t sat_left_shift_s16(int16_t val, int shift)
function sat_round_dbl_high_mul_s16 (line 29) | static inline int16_t sat_round_dbl_high_mul_s16(int16_t a, int16_t b)
function sat_dbl_high_mul_s16 (line 39) | static inline int16_t sat_dbl_high_mul_s16(int16_t a, int16_t b)
function rounding_div_pot_s16 (line 48) | static inline int16_t rounding_div_pot_s16(int16_t val, int exponent)
function esp_nn_hard_swish_s8_ansi (line 56) | void esp_nn_hard_swish_s8_ansi(const int8_t *input,
FILE: src/activation_functions/esp_nn_hard_swish_s8_esp32p4.c
function sat_rnd_dbl_hi_mul (line 16) | static inline __attribute__((always_inline))
function sat_dbl_hi_mul (line 22) | static inline __attribute__((always_inline))
function sat_left_shift_s16 (line 28) | static inline __attribute__((always_inline))
function rounding_div_pot_s16 (line 35) | static inline __attribute__((always_inline))
function hard_swish_output (line 44) | static inline __attribute__((always_inline))
function esp_nn_hard_swish_s8_esp32p4 (line 55) | void esp_nn_hard_swish_s8_esp32p4(const int8_t *input,
FILE: src/activation_functions/esp_nn_hard_swish_s8_esp32s3.c
function esp_nn_get_hard_swish_scratch_size_esp32s3 (line 34) | int32_t esp_nn_get_hard_swish_scratch_size_esp32s3(void)
function esp_nn_set_hard_swish_scratch_buf_esp32s3 (line 39) | void esp_nn_set_hard_swish_scratch_buf_esp32s3(void *buf)
function esp_nn_hard_swish_s8_esp32s3 (line 44) | void esp_nn_hard_swish_s8_esp32s3(const int8_t *input,
FILE: src/activation_functions/esp_nn_relu_ansi.c
function esp_nn_relu6_s8_ansi (line 20) | void esp_nn_relu6_s8_ansi(int8_t *data, uint16_t size)
FILE: src/activation_functions/esp_nn_relu_s8_esp32p4.c
function esp_nn_relu6_s8_esp32p4 (line 14) | void esp_nn_relu6_s8_esp32p4(int8_t *data, uint16_t size)
FILE: src/basic_math/esp_nn_add_ansi.c
function esp_nn_add_elementwise_u8_ansi (line 19) | void esp_nn_add_elementwise_u8_ansi(const uint8_t *input1_data,
function esp_nn_add_elementwise_s8_ansi (line 59) | void esp_nn_add_elementwise_s8_ansi(const int8_t *input1_data,
FILE: src/basic_math/esp_nn_add_s8_esp32p4.c
function add_requant (line 18) | static inline __attribute__((always_inline))
function esp_nn_add_elementwise_s8_esp32p4 (line 32) | void esp_nn_add_elementwise_s8_esp32p4(const int8_t *input1_data,
FILE: src/basic_math/esp_nn_mul_ansi.c
function esp_nn_mul_elementwise_s8_ansi (line 19) | void esp_nn_mul_elementwise_s8_ansi(const int8_t *input1_data,
function esp_nn_mul_broadcast_channel_s8_ansi (line 44) | void esp_nn_mul_broadcast_channel_s8_ansi(const int8_t *input1,
FILE: src/basic_math/esp_nn_mul_s8_esp32p4.c
function esp_nn_mul_elementwise_s8_esp32p4 (line 15) | void esp_nn_mul_elementwise_s8_esp32p4(const int8_t *input1_data,
FILE: src/common/common_functions.h
function __NN_FORCE_INLINE__ (line 34) | __NN_FORCE_INLINE__ int32_t esp_nn_clz32(uint32_t in)
function __NN_FORCE_INLINE__ (line 74) | __NN_FORCE_INLINE__ int32_t esp_nn_saturate8(int32_t in)
function __NN_FORCE_INLINE__ (line 84) | __NN_FORCE_INLINE__ int32_t esp_nn_pick_sat_high32_of64(int64_t val64)
function __NN_FORCE_INLINE__ (line 91) | __NN_FORCE_INLINE__ int32_t esp_nn_sat_round_doubling_high_mul(int32_t i...
function __NN_FORCE_INLINE__ (line 118) | __NN_FORCE_INLINE__ int32_t esp_nn_div_by_power_of_two_fast(int32_t val,...
function __NN_FORCE_INLINE__ (line 124) | __NN_FORCE_INLINE__ int32_t esp_nn_div_by_power_of_two(int32_t val, int3...
function __NN_FORCE_INLINE__ (line 140) | __NN_FORCE_INLINE__ int32_t esp_nn_multiply_by_quantized_mult(int32_t x,...
function __NN_FORCE_INLINE__ (line 211) | __NN_FORCE_INLINE__ int32_t esp_nn_multiply_by_quantized_mult_fast(int32...
function esp_nn_aligned_s8_pad_with_value (line 239) | static void esp_nn_aligned_s8_pad_with_value(const int8_t *src, int8_t *...
function esp_nn_aligned_s8_pad_end_with_value (line 260) | static void esp_nn_aligned_s8_pad_end_with_value(const int8_t *src, int8...
function __NN_FORCE_INLINE__ (line 291) | __NN_FORCE_INLINE__ void esp_nn_s8_to_s16_with_offset(const int8_t *src,...
function __NN_FORCE_INLINE__ (line 311) | __NN_FORCE_INLINE__ void esp_nn_s8_to_s16(const int8_t *src, int16_t *ds...
FILE: src/common/esp_nn_mean_ansi.c
function esp_nn_mean_nhwc_s8_ansi (line 16) | void esp_nn_mean_nhwc_s8_ansi(const int8_t *input,
FILE: src/common/esp_nn_mean_s8_esp32p4.c
function esp_nn_mean_nhwc_s8_esp32p4 (line 15) | void esp_nn_mean_nhwc_s8_esp32p4(const int8_t *input,
FILE: src/common/esp_nn_mean_s8_esp32s3.c
function esp_nn_mean_nhwc_s8_esp32s3 (line 17) | void esp_nn_mean_nhwc_s8_esp32s3(const int8_t *input,
FILE: src/convolution/esp_nn_conv_ansi.c
function esp_nn_get_conv_scratch_size_ansi (line 19) | int esp_nn_get_conv_scratch_size_ansi(const data_dims_t *input_dims,
function esp_nn_set_conv_scratch_buf_ansi (line 27) | void esp_nn_set_conv_scratch_buf_ansi(const void *buf)
function esp_nn_conv_u8_ansi (line 37) | void esp_nn_conv_u8_ansi(const uint8_t *input_data,
function esp_nn_conv_s8_ansi (line 109) | void esp_nn_conv_s8_ansi(const data_dims_t *input_dims,
FILE: src/convolution/esp_nn_conv_esp32p4.c
function pie_dot_s8 (line 63) | static inline __attribute__((always_inline))
function conv_1x1_batch16 (line 143) | __attribute__((noinline))
function esp_nn_conv_s8_1x1 (line 217) | __attribute__ ((noinline))
function esp_nn_conv_s8_padded (line 384) | __attribute__ ((noinline))
function esp_nn_conv_s8_im2col (line 604) | __attribute__ ((noinline))
function esp_nn_conv_s8_tiled (line 706) | __attribute__ ((noinline))
function esp_nn_get_conv_scratch_size_esp32p4 (line 869) | int esp_nn_get_conv_scratch_size_esp32p4(const data_dims_t *input_dims,
function esp_nn_set_conv_scratch_buf_esp32p4 (line 970) | void esp_nn_set_conv_scratch_buf_esp32p4(void *buf)
function esp_nn_conv_s8_esp32p4 (line 985) | void esp_nn_conv_s8_esp32p4(const data_dims_t *input_dims,
FILE: src/convolution/esp_nn_conv_esp32s3.c
function esp_nn_conv_s8_im2col_s3 (line 176) | __attribute__ ((noinline))
function esp_nn_get_conv_scratch_size_esp32s3 (line 315) | int esp_nn_get_conv_scratch_size_esp32s3(const data_dims_t *input_dims,
function esp_nn_set_conv_scratch_buf_esp32s3 (line 388) | void esp_nn_set_conv_scratch_buf_esp32s3(void *buf)
function esp_nn_conv_s8_esp32s3 (line 393) | void esp_nn_conv_s8_esp32s3(const data_dims_t *input_dims,
FILE: src/convolution/esp_nn_conv_opt.c
function esp_nn_get_conv_scratch_size_opt (line 20) | int esp_nn_get_conv_scratch_size_opt(const data_dims_t *input_dims,
function esp_nn_set_conv_scratch_buf_opt (line 28) | void esp_nn_set_conv_scratch_buf_opt(const void *buf)
function esp_nn_conv_s8_1x1 (line 33) | __attribute__ ((noinline))
function esp_nn_conv_s8_opt (line 95) | void esp_nn_conv_s8_opt(const data_dims_t *input_dims,
FILE: src/convolution/esp_nn_conv_s8_1x1_esp32s3.c
function esp_nn_conv_s8_1x1_scratch_size (line 17) | int esp_nn_conv_s8_1x1_scratch_size(int out_channels)
function transpose_8x8_s16_c (line 28) | static inline void transpose_8x8_s16_c(const int8_t *input, int stride,
function transpose_8x8_s16_simd (line 46) | static inline void transpose_8x8_s16_simd(const int8_t *input, int stride,
function mac_8pos_8ch_simd (line 121) | static inline void mac_8pos_8ch_simd(const int16_t *data_buf, const int8...
function esp_nn_conv_s8_1x1 (line 155) | void esp_nn_conv_s8_1x1(const int8_t *input,
FILE: src/convolution/esp_nn_conv_s8_3x3_opt_esp32s3.c
function esp_nn_conv_s8_3x3_can_use (line 34) | int esp_nn_conv_s8_3x3_can_use(int filter_wd, int filter_ht,
function esp_nn_conv_s8_3x3_scratch_size (line 46) | int esp_nn_conv_s8_3x3_scratch_size(int in_channels, int out_channels)
function esp_nn_conv_s8_3x3_opt (line 57) | void esp_nn_conv_s8_3x3_opt(const int8_t *input,
FILE: src/convolution/esp_nn_depthwise_conv_ansi.c
function esp_nn_get_depthwise_conv_scratch_size_ansi (line 18) | int esp_nn_get_depthwise_conv_scratch_size_ansi(const data_dims_t *input...
function esp_nn_set_depthwise_conv_scratch_buf_ansi (line 26) | void esp_nn_set_depthwise_conv_scratch_buf_ansi(const void *buf)
function esp_nn_depthwise_conv_s8_ansi (line 31) | void esp_nn_depthwise_conv_s8_ansi(const data_dims_t *input_dims,
FILE: src/convolution/esp_nn_depthwise_conv_esp32p4.c
function esp_nn_get_depthwise_conv_scratch_size_esp32p4 (line 25) | int esp_nn_get_depthwise_conv_scratch_size_esp32p4(const data_dims_t *in...
function esp_nn_set_depthwise_conv_scratch_buf_esp32p4 (line 33) | void esp_nn_set_depthwise_conv_scratch_buf_esp32p4(const void *buf)
function depthwise_conv_s8_ch1_pie (line 42) | __attribute__ ((noinline))
function esp_nn_depthwise_conv_s8_esp32p4 (line 262) | void esp_nn_depthwise_conv_s8_esp32p4(const data_dims_t *input_dims,
FILE: src/convolution/esp_nn_depthwise_conv_opt.c
function esp_nn_get_depthwise_conv_scratch_size_opt (line 18) | int esp_nn_get_depthwise_conv_scratch_size_opt(const data_dims_t *input_...
function esp_nn_set_depthwise_conv_scratch_buf_opt (line 26) | void esp_nn_set_depthwise_conv_scratch_buf_opt(const void *buf)
function esp_nn_depthwise_conv_s8_ch_mult_1 (line 32) | __attribute__ ((noinline))
function esp_nn_depthwise_conv_s8_opt (line 160) | void esp_nn_depthwise_conv_s8_opt(const data_dims_t *input_dims,
FILE: src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c
function esp_nn_depthwise_conv_s8_unrolled (line 158) | static void esp_nn_depthwise_conv_s8_unrolled(const int8_t *input_data,
function esp_nn_depthwise_conv_s8_ch_mult1 (line 288) | void esp_nn_depthwise_conv_s8_ch_mult1(const int8_t *input_data,
function esp_nn_get_depthwise_conv_scratch_size_esp32s3 (line 348) | int esp_nn_get_depthwise_conv_scratch_size_esp32s3(const data_dims_t *in...
function esp_nn_set_depthwise_conv_scratch_buf_esp32s3 (line 445) | void esp_nn_set_depthwise_conv_scratch_buf_esp32s3(void *buf)
function esp_nn_depthwise_conv_s8_esp32s3 (line 471) | void esp_nn_depthwise_conv_s8_esp32s3(const data_dims_t *input_dims,
FILE: src/fully_connected/esp_nn_fully_connected_ansi.c
function esp_nn_fully_connected_s8_ansi (line 19) | void esp_nn_fully_connected_s8_ansi(const int8_t *input_data,
function esp_nn_fully_connected_per_ch_s8_ansi (line 52) | void esp_nn_fully_connected_per_ch_s8_ansi(const int8_t *input_data,
FILE: src/fully_connected/esp_nn_fully_connected_esp32s3.c
function esp_nn_fully_connected_s8_esp32s3 (line 52) | void esp_nn_fully_connected_s8_esp32s3(const int8_t *input_data,
function esp_nn_fully_connected_per_ch_s8_esp32s3 (line 120) | void esp_nn_fully_connected_per_ch_s8_esp32s3(const int8_t *input_data,
FILE: src/fully_connected/esp_nn_fully_connected_s8_esp32p4.c
function fc_dot_s8_pie (line 24) | static inline __attribute__((always_inline))
function esp_nn_fully_connected_s8_esp32p4 (line 114) | void esp_nn_fully_connected_s8_esp32p4(const int8_t *input_data,
function esp_nn_fully_connected_per_ch_s8_esp32p4 (line 163) | void esp_nn_fully_connected_per_ch_s8_esp32p4(const int8_t *input_data,
FILE: src/logistic/esp_nn_logistic_ansi.c
function esp_nn_get_logistic_s8_scratch_size_ansi (line 21) | int32_t esp_nn_get_logistic_s8_scratch_size_ansi(void)
function esp_nn_logistic_s8_prepare_ansi (line 26) | void esp_nn_logistic_s8_prepare_ansi(int8_t *lut,
function esp_nn_logistic_s8_ansi (line 51) | void esp_nn_logistic_s8_ansi(const int8_t *input, int8_t *output,
FILE: src/pooling/esp_nn_avg_pool_ansi.c
function esp_nn_avg_pool_s8_ansi (line 19) | void esp_nn_avg_pool_s8_ansi(const int8_t *input,
FILE: src/pooling/esp_nn_avg_pool_s8_esp32p4.c
function esp_nn_avg_pool_s8_esp32p4 (line 18) | void esp_nn_avg_pool_s8_esp32p4(const int8_t *input,
FILE: src/pooling/esp_nn_avg_pool_s8_esp32s3.c
function esp_nn_avg_pool_s8_esp32s3 (line 34) | void esp_nn_avg_pool_s8_esp32s3(const int8_t *input,
FILE: src/pooling/esp_nn_max_pool_ansi.c
function esp_nn_max_pool_s8_ansi (line 19) | void esp_nn_max_pool_s8_ansi(const int8_t *input,
FILE: src/pooling/esp_nn_max_pool_s8_esp32p4.c
function esp_nn_max_pool_s8_esp32p4 (line 16) | void esp_nn_max_pool_s8_esp32p4(const int8_t *input,
FILE: src/softmax/esp_nn_softmax_ansi.c
function esp_nn_get_softmax_scratch_size_ansi (line 17) | int32_t esp_nn_get_softmax_scratch_size_ansi(const int32_t width, const ...
function esp_nn_set_softmax_scratch_buf_ansi (line 24) | void esp_nn_set_softmax_scratch_buf_ansi(void *buffer)
function esp_nn_softmax_s8_ansi (line 30) | void esp_nn_softmax_s8_ansi(const int8_t *input_data,
FILE: src/softmax/esp_nn_softmax_opt.c
function esp_nn_get_softmax_scratch_size_opt (line 29) | int32_t esp_nn_get_softmax_scratch_size_opt(const int32_t width, const i...
function esp_nn_set_softmax_scratch_buf_opt (line 41) | void esp_nn_set_softmax_scratch_buf_opt(void *buffer)
function esp_nn_softmax_s8_opt (line 46) | void esp_nn_softmax_s8_opt(const int8_t *input_data,
FILE: src/softmax/esp_nn_softmax_s8_esp32p4.c
function esp_nn_get_softmax_scratch_size_esp32p4 (line 13) | int32_t esp_nn_get_softmax_scratch_size_esp32p4(const int32_t width, con...
function esp_nn_set_softmax_scratch_buf_esp32p4 (line 19) | void esp_nn_set_softmax_scratch_buf_esp32p4(void *buffer)
function esp_nn_softmax_s8_esp32p4 (line 36) | void esp_nn_softmax_s8_esp32p4(const int8_t *input_data,
FILE: src/softmax/esp_nn_softmax_s8_esp32s3.c
function esp_nn_get_softmax_scratch_size_esp32s3 (line 16) | int32_t esp_nn_get_softmax_scratch_size_esp32s3(const int32_t width, con...
function esp_nn_set_softmax_scratch_buf_esp32s3 (line 22) | void esp_nn_set_softmax_scratch_buf_esp32s3(void *buffer)
function find_max_s8 (line 28) | static inline int8_t find_max_s8(const int8_t *data, int32_t len)
function esp_nn_softmax_s8_esp32s3 (line 74) | void esp_nn_softmax_s8_esp32s3(const int8_t *input_data,
FILE: src/softmax/softmax_common.h
function __NN_FORCE_INLINE__ (line 24) | __NN_FORCE_INLINE__ int32_t mul_power_of_2(int val, int exp)
function __NN_FORCE_INLINE__ (line 44) | __NN_FORCE_INLINE__ int32_t esp_nn_one_over_one_plus_x_for_x_in_0_1(int3...
function __NN_FORCE_INLINE__ (line 66) | __NN_FORCE_INLINE__ int32_t esp_nn_exp_on_negative_values(int32_t val)
FILE: test_app/main/main.c
function profile_c_start (line 30) | void profile_c_start()
function profile_c_end (line 36) | uint32_t profile_c_end()
function profile_opt_start (line 43) | void profile_opt_start()
function profile_opt_end (line 49) | uint32_t profile_opt_end()
function print_profile (line 56) | static void print_profile(const char *kernel)
function app_main (line 63) | void app_main()
FILE: tests/src/basic_math_test.c
function esp_nn_add_elementwise_s8_test (line 52) | void esp_nn_add_elementwise_s8_test()
function esp_nn_mul_elementwise_s8_test (line 243) | void esp_nn_mul_elementwise_s8_test()
function esp_nn_mul_broadcast_channel_s8_test (line 381) | void esp_nn_mul_broadcast_channel_s8_test()
FILE: tests/src/convolution_test.c
function esp_nn_depthwise_conv_s8_test (line 17) | void esp_nn_depthwise_conv_s8_test()
function esp_nn_conv_s8_test (line 340) | void esp_nn_conv_s8_test()
FILE: tests/src/fully_connected_test.c
function esp_nn_fully_connected_s8_test (line 17) | void esp_nn_fully_connected_s8_test()
function esp_nn_fully_connected_per_ch_s8_test (line 160) | void esp_nn_fully_connected_per_ch_s8_test()
FILE: tests/src/hard_swish_test.c
function esp_nn_hard_swish_s8_test (line 16) | void esp_nn_hard_swish_s8_test()
FILE: tests/src/mean_test.c
function esp_nn_mean_nhwc_s8_test (line 17) | void esp_nn_mean_nhwc_s8_test()
FILE: tests/src/pooling_test.c
function run_avg_pool_test (line 17) | static void run_avg_pool_test(uint16_t input_wd, uint16_t input_ht, uint...
function esp_nn_avg_pool_s8_test (line 76) | void esp_nn_avg_pool_s8_test()
function run_max_pool_test (line 100) | static void run_max_pool_test(uint16_t input_wd, uint16_t input_ht, uint...
function esp_nn_max_pool_s8_test (line 159) | void esp_nn_max_pool_s8_test()
FILE: tests/src/relu_test.c
function run_relu6_test (line 16) | static void run_relu6_test(int size, int iter)
function esp_nn_relu6_s8_test (line 59) | void esp_nn_relu6_s8_test()
FILE: tests/src/softmax_test.c
function run_softmax_test (line 17) | static void run_softmax_test(int32_t height, int32_t width, int32_t mult,
function esp_nn_softmax_s8_test (line 80) | void esp_nn_softmax_s8_test()
Condensed preview — 99 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (851K chars).
[
{
"path": ".github/workflows/upload_component.yml",
"chars": 428,
"preview": "name: Push esp-nn to IDF Component Registry\n\non:\n push:\n branches:\n - master\n\njobs:\n upload_components:\n ru"
},
{
"path": ".gitignore",
"chars": 654,
"preview": ".config\n*.o\n*.i\n*.s\n*.orig\n*.pyc\n\n# gtags\nGTAGS\nGRTAGS\nGPATH\n\n# emacs\n.dir-locals.el\n\n# emacs temp file suffixes\n*~\n.#*\n"
},
{
"path": ".gitlab-ci.yml",
"chars": 2156,
"preview": "stages:\n - build\n\n# Avoid running duplicate pipeline\nworkflow:\n rules:\n - if: '$CI_PIPELINE_SOURCE == \"merge_reques"
},
{
"path": "CMakeLists.txt",
"chars": 4015,
"preview": "cmake_minimum_required(VERSION 3.5)\n\nset(c_srcs\n \"src/activation_functions/esp_nn_relu_ansi.c\"\n \"src/activation_fu"
},
{
"path": "CONTRIBUTING.md",
"chars": 1701,
"preview": "# Contributing\n\nContributions to ESP-NN project in the form of pull requests, bug reports, and feature requests are welc"
},
{
"path": "Kconfig.projbuild",
"chars": 1320,
"preview": "menu \"ESP-NN\"\n\nchoice NN_OPTIMIZATIONS\n bool \"Optimization for nn functions\"\n default NN_OPTIMIZED\n help\n Use"
},
{
"path": "LICENSE",
"chars": 11358,
"preview": "\n Apache License\n Version 2.0, January 2004\n "
},
{
"path": "README.md",
"chars": 6247,
"preview": "# ESP-NN\n\nThe library contains optimised NN (Neural Network) functions for various Espressif chips.\n\n* Supported platfor"
},
{
"path": "idf_component.yml",
"chars": 321,
"preview": "version: \"1.2.3\"\ndescription: Optimized NN (Neural Network) functions for Espressif chips\nurl: https://github.com/espres"
},
{
"path": "include/esp_nn.h",
"chars": 1345,
"preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "include/esp_nn_ansi_c.h",
"chars": 1877,
"preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
},
{
"path": "include/esp_nn_ansi_headers.h",
"chars": 17533,
"preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
},
{
"path": "include/esp_nn_defs.h",
"chars": 1950,
"preview": "// Copyright 2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"Licens"
},
{
"path": "include/esp_nn_esp32p4.h",
"chars": 11013,
"preview": "/*\n * SPDX-FileCopyrightText: 2024-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
},
{
"path": "include/esp_nn_esp32s3.h",
"chars": 14349,
"preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
},
{
"path": "include/esp_nn_generic_opt.h",
"chars": 1903,
"preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
},
{
"path": "src/activation_functions/esp_nn_hard_swish_ansi.c",
"chars": 3279,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
},
{
"path": "src/activation_functions/esp_nn_hard_swish_s8_esp32p4.c",
"chars": 6113,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
},
{
"path": "src/activation_functions/esp_nn_hard_swish_s8_esp32s3.c",
"chars": 3185,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
},
{
"path": "src/activation_functions/esp_nn_relu_ansi.c",
"chars": 891,
"preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/activation_functions/esp_nn_relu_s8_esp32p4.c",
"chars": 1894,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
},
{
"path": "src/activation_functions/esp_nn_relu_s8_esp32s3.S",
"chars": 3677,
"preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/basic_math/esp_nn_add_ansi.c",
"chars": 4214,
"preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/basic_math/esp_nn_add_s8_esp32p4.c",
"chars": 3669,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
},
{
"path": "src/basic_math/esp_nn_add_s8_esp32s3.S",
"chars": 21647,
"preview": "// Copyright 2021-2023 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/basic_math/esp_nn_mul_ansi.c",
"chars": 3168,
"preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/basic_math/esp_nn_mul_broadcast_s8_esp32s3.S",
"chars": 11279,
"preview": "// Copyright 2026 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"Licens"
},
{
"path": "src/basic_math/esp_nn_mul_s8_esp32p4.c",
"chars": 3187,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
},
{
"path": "src/basic_math/esp_nn_mul_s8_esp32s3.S",
"chars": 13293,
"preview": "// Copyright 2021-2023 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/common/common_functions.h",
"chars": 10801,
"preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
},
{
"path": "src/common/esp_nn_common_functions_esp32s3.S",
"chars": 8793,
"preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/common/esp_nn_dot_s8_esp32s3.S",
"chars": 4359,
"preview": "//\n// SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n//\n// SPDX-License-Identifier: Apache-2.0\n//\n\n//\n"
},
{
"path": "src/common/esp_nn_mean_ansi.c",
"chars": 1591,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
},
{
"path": "src/common/esp_nn_mean_s8_esp32p4.c",
"chars": 3743,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
},
{
"path": "src/common/esp_nn_mean_s8_esp32s3.c",
"chars": 3328,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
},
{
"path": "src/common/esp_nn_multiply_by_quantized_mult_esp32p4.S",
"chars": 2980,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
},
{
"path": "src/common/esp_nn_multiply_by_quantized_mult_esp32s3.S",
"chars": 4069,
"preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/common/esp_nn_multiply_by_quantized_mult_ver1_esp32s3.S",
"chars": 6841,
"preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/convolution/esp_nn_conv_ansi.c",
"chars": 8851,
"preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/convolution/esp_nn_conv_esp32p4.c",
"chars": 45228,
"preview": "/*\n * SPDX-FileCopyrightText: 2024-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
},
{
"path": "src/convolution/esp_nn_conv_esp32s3.c",
"chars": 27082,
"preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
},
{
"path": "src/convolution/esp_nn_conv_opt.c",
"chars": 8540,
"preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/convolution/esp_nn_conv_s16_mult4_1x1_esp32s3.S",
"chars": 15381,
"preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/convolution/esp_nn_conv_s16_mult8_esp32s3.S",
"chars": 22050,
"preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/convolution/esp_nn_conv_s8_1x1_esp32s3.c",
"chars": 11187,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
},
{
"path": "src/convolution/esp_nn_conv_s8_3x3_opt_esp32s3.c",
"chars": 6809,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
},
{
"path": "src/convolution/esp_nn_conv_s8_filter_aligned_input_padded_esp32s3.S",
"chars": 7386,
"preview": "//\n// SPDX-FileCopyrightText: 2023-2026 Espressif Systems (Shanghai) CO LTD\n//\n// SPDX-License-Identifier: Apache-2.0\n//"
},
{
"path": "src/convolution/esp_nn_conv_s8_mult8_1x1_esp32s3.S",
"chars": 21309,
"preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/convolution/esp_nn_depthwise_conv_ansi.c",
"chars": 4881,
"preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/convolution/esp_nn_depthwise_conv_esp32p4.c",
"chars": 13248,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
},
{
"path": "src/convolution/esp_nn_depthwise_conv_opt.c",
"chars": 14879,
"preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3.S",
"chars": 18148,
"preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3.S",
"chars": 16628,
"preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/convolution/esp_nn_depthwise_conv_s16_mult1_esp32s3.S",
"chars": 14717,
"preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/convolution/esp_nn_depthwise_conv_s16_mult4_esp32s3.S",
"chars": 18952,
"preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/convolution/esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3.S",
"chars": 21304,
"preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/convolution/esp_nn_depthwise_conv_s16_mult8_esp32s3.S",
"chars": 19951,
"preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c",
"chars": 48323,
"preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
},
{
"path": "src/convolution/esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3.S",
"chars": 23834,
"preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/fully_connected/esp_nn_fc_s8_mac16_esp32s3.S",
"chars": 2546,
"preview": "//\n// SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n//\n// SPDX-License-Identifier: Apache-2.0\n//\n\n//\n"
},
{
"path": "src/fully_connected/esp_nn_fully_connected_ansi.c",
"chars": 3799,
"preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/fully_connected/esp_nn_fully_connected_esp32s3.c",
"chars": 8025,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
},
{
"path": "src/fully_connected/esp_nn_fully_connected_per_ch_s8_esp32s3.S",
"chars": 8441,
"preview": "//\n// SPDX-FileCopyrightText: 2025-2026 Espressif Systems (Shanghai) CO LTD\n//\n// SPDX-License-Identifier: Apache-2.0\n//"
},
{
"path": "src/fully_connected/esp_nn_fully_connected_s8_esp32p4.c",
"chars": 8680,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
},
{
"path": "src/fully_connected/esp_nn_fully_connected_s8_esp32s3.S",
"chars": 8101,
"preview": "//\n// SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n//\n// SPDX-License-Identifier: Apache-2.0\n//"
},
{
"path": "src/logistic/esp_nn_logistic_ansi.c",
"chars": 1951,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
},
{
"path": "src/pooling/esp_nn_avg_pool_ansi.c",
"chars": 3258,
"preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/pooling/esp_nn_avg_pool_s8_esp32p4.c",
"chars": 6113,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
},
{
"path": "src/pooling/esp_nn_avg_pool_s8_esp32s3.S",
"chars": 29214,
"preview": "//\n// SPDX-FileCopyrightText: 2021-2026 Espressif Systems (Shanghai) CO LTD\n//\n// SPDX-License-Identifier: Apache-2.0\n//"
},
{
"path": "src/pooling/esp_nn_avg_pool_s8_esp32s3.c",
"chars": 4084,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
},
{
"path": "src/pooling/esp_nn_max_pool_ansi.c",
"chars": 2975,
"preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/pooling/esp_nn_max_pool_s8_esp32p4.c",
"chars": 5822,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
},
{
"path": "src/pooling/esp_nn_max_pool_s8_esp32s3.S",
"chars": 19966,
"preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
},
{
"path": "src/softmax/esp_nn_softmax_ansi.c",
"chars": 3421,
"preview": "// Copyright 2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"Licens"
},
{
"path": "src/softmax/esp_nn_softmax_opt.c",
"chars": 3909,
"preview": "// Copyright 2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"Licens"
},
{
"path": "src/softmax/esp_nn_softmax_s8_esp32p4.c",
"chars": 4729,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
},
{
"path": "src/softmax/esp_nn_softmax_s8_esp32s3.c",
"chars": 5249,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
},
{
"path": "src/softmax/softmax_common.h",
"chars": 4287,
"preview": "// Copyright 2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"Licens"
},
{
"path": "test_app/CMakeLists.txt",
"chars": 319,
"preview": "# The following lines of boilerplate have to be in your project's\n# CMakeLists in this exact order for cmake to work cor"
},
{
"path": "test_app/Makefile",
"chars": 384,
"preview": "#\n# This is a project Makefile. It is assumed the directory this Makefile resides in is a\n# project subdirectory.\n#\n\nPRO"
},
{
"path": "test_app/main/CMakeLists.txt",
"chars": 132,
"preview": "\nset(COMPONENT_SRCS \"main.c\")\nset(COMPONENT_ADD_INCLUDEDIRS \"\")\n\nset(COMPONENT_PRIV_REQUIRES tests esp_timer)\n\nregister_"
},
{
"path": "test_app/main/component.mk",
"chars": 316,
"preview": "#\n# Main component makefile.\n#\n# This Makefile can be left empty. By default, it will take the sources in the \n# src/ di"
},
{
"path": "test_app/main/main.c",
"chars": 2668,
"preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
},
{
"path": "test_app/sdkconfig.defaults",
"chars": 36,
"preview": "\n#\n# esp-nn\n#\nCONFIG_NN_OPTIMIZED=y\n"
},
{
"path": "test_app/sdkconfig.defaults.esp32p4",
"chars": 659,
"preview": "# Enables high speed SPIRAM and other options\nCONFIG_IDF_EXPERIMENTAL_FEATURES=y\n\n#\n# ESP System Settings\n#\nCONFIG_ESP_D"
},
{
"path": "test_app/sdkconfig.defaults.esp32s3",
"chars": 225,
"preview": "# Default configurations for ESP32-S3\n\nCONFIG_ESP32S3_DEFAULT_CPU_FREQ_240=y\n# CONFIG_ESP32S3_SPIRAM_SUPPORT is not set\n"
},
{
"path": "tests/CMakeLists.txt",
"chars": 531,
"preview": "\nset(COMPONENT_ADD_INCLUDEDIRS ./include/)\nset(COMPONENT_SRCS \"src/basic_math_test.c\"\n \"src/convolutio"
},
{
"path": "tests/README.md",
"chars": 129,
"preview": "# Tests for esp_nn library\n\n- Include these in your test framework and run the framework.\n- For IDF test please refer `t"
},
{
"path": "tests/component.mk",
"chars": 74,
"preview": "#FIXME\n\nCOMPONENT_ADD_INCLUDEDIRS := include/\n\nCOMPONENT_SRCDIRS := src/\n"
},
{
"path": "tests/include/test_functions.h",
"chars": 1060,
"preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
},
{
"path": "tests/include/test_utils.h",
"chars": 3813,
"preview": "/*\n * SPDX-FileCopyrightText: 2020-2023 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
},
{
"path": "tests/src/basic_math_test.c",
"chars": 21293,
"preview": "/*\n * SPDX-FileCopyrightText: 2020-2023 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
},
{
"path": "tests/src/convolution_test.c",
"chars": 31742,
"preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
},
{
"path": "tests/src/fully_connected_test.c",
"chars": 9977,
"preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
},
{
"path": "tests/src/hard_swish_test.c",
"chars": 3297,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
},
{
"path": "tests/src/mean_test.c",
"chars": 2813,
"preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
},
{
"path": "tests/src/pooling_test.c",
"chars": 7725,
"preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
},
{
"path": "tests/src/relu_test.c",
"chars": 2403,
"preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
},
{
"path": "tests/src/softmax_test.c",
"chars": 4120,
"preview": "/*\n * SPDX-FileCopyrightText: 2022-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
}
]
About this extraction
This page contains the full source code of the espressif/esp-nn GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 99 files (801.9 KB), approximately 247.9k tokens, and a symbol index with 143 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.