Full Code of espressif/esp-nn for AI

master d45b843ca5f8 cached
99 files
801.9 KB
247.9k tokens
143 symbols
1 requests
Download .txt
Showing preview only (841K chars total). Download the full file or copy to clipboard to get everything.
Repository: espressif/esp-nn
Branch: master
Commit: d45b843ca5f8
Files: 99
Total size: 801.9 KB

Directory structure:
gitextract__zjpraf8/

├── .github/
│   └── workflows/
│       └── upload_component.yml
├── .gitignore
├── .gitlab-ci.yml
├── CMakeLists.txt
├── CONTRIBUTING.md
├── Kconfig.projbuild
├── LICENSE
├── README.md
├── idf_component.yml
├── include/
│   ├── esp_nn.h
│   ├── esp_nn_ansi_c.h
│   ├── esp_nn_ansi_headers.h
│   ├── esp_nn_defs.h
│   ├── esp_nn_esp32p4.h
│   ├── esp_nn_esp32s3.h
│   └── esp_nn_generic_opt.h
├── src/
│   ├── activation_functions/
│   │   ├── esp_nn_hard_swish_ansi.c
│   │   ├── esp_nn_hard_swish_s8_esp32p4.c
│   │   ├── esp_nn_hard_swish_s8_esp32s3.c
│   │   ├── esp_nn_relu_ansi.c
│   │   ├── esp_nn_relu_s8_esp32p4.c
│   │   └── esp_nn_relu_s8_esp32s3.S
│   ├── basic_math/
│   │   ├── esp_nn_add_ansi.c
│   │   ├── esp_nn_add_s8_esp32p4.c
│   │   ├── esp_nn_add_s8_esp32s3.S
│   │   ├── esp_nn_mul_ansi.c
│   │   ├── esp_nn_mul_broadcast_s8_esp32s3.S
│   │   ├── esp_nn_mul_s8_esp32p4.c
│   │   └── esp_nn_mul_s8_esp32s3.S
│   ├── common/
│   │   ├── common_functions.h
│   │   ├── esp_nn_common_functions_esp32s3.S
│   │   ├── esp_nn_dot_s8_esp32s3.S
│   │   ├── esp_nn_mean_ansi.c
│   │   ├── esp_nn_mean_s8_esp32p4.c
│   │   ├── esp_nn_mean_s8_esp32s3.c
│   │   ├── esp_nn_multiply_by_quantized_mult_esp32p4.S
│   │   ├── esp_nn_multiply_by_quantized_mult_esp32s3.S
│   │   └── esp_nn_multiply_by_quantized_mult_ver1_esp32s3.S
│   ├── convolution/
│   │   ├── esp_nn_conv_ansi.c
│   │   ├── esp_nn_conv_esp32p4.c
│   │   ├── esp_nn_conv_esp32s3.c
│   │   ├── esp_nn_conv_opt.c
│   │   ├── esp_nn_conv_s16_mult4_1x1_esp32s3.S
│   │   ├── esp_nn_conv_s16_mult8_esp32s3.S
│   │   ├── esp_nn_conv_s8_1x1_esp32s3.c
│   │   ├── esp_nn_conv_s8_3x3_opt_esp32s3.c
│   │   ├── esp_nn_conv_s8_filter_aligned_input_padded_esp32s3.S
│   │   ├── esp_nn_conv_s8_mult8_1x1_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_ansi.c
│   │   ├── esp_nn_depthwise_conv_esp32p4.c
│   │   ├── esp_nn_depthwise_conv_opt.c
│   │   ├── esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_s16_mult1_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_s16_mult4_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_s16_mult8_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_s8_esp32s3.c
│   │   └── esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3.S
│   ├── fully_connected/
│   │   ├── esp_nn_fc_s8_mac16_esp32s3.S
│   │   ├── esp_nn_fully_connected_ansi.c
│   │   ├── esp_nn_fully_connected_esp32s3.c
│   │   ├── esp_nn_fully_connected_per_ch_s8_esp32s3.S
│   │   ├── esp_nn_fully_connected_s8_esp32p4.c
│   │   └── esp_nn_fully_connected_s8_esp32s3.S
│   ├── logistic/
│   │   └── esp_nn_logistic_ansi.c
│   ├── pooling/
│   │   ├── esp_nn_avg_pool_ansi.c
│   │   ├── esp_nn_avg_pool_s8_esp32p4.c
│   │   ├── esp_nn_avg_pool_s8_esp32s3.S
│   │   ├── esp_nn_avg_pool_s8_esp32s3.c
│   │   ├── esp_nn_max_pool_ansi.c
│   │   ├── esp_nn_max_pool_s8_esp32p4.c
│   │   └── esp_nn_max_pool_s8_esp32s3.S
│   └── softmax/
│       ├── esp_nn_softmax_ansi.c
│       ├── esp_nn_softmax_opt.c
│       ├── esp_nn_softmax_s8_esp32p4.c
│       ├── esp_nn_softmax_s8_esp32s3.c
│       └── softmax_common.h
├── test_app/
│   ├── CMakeLists.txt
│   ├── Makefile
│   ├── main/
│   │   ├── CMakeLists.txt
│   │   ├── component.mk
│   │   └── main.c
│   ├── sdkconfig.defaults
│   ├── sdkconfig.defaults.esp32p4
│   └── sdkconfig.defaults.esp32s3
└── tests/
    ├── CMakeLists.txt
    ├── README.md
    ├── component.mk
    ├── include/
    │   ├── test_functions.h
    │   └── test_utils.h
    └── src/
        ├── basic_math_test.c
        ├── convolution_test.c
        ├── fully_connected_test.c
        ├── hard_swish_test.c
        ├── mean_test.c
        ├── pooling_test.c
        ├── relu_test.c
        └── softmax_test.c

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/upload_component.yml
================================================
name: Push esp-nn to IDF Component Registry

on:
  push:
    branches:
      - master

jobs:
  upload_components:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Upload esp-nn to IDF Component Registry
        uses: espressif/upload-components-ci-action@v1
        with:
          namespace: "espressif"
          name: "esp-nn"
          api_token: ${{ secrets.IDF_COMPONENT_API_TOKEN }}


================================================
FILE: .gitignore
================================================
.config
*.o
*.i
*.s
*.orig
*.pyc

# gtags
GTAGS
GRTAGS
GPATH

# emacs
.dir-locals.el

# emacs temp file suffixes
*~
.#*
\#*#

# eclipse setting
.settings

# MacOS directory files
.DS_Store

# Example project files
examples/**/sdkconfig
examples/**/sdkconfig.old
examples/**/build

# Test app files
test_app/build
test_app/sdkconfig
test_app/sdkconfig.old

# Doc build artifacts
docs/_build/
docs/doxygen-warning-log.txt
docs/sphinx-warning-log.txt
docs/sphinx-warning-log-sanitized.txt
docs/xml/
docs/xml_in/
docs/man/
docs/doxygen_sqlite3.db

TEST_LOGS


# gcov coverage reports
*.gcda
*.gcno
coverage.info
coverage_report/

# VS Code Settings
.vscode/


================================================
FILE: .gitlab-ci.yml
================================================
stages:
  - build

# Avoid running duplicate pipeline
workflow:
  rules:
    - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
    - if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH'

variables:
  GIT_STRATEGY: fetch
  GIT_SUBMODULE_STRATEGY: recursive
before_script:
    - mkdir -p ~/.ssh
    - chmod 700 ~/.ssh
    - echo -n $GITLAB_KEY_TMP > ~/.ssh/id_rsa_base64
    - base64 --decode --ignore-garbage ~/.ssh/id_rsa_base64 > ~/.ssh/id_rsa
    - chmod 600 ~/.ssh/id_rsa
    - echo -e "Host gitlab.espressif.cn\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config
    - |
      if [ -n "$IDF_COMPONENT_MGR_VER" ]; then
        pip install idf-component-manager==$IDF_COMPONENT_MGR_VER
      fi

.test_build: &test_build
    # Build examples
    - for TARGET in $EXAMPLE_TARGETS; do
    - idf.py set-target $TARGET build
    - done

.build_template:
  stage: build
  image: espressif/idf:latest
  tags:
    - build
  variables:
    PEDANTIC_FLAGS: "-Werror -Wno-error=cpp -Werror=unused-variable -Werror=unused-but-set-variable -Werror=unused-function"
    EXTRA_CFLAGS: "${PEDANTIC_FLAGS}"
    EXTRA_CXXFLAGS: "${PEDANTIC_FLAGS}"
  rules:
    - if: '$CI_PIPELINE_SOURCE == "schedule"'
      when: never
    - when: always
  script:
    - cd ${CI_PROJECT_DIR}/test_app
    # build examples
    - *test_build
    - cd ${CI_PROJECT_DIR}

build_idf_v5.5:
  extends: .build_template
  image: espressif/idf:release-v5.5
  variables:
    EXAMPLE_TARGETS: esp32 esp32s3 esp32c3 esp32p4

build_idf_v5.2:
  extends: .build_template
  image: espressif/idf:release-v5.2
  variables:
    EXAMPLE_TARGETS: esp32 esp32s3 esp32c3

build_idf_v5.0:
  extends: .build_template
  image: espressif/idf:release-v5.0
  variables:
    EXAMPLE_TARGETS: esp32 esp32s3 esp32c3

build_idf_v4.4:
  extends: .build_template
  image: espressif/idf:release-v4.4
  variables:
    EXAMPLE_TARGETS: esp32 esp32s3 esp32c3
    IDF_COMPONENT_MGR_VER: "1.2.0"

build_idf_v4.3:
  extends: .build_template
  image: espressif/idf:release-v4.3
  variables:
    EXAMPLE_TARGETS: esp32

build_idf_v4.2:
  extends: .build_template
  image: espressif/idf:release-v4.2
  variables:
    EXAMPLE_TARGETS: esp32


================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.5)

set(c_srcs
    "src/activation_functions/esp_nn_relu_ansi.c"
    "src/activation_functions/esp_nn_hard_swish_ansi.c"
    "src/common/esp_nn_mean_ansi.c"
    "src/basic_math/esp_nn_add_ansi.c"
    "src/basic_math/esp_nn_mul_ansi.c"
    "src/convolution/esp_nn_conv_ansi.c"
    "src/convolution/esp_nn_conv_opt.c"
    "src/convolution/esp_nn_depthwise_conv_ansi.c"
    "src/convolution/esp_nn_depthwise_conv_opt.c"
    "src/fully_connected/esp_nn_fully_connected_ansi.c"
    "src/softmax/esp_nn_softmax_ansi.c"
    "src/softmax/esp_nn_softmax_opt.c"
    "src/logistic/esp_nn_logistic_ansi.c"
    "src/pooling/esp_nn_avg_pool_ansi.c"
    "src/pooling/esp_nn_max_pool_ansi.c")

if(CONFIG_IDF_TARGET_ESP32S3)
    set(s3_srcs
        "src/common/esp_nn_common_functions_esp32s3.S"
        "src/common/esp_nn_dot_s8_esp32s3.S"
        "src/common/esp_nn_multiply_by_quantized_mult_esp32s3.S"
        "src/common/esp_nn_multiply_by_quantized_mult_ver1_esp32s3.S"
        "src/activation_functions/esp_nn_relu_s8_esp32s3.S"
        "src/activation_functions/esp_nn_hard_swish_s8_esp32s3.c"
        "src/common/esp_nn_mean_s8_esp32s3.c"
        "src/basic_math/esp_nn_add_s8_esp32s3.S"
        "src/basic_math/esp_nn_mul_s8_esp32s3.S"
        "src/basic_math/esp_nn_mul_broadcast_s8_esp32s3.S"
        "src/convolution/esp_nn_conv_esp32s3.c"
        "src/convolution/esp_nn_conv_s8_1x1_esp32s3.c"
        "src/convolution/esp_nn_conv_s8_3x3_opt_esp32s3.c"
        "src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c"
        "src/convolution/esp_nn_conv_s16_mult8_esp32s3.S"
        "src/convolution/esp_nn_conv_s8_mult8_1x1_esp32s3.S"
        "src/convolution/esp_nn_conv_s16_mult4_1x1_esp32s3.S"
        "src/convolution/esp_nn_conv_s8_filter_aligned_input_padded_esp32s3.S"
        "src/convolution/esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3.S"
        "src/convolution/esp_nn_depthwise_conv_s16_mult1_esp32s3.S"
        "src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3.S"
        "src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3.S"
        "src/convolution/esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3.S"
        "src/convolution/esp_nn_depthwise_conv_s16_mult4_esp32s3.S"
        "src/convolution/esp_nn_depthwise_conv_s16_mult8_esp32s3.S"
        "src/fully_connected/esp_nn_fully_connected_esp32s3.c"
        "src/fully_connected/esp_nn_fc_s8_mac16_esp32s3.S"
        "src/fully_connected/esp_nn_fully_connected_s8_esp32s3.S"
        "src/fully_connected/esp_nn_fully_connected_per_ch_s8_esp32s3.S"
        "src/pooling/esp_nn_max_pool_s8_esp32s3.S"
        "src/pooling/esp_nn_avg_pool_s8_esp32s3.c"
        "src/pooling/esp_nn_avg_pool_s8_esp32s3.S"
        "src/softmax/esp_nn_softmax_s8_esp32s3.c")
endif()

if(CONFIG_IDF_TARGET_ESP32P4)
    set(p4_srcs
        "src/common/esp_nn_mean_s8_esp32p4.c"
        "src/common/esp_nn_multiply_by_quantized_mult_esp32p4.S"
        "src/activation_functions/esp_nn_hard_swish_s8_esp32p4.c"
        "src/activation_functions/esp_nn_relu_s8_esp32p4.c"
        "src/basic_math/esp_nn_add_s8_esp32p4.c"
        "src/basic_math/esp_nn_mul_s8_esp32p4.c"
        "src/convolution/esp_nn_conv_esp32p4.c"
        "src/convolution/esp_nn_depthwise_conv_esp32p4.c"
        "src/fully_connected/esp_nn_fully_connected_s8_esp32p4.c"
        "src/pooling/esp_nn_avg_pool_s8_esp32p4.c"
        "src/pooling/esp_nn_max_pool_s8_esp32p4.c"
        "src/softmax/esp_nn_softmax_s8_esp32p4.c")
endif()

idf_component_register(SRCS "${c_srcs}"
                            "${s3_srcs}"
                            "${p4_srcs}"
                       INCLUDE_DIRS "include" "src/common")

if(CONFIG_IDF_TARGET_ESP32S3)
    target_compile_options(${COMPONENT_LIB} PRIVATE -mlongcalls -fno-unroll-loops -O2 -Wno-unused-function)
else()
    target_compile_options(${COMPONENT_LIB} PRIVATE  -O2 -Wno-unused-function)
endif()

if(CONFIG_NN_SKIP_NUDGE)
    target_compile_definitions(${COMPONENT_LIB} PRIVATE SKIP_NUDGE)
endif()


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing

Contributions to ESP-NN project in the form of pull requests, bug reports, and feature requests are welcome!

This document covers various topics related to contributions to the ESP-NN projects. Please read it if you plan to submit a PR!

## CLA

We require accepting the contributor's license agreement for all pull requests. When opening a pull request the first time you will be prompted to sign the CLA by the [CLA Assistant](https://cla-assistant.io/) service.

## Large-scale Changes

If you'd like to propose a change to the existing APIs or a large-scale refactoring of the implementation, we recommend opening an issue first to discuss this.

## Updating the Benchmarks Table

The benchmarks table in [README.md](README.md) contains benchmarks for ESP32-S3. The benchmarks are collected by running the app in [test_app](test_app/) directory. Please update this table if you have changed the implementations of some of the functions or added the new ones.

## Releasing a new version

Maintainers should follow the steps below to release a new version of ESP-NN component. Assuming the new version is `vX.Y.Z`:

1. Ensure you are on the latest `master` branch:
   ```bash
   git checkout master
   git pull --ff-only origin master
   ```
1. Create the new tag:
   ```bash
   git tag -s -a -m "vX.Y.Z" vX.Y.Z
   ```
1. Push the tag and the branch to the internal repository:
   ```bash
   git push origin vX.Y.Z
   ```
1. CI will automatically push the tag to Github and will upload the new version to the IDF Component Registry.
1. Go to https://github.com/espressif/esp-nn/releases and create a release from the tag vX.Y.Z.
1. Write the release notes and publish the release.


================================================
FILE: Kconfig.projbuild
================================================
menu "ESP-NN"

choice NN_OPTIMIZATIONS
   bool "Optimization for nn functions"
   default NN_OPTIMIZED
   help
      Use ANSI-C versions for verification and debug purpose.
      Optimisations are automatically picked up for a chipset.
      For ESP32-S3, assembly optimisations are selected.
      For other platforms(viz., ESP32, ESP32-C3), generic optimisations are used.

config NN_ANSI_C
   bool "ANSI C"
   help
      ANSI C versions for verification and debug purposes.
config NN_OPTIMIZED
   bool "Optimized versions"
   help
      Optimisations are automatically picked up for a chipset.
      For ESP32-S3, assembly optimisations are selected.
      For other platforms(viz., ESP32, ESP32-C3), generic optimisations are used.
endchoice

config NN_OPTIMIZATIONS
   int
   default 0 if NN_ANSI_C
   default 1 if NN_OPTIMIZED

config NN_SKIP_NUDGE
   bool "Use fast (non-bit-exact) requantization"
   depends on NN_OPTIMIZED
   default n
   help
      When enabled, kernels use a faster requantize path that may differ
      from the TFLite reference by +/-1 LSB at half-shift boundaries.
      On ESP32-S3, this also skips the nudge addition in the assembly
      requantize for ~20% speedup.
      Leave disabled for bit-exact behavior (recommended for tests and
      for matching reference outputs).

endmenu


================================================
FILE: LICENSE
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# ESP-NN

The library contains optimised NN (Neural Network) functions for various Espressif chips.

* Supported platforms:
   * TensorFlow Lite Micro (TFLite Micro). Repo can be found [here](https://github.com/espressif/tflite-micro-esp-examples)

* Supported ESP chips include:
   * ESP32-S3 (Assembly versions optimised to benefit from vector instructions of ESP32-S3)
   * ESP32-P4 (Optimised using PIE/QACC SIMD instructions)
   * ESP32 (Generic optimisations)
   * ESP32-C3 (Generic optimisations)

## Performance

### Kernelwise performance for s8 versions:

  * Kernelwise performance on ESP32-P4 chip
    * Numbers are ticks taken for kernel to execute
    * Chip config: 360MHz, SPI-RAM: HEX 200MHz, L2-Cache: 128KB

    | Function        | ANSI C  | Optimized | Opt Ratio | Data info   | Memory    |
    | ----------------| --------|---------|---------|-------------|-----------|
    | elementwise_add | 190786  | 88451   | 2.16    | size = 1615 | External  |
    | elementwise_mul | 76585   | 47601   | 1.60    | size = 1615 | External  |
    | convolution     | 4005512 | 572459  | 7.00    | input(10,10), filter(64x1x1x64), pad(0,0), stride(1,1) | External |
    | convolution     | 249700  | 71104   | 3.51    | input(8,8), filter(16x1x1x16), pad(0,0), stride(1,1) | External |
    | convolution     | 816975  | 533318  | 1.53    | input(10,10), filter(64x3x3x3), pad(0,0), stride(1,1) | External |
    | depthwise conv  | 962834  | 482389  | 2.00    | input (16, 16), pad(0,0), stride(1,1) filter: 1x3x3x16 | External |
    | depthwise conv  | 1365066 | 703989  | 1.94    | input (12, 12), pad(1,1), stride(1,1)  filter: 8x5x5x4 | External |
    | max pool        | 482184  | 24178   | 19.94   | input(16,16), filter (1x3x3x16) | Internal |
    | avg pool        | 303210  | 84401   | 3.59    | input(16,16), filter (1x3x3x16) | Internal |
    | fully connected | 7650    | 915     | 8.36    | len: 271, ch = 3 | Internal |
    | prelu (relu6)   | 1195    | 154     | 7.76    | size, 1615  | Internal  |
    | softmax         | 14260   | 8587    | 1.66    | width: 256  | Internal  |
    | hard_swish      | 703970  | 516582  | 1.36    | size: 12544 | External  |
    | mean            | 10113   | 4686    | 2.16    | 7x7x16     | Internal  |


  * Kernelwise performance on ESP32-S3 chip
    * Numbers are ticks taken for kernel to execute
    * Chip config: 240MHz, SPI: QPI 80MHz, Data cache: 64KB

    | Function        | ANSI C   | Optimized | Opt Ratio | Data info   | Memory    |
    | ----------------| ---------|-----------|-----------|-------------|-----------|
    | elementwise_add | 281337   | 74440     | 3.78      | size = 1615 | External  |
    | elementwise_mul | 122703   | 35002     | 3.51      | size = 1615 | External  |
    | convolution     | 4712500  | 331008    | 14.24     | input(10,10), filter(64x1x1x64), pad(0,0), stride(1,1) | External |
    | convolution     | 312754   | 39022     | 8.01      | input(8,8), filter(16x1x1x16), pad(0,0), stride(1,1) | External |
    | convolution     | 2193289  | 394842    | 5.55      | input(8,8), filter(64x3x3x3), pad(0,0), stride(1,1) | External |
    | depthwise conv  | 1159831  | 184176    | 6.30      | input(18,18), pad(0,0), stride(1,1), filter: 1x3x3x16 | External |
    | depthwise conv  | 1671363  | 372435    | 4.49      | input(12,12), pad(1,1), stride(1,1), filter: 8x5x5x4 | External |
    | max pool        | 376294   | 48069     | 7.83      | input(16,16), filter(1x3x3x16) | Internal |
    | avg pool        | 427293   | 118052    | 3.62      | input(16,16), filter(1x3x3x16) | Internal |
    | fully connected | 8443     | 1078      | 7.83      | len: 271, ch = 3 | Internal |
    | softmax         | 15209    | 11107     | 1.37      | h: 8, w: 32 | Internal  |
    | prelu (relu6)   | 1125     | 98        | 11.48     | size: 1615  | Internal  |


### Model-level performance:

  * **Person Detection** (Visual Wake Words, INT8 quantized — from [esp-tflite-micro](https://github.com/espressif/esp-tflite-micro))
    * Numbers are time (ms) for `invoke()` call, using internal memory

    | Chip     | CPU Freq | without ESP-NN | with ESP-NN |
    | -------- | -------- | -------------- | ----------- |
    | ESP32-P4 | 360MHz   | 1395ms         | 73ms        |
    | ESP32-S3 | 240MHz   | 2300ms         | 54ms        |
    | ESP32    | 240MHz   | 4084ms         | 380ms       |
    | ESP32-C3 | 160MHz   | 3355ms         | 426ms       |

  * **MobileNetV3 Small** (INT8 quantized, 224x224x3, 1000 classes)

    | Chip     | CPU Freq | without ESP-NN | with ESP-NN |
    | -------- | -------- | -------------- | ----------- |
    | ESP32-S3 | 240MHz   | 26000ms        | 1434ms      |
    | ESP32-P4 | 360MHz   | 11600ms        | 1050ms      |

> **Note**:
  - The above is time taken for execution of the `invoke()` call
  - SPIRAM used for TensorArena.
  - Person detection on ESP32-S3 with internal RAM: 47ms
  - ESP32-P4 optimisation is work in progress
  - `Without ESP-NN` case is when `esp-nn` is completely disabled by removing below flag from [CMakeLists.txt](CMakeLists.txt):
    ```cmake
      # enable ESP-NN optimizations by Espressif
      target_compile_options(${COMPONENT_LIB} PRIVATE -DESP_NN)
    ```


## Configuration

  * To configure, please use `idf.py menuconfig` and under `ESP-NN` select `NN_OPTIMIZATIONS`
  * There are two options presented:
     * Optimized versions
     * ANSI C

  * Default selection is for `Optimized versions`. For ESP32-S3 and ESP32-P4, assembly versions are automatically selected, whereas for other chips (viz., ESP32, ESP32-C3), generic optimisations are selected.
  * For debugging purposes, you may want to select `ANSI C` reference versions.


## Contributing

If you encounter an issue with ESP-NN, or wish to submit a feature request, please use the Issues section on the Github.

For general questions related to this library, please use the esp32.com forum.

Please check [CONTRIBUTING.md](CONTRIBUTING.md) for further information if you'd like to contribute to ESP-NN.

## Copyrights and License

All original source code in this repository is Copyright (C) 2020-2021 Espressif Systems. This source code is licensed under the Apache License 2.0 as described in the file LICENSE.


================================================
FILE: idf_component.yml
================================================
version: "1.2.3"
description: Optimized NN (Neural Network) functions for Espressif chips
url: https://github.com/espressif/esp-nn
repository: https://github.com/espressif/esp-nn.git
issues: https://github.com/espressif/esp-nn/issues
dependencies:
  idf:
    version: ">=4.2"
files:
  exclude:
    - test_app
    - tests


================================================
FILE: include/esp_nn.h
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#if defined(CONFIG_NN_OPTIMIZED)
// select apt optimisations
#ifdef CONFIG_IDF_TARGET_ESP32P4
#define ARCH_ESP32_P4 1
#endif
#ifdef CONFIG_IDF_TARGET_ESP32S3
#define ARCH_ESP32_S3 1
#endif
#ifdef CONFIG_IDF_TARGET_ESP32
#define ARCH_ESP32 1
#endif
#endif

#ifdef __cplusplus
extern "C" {
#endif

/* reference kernels included by default */
#include "esp_nn_ansi_headers.h"

#if defined(CONFIG_NN_OPTIMIZED)
#if defined(ARCH_ESP32_P4)
#include "esp_nn_esp32p4.h"
#elif defined(ARCH_ESP32_S3)
#include "esp_nn_esp32s3.h"
#else // for other platforms use generic optimisations
#include "esp_nn_generic_opt.h"
#endif // #if defined(ARCH_ESP32_S3)
#else
#include "esp_nn_ansi_c.h"
#endif

#ifdef __cplusplus
}
#endif


================================================
FILE: include/esp_nn_ansi_c.h
================================================
/*
 * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/**
 * @file        Header definitions to include for ANSI C versions.
 *              These are just typedefs to pick up ANSI versions.
 */

#pragma once

#include "esp_nn_defs.h"
#include "esp_nn_ansi_headers.h"

#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_ansi
#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_ansi
#define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_ansi

#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_ansi

#define esp_nn_conv_s8 esp_nn_conv_s8_ansi

#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_ansi
#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_ansi

#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_ansi
#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_ansi

#define esp_nn_relu6_s8 esp_nn_relu6_s8_ansi
#define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_ansi
#define esp_nn_get_hard_swish_scratch_size() 0
#define esp_nn_set_hard_swish_scratch_buf(buf)
#define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_ansi

#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_ansi
#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_ansi

#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_ansi
#define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_ansi

#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_ansi
#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_ansi
#define esp_nn_softmax_s8 esp_nn_softmax_s8_ansi

#define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi
#define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi
#define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi


================================================
FILE: include/esp_nn_ansi_headers.h
================================================
/*
 * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#pragma once

/**
 * @file        Header definitions to include for esp_nn reference functions
 */

#include "esp_nn_defs.h"
/************************** Basic math functions ****************************/

/**
 * @brief       elementwise addition
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 *
 *              shift values are expected to be <= 0
 */
void esp_nn_add_elementwise_s8_ansi(const int8_t *input1_data,
                                    const int8_t *input2_data,
                                    const int32_t input1_offset,
                                    const int32_t input2_offset,
                                    const int32_t input1_mult,
                                    const int32_t input2_mult,
                                    const int32_t input1_shift,
                                    const int32_t input2_shift,
                                    const int32_t left_shift,
                                    int8_t *output,
                                    const int32_t out_offset,
                                    const int32_t out_mult,
                                    const int32_t out_shift,
                                    const int32_t activation_min,
                                    const int32_t activation_max,
                                    const int32_t size);
/**
 * @brief       elementwise multiplication
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 *
 *              output shift is expected to be <= 0
 */
void esp_nn_mul_elementwise_s8_ansi(const int8_t *input1_data,
                                    const int8_t *input2_data,
                                    const int32_t input1_offset,
                                    const int32_t input2_offset,
                                    int8_t *output,
                                    const int32_t out_offset,
                                    const int32_t out_mult,
                                    const int32_t out_shift,
                                    const int32_t activation_min,
                                    const int32_t activation_max,
                                    const int32_t size);

/**
 * @brief       broadcast MUL for [H,W,C] * [1,1,C] pattern (SE-block)
 *
 * @note        input2_per_ch has `channels` elements, broadcast to all spatial positions.
 *              Uses fast requantization (constant nudge).
 */
void esp_nn_mul_broadcast_channel_s8_ansi(const int8_t *input1,
                                      const int8_t *input2_per_ch,
                                      const int32_t input1_offset,
                                      const int32_t input2_offset,
                                      int8_t *output,
                                      const int32_t output_offset,
                                      const int32_t output_mult,
                                      const int32_t output_shift,
                                      const int32_t activation_min,
                                      const int32_t activation_max,
                                      const int32_t total_spatial,
                                      const int32_t channels);


/************************** Convolution functions *****************************/

/**
 * @brief       depthwise convolution per channel
 *
 * @note        inputs type: int8_t, output: int8_t
 *              Version used in tflite is per channel.
 *              This version follows the same footsprints.
 *              Meaning, it has per out_channel shift and multiplier for
 *              requantization
 *
 *              optimization notes: Though input_offset is int32 type,
 *              offset values are contained in 8 bits [-128, 127]
 */
void esp_nn_depthwise_conv_s8_ansi(const data_dims_t *input_dims,
                                   const int8_t *input_data,
                                   const data_dims_t *filter_dims,
                                   const int8_t *filter_data,
                                   const int32_t *bias,
                                   const data_dims_t *output_dims,
                                   int8_t *out_data,
                                   const dw_conv_params_t *conv_params,
                                   const quant_data_t *quant_data);

/**
 * @brief       2d-convolution channelwise
 *
 * @note        operation: result += (input + offset) * filter
 *
 *              inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 */
void esp_nn_conv_s8_ansi(const data_dims_t *input_dims,
                         const int8_t *input_data,
                         const data_dims_t *filter_dims,
                         const int8_t *filter_data,
                         const int32_t *bias,
                         const data_dims_t *output_dims,
                         int8_t *out_data,
                         const conv_params_t *conv_params,
                         const quant_data_t *quant_data);

int esp_nn_get_conv_scratch_size_ansi(const data_dims_t *input_dims,
                                      const data_dims_t *filter_dims,
                                      const data_dims_t *output_dims,
                                      const conv_params_t *conv_params);
void esp_nn_set_conv_scratch_buf_ansi(const void *buf);

int esp_nn_get_depthwise_conv_scratch_size_ansi(const data_dims_t *input_dims,
                                                const data_dims_t *filter_dims,
                                                const data_dims_t *output_dims,
                                                const dw_conv_params_t *conv_params);
void esp_nn_set_depthwise_conv_scratch_buf_ansi(const void *buf);

/************************** Activation functions *****************************/

/**
 * @brief       relu6
 *
 * @note        inout: int8_t
 */
void esp_nn_relu6_s8_ansi(int8_t *data, uint16_t size);

/**
 * @brief       hard_swish activation: y = x * relu6(x + 3) / 6
 *
 * @note        Quantized int8 fixed-point implementation
 */
void esp_nn_hard_swish_s8_ansi(const int8_t *input, int8_t *output,
                                const int32_t size,
                                const int16_t input_zero_point,
                                const int16_t output_mult_fxp,
                                const int16_t reluish_mult_fxp,
                                const int32_t reluish_mult_exp,
                                const int32_t output_mult_exp,
                                const int16_t output_zero_point);

/**
 * @brief       mean reduction over spatial dims (H,W) for NHWC int8 tensor
 *
 * @note        Specialized for 4D [N,H,W,C] → [N,1,1,C] reduction.
 *              Used by Squeeze-and-Excite in MobileNetV3.
 */
void esp_nn_mean_nhwc_s8_ansi(const int8_t *input, int8_t *output,
                               const int32_t height, const int32_t width,
                               const int32_t channels,
                               const int32_t input_zero_point,
                               const int32_t output_zero_point,
                               const int32_t multiplier,
                               const int32_t shift);

/************************** Pooling functions *****************************/


/**
 * @brief       max_pool
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 */
void esp_nn_max_pool_s8_ansi(const int8_t *input,
                             const uint16_t input_wd,
                             const uint16_t input_ht,
                             int8_t *output,
                             const uint16_t output_wd,
                             const uint16_t output_ht,
                             const uint16_t stride_wd,
                             const uint16_t stride_ht,
                             const uint16_t filter_wd,
                             const uint16_t filter_ht,
                             const uint16_t pad_wd,
                             const uint16_t pad_ht,
                             const int32_t activation_min,
                             const int32_t activation_max,
                             const uint16_t channels);

/**
 * @brief       avg_pool
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 */
void esp_nn_avg_pool_s8_ansi(const int8_t *input,
                             const uint16_t input_wd,
                             const uint16_t input_ht,
                             int8_t *output,
                             const uint16_t output_wd,
                             const uint16_t output_ht,
                             const uint16_t stride_wd,
                             const uint16_t stride_ht,
                             const uint16_t filter_wd,
                             const uint16_t filter_ht,
                             const uint16_t pad_wd,
                             const uint16_t pad_ht,
                             const int32_t activation_min,
                             const int32_t activation_max,
                             const uint16_t channels);


/************************** Fully connected functions ***********************/

/**
 * @brief       fully connected
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 */
void esp_nn_fully_connected_s8_ansi(const int8_t *input_data,
                                    const int32_t input_offset,
                                    const uint16_t row_len,
                                    const int8_t *filter_data,
                                    const int32_t filter_offset,
                                    const int32_t *bias,
                                    int8_t *out_data,
                                    const uint16_t out_channels,
                                    const int32_t out_offset,
                                    const int32_t out_shift,
                                    const int32_t out_mult,
                                    const int32_t activation_min,
                                    const int32_t activation_max);

/**
 * @brief       fully connected
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 *              out_mult, out_shift: int32_t* containing per-channel data
 */
void esp_nn_fully_connected_per_ch_s8_ansi(const int8_t *input_data,
                                    const int32_t input_offset,
                                    const uint16_t row_len,
                                    const int8_t *filter_data,
                                    const int32_t filter_offset,
                                    const int32_t *bias,
                                    int8_t *out_data,
                                    const uint16_t out_channels,
                                    const int32_t out_offset,
                                    const int32_t* out_shift,
                                    const int32_t* out_mult,
                                    const int32_t activation_min,
                                    const int32_t activation_max);

/**
 * @brief   Get scratch buffer size needed by softmax function
 *
 * @param   width
 * @param   height
 * @return  size in bytes
 *
 * @note    buffer must be 4 byte aligned
 */
int32_t esp_nn_get_softmax_scratch_size_ansi(const int32_t width, const int32_t height);

/* ANSI C function to be hooked up when optimised version needed */
int32_t esp_nn_get_softmax_scratch_size_opt(const int32_t width, const int32_t height);

/**
 * @brief   Set scratch buffer to be used by softmax function
 *
 * @param   buffer  this can be NULL if one needs to unset it
 *                  must be aligned to 4 bytes
 */
void esp_nn_set_softmax_scratch_buf_ansi(void *buffer);

/**
 * @brief       reference softmax function
 *
 * @note        inputs type: int8_t, output: int8_t
 */
void esp_nn_softmax_s8_ansi(const int8_t *input_data,
                            const int32_t height,
                            const int32_t width,
                            const int32_t mult,
                            const int32_t shift,
                            const int32_t diff_min,
                            int8_t *output_data);


//////////////////////////// Generic optimisations /////////////////////////////

/************************** Convolution functions *****************************/

/**
 * @brief       2d-convolution channelwise optimized version
 *
 * @note        operation: result += (input + offset) * filter
 *
 *              inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 */
void esp_nn_conv_s8_opt(const data_dims_t *input_dims,
                        const int8_t *input_data,
                        const data_dims_t *filter_dims,
                        const int8_t *filter_data,
                        const int32_t *bias,
                        const data_dims_t *output_dims,
                        int8_t *out_data,
                        const conv_params_t *conv_params,
                        const quant_data_t *quant_data);

/**
 * @brief       depthwise convolution per channel optimized version
 *
 * @note        inputs type: int8_t, output: int8_t
 *              Version used in tflite is per channel.
 *              This version follows the same footsprints.
 *              Meaning, it has per out_channel shift and multiplier for
 *              requantization
 *
 *              optimization notes: Though input_offset is int32 type,
 *              offset values are contained in 8 bits [-128, 127]
 */
void esp_nn_depthwise_conv_s8_opt(const data_dims_t *input_dims,
                                  const int8_t *input_data,
                                  const data_dims_t *filter_dims,
                                  const int8_t *filter_data,
                                  const int32_t *bias,
                                  const data_dims_t *output_dims,
                                  int8_t *out_data,
                                  const dw_conv_params_t *conv_params,
                                  const quant_data_t *quant_data);

int esp_nn_get_conv_scratch_size_opt(const data_dims_t *input_dims,
                                     const data_dims_t *filter_dims,
                                     const data_dims_t *output_dims,
                                     const conv_params_t *conv_params);
void esp_nn_set_conv_scratch_buf_opt(const void *buf);

int esp_nn_get_depthwise_conv_scratch_size_opt(const data_dims_t *input_dims,
                                               const data_dims_t *filter_dims,
                                               const data_dims_t *output_dims,
                                               const dw_conv_params_t *conv_params);
void esp_nn_set_depthwise_conv_scratch_buf_opt(const void *buf);

/* ANSI C function to be hooked up when optimised version needed */
void esp_nn_set_softmax_scratch_buf_opt(void *buffer);

/**
 * @brief       optimised version of softmax function
 *
 * @note        the function uses extra buffer (4 * width bytes)
 *              hence, scratch buffers must be set before calling this.
 */
void esp_nn_softmax_s8_opt(const int8_t *input_data,
                           const int32_t height,
                           const int32_t width,
                           const int32_t mult,
                           const int32_t shift,
                           const int32_t diff_min,
                           int8_t *output_data);

/**
 * @brief       Get scratch buffer size for int8 logistic (sigmoid).
 * @return      256 (size of LUT in bytes)
 */
int32_t esp_nn_get_logistic_s8_scratch_size_ansi(void);

/**
 * @brief       Prepare LUT for int8 logistic (sigmoid).
 *              Call once during model preparation after scratch is allocated.
 *
 * @param       scratch_buf         Scratch buffer (256 bytes, from get_scratch_size)
 * @param       input_zero_point    Input quantization zero point
 * @param       input_scale         Input quantization scale (float)
 *
 * @note        Output quantization is fixed: scale=1/256, zero_point=-128.
 */
void esp_nn_logistic_s8_prepare_ansi(int8_t *scratch_buf,
                                      int32_t input_zero_point,
                                      float input_scale);

/**
 * @brief       Apply int8 logistic (sigmoid) using precomputed LUT.
 *
 * @param       input       Input int8 data
 * @param       output      Output int8 data
 * @param       size        Number of elements
 * @param       scratch_buf 256-byte LUT from esp_nn_logistic_s8_prepare()
 */
void esp_nn_logistic_s8_ansi(const int8_t *input, int8_t *output,
                              int32_t size, const int8_t *scratch_buf);


================================================
FILE: include/esp_nn_defs.h
================================================
// Copyright 2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <stdint.h>

/**
 * @brief structure to club data dims
 * this structure can be used for input, output and filter
 */
typedef struct data_dims {
    int32_t width;
    int32_t height;
    int32_t channels;

    int32_t extra; // can be used as batch or any other param
} data_dims_t;

/**
 * @brief 2d data structure (width, height)
 *
 */
typedef struct data_2d {
    int32_t width;
    int32_t height;
} data_2d_t;

/**
 * @brief min/max activation
 */
typedef struct act_params {
    int32_t min;
    int32_t max;
} act_params_t;

/**
 * @brief per channel quant data
 *
 * @note number of shift and mult elements are equal to output channels
 */
typedef struct quant_data {
    int32_t *shift;
    int32_t *mult;
} quant_data_t;

/**
 * @brief params specific to convolution 2d
 *
 */
typedef struct conv_params {
    int32_t in_offset;
    int32_t out_offset;
    data_2d_t stride;
    data_2d_t padding;
    data_2d_t dilation;
    act_params_t activation;
} conv_params_t;

/**
 * @brief params specific to depthwise convolution 2d
 *
 */
typedef struct dw_conv_params {
    int32_t in_offset;
    int32_t out_offset;
    int32_t ch_mult; // channel multiplier. (in_ch * ch_mult = out_ch)
    data_2d_t stride;
    data_2d_t padding;
    data_2d_t dilation;
    act_params_t activation;
} dw_conv_params_t;


================================================
FILE: include/esp_nn_esp32p4.h
================================================
/*
 * SPDX-FileCopyrightText: 2024-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/**
 * @file        Header definitions to include for esp_nn optimized functions for
 *              the ESP32-P4 platform
 */

#pragma once

#include "esp_nn_defs.h"
#include "esp_nn_ansi_headers.h"

/**
 * @brief       2d - convolution channelwise
 *
 * @note        operation: result += (input + offset) * filter
 *
 *              inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 */
void esp_nn_conv_s8_esp32p4(const data_dims_t *input_dims,
                            const int8_t *input_data,
                            const data_dims_t *filter_dims,
                            const int8_t *filter_data,
                            const int32_t *bias,
                            const data_dims_t *output_dims,
                            int8_t *output_data,
                            const conv_params_t *conv_params,
                            const quant_data_t *quant_data);

int esp_nn_get_conv_scratch_size_esp32p4(const data_dims_t *input_dims,
                                         const data_dims_t *filter_dims,
                                         const data_dims_t *output_dims,
                                         const conv_params_t *conv_params);
void esp_nn_set_conv_scratch_buf_esp32p4(const void *buf);

/********************** function defines ***************************/



#define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_ansi

void esp_nn_add_elementwise_s8_esp32p4(const int8_t *input1_data,
                                        const int8_t *input2_data,
                                        const int32_t input1_offset,
                                        const int32_t input2_offset,
                                        const int32_t input1_mult,
                                        const int32_t input2_mult,
                                        const int32_t input1_shift,
                                        const int32_t input2_shift,
                                        const int32_t left_shift,
                                        int8_t *output,
                                        const int32_t out_offset,
                                        const int32_t out_mult,
                                        const int32_t out_shift,
                                        const int32_t activation_min,
                                        const int32_t activation_max,
                                        const int32_t size);
#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_esp32p4

void esp_nn_mul_elementwise_s8_esp32p4(const int8_t *input1_data,
                                        const int8_t *input2_data,
                                        const int32_t input1_offset,
                                        const int32_t input2_offset,
                                        int8_t *output,
                                        const int32_t out_offset,
                                        const int32_t out_mult,
                                        const int32_t out_shift,
                                        const int32_t activation_min,
                                        const int32_t activation_max,
                                        const int32_t size);
#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_esp32p4

void esp_nn_depthwise_conv_s8_esp32p4(const data_dims_t *input_dims,
                                       const int8_t *input_data,
                                       const data_dims_t *filter_dims,
                                       const int8_t *filter_data,
                                       const int32_t *bias,
                                       const data_dims_t *output_dims,
                                       int8_t *out_data,
                                       const dw_conv_params_t *conv_params,
                                       const quant_data_t *quant_data);
int esp_nn_get_depthwise_conv_scratch_size_esp32p4(const data_dims_t *input_dims,
                                                    const data_dims_t *filter_dims,
                                                    const data_dims_t *output_dims,
                                                    const dw_conv_params_t *conv_params);
void esp_nn_set_depthwise_conv_scratch_buf_esp32p4(const void *buf);
#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_esp32p4

#define esp_nn_conv_s8 esp_nn_conv_s8_esp32p4

#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_esp32p4
#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_esp32p4

#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_esp32p4
#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_esp32p4

/* Functions not yet optimized for P4 - use ANSI fallback */
void esp_nn_hard_swish_s8_esp32p4(const int8_t *input, int8_t *output,
                                   const int32_t size,
                                   const int16_t input_zero_point,
                                   const int16_t output_mult_fxp,
                                   const int16_t reluish_mult_fxp,
                                   const int32_t reluish_mult_exp,
                                   const int32_t output_mult_exp,
                                   const int16_t output_zero_point);
#define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_esp32p4
#define esp_nn_get_hard_swish_scratch_size() 0
#define esp_nn_set_hard_swish_scratch_buf(buf)

void esp_nn_mean_nhwc_s8_esp32p4(const int8_t *input, int8_t *output,
                                  const int32_t height, const int32_t width,
                                  const int32_t channels,
                                  const int32_t input_zero_point,
                                  const int32_t output_zero_point,
                                  const int32_t multiplier,
                                  const int32_t shift);
#define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_esp32p4

void esp_nn_relu6_s8_esp32p4(int8_t *data, uint16_t size);
#define esp_nn_relu6_s8 esp_nn_relu6_s8_esp32p4

void esp_nn_avg_pool_s8_esp32p4(const int8_t *input,
                                 const uint16_t input_wd,
                                 const uint16_t input_ht,
                                 int8_t *output,
                                 const uint16_t output_wd,
                                 const uint16_t output_ht,
                                 const uint16_t stride_wd,
                                 const uint16_t stride_ht,
                                 const uint16_t filter_wd,
                                 const uint16_t filter_ht,
                                 const uint16_t pad_wd,
                                 const uint16_t pad_ht,
                                 const int32_t activation_min,
                                 const int32_t activation_max,
                                 const uint16_t channels);
#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_esp32p4
void esp_nn_max_pool_s8_esp32p4(const int8_t *input,
                                 const uint16_t input_wd,
                                 const uint16_t input_ht,
                                 int8_t *output,
                                 const uint16_t output_wd,
                                 const uint16_t output_ht,
                                 const uint16_t stride_wd,
                                 const uint16_t stride_ht,
                                 const uint16_t filter_wd,
                                 const uint16_t filter_ht,
                                 const uint16_t pad_wd,
                                 const uint16_t pad_ht,
                                 const int32_t activation_min,
                                 const int32_t activation_max,
                                 const uint16_t channels);
#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_esp32p4

void esp_nn_fully_connected_s8_esp32p4(const int8_t *input_data,
                                        const int32_t input_offset,
                                        const uint16_t row_len,
                                        const int8_t *filter_data,
                                        const int32_t filter_offset,
                                        const int32_t *bias,
                                        int8_t *out_data,
                                        const uint16_t out_channels,
                                        const int32_t out_offset,
                                        const int32_t out_shift,
                                        const int32_t out_mult,
                                        const int32_t activation_min,
                                        const int32_t activation_max);
void esp_nn_fully_connected_per_ch_s8_esp32p4(const int8_t *input_data,
                                        const int32_t input_offset,
                                        const uint16_t row_len,
                                        const int8_t *filter_data,
                                        const int32_t filter_offset,
                                        const int32_t *bias,
                                        int8_t *out_data,
                                        const uint16_t out_channels,
                                        const int32_t out_offset,
                                        const int32_t *out_shift,
                                        const int32_t *out_mult,
                                        const int32_t activation_min,
                                        const int32_t activation_max);
#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_esp32p4
#define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_esp32p4

int32_t esp_nn_get_softmax_scratch_size_esp32p4(const int32_t width, const int32_t height);
void esp_nn_set_softmax_scratch_buf_esp32p4(void *buffer);
void esp_nn_softmax_s8_esp32p4(const int8_t *input_data,
                                const int32_t height,
                                const int32_t width,
                                const int32_t mult,
                                const int32_t shift,
                                const int32_t diff_min,
                                int8_t *output_data);
#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_esp32p4
#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_esp32p4
#define esp_nn_softmax_s8 esp_nn_softmax_s8_esp32p4

#define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi
#define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi
#define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi


================================================
FILE: include/esp_nn_esp32s3.h
================================================
/*
 * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/**
 * @file        Header definitions to include for esp_nn optimized functions for
 *              the ESP32-S3 platform
 */

#pragma once

#include "esp_nn_defs.h"
#include "esp_nn_ansi_headers.h"

/************************** Basic math functions *****************************/


/**
 * @brief       elementwise addition
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 *
 *              shift values are expected to be <= 0
 */
void esp_nn_add_elementwise_s8_esp32s3(const int8_t *input1_data,
                                       const int8_t *input2_data,
                                       const int32_t input1_offset,
                                       const int32_t input2_offset,
                                       const int32_t input1_mult,
                                       const int32_t input2_mult,
                                       const int32_t input1_shift,
                                       const int32_t input2_shift,
                                       const int32_t left_shift,
                                       int8_t *output,
                                       const int32_t out_offset,
                                       const int32_t out_mult,
                                       const int32_t out_shift,
                                       const int32_t activation_min,
                                       const int32_t activation_max,
                                       const int32_t size);

/**
 * @brief       elementwise multiplication
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 *
 *              output shift is expected to be <= 0
 */
void esp_nn_mul_elementwise_s8_esp32s3(const int8_t *input1_data,
                                       const int8_t *input2_data,
                                       const int32_t input1_offset,
                                       const int32_t input2_offset,
                                       int8_t *output,
                                       const int32_t out_offset,
                                       const int32_t out_mult,
                                       const int32_t out_shift,
                                       const int32_t activation_min,
                                       const int32_t activation_max,
                                       const int32_t size);


/************************** Convolution functions *****************************/

/**
 * @brief       depthwise convolution per channel
 *
 * @note        inputs type: int8_t, output: int8_t
 *              Version used in tflite is per channel.
 *              This version follows the same footsprints.
 *              Meaning, it has per out_channel shift and multiplier for
 *              requantization
 *
 *              optimization notes: Though input_offset is int32 type,
 *              offset values are contained in 8 bits [-128, 127]
 */
void esp_nn_depthwise_conv_s8_esp32s3(const data_dims_t *input_dims,
                                      const int8_t *input_data,
                                      const data_dims_t *filter_dims,
                                      const int8_t *filter_data,
                                      const int32_t *bias,
                                      const data_dims_t *output_dims,
                                      int8_t *output_data,
                                      const dw_conv_params_t *conv_params,
                                      const quant_data_t *quant_data);

/**
 * @brief       2d - convolution channelwise
 *
 * @note        operation: result += (input + offset) * filter
 *
 *              inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 */
void esp_nn_conv_s8_esp32s3(const data_dims_t *input_dims,
                            const int8_t *input_data,
                            const data_dims_t *filter_dims,
                            const int8_t *filter_data,
                            const int32_t *bias,
                            const data_dims_t *output_dims,
                            int8_t *output_data,
                            const conv_params_t *conv_params,
                            const quant_data_t *quant_data);

int esp_nn_get_conv_scratch_size_esp32s3(const data_dims_t *input_dims,
                                         const data_dims_t *filter_dims,
                                         const data_dims_t *output_dims,
                                         const conv_params_t *conv_params);
void esp_nn_set_conv_scratch_buf_esp32s3(const void *buf);

int esp_nn_get_depthwise_conv_scratch_size_esp32s3(const data_dims_t *input_dims,
                                                   const data_dims_t *filter_dims,
                                                   const data_dims_t *output_dims,
                                                   const dw_conv_params_t *conv_params);
void esp_nn_set_depthwise_conv_scratch_buf_esp32s3(const void *buf);

/************************** Pooling functions *****************************/

/**
 * @brief       max_pool
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 */
void esp_nn_max_pool_s8_esp32s3(const int8_t *input,
                                const uint16_t input_wd,
                                const uint16_t input_ht,
                                int8_t *output,
                                const uint16_t output_wd,
                                const uint16_t output_ht,
                                const uint16_t stride_wd,
                                const uint16_t stride_ht,
                                const uint16_t filter_wd,
                                const uint16_t filter_ht,
                                const uint16_t pad_wd,
                                const uint16_t pad_ht,
                                const int32_t activation_min,
                                const int32_t activation_max,
                                const uint16_t channels);

/**
 * @brief       avg_pool
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 */
void esp_nn_avg_pool_s8_esp32s3(const int8_t *input,
                                const uint16_t input_wd,
                                const uint16_t input_ht,
                                int8_t *output,
                                const uint16_t output_wd,
                                const uint16_t output_ht,
                                const uint16_t stride_wd,
                                const uint16_t stride_ht,
                                const uint16_t filter_wd,
                                const uint16_t filter_ht,
                                const uint16_t pad_wd,
                                const uint16_t pad_ht,
                                const int32_t activation_min,
                                const int32_t activation_max,
                                const uint16_t channels);


/************************** Fully connected functions *****************************/

/**
 * @brief       fully connected
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 *
 *              Current version works only on aligned input.
 *              row_len and channels should both be multiple of 8.
 */
void esp_nn_fully_connected_s8_esp32s3(const int8_t *input_data,
                                       const int32_t input_offset,
                                       const uint16_t row_len,
                                       const int8_t *filter_data,
                                       const int32_t filter_offset,
                                       const int32_t *bias,
                                       int8_t *out_data,
                                       const uint16_t out_channels,
                                       const int32_t out_offset,
                                       const int32_t out_shift,
                                       const int32_t out_mult,
                                       const int32_t activation_min,
                                       const int32_t activation_max);

/**
 * @brief       fully connected - per channel
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 *              out_mult, out_shift: int32_t* containing per-channel data
 *
 *              Current version works only on aligned input.
 *              row_len and channels should both be multiple of 8.
 */
void esp_nn_fully_connected_per_ch_s8_esp32s3(const int8_t *input_data,
                                       const int32_t input_offset,
                                       const uint16_t row_len,
                                       const int8_t *filter_data,
                                       const int32_t filter_offset,
                                       const int32_t *bias,
                                       int8_t *out_data,
                                       const uint16_t out_channels,
                                       const int32_t out_offset,
                                       const int32_t* out_shift,
                                       const int32_t* out_mult,
                                       const int32_t activation_min,
                                       const int32_t activation_max);

/**
 * @brief       relu6
 *
 * @note        inout: int8_t
 */
void esp_nn_relu6_s8_esp32s3(int8_t *data, uint16_t size);

/********************** function defines ***************************/

#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_esp32s3
#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_esp32s3

void esp_nn_mul_broadcast_channel_s8_esp32s3(const int8_t *input1,
                                              const int8_t *input2_per_ch,
                                              const int32_t input1_offset,
                                              const int32_t input2_offset,
                                              int8_t *output,
                                              const int32_t output_offset,
                                              const int32_t output_mult,
                                              const int32_t output_shift,
                                              const int32_t activation_min,
                                              const int32_t activation_max,
                                              const int32_t total_spatial,
                                              const int32_t channels);
#define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_esp32s3

#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_esp32s3

#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_esp32s3
#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_esp32s3

#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_esp32s3
#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_esp32s3

#define esp_nn_conv_s8 esp_nn_conv_s8_esp32s3

#define esp_nn_relu6_s8 esp_nn_relu6_s8_esp32s3

int32_t esp_nn_get_hard_swish_scratch_size_esp32s3(void);
void esp_nn_set_hard_swish_scratch_buf_esp32s3(void *buf);
void esp_nn_hard_swish_s8_esp32s3(const int8_t *input, int8_t *output,
                                   const int32_t size,
                                   const int16_t input_zero_point,
                                   const int16_t output_mult_fxp,
                                   const int16_t reluish_mult_fxp,
                                   const int32_t reluish_mult_exp,
                                   const int32_t output_mult_exp,
                                   const int16_t output_zero_point);
#define esp_nn_get_hard_swish_scratch_size esp_nn_get_hard_swish_scratch_size_esp32s3
#define esp_nn_set_hard_swish_scratch_buf esp_nn_set_hard_swish_scratch_buf_esp32s3
#define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_esp32s3

void esp_nn_mean_nhwc_s8_esp32s3(const int8_t *input, int8_t *output,
                                  const int32_t height, const int32_t width,
                                  const int32_t channels,
                                  const int32_t input_zero_point,
                                  const int32_t output_zero_point,
                                  const int32_t multiplier,
                                  const int32_t shift);
#define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_esp32s3

#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_esp32s3
#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_esp32s3

#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_esp32s3
#define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_esp32s3

int32_t esp_nn_get_softmax_scratch_size_esp32s3(const int32_t width, const int32_t height);
void esp_nn_set_softmax_scratch_buf_esp32s3(void *buffer);
void esp_nn_softmax_s8_esp32s3(const int8_t *input_data, const int32_t height,
                                const int32_t width, const int32_t mult,
                                const int32_t shift, const int32_t diff_min,
                                int8_t *output_data);

#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_esp32s3
#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_esp32s3
#define esp_nn_softmax_s8 esp_nn_softmax_s8_esp32s3

/* Logistic (sigmoid) — LUT-based, same impl for all targets */
#define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi
#define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi
#define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi


================================================
FILE: include/esp_nn_generic_opt.h
================================================
/*
 * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/**
 * @file        Header definitions to include for esp_nn generic optimisations
 *              For functions which not having optimisations, _ansi versions are picked.
 */

#pragma once

#include "esp_nn_defs.h"
#include "esp_nn_ansi_headers.h"

#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_ansi
#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_ansi
#define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_ansi

#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_opt

#define esp_nn_conv_s8 esp_nn_conv_s8_opt

#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_opt
#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_opt

#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_opt
#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_opt

#define esp_nn_relu6_s8 esp_nn_relu6_s8_ansi
#define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_ansi
#define esp_nn_get_hard_swish_scratch_size() 0
#define esp_nn_set_hard_swish_scratch_buf(buf)
#define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_ansi

#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_ansi
#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_ansi

#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_ansi
#define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_ansi

#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_opt
#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_opt
#define esp_nn_softmax_s8 esp_nn_softmax_s8_opt

#define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi
#define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi
#define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi


================================================
FILE: src/activation_functions/esp_nn_hard_swish_ansi.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * HardSwish activation function: y = x * relu6(x + 3) / 6
 * Quantized int8 implementation using fixed-point arithmetic.
 */

#include <stdint.h>
#include <common_functions.h>

/*
 * Saturating left shift for int16
 */
static inline int16_t sat_left_shift_s16(int16_t val, int shift)
{
    int32_t result = (int32_t)val << shift;
    if (result > 32767) return 32767;
    if (result < -32768) return -32768;
    return (int16_t)result;
}

/*
 * SaturatingRoundingDoublingHighMul for int16: (a * b + (1<<14)) >> 15
 */
static inline int16_t sat_round_dbl_high_mul_s16(int16_t a, int16_t b)
{
    if (a == b && a == -32768) return 32767;
    int32_t ab = (int32_t)a * (int32_t)b;
    return (int16_t)((ab + (1 << 14)) >> 15);
}

/*
 * SaturatingDoublingHighMul (NOT rounding): (a * b) >> 15
 */
static inline int16_t sat_dbl_high_mul_s16(int16_t a, int16_t b)
{
    if (a == b && a == -32768) return 32767;
    return (int16_t)(((int32_t)a * (int32_t)b) / (1 << 15));
}

/*
 * RoundingDivideByPOT for int16
 */
static inline int16_t rounding_div_pot_s16(int16_t val, int exponent)
{
    int32_t mask = (1 << exponent) - 1;
    int32_t remainder = val & mask;
    int32_t threshold = (mask >> 1) + (val < 0 ? 1 : 0);
    return (int16_t)((val >> exponent) + (remainder > threshold ? 1 : 0));
}

void esp_nn_hard_swish_s8_ansi(const int8_t *input,
                                int8_t *output,
                                const int32_t size,
                                const int16_t input_zero_point,
                                const int16_t output_mult_fxp,
                                const int16_t reluish_mult_fxp,
                                const int32_t reluish_mult_exp,
                                const int32_t output_mult_exp,
                                const int16_t output_zero_point)
{
    for (int i = 0; i < size; i++) {
        const int16_t in_val = input[i] - input_zero_point;
        const int16_t in_hires = in_val * 128; /* << 7 */

        /* Scale input to output scale */
        const int16_t in_on_out_scale = sat_round_dbl_high_mul_s16(in_hires, output_mult_fxp);

        /* Compute reluish value: maps input from [-3,3] to [-1,1] */
        int16_t reluish = in_hires;
        if (reluish_mult_exp > 0) {
            reluish = sat_left_shift_s16(reluish, reluish_mult_exp - 1);
        }
        reluish = sat_round_dbl_high_mul_s16(reluish, reluish_mult_fxp);
        if (reluish_mult_exp > 0) {
            reluish = sat_left_shift_s16(reluish, 1);
        }
        if (reluish_mult_exp < 0) {
            reluish = rounding_div_pot_s16(reluish, -reluish_mult_exp);
        }

        /* Convert from [-1,1] to [0,1] */
        reluish = (reluish + (1 << 15)) >> 1;

        /* Multiply: output = reluish * input_on_output_scale */
        const int16_t pre_out = sat_dbl_high_mul_s16(reluish, in_on_out_scale);

        /* Final shift and offset */
        int16_t out_val = rounding_div_pot_s16(pre_out, -output_mult_exp);
        out_val += output_zero_point;
        if (out_val > 127) out_val = 127;
        if (out_val < -128) out_val = -128;
        output[i] = (int8_t)out_val;
    }
}


================================================
FILE: src/activation_functions/esp_nn_hard_swish_s8_esp32p4.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * ESP32-P4 optimized HardSwish with:
 * 1. Branch hoisting (borrowed from S3): dispatch on reluish_mult_exp ONCE
 * 2. 2x loop unrolling for better ILP on RISC-V pipeline
 * 3. All int16 arithmetic - no 64-bit multiply bottleneck
 */

#include <stdint.h>

static inline __attribute__((always_inline))
int16_t sat_rnd_dbl_hi_mul(int16_t a, int16_t b) {
    if (__builtin_expect(a == b && a == -32768, 0)) return 32767;
    return (int16_t)(((int32_t)a * (int32_t)b + (1 << 14)) >> 15);
}

static inline __attribute__((always_inline))
int16_t sat_dbl_hi_mul(int16_t a, int16_t b) {
    if (__builtin_expect(a == b && a == -32768, 0)) return 32767;
    return (int16_t)(((int32_t)a * (int32_t)b) >> 15);
}

static inline __attribute__((always_inline))
int16_t sat_left_shift_s16(int32_t val) {
    if (val > 32767) return 32767;
    if (val < -32768) return -32768;
    return (int16_t)val;
}

static inline __attribute__((always_inline))
int16_t rounding_div_pot_s16(int16_t val, int exp) {
    int32_t mask = (1 << exp) - 1;
    int32_t remainder = val & mask;
    int32_t threshold = (mask >> 1) + (val < 0 ? 1 : 0);
    return (int16_t)((val >> exp) + (remainder > threshold ? 1 : 0));
}

/* Core output computation shared by all paths */
static inline __attribute__((always_inline))
int8_t hard_swish_output(int16_t reluish, int16_t in_on_out_scale,
                          int neg_out_exp, int16_t output_zero_point) {
    int16_t pre = sat_dbl_hi_mul(reluish, in_on_out_scale);
    int16_t ov = rounding_div_pot_s16(pre, neg_out_exp);
    int32_t result = ov + output_zero_point;
    if (result > 127) result = 127;
    if (result < -128) result = -128;
    return (int8_t)result;
}

void esp_nn_hard_swish_s8_esp32p4(const int8_t *input,
                                   int8_t *output,
                                   const int32_t size,
                                   const int16_t input_zero_point,
                                   const int16_t output_mult_fxp,
                                   const int16_t reluish_mult_fxp,
                                   const int32_t reluish_mult_exp,
                                   const int32_t output_mult_exp,
                                   const int16_t output_zero_point)
{
    const int neg_out_exp = -output_mult_exp;
    int i = 0;

    /* Branch on reluish_mult_exp ONCE - 3 specialized loops */
    if (reluish_mult_exp > 0) {
        const int ls1 = reluish_mult_exp - 1;

        for (; i <= size - 2; i += 2) {
            int16_t iv0 = input[i] - input_zero_point;
            int16_t iv1 = input[i+1] - input_zero_point;
            int16_t hi0 = iv0 * 128, hi1 = iv1 * 128;

            int16_t on0 = sat_rnd_dbl_hi_mul(hi0, output_mult_fxp);
            int16_t on1 = sat_rnd_dbl_hi_mul(hi1, output_mult_fxp);

            int16_t rv0 = sat_left_shift_s16((int32_t)hi0 << ls1);
            int16_t rv1 = sat_left_shift_s16((int32_t)hi1 << ls1);
            rv0 = sat_rnd_dbl_hi_mul(rv0, reluish_mult_fxp);
            rv1 = sat_rnd_dbl_hi_mul(rv1, reluish_mult_fxp);
            rv0 = sat_left_shift_s16((int32_t)rv0 * 2);
            rv1 = sat_left_shift_s16((int32_t)rv1 * 2);

            rv0 = (int16_t)(((int32_t)rv0 + 32768) >> 1);
            rv1 = (int16_t)(((int32_t)rv1 + 32768) >> 1);

            output[i]   = hard_swish_output(rv0, on0, neg_out_exp, output_zero_point);
            output[i+1] = hard_swish_output(rv1, on1, neg_out_exp, output_zero_point);
        }
    } else if (reluish_mult_exp < 0) {
        const int neg_relu_exp = -reluish_mult_exp;

        for (; i <= size - 2; i += 2) {
            int16_t iv0 = input[i] - input_zero_point;
            int16_t iv1 = input[i+1] - input_zero_point;
            int16_t hi0 = iv0 * 128, hi1 = iv1 * 128;

            int16_t on0 = sat_rnd_dbl_hi_mul(hi0, output_mult_fxp);
            int16_t on1 = sat_rnd_dbl_hi_mul(hi1, output_mult_fxp);

            int16_t rv0 = sat_rnd_dbl_hi_mul(hi0, reluish_mult_fxp);
            int16_t rv1 = sat_rnd_dbl_hi_mul(hi1, reluish_mult_fxp);
            rv0 = rounding_div_pot_s16(rv0, neg_relu_exp);
            rv1 = rounding_div_pot_s16(rv1, neg_relu_exp);

            rv0 = (int16_t)(((int32_t)rv0 + 32768) >> 1);
            rv1 = (int16_t)(((int32_t)rv1 + 32768) >> 1);

            output[i]   = hard_swish_output(rv0, on0, neg_out_exp, output_zero_point);
            output[i+1] = hard_swish_output(rv1, on1, neg_out_exp, output_zero_point);
        }
    } else {
        for (; i <= size - 2; i += 2) {
            int16_t iv0 = input[i] - input_zero_point;
            int16_t iv1 = input[i+1] - input_zero_point;
            int16_t hi0 = iv0 * 128, hi1 = iv1 * 128;

            int16_t on0 = sat_rnd_dbl_hi_mul(hi0, output_mult_fxp);
            int16_t on1 = sat_rnd_dbl_hi_mul(hi1, output_mult_fxp);
            int16_t rv0 = sat_rnd_dbl_hi_mul(hi0, reluish_mult_fxp);
            int16_t rv1 = sat_rnd_dbl_hi_mul(hi1, reluish_mult_fxp);

            rv0 = (int16_t)(((int32_t)rv0 + 32768) >> 1);
            rv1 = (int16_t)(((int32_t)rv1 + 32768) >> 1);

            output[i]   = hard_swish_output(rv0, on0, neg_out_exp, output_zero_point);
            output[i+1] = hard_swish_output(rv1, on1, neg_out_exp, output_zero_point);
        }
    }

    /* Scalar remainder */
    for (; i < size; i++) {
        int16_t iv = input[i] - input_zero_point;
        int16_t hi = iv * 128;
        int16_t on_out = sat_rnd_dbl_hi_mul(hi, output_mult_fxp);

        int16_t rv = hi;
        if (reluish_mult_exp > 0)
            rv = sat_left_shift_s16((int32_t)rv << (reluish_mult_exp - 1));
        rv = sat_rnd_dbl_hi_mul(rv, reluish_mult_fxp);
        if (reluish_mult_exp > 0)
            rv = sat_left_shift_s16((int32_t)rv * 2);
        if (reluish_mult_exp < 0)
            rv = rounding_div_pot_s16(rv, -reluish_mult_exp);

        rv = (int16_t)(((int32_t)rv + 32768) >> 1);
        output[i] = hard_swish_output(rv, on_out, neg_out_exp, output_zero_point);
    }
}


================================================
FILE: src/activation_functions/esp_nn_hard_swish_s8_esp32s3.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * ESP32-S3 optimized HardSwish using 256-byte lookup table.
 *
 * Key insight: HardSwish maps int8 -> int8 with fixed quantization parameters
 * per layer. Only 256 possible input values exist. We precompute the full
 * mapping once using the ANSI reference (bit-exact), then the inner loop
 * is a single byte load per element.
 *
 * Scratch buffer: 256 bytes (set via esp_nn_set_hard_swish_scratch_buf).
 */

#include <stdint.h>
#include <stddef.h>

/* Use ANSI C reference to build LUT — guarantees bit-exact match */
extern void esp_nn_hard_swish_s8_ansi(const int8_t *input,
                                       int8_t *output,
                                       const int32_t size,
                                       const int16_t input_zero_point,
                                       const int16_t output_mult_fxp,
                                       const int16_t reluish_mult_fxp,
                                       const int32_t reluish_mult_exp,
                                       const int32_t output_mult_exp,
                                       const int16_t output_zero_point);

static int8_t *hard_swish_scratch = NULL;

int32_t esp_nn_get_hard_swish_scratch_size_esp32s3(void)
{
    return 512; /* 256 for lut_input + 256 for lut output */
}

void esp_nn_set_hard_swish_scratch_buf_esp32s3(void *buf)
{
    hard_swish_scratch = (int8_t *)buf;
}

void esp_nn_hard_swish_s8_esp32s3(const int8_t *input,
                                   int8_t *output,
                                   const int32_t size,
                                   const int16_t input_zero_point,
                                   const int16_t output_mult_fxp,
                                   const int16_t reluish_mult_fxp,
                                   const int32_t reluish_mult_exp,
                                   const int32_t output_mult_exp,
                                   const int16_t output_zero_point)
{
    if (!hard_swish_scratch) {
        /* No scratch — fall through to ANSI */
        esp_nn_hard_swish_s8_ansi(input, output, size,
                                   input_zero_point, output_mult_fxp,
                                   reluish_mult_fxp, reluish_mult_exp,
                                   output_mult_exp, output_zero_point);
        return;
    }

    /* Build 256-byte LUT using ANSI reference (bit-exact).
     * lut[i] = hardswish((int8_t)i) for the given quant params.
     * Indexed by (uint8_t)input_val for direct lookup. */
    int8_t *lut_input = hard_swish_scratch;
    int8_t *lut = hard_swish_scratch + 256;

    for (int i = 0; i < 256; i++) {
        lut_input[i] = (int8_t)i;
    }
    esp_nn_hard_swish_s8_ansi(lut_input, lut, 256,
                               input_zero_point, output_mult_fxp,
                               reluish_mult_fxp, reluish_mult_exp,
                               output_mult_exp, output_zero_point);

    /* Apply LUT — one byte load per element */
    for (int i = 0; i < size; i++) {
        output[i] = lut[(uint8_t)input[i]];
    }
}


================================================
FILE: src/activation_functions/esp_nn_relu_ansi.c
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <stdint.h>
#include <stdlib.h>

#include <common_functions.h>

void esp_nn_relu6_s8_ansi(int8_t *data, uint16_t size)
{
    int32_t i;

    for (i = 0; i < size; i++) {
        int32_t ip = data[i];

        ip = max(ip, 0);
        data[i] = min(ip, 6);
    }
}


================================================
FILE: src/activation_functions/esp_nn_relu_s8_esp32p4.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>

/**
 * In-place ReLU6 for s8 data using ESP32-P4 PIE SIMD.
 * Clamps each element to [0, 6].
 * Processes 16 elements per iteration via 128-bit vector ops.
 */
void esp_nn_relu6_s8_esp32p4(int8_t *data, uint16_t size)
{
    /* Enable PIE */
    asm volatile (
        "csrsi  0x7f2, 0b01        \n\t"
        "li     x29, 0b10          \n\t"
        "esp.movx.w.cfg x29        \n\t"
        ::: "x29"
    );

    int i = 0;

    if (size >= 16) {
        /* Broadcast 0 into q2 and 6 into q3 */
        const int8_t zero_val = 0;
        const int8_t six_val = 6;

        asm volatile (
            "esp.vldbc.8.ip  q2, %0, 0   \n\t"
            "esp.vldbc.8.ip  q3, %1, 0   \n\t"
            :: "r"(&zero_val), "r"(&six_val)
        );

        int count = size >> 4;
        int stride = 16;

        asm volatile (
            "mv     x30, %[ptr]             \n\t"
            "mv     x31, %[cnt]             \n\t"

            "1:                             \n\t"
            "esp.vld.128.ip   q0, x30, 0    \n\t"  /* load 16 bytes, no auto-increment */
            "esp.vmax.s8      q0, q0, q2    \n\t"  /* max(val, 0) */
            "esp.vmin.s8      q0, q0, q3    \n\t"  /* min(val, 6) */
            "esp.vst.128.xp   q0, x30, %[stride] \n\t"  /* store and advance ptr by 16 */
            "addi   x31, x31, -1            \n\t"
            "bnez   x31, 1b                 \n\t"

            :
            : [ptr] "r"(data), [cnt] "r"(count), [stride] "r"(stride)
            : "x30", "x31", "memory"
        );

        i = count << 4;
    }

    /* Handle remaining elements scalar */
    for (; i < size; i++) {
        int32_t val = data[i];
        if (val < 0) val = 0;
        if (val > 6) val = 6;
        data[i] = (int8_t) val;
    }
}


================================================
FILE: src/activation_functions/esp_nn_relu_s8_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


    .text
    .align  4
    .literal_position

# in place relu6 function. a2: data, a3: size
    # Program Unit: esp_nn_relu6_s8_esp32s3
    .type   esp_nn_relu6_s8_esp32s3, @function
    .align   4
    .global esp_nn_relu6_s8_esp32s3

esp_nn_relu6_s8_esp32s3:
    entry   a1,48                       #
    mov.n   a9,a2                       # [0], data
    mov.n   a7,a3                       # [1], size

 // process multiple of 16
    movi.n      a4,6                    # [4]
    s8i         a4,a1,0                     # [5]  six
    addi        a10,a3,-7                   # [2]
    ee.vldbc.8  q1,a1               # [6]  id:72 six+0x0
    blti        a3,16,.Lt_0_5634            # [7]

    srai        a8,a3,4                     # [0]
    ee.zero.q   q2                      # [1]
    loopgtz     a8,.LBB37_esp_nn_relu6_s8_esp32s3   # [3]

    ee.vld.128.ip   q0,a2,0             # [0*II+0]  id:73
    ee.vmax.s8      q0,q0,q2            # [0*II+2]
    ee.vmin.s8      q0,q0,q1            # [0*II+3]
    ee.vst.128.ip   q0,a2,16            # [0*II+4]  id:74
.LBB37_esp_nn_relu6_s8_esp32s3: # 0x34

    slli    a8,a8,4                     # [0]

 // remaining multiple of 8 data
    bge     a8,a10,.Lt_0_3586           # [1]

.Lt_0_3842: # 0x3a
    sub     a6,a7,a8                    # [0]
    srai    a6,a6,3                     # [1]
    loopgtz a6,.LBB52_esp_nn_relu6_s8_esp32s3   # [2]

    ee.vld.l.64.ip  q0,a2,0         # [0*II+0]  id:75
    ee.vmax.s8      q0,q0,q2            # [0*II+2]
    ee.vmin.s8      q0,q0,q1            # [0*II+3]
    ee.vst.l.64.ip  q0,a2,8         # [0*II+4]  id:76

.LBB52_esp_nn_relu6_s8_esp32s3: # 0x4f
    addx8   a8,a6,a8                    # [0]

.Lt_0_3586: # 0x52
 // process leftover
    bge     a8,a7,.Lt_0_6402            # [0]

.Lt_0_4866: # 0x55
    movi.n  a5,0                    # [0]
    sub     a3,a7,a8                    # [1]
    add.n   a2,a8,a9                    # [2]
    l8ui    a6,a2,0                     # [3]  id:78
    addi.n  a3,a3,-1                # [4]
    sext    a6,a6,7
    max     a6,a5,a6                    # [6]
    min     a6,a4,a6                    # [7]
    s8i     a6,a2,0                     # [8]  id:79

    loopgtz a3,.LBB67_esp_nn_relu6_s8_esp32s3   # [9]

    l8ui    a3,a2,1                     # [0*II+0]  id:78
    addi.n  a2,a2,1                 # [1*II+1]
    sext    a3,a3,7
    max     a3,a5,a3                    # [0*II+3]
    min     a3,a4,a3                    # [0*II+4]
    s8i     a3,a2,0                     # [0*II+5]  id:79
.LBB67_esp_nn_relu6_s8_esp32s3: # 0x81

.Lt_0_6402: # 0x83
    retw.n                          # [0]

.Lt_0_5634: # 0x85
    blti    a10,1,.Lt_0_5890            # [0]

    movi.n  a8,0                    # [0]
    ee.zero.q   q2                      # [1]
    j   .Lt_0_3842                      # [2]

.Lt_0_5890: # 0x90
    beqz.n  a3,.Lt_0_6402           # [0]

    movi.n  a8,0                    # [0]
    j   .Lt_0_4866                      # [1]

    .size   esp_nn_relu6_s8_esp32s3, . - esp_nn_relu6_s8_esp32s3


================================================
FILE: src/basic_math/esp_nn_add_ansi.c
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <stdint.h>

#include <common_functions.h>

void esp_nn_add_elementwise_u8_ansi(const uint8_t *input1_data,
                                    const uint8_t *input2_data,
                                    const int32_t input1_offset,
                                    const int32_t input2_offset,
                                    const int32_t input1_mult,
                                    const int32_t input2_mult,
                                    const int32_t input1_shift,
                                    const int32_t input2_shift,
                                    const int32_t left_shift,
                                    uint8_t *output,
                                    const int32_t out_offset,
                                    const int32_t out_mult,
                                    const int32_t out_shift,
                                    const int32_t activation_min,
                                    const int32_t activation_max,
                                    const int32_t size)
{
    for (int i = 0; i < size; i++) {
        int32_t tmp1 = input1_data[i] + input1_offset;
        int32_t tmp2 = input2_data[i] + input2_offset;

        tmp1 <<= left_shift;
        tmp2 <<= left_shift;

        tmp1 = esp_nn_sat_round_doubling_high_mul(tmp1, input1_mult);
        tmp2 = esp_nn_sat_round_doubling_high_mul(tmp2, input2_mult);

        tmp1 = esp_nn_div_by_power_of_two(tmp1, -input1_shift);
        tmp2 = esp_nn_div_by_power_of_two(tmp2, -input2_shift);

        int32_t out = tmp1 + tmp2;
        out = esp_nn_sat_round_doubling_high_mul(out, out_mult);
        out = esp_nn_div_by_power_of_two(out, -out_shift);
        out = out + out_offset;

        out = max(activation_min, min(out, activation_max));
        output[i] = (uint8_t) out;
    }
}

void esp_nn_add_elementwise_s8_ansi(const int8_t *input1_data,
                                    const int8_t *input2_data,
                                    const int32_t input1_offset,
                                    const int32_t input2_offset,
                                    const int32_t input1_mult,
                                    const int32_t input2_mult,
                                    const int32_t input1_shift,
                                    const int32_t input2_shift,
                                    const int32_t left_shift,
                                    int8_t *output,
                                    const int32_t out_offset,
                                    const int32_t out_mult,
                                    const int32_t out_shift,
                                    const int32_t activation_min,
                                    const int32_t activation_max,
                                    const int32_t size)
{
    for (int i = 0; i < size; i++) {
        int32_t tmp1 = input1_data[i] + input1_offset;
        int32_t tmp2 = input2_data[i] + input2_offset;

        tmp1 <<= left_shift;
        tmp2 <<= left_shift;

        tmp1 = esp_nn_sat_round_doubling_high_mul(tmp1, input1_mult);
        tmp2 = esp_nn_sat_round_doubling_high_mul(tmp2, input2_mult);

        tmp1 = esp_nn_div_by_power_of_two(tmp1, -input1_shift);
        tmp2 = esp_nn_div_by_power_of_two(tmp2, -input2_shift);

        int32_t out = tmp1 + tmp2;
        out = esp_nn_sat_round_doubling_high_mul(out, out_mult);
        out = esp_nn_div_by_power_of_two(out, -out_shift);
        out = out + out_offset;

        out = max(activation_min, min(out, activation_max));
        output[i] = (int8_t) out;
    }
}


================================================
FILE: src/basic_math/esp_nn_add_s8_esp32p4.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>
#include <common_functions.h>

/**
 * Optimized elementwise add for s8 on ESP32-P4.
 * Uses fast multiply-by-quantized-mult and 2x unrolling.
 */

/* Inline the core requantization to avoid function call overhead */
/* Inlined fast requant using explicit RISC-V mul/mulh to avoid
 * compiler generating 64-bit multiply helper calls */
static inline __attribute__((always_inline))
int32_t add_requant(int32_t val, int32_t mult, int32_t neg_shift)
{
    /* Use C 64-bit multiply - compiler already generates mul+mulh pair at -O2 */
    int64_t prod64 = (int64_t)val * mult + ((int64_t)1 << 30);
    int32_t result = (int32_t)(prod64 >> 31);

    if (neg_shift > 0) {
        int32_t rnd = (1 << (neg_shift - 1)) - (result < 0);
        result = (result + rnd) >> neg_shift;
    }
    return result;
}

void esp_nn_add_elementwise_s8_esp32p4(const int8_t *input1_data,
                                        const int8_t *input2_data,
                                        const int32_t input1_offset,
                                        const int32_t input2_offset,
                                        const int32_t input1_mult,
                                        const int32_t input2_mult,
                                        const int32_t input1_shift,
                                        const int32_t input2_shift,
                                        const int32_t left_shift,
                                        int8_t *output,
                                        const int32_t out_offset,
                                        const int32_t out_mult,
                                        const int32_t out_shift,
                                        const int32_t activation_min,
                                        const int32_t activation_max,
                                        const int32_t size)
{
    const int32_t neg_in1_shift = -input1_shift;
    const int32_t neg_in2_shift = -input2_shift;
    const int32_t neg_out_shift = -out_shift;

    int i = 0;
    /* Process 2 at a time - C inline requant lets compiler optimize across calls */
    for (; i <= size - 2; i += 2) {
        int32_t a0 = (input1_data[i + 0] + input1_offset) << left_shift;
        int32_t b0 = (input2_data[i + 0] + input2_offset) << left_shift;

        a0 = add_requant(a0, input1_mult, neg_in1_shift);
        b0 = add_requant(b0, input2_mult, neg_in2_shift);
        int32_t out0 = add_requant(a0 + b0, out_mult, neg_out_shift) + out_offset;
        out0 = max(activation_min, min(out0, activation_max));

        int32_t a1 = (input1_data[i + 1] + input1_offset) << left_shift;
        int32_t b1 = (input2_data[i + 1] + input2_offset) << left_shift;

        a1 = add_requant(a1, input1_mult, neg_in1_shift);
        b1 = add_requant(b1, input2_mult, neg_in2_shift);
        int32_t out1 = add_requant(a1 + b1, out_mult, neg_out_shift) + out_offset;
        out1 = max(activation_min, min(out1, activation_max));

        output[i + 0] = (int8_t) out0;
        output[i + 1] = (int8_t) out1;
    }

    for (; i < size; i++) {
        int32_t tmp1 = (input1_data[i] + input1_offset) << left_shift;
        int32_t tmp2 = (input2_data[i] + input2_offset) << left_shift;

        tmp1 = add_requant(tmp1, input1_mult, neg_in1_shift);
        tmp2 = add_requant(tmp2, input2_mult, neg_in2_shift);

        int32_t out = add_requant(tmp1 + tmp2, out_mult, neg_out_shift) + out_offset;
        out = max(activation_min, min(out, activation_max));
        output[i] = (int8_t) out;
    }
}


================================================
FILE: src/basic_math/esp_nn_add_s8_esp32s3.S
================================================
// Copyright 2021-2023 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

    .text
    .align  4
    .literal_position
    .literal    .nudge_val, 1073741824

    # Program Unit: esp_nn_add_elementwise_s8_esp32s3
    .type   esp_nn_add_elementwise_s8_esp32s3, @function
    .align   4
    .global esp_nn_add_elementwise_s8_esp32s3

esp_nn_add_elementwise_s8_esp32s3:  # 0x4
    # temp_neg_out_shift = 0
    # temp_neg_input2_shift = 4
    # temp_neg_input1_shift = 8
    # gra_spill_temp_2 = 12
    # gra_spill_temp_3 = 16
    # gra_spill_temp_4 = 20
    # gra_spill_temp_5 = 24
    # gra_spill_temp_6 = 28
    # gra_spill_temp_7 = 32
    # gra_spill_temp_8 = 36
    # gra_spill_temp_9 = 40
    # gra_spill_temp_10 = 44
    # gra_spill_temp_11 = 48
    # gra_spill_temp_12 = 52
    # gra_spill_temp_13 = 56

 // a2 : *input1_data
 // a3 : *input2_data
 // a4 : input1_offset
 // a5 : input2_offset
 // a6 : input1_mult
 // a7 : input2_mult
 // On stack:
 // 80: input1_shift
 // 84: input2_shift
 // 88: left_shift
 // 92: *output
 // 96: out_offset
 // 100: out_mult, loaded in `a8`
 // 104: out_shift
 // 108: activation_min
 // 112: activation_max
 // 116: size

    entry       a1,80                      #
    s32i.n      a4,a1,48                    # [10]  gra_spill_temp_11, input1_offset
    s32i.n      a5,a1,52                    # [0]  gra_spill_temp_12, input2_offset
    s32i.n      a2,a1,32                 # [5]  gra_spill_temp_7, input1_data
    s32i.n      a3,a1,12                    # [3]  gra_spill_temp_2, input2_data

    l32i        a12,a1,116                  # [11]  id:720 size+0x0
    mov.n       a14,a2                      # [6]
    mov.n       a10,a3                      # [8]
    blti        a12,1,.exit           # [1] // exit

    l32i        a3,a1,80                   # [0]  id:721 input1_shift+0x0
    l32i        a13,a1,84                  # [1]  id:722 input2_shift+0x0
    l32i        a2,a1,104                   # [8]  id:723 out_shift+0x0
    l32i        a8,a1,100                   # [1]  out_mult

    neg         a3,a3                       # [12]
    neg         a13,a13                     # [7]
    neg         a2,a2                       # [11]

    s32i.n      a3,a1,8                    # [12]  temp_neg_input1_shift, -input1_shift
    s32i.n      a13,a1,4                   # [7]  temp_neg_input2_shift, -input2_shift
    s32i.n      a2,a1,0                    # [16]  temp_neg_out_shift, -out_shift

    movi.n      a5,1
    addi        a9,a3,-1
    ssl         a9
    sll         a15,a5
    s32i.n      a15,a1,16               # gra_spill_temp_3, 1 << (exponent - 1) for input1

    addi        a9,a13,-1
    ssl         a9
    sll         a15,a5
    s32i.n      a15,a1,20               # gra_spill_temp_4, 1 << (exponent - 1) for input2

    addi        a9,a2,-1
    ssl         a9
    sll         a15,a5
    s32i.n      a15,a1,24               # gra_spill_temp_5, 1 << (exponent - 1) for out

    movi.n      a2,0
    blti        a12,12,.process_leftover          # [23]

    // skip to leftover routine if inputs are unaligned
    or          a9,a14,a10
    extui       a9,a9,0,4
    bnez        a9,.process_leftover

    l32i        a9,a1,92                   # [17]  id:1279 output+0x0

    l32i        a13,a1,116                  # [20]
    srai        a13,a13,3                   # [21]
    s32i.n      a13,a1,56                   # [22]  gra_spill_temp_13

    movi.n      a13,8
    s32i.n      a13,a1,28               # gra_spill_temp_6, mult_of8 counter

    ee.zero.q       q6                      # [8]

.vector_loop: // process 8 values in one go
    l32i            a15,a1,88                  # [6]  left_shift
    ee.vld.l.64.ip  q0,a14,8        # [9]  id:729
    s32i.n          a9,a1,44                    # [10]  gra_spill_temp_10, out_ptr
    s32i.n          a14,a1,40                   # [20]  gra_spill_temp_9
    wsr.sar         a15                     # [21] load left shift

    addi.n          a15,a1,48                   # [14]
    ee.vldbc.16     q7,a15              # [21]  id:1277 input1_offset
    ee.vcmp.lt.s8   q5,q0,q6            # [29]
    ee.vzip.8       q0,q5                   # [31], 20 bits
    ee.vadds.s16    q0,q0,q7            # [34], add offset
    ee.vcmp.lt.s16  q2,q0,q6        # [36]
    ee.vzip.16      q0,q2               # [39], 32 bits
    ee.vsl.32       q0,q0                   # [41] left_shift
    ee.vsl.32       q2,q2                   # [42] left_shift

    l32r            a9,.nudge_val              # [15], nudge

// mulhi32 for q0
    ee.movi.32.a    q0,a3,2             # [44]
    ee.movi.32.a    q0,a4,3             # [45]
    ee.movi.32.a    q0,a14,1            # [46]
    ee.movi.32.a    q0,a5,0             # [62]

    mulsh           a13,a6,a3                   # [51]
    mull            a3,a6,a3                    # [53]

    mulsh           a12,a6,a4                   # [50]
    mull            a4,a6,a4                    # [55]

    mulsh           a15,a6,a14                  # [48]
    mull            a14,a6,a14                  # [49]

    ssai            31                          # [47]

    add             a3,a3,a9
    saltu           a2,a3,a9
    add.n           a13,a13,a2
    src             a13,a13,a3

    add             a4,a4,a9
    saltu           a2,a4,a9
    add.n           a12,a12,a2
    src             a12,a12,a4
    ee.movi.32.q    q0,a13,2            # [62]

    add             a14,a14,a9
    saltu           a2,a14,a9
    add.n           a15,a15,a2
    src             a15,a15,a14
    ee.movi.32.q    q0,a12,3            # [62]

    mulsh           a13,a6,a5                   # [51]
    mull            a5,a6,a5                    # [53]
    ee.movi.32.q    q0,a15,1            # [62]

    add             a5,a5,a9
    saltu           a2,a5,a9
    add.n           a13,a13,a2
    src             a13,a13,a5
    ee.movi.32.q    q0,a13,0            # [62]


// mulhi32 for q2
    ee.movi.32.a    q2,a3,2             # [44]
    ee.movi.32.a    q2,a4,3             # [45]
    ee.movi.32.a    q2,a14,1            # [46]
    ee.movi.32.a    q2,a5,0             # [62]

    mulsh           a13,a6,a3                   # [51]
    mull            a3,a6,a3                    # [53]

    mulsh           a12,a6,a4                   # [50]
    mull            a4,a6,a4                    # [55]

    mulsh           a15,a6,a14                  # [48]
    mull            a14,a6,a14                  # [49]

    ssai            31                          # [47]

    add             a3,a3,a9
    saltu           a2,a3,a9
    add.n           a13,a13,a2
    src             a13,a13,a3

    add             a4,a4,a9
    saltu           a2,a4,a9
    add.n           a12,a12,a2
    src             a12,a12,a4
    ee.movi.32.q    q2,a13,2            # [62]

    add             a14,a14,a9
    saltu           a2,a14,a9
    add.n           a15,a15,a2
    src             a15,a15,a14
    ee.movi.32.q    q2,a12,3            # [62]

    mulsh           a13,a6,a5                   # [51]
    mull            a5,a6,a5                    # [53]
    ee.movi.32.q    q2,a15,1            # [62]

    l32i            a3,a1,8                    # [12]  temp_neg_input1_shift, -input1_shift
    add             a5,a5,a9
    saltu           a2,a5,a9
    add.n           a13,a13,a2
    src             a13,a13,a5
    ee.movi.32.q    q2,a13,0            # [62]


    blti            a3,1, .skip_div_by2_in0

    addi.n          a13,a1,16
    ee.vcmp.lt.s32  q1,q0,q6
    ee.vcmp.lt.s32  q3,q2,q6
    ee.vldbc.32     q5,a13      // 1 << (exponent - 1)
    wsr.sar         a3          // load right_shift
    ee.vadds.s32    q0,q0,q1    // subtract 1 `if (val < 0)`
    ee.vadds.s32    q2,q2,q3    // subtract 1 `if (val < 0)`
    ee.vadds.s32    q0,q0,q5
    ee.vadds.s32    q2,q2,q5
    ee.vsr.32       q0,q0
    ee.vsr.32       q2,q2

.skip_div_by2_in0:


    ee.vld.l.64.ip  q1,a10,8        # [11]  id:1290
    addi.n          a15,a1,52                   # [12]
    ee.vldbc.16     q7,a15              # [19]  id:1278 input2_offset
    l32i            a15,a1,88                  # [6]  left_shift
    s32i            a10,a1,36                   # [14]  gra_spill_temp_8
    ee.vcmp.lt.s8   q3,q1,q6            # [271]
    wsr.sar         a15                     # [21], load shift for left shift
    ee.vzip.8       q1,q3                   # [274], 20 bits
    ee.vadds.s16    q1,q1,q7            # [281]
    ee.vcmp.lt.s16  q3,q1,q6        # [282]
    ee.vzip.16      q1,q3               # [283], 32 bits
    ee.vsl.32       q1,q1                   # [284]
    ee.vsl.32       q3,q3                   # [285]


// mulhi32 for q1
    ee.movi.32.a    q1,a3,2             # [44]
    ee.movi.32.a    q1,a4,3             # [45]
    ee.movi.32.a    q1,a14,1            # [46]
    ee.movi.32.a    q1,a5,0             # [62]

    mulsh           a13,a7,a3                   # [51]
    mull            a3,a7,a3                    # [53]

    mulsh           a12,a7,a4                   # [50]
    mull            a4,a7,a4                    # [55]

    mulsh           a15,a7,a14                  # [48]
    mull            a14,a7,a14                  # [49]

    ssai            31                          # [47]

    add             a3,a3,a9
    saltu           a2,a3,a9
    add.n           a13,a13,a2
    src             a13,a13,a3

    add             a4,a4,a9
    saltu           a2,a4,a9
    add.n           a12,a12,a2
    src             a12,a12,a4
    ee.movi.32.q    q1,a13,2            # [62]

    add             a14,a14,a9
    saltu           a2,a14,a9
    add.n           a15,a15,a2
    src             a15,a15,a14
    ee.movi.32.q    q1,a12,3            # [62]

    mulsh           a13,a7,a5                   # [51]
    mull            a5,a7,a5                    # [53]
    ee.movi.32.q    q1,a15,1            # [62]

    add             a5,a5,a9
    saltu           a2,a5,a9
    add.n           a13,a13,a2
    src             a13,a13,a5
    ee.movi.32.q    q1,a13,0            # [62]


// mulhi32 for q3
    ee.movi.32.a    q3,a3,2             # [44]
    ee.movi.32.a    q3,a4,3             # [45]
    ee.movi.32.a    q3,a14,1            # [46]
    ee.movi.32.a    q3,a5,0             # [62]

    mulsh           a13,a7,a3                   # [51]
    mull            a3,a7,a3                    # [53]

    mulsh           a12,a7,a4                   # [50]
    mull            a4,a7,a4                    # [55]

    mulsh           a15,a7,a14                  # [48]
    mull            a14,a7,a14                  # [49]

    ssai            31                          # [47]

    add             a3,a3,a9
    saltu           a2,a3,a9
    add.n           a13,a13,a2
    src             a13,a13,a3

    add             a4,a4,a9
    saltu           a2,a4,a9
    add.n           a12,a12,a2
    src             a12,a12,a4
    ee.movi.32.q    q3,a13,2            # [62]

    add             a14,a14,a9
    saltu           a2,a14,a9
    add.n           a15,a15,a2
    src             a15,a15,a14
    ee.movi.32.q    q3,a12,3            # [62]

    mulsh           a13,a7,a5                   # [51]
    mull            a5,a7,a5                    # [53]
    ee.movi.32.q    q3,a15,1            # [62]
    l32i            a14,a1,4                   # [7]  temp_neg_input2_shift, -input2_shift

    add             a5,a5,a9
    saltu           a2,a5,a9
    add.n           a13,a13,a2
    src             a13,a13,a5
    ee.movi.32.q    q3,a13,0            # [62]

    // multiplication results: q0-q2 & q1-q3


    blti            a14,1, .skip_div_by2_in1

    addi.n          a5,a1,20
    ee.vcmp.lt.s32  q4,q1,q6
    ee.vcmp.lt.s32  q5,q3,q6
    ee.vldbc.32     q7,a5       // 1 << (exponent - 1)
    wsr.sar         a14         // load right_shift
    ee.vadds.s32    q4,q4,q7    // subtract 1 `if (val < 0)`
    ee.vadds.s32    q5,q5,q7    // subtract 1 `if (val < 0)`
    ee.vadds.s32    q1,q1,q4
    ee.vadds.s32    q3,q3,q5
    ee.vsr.32       q1,q1
    ee.vsr.32       q3,q3

.skip_div_by2_in1:

    ee.vadds.s32        q0,q0,q1
    ee.vadds.s32        q1,q2,q3

// mulhi32 for q0
    ee.movi.32.a    q0,a3,2             # [44]
    ee.movi.32.a    q0,a4,3             # [45]
    ee.movi.32.a    q0,a14,1            # [46]
    ee.movi.32.a    q0,a5,0             # [62]

    mulsh           a13,a8,a3                   # [51]
    mull            a3,a8,a3                    # [53]

    mulsh           a12,a8,a4                   # [50]
    mull            a4,a8,a4                    # [55]

    mulsh           a15,a8,a14                  # [48]
    mull            a14,a8,a14                  # [49]

    ssai            31                          # [47]

    add             a3,a3,a9
    saltu           a2,a3,a9
    add.n           a13,a13,a2
    src             a13,a13,a3

    add             a4,a4,a9
    saltu           a2,a4,a9
    add.n           a12,a12,a2
    src             a12,a12,a4
    ee.movi.32.q    q0,a13,2            # [62]

    add             a14,a14,a9
    saltu           a2,a14,a9
    add.n           a15,a15,a2
    src             a15,a15,a14
    ee.movi.32.q    q0,a12,3            # [62]

    mulsh           a13,a8,a5                   # [51]
    mull            a5,a8,a5                    # [53]
    ee.movi.32.q    q0,a15,1            # [62]

    add             a5,a5,a9
    saltu           a2,a5,a9
    add.n           a13,a13,a2
    src             a13,a13,a5
    ee.movi.32.q    q0,a13,0            # [62]


// mulhi32 for q1
    ee.movi.32.a    q1,a3,2             # [44]
    ee.movi.32.a    q1,a4,3             # [45]
    ee.movi.32.a    q1,a14,1            # [46]
    ee.movi.32.a    q1,a5,0             # [62]

    mulsh           a13,a8,a3                   # [51]
    mull            a3,a8,a3                    # [53]

    mulsh           a12,a8,a4                   # [50]
    mull            a4,a8,a4                    # [55]

    mulsh           a15,a8,a14                  # [48]
    mull            a14,a8,a14                  # [49]

    ssai            31                          # [47]

    add             a3,a3,a9
    saltu           a2,a3,a9
    add.n           a13,a13,a2
    src             a13,a13,a3

    add             a4,a4,a9
    saltu           a2,a4,a9
    add.n           a12,a12,a2
    src             a12,a12,a4
    ee.movi.32.q    q1,a13,2            # [62]

    add             a14,a14,a9
    saltu           a2,a14,a9
    add.n           a15,a15,a2
    src             a15,a15,a14
    ee.movi.32.q    q1,a12,3            # [62]

    mulsh           a13,a8,a5                   # [51]
    mull            a5,a8,a5                    # [53]
    ee.movi.32.q    q1,a15,1            # [62]
    l32i            a14,a1,0                   # [738]  temp_neg_out_shift, -out_shift

    add             a5,a5,a9
    saltu           a2,a5,a9
    add.n           a13,a13,a2
    src             a13,a13,a5
    ee.movi.32.q    q1,a13,0            # [62]


    //q0-q1 has output

    blti            a14,1,.skip_div_by2_out
    addi.n          a5,a1,24
    ee.vcmp.lt.s32  q2,q0,q6
    ee.vcmp.lt.s32  q3,q1,q6
    ee.vldbc.32     q5,a5       // 1 << (exponent - 1)
    wsr.sar         a14         // load right shift
    ee.vadds.s32    q0,q0,q2    // subtract 1 `if (val < 0)`
    ee.vadds.s32    q1,q1,q3    // subtract 1 `if (val < 0)`
    ee.vadds.s32    q0,q0,q5
    ee.vadds.s32    q1,q1,q5
    ee.vsr.32       q0,q0
    ee.vsr.32       q1,q1

.skip_div_by2_out:

// add offset and apply activation
    addi            a15,a1,96
    ee.vldbc.32     q3,a15              # [809]  id:802 out_offset
    ee.vadds.s32    q0,q0,q3            # [811]
    ee.vadds.s32    q1,q1,q3            # [812]
    addi            a13,a1,108
    addi            a14,a1,112
    ee.vldbc.32     q3,a14              # [813]  id:803 activation_max
    ee.vmin.s32     q0,q0,q3            # [815]
    ee.vmin.s32     q1,q1,q3            # [816]
    ee.vldbc.32     q3,a13              # [817]  id:804 activation_min
    l32i            a13,a1,4                   # [818]  temp_neg_input2_shift
    ee.vmax.s32     q1,q1,q3            # [819]
    ee.vmax.s32     q0,q0,q3            # [820]

//pack the data and store
    l32i.n          a9,a1,44                    # [784]  gra_spill_temp_10
    ee.vunzip.16    q0,q1               # [821]
    ee.vunzip.8     q0,q1               # [822]
    l32i.n          a13,a1,28           # gra_spill_temp_6, multiple of 12 index
    ee.vst.l.64.ip  q0,a9,8             # [823]  id:805
    l32i            a15,a1,116                  # [1], size
    l32i.n          a14,a1,40                   # [20]  gra_spill_temp_9
    l32i.n          a10,a1,36                   # [14]  gra_spill_temp_8
    addi            a13,a13,8
    s32i.n          a13,a1,28           # gra_spill_temp_6
    bge             a15,a13,.vector_loop

    l32i.n  a2,a1,56                # [0]  gra_spill_temp_13

// check for leftover
    l32i    a10,a1,116                  # [1]
    slli    a2,a2,3                     # [2]
    bge     a2,a10,.exit          # [3] // done, exit

.process_leftover:
    l32i.n  a3,a1,48                    # [1]  gra_spill_temp_11
    l32i.n  a12,a1,52                   # [2]  gra_spill_temp_12

    l32i.n  a10,a1,12                   # [3]  gra_spill_temp_2
    l32i.n  a14,a1,32                # [8]  gra_spill_temp_7
    add.n   a10,a2,a10                  # [5]
    add.n   a14,a2,a14                  # [6]
    l8ui    a14,a14,0                   # [7]  id:809, input1
    l8ui    a10,a10,0                   # [12]  id:1370, input2

    sext    a14,a14,7                   # [9]
    sext    a10,a10,7                   # [10]
    add.n   a10,a10,a12                 # [11] // add offset2
    add.n   a14,a14,a3                  # [16] // add offset1
    l32i    a12,a1,88                  # [13]  left_shift

    // sat_round_doubling_high_mul step for input1 and input2
    ssl     a12                         # [15]
    sll     a10,a10                     # [20]
    sll     a14,a14                     # [17]

    l32r            a12,.nudge_val             # [0], nudge

    // a13,a3 are free, a12: nudge, a6:mult1
    mulsh           a13,a14,a6
    mull            a9,a14,a6
    ssai            31

    add             a9,a9,a12
    saltu           a3,a9,a12
    add.n           a13,a13,a3
    src             a14,a13,a9 //result in a14

    mulsh           a13,a10,a7
    mull            a9,a10,a7
    ssai            31

    add             a9,a9,a12
    saltu           a3,a9,a12
    add.n           a13,a13,a3
    src             a10,a13,a9 //result in a10

// divide_by_power_of2_step for input1 (a14), input2 (a10)
// free registers: a13, a12, a9, a3

    l32i.n          a12,a1,8   // -input1_shift
    l32i.n          a13,a1,4   // -input2_shift

    blti            a12,1,.skip_div_by2_in0_remain
    l32i.n          a3,a1,16    // 1 << (exponent - 1)
    extui           a9,a14,31,1
    ssr             a12         // load right_shift
    sub             a3,a3,a9    // 1 << (exponent - 1) - (val < 0)
    add             a14,a14,a3
    sra             a14,a14
.skip_div_by2_in0_remain:

    blti            a13,1,.skip_div_by2_in1_remain
    l32i.n          a3,a1,20    // 1 << (exponent - 1)
    extui           a9,a10,31,1
    ssr             a13         // load right_shift
    sub             a3,a3,a9    // 1 << (exponent - 1) - (val < 0)
    add             a10,a10,a3
    sra             a10,a10
.skip_div_by2_in1_remain:

// process output
    l32r            a12,.nudge_val             # [0], nudge
    l32i            a13,a1,0                   // -out_shift
    add.n           a10,a10,a14                 # [45]

// multiply and pick high32
    mulsh           a3,a10,a8
    mull            a10,a10,a8
    ssai            31                          # [0]
    add             a10,a10,a12
    saltu           a9,a10,a12
    add             a12,a3,a9
    src             a12,a12,a10

// div by power of 2 for output

    l32i            a9,a1,96                   # [31]  out_offset
    blti            a13,1,.skip_div_by2_out_remain
    l32i.n          a3,a1,24    // 1 << (exponent - 1)
    extui           a14,a12,31,1
    ssr             a13         // load right_shift
    sub             a3,a3,a14   // 1 << (exponent - 1) - (val < 0)
    add             a12,a12,a3
    sra             a12,a12
.skip_div_by2_out_remain:

// add offset
    add.n   a9,a9,a12                   # [33]

// apply activation
    l32i    a13,a1,112                  # [34]  activation_max
    l32i    a12,a1,108                  # [35]  activation_min
    min     a13,a13,a9                      # [36]
    l32i    a9,a1,92                   # [37]  output
    max     a13,a13,a12                     # [38]
    add.n   a9,a2,a9                    # [39]
    s8i     a13,a9,0                    # [40]  id:1371
    l32i    a12,a1,116
    addi.n  a2,a2,1                 # [41]
    blt     a2,a12,.process_leftover

.exit:
    retw.n                          # [0]

    .size   esp_nn_add_elementwise_s8_esp32s3, . - esp_nn_add_elementwise_s8_esp32s3


================================================
FILE: src/basic_math/esp_nn_mul_ansi.c
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <stdint.h>

#include <common_functions.h>

void esp_nn_mul_elementwise_s8_ansi(const int8_t *input1_data,
                                    const int8_t *input2_data,
                                    const int32_t input1_offset,
                                    const int32_t input2_offset,
                                    int8_t *output,
                                    const int32_t out_offset,
                                    const int32_t out_mult,
                                    const int32_t out_shift,
                                    const int32_t activation_min,
                                    const int32_t activation_max,
                                    const int32_t size)
{
    for (int i = 0; i < size; i++) {
        int32_t tmp1 = input1_data[i] + input1_offset;
        int32_t tmp2 = input2_data[i] + input2_offset;

        int32_t out = tmp1 * tmp2;
        out = esp_nn_multiply_by_quantized_mult(out, out_mult, out_shift);
        out = out + out_offset;

        out = max(activation_min, min(out, activation_max));
        output[i] = (int8_t) out;
    }
}

void esp_nn_mul_broadcast_channel_s8_ansi(const int8_t *input1,
                                          const int8_t *input2_per_ch,
                                          const int32_t input1_offset,
                                          const int32_t input2_offset,
                                          int8_t *output,
                                          const int32_t output_offset,
                                          const int32_t output_mult,
                                          const int32_t output_shift,
                                          const int32_t activation_min,
                                          const int32_t activation_max,
                                          const int32_t total_spatial,
                                          const int32_t channels)
{
    for (int s = 0; s < total_spatial; s++) {
        const int8_t *in_row = input1 + s * channels;
        int8_t *out_row = output + s * channels;
        for (int c = 0; c < channels; c++) {
            int32_t val = ((int32_t)in_row[c] + input1_offset) *
                          ((int32_t)input2_per_ch[c] + input2_offset);
            val = esp_nn_multiply_by_quantized_mult(val, output_mult, output_shift);
            val += output_offset;
            val = max(val, activation_min);
            val = min(val, activation_max);
            out_row[c] = (int8_t)val;
        }
    }
}


================================================
FILE: src/basic_math/esp_nn_mul_broadcast_s8_esp32s3.S
================================================
// Copyright 2026 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Optimized broadcast MUL for SE-block pattern: [H,W,C] * [1,1,C]
// Processes 8 channels at a time using S3 SIMD.

    .text
    .align  4
    .literal_position
    .literal    .LC_nudge, 1073741824   // 1 << 30

    .type   esp_nn_mul_broadcast_channel_s8_esp32s3, @function
    .align  4
    .global esp_nn_mul_broadcast_channel_s8_esp32s3

// void esp_nn_mul_broadcast_channel_s8_esp32s3(
//     const int8_t *input1,           // a2
//     const int8_t *input2_per_ch,    // a3
//     const int32_t input1_offset,    // a4
//     const int32_t input2_offset,    // a5
//     int8_t *output,                 // a6
//     const int32_t output_offset,    // a7
//     const int32_t output_mult,      // stack+120
//     const int32_t output_shift,     // stack+124
//     const int32_t activation_min,   // stack+128
//     const int32_t activation_max,   // stack+132
//     const int32_t total_spatial,    // stack+136
//     const int32_t channels);        // stack+140

// Stack frame layout (entry a1, 120):
//  0: to_add (for div by power of 2)
//  4: input2_per_ch (saved)
//  8: output base (saved)
// 12: channels
// 16: input1 base (saved)
// 20: right_shift
// 24: input1_offset (saved)
// 28: input2_offset (saved)
// 32: spatial counter
// 36: out_ptr (current)
// 40: out_offset (from a7)
// 44: input1_offset (for vldbc)
// 48: input2_offset (for vldbc)

esp_nn_mul_broadcast_channel_s8_esp32s3:
    entry   a1, 120

    // Save args
    s32i.n  a3, a1, 4               // input2_per_ch base
    s32i.n  a6, a1, 8               // output base
    s32i.n  a2, a1, 16              // input1 base
    s32i.n  a4, a1, 24              // input1_offset
    s32i.n  a5, a1, 28              // input2_offset
    s32i    a7, a1, 40              // out_offset

    l32i    a8, a1, 136             // total_spatial
    l32i    a9, a1, 140             // channels
    s32i.n  a9, a1, 12              // save channels

    blti    a8, 1, .Lexit           // no spatial positions
    blti    a9, 1, .Lexit           // no channels

    // Prepare shift values
    l32i    a15, a1, 124            // output_shift
    movi.n  a11, 0
    max     a14, a15, a11           // left_shift = max(shift, 0)
    sub     a4, a14, a15            // right_shift = left_shift - shift
    s32i.n  a4, a1, 20              // save right_shift

    l32i    a13, a1, 120            // output_mult
    l32r    a4, .LC_nudge           // nudge = 1 << 30

    // Store offsets for vldbc
    l32i    a8, a1, 136             // reload total_spatial
    s32i    a5, a1, 48              // input2_offset for vldbc
    l32i.n  a5, a1, 24              // input1_offset
    s32i    a5, a1, 44              // input1_offset for vldbc

    // Init spatial counter
    movi.n  a10, 0
    s32i    a10, a1, 32             // spatial counter = 0

    // Pointers: a2 = input1 (current), a3 = input2_per_ch (reloaded each row),
    //           a6 = output (current)

.Lspatial_loop:
    l32i    a8, a1, 136             // total_spatial
    l32i    a10, a1, 32             // spatial counter
    bge     a10, a8, .Lexit

    // Reset input2 pointer for each spatial position
    l32i.n  a3, a1, 4               // input2_per_ch base

    // Channel counter
    l32i.n  a9, a1, 12              // channels
    movi.n  a11, 0                  // channel index

    blti    a9, 8, .Lchannel_leftover

    // Check alignment for SIMD path
    or      a8, a2, a3
    or      a8, a8, a6
    extui   a8, a8, 0, 4
    bnez    a8, .Lchannel_leftover

    // Setup SIMD constants
    ee.zero.q   q1                  // zero register
    addi    a8, a1, 44
    ee.vldbc.16 q0, a8              // input1_offset broadcast
    addi    a8, a1, 48
    ee.vldbc.16 q7, a8              // input2_offset broadcast
    st.qr   q0, a1, 64             // save for reload in loop

.Lchannel_simd_loop:
    addi    a8, a9, -7              // channels - 7
    blt     a11, a8, .Lchannel_simd_body
    j       .Lchannel_leftover

.Lchannel_simd_body:
    ld.qr           q4, a1, 64             // input1_offset
    ee.vld.l.64.ip  q2, a2, 8              // load 8 input1 values
    movi.n          a7, 16
    ee.vld.h.64.ip  q2, a3, 8              // load 8 input2 values (per-ch)
    wsr.sar         a7
    ee.vcmp.lt.s8   q5, q2, q1             // sign extend
    ee.vzip.8       q2, q5                 // interleave to 16-bit
    ee.vadds.s16    q5, q5, q7             // add input2_offset
    ee.vadds.s16    q4, q2, q4             // add input1_offset
    ee.vmul.s16     q3, q4, q5             // multiply (high part)
    ssai            0                      // sar = 0
    ee.vmul.s16     q2, q4, q5             // multiply (low part)

    // Requantize 8 results (same pattern as elementwise mul)
    wsr.sar         a14                     // left_shift
    ee.vzip.16      q2, q3
    ee.vsl.32       q6, q2                  // left shift first 4

    ssai            31

    // Element 2 of q6
    ee.movi.32.a    q6, a8, 2
    mulsh           a7, a13, a8
    mull            a8, a13, a8
    add.n           a8, a4, a8
    saltu           a5, a8, a4
    add.n           a5, a5, a7
    src             a5, a5, a8
    // Element 3
    ee.movi.32.a    q6, a8, 3
    mulsh           a7, a13, a8
    mull            a8, a13, a8
    add.n           a8, a4, a8
    saltu           a12, a8, a4
    add.n           a12, a12, a7
    src             a12, a12, a8
    ee.movi.32.q    q2, a5, 2
    ee.movi.32.q    q2, a12, 3
    // Element 1
    ee.movi.32.a    q6, a8, 1
    mulsh           a7, a13, a8
    mull            a8, a13, a8
    add.n           a8, a4, a8
    saltu           a5, a8, a4
    add.n           a5, a5, a7
    src             a5, a5, a8
    // Element 0
    ee.movi.32.a    q6, a8, 0
    mulsh           a7, a13, a8
    mull            a8, a13, a8
    add.n           a8, a4, a8
    saltu           a12, a8, a4
    add.n           a12, a12, a7
    src             a12, a12, a8
    ee.movi.32.q    q2, a5, 1
    ee.movi.32.q    q2, a12, 0

    // Second group of 4 (q3)
    wsr.sar         a14                     // left_shift
    ee.vsl.32       q4, q3

    ssai            31

    ee.movi.32.a    q4, a8, 2
    mulsh           a7, a13, a8
    mull            a8, a13, a8
    add.n           a8, a4, a8
    saltu           a5, a8, a4
    add.n           a5, a5, a7
    src             a5, a5, a8
    ee.movi.32.a    q4, a8, 3
    mulsh           a7, a13, a8
    mull            a8, a13, a8
    add.n           a8, a4, a8
    saltu           a12, a8, a4
    add.n           a12, a12, a7
    src             a12, a12, a8
    ee.movi.32.q    q0, a5, 2
    ee.movi.32.q    q0, a12, 3
    ee.movi.32.a    q4, a8, 1
    mulsh           a7, a13, a8
    mull            a8, a13, a8
    add.n           a8, a4, a8
    saltu           a5, a8, a4
    add.n           a5, a5, a7
    src             a5, a5, a8
    ee.movi.32.a    q4, a8, 0
    mulsh           a7, a13, a8
    mull            a8, a13, a8
    add.n           a8, a4, a8
    saltu           a12, a8, a4
    add.n           a12, a12, a7
    src             a12, a12, a8
    ee.movi.32.q    q0, a5, 1
    ee.movi.32.q    q0, a12, 0

    // Divide by power of 2 (right_shift)
    l32i.n          a5, a1, 20              // right_shift
    movi.n          a7, 1

    blti            a5, 1, .Lskip_div

    ee.vcmp.lt.s32  q5, q2, q1
    ee.vcmp.lt.s32  q6, q0, q1
    addi.n          a8, a5, -1
    ssl             a8
    sll             a7, a7                  // to_add = 1 << (right_shift - 1)
    s32i.n          a7, a1, 0
    ee.vldbc.32     q4, a1                  // broadcast to_add
    wsr.sar         a5
    ee.vadds.s32    q5, q4, q5
    ee.vadds.s32    q5, q2, q5
    ee.vsr.32       q2, q5
    wsr.sar         a5
    ee.vadds.s32    q5, q4, q6
    ee.vadds.s32    q5, q0, q5
    ee.vsr.32       q0, q5

.Lskip_div:
    // Add output offset, apply activation
    addi            a8, a1, 132
    ee.vldbc.32     q4, a8                  // activation_max
    addi            a5, a1, 40
    ee.vldbc.32     q6, a5                  // output_offset
    addi            a7, a1, 128
    ee.vadds.s32    q0, q0, q6              // add offset
    ee.vadds.s32    q2, q2, q6
    ee.vldbc.32     q6, a7                  // activation_min
    ee.vmin.s32     q0, q0, q4
    ee.vmin.s32     q2, q2, q4
    ee.vmax.s32     q0, q0, q6
    ee.vmax.s32     q2, q2, q6

    // Pack 32-bit -> 8-bit and store
    ee.vunzip.16    q2, q0
    ee.vunzip.8     q2, q0
    ee.vst.l.64.ip  q2, a6, 8

    addi            a11, a11, 8             // channel index += 8
    j               .Lchannel_simd_loop

.Lchannel_leftover:
    // Process remaining channels one by one
    l32i.n  a9, a1, 12              // channels
    bge     a11, a9, .Lspatial_next

    ssl     a14                     // left_shift
    l32i.n  a8, a1, 24              // input1_offset
    l8ui    a10, a2, 0              // *input1
    sext    a10, a10, 7
    add.n   a10, a10, a8            // + input1_offset
    l32i.n  a8, a1, 28              // input2_offset
    l8ui    a12, a3, 0              // *input2_per_ch
    sext    a12, a12, 7
    add.n   a12, a12, a8            // + input2_offset
    mull    a10, a10, a12           // multiply

    // Requantize
    sll     a10, a10                // left shift

    l32i.n  a9, a1, 20              // right_shift
    mulsh   a8, a10, a13
    mull    a12, a10, a13
    ssai    31
    add.n   a12, a4, a12
    saltu   a10, a12, a4
    add.n   a10, a10, a8
    src     a10, a10, a12           // result

    blti    a9, 1, .Lskip_div_scalar

    addi    a8, a9, -1
    ssl     a8
    movi    a7, 1
    sll     a7, a7                  // to_add
    extui   a8, a10, 31, 1          // sign bit (1 if neg, 0 if pos)
    sub     a10, a10, a8            // val -= sign (fast rounding)
    add     a10, a10, a7
    ssr     a9
    sra     a10, a10

.Lskip_div_scalar:
    l32i    a8, a1, 40              // output_offset
    l32i    a7, a1, 128             // activation_min
    l32i    a12, a1, 132            // activation_max
    add.n   a10, a10, a8
    min     a10, a10, a12
    max     a10, a10, a7
    s8i     a10, a6, 0              // store

    addi    a2, a2, 1               // input1++
    addi    a3, a3, 1               // input2++
    addi    a6, a6, 1               // output++
    addi    a11, a11, 1             // channel index++
    j       .Lchannel_leftover

.Lspatial_next:
    l32i    a10, a1, 32             // spatial counter
    addi    a10, a10, 1
    s32i    a10, a1, 32
    j       .Lspatial_loop

.Lexit:
    retw.n

    .size   esp_nn_mul_broadcast_channel_s8_esp32s3, . - esp_nn_mul_broadcast_channel_s8_esp32s3


================================================
FILE: src/basic_math/esp_nn_mul_s8_esp32p4.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>
#include <common_functions.h>

/**
 * Elementwise multiply for s8 optimized for ESP32-P4.
 * Uses inlined fast requantization with 4x unrolled loop.
 * Interleaves independent computations to hide latency.
 */
void esp_nn_mul_elementwise_s8_esp32p4(const int8_t *input1_data,
                                        const int8_t *input2_data,
                                        const int32_t input1_offset,
                                        const int32_t input2_offset,
                                        int8_t *output,
                                        const int32_t out_offset,
                                        const int32_t out_mult,
                                        const int32_t out_shift,
                                        const int32_t activation_min,
                                        const int32_t activation_max,
                                        const int32_t size)
{
    const int32_t left_shift = out_shift > 0 ? out_shift : 0;
    const int32_t right_shift = left_shift - out_shift;
    const int64_t nudge = (int64_t)1 << 30;

    int i = 0;
    for (; i <= size - 4; i += 4) {
        int32_t prod0 = (input1_data[i+0] + input1_offset) * (input2_data[i+0] + input2_offset);
        int32_t prod1 = (input1_data[i+1] + input1_offset) * (input2_data[i+1] + input2_offset);
        int32_t prod2 = (input1_data[i+2] + input1_offset) * (input2_data[i+2] + input2_offset);
        int32_t prod3 = (input1_data[i+3] + input1_offset) * (input2_data[i+3] + input2_offset);

        int32_t s0 = prod0 << left_shift;
        int32_t s1 = prod1 << left_shift;
        int32_t s2 = prod2 << left_shift;
        int32_t s3 = prod3 << left_shift;

        int32_t r0 = (int32_t)(((int64_t)s0 * out_mult + nudge) >> 31);
        int32_t r1 = (int32_t)(((int64_t)s1 * out_mult + nudge) >> 31);
        int32_t r2 = (int32_t)(((int64_t)s2 * out_mult + nudge) >> 31);
        int32_t r3 = (int32_t)(((int64_t)s3 * out_mult + nudge) >> 31);

        if (right_shift > 0) {
            int32_t rnd = (1 << (right_shift - 1));
            r0 = (r0 + rnd - (r0 < 0)) >> right_shift;
            r1 = (r1 + rnd - (r1 < 0)) >> right_shift;
            r2 = (r2 + rnd - (r2 < 0)) >> right_shift;
            r3 = (r3 + rnd - (r3 < 0)) >> right_shift;
        }

        r0 = max(activation_min, min(r0 + out_offset, activation_max));
        r1 = max(activation_min, min(r1 + out_offset, activation_max));
        r2 = max(activation_min, min(r2 + out_offset, activation_max));
        r3 = max(activation_min, min(r3 + out_offset, activation_max));

        output[i+0] = (int8_t) r0;
        output[i+1] = (int8_t) r1;
        output[i+2] = (int8_t) r2;
        output[i+3] = (int8_t) r3;
    }

    for (; i < size; i++) {
        int32_t prod = (input1_data[i] + input1_offset) * (input2_data[i] + input2_offset);
        int32_t out = esp_nn_requantize(prod, out_mult, out_shift);
        out = max(activation_min, min(out + out_offset, activation_max));
        output[i] = (int8_t) out;
    }
}


================================================
FILE: src/basic_math/esp_nn_mul_s8_esp32s3.S
================================================
// Copyright 2021-2023 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

    .text
    .align  4
    .literal_position
    .literal    .LC0_26_123, 1073741824 // `1 << 30`

    # Program Unit: esp_nn_mul_elementwise_s8_esp32s3
    .type   esp_nn_mul_elementwise_s8_esp32s3, @function
    .align   4
    .global esp_nn_mul_elementwise_s8_esp32s3

esp_nn_mul_elementwise_s8_esp32s3:  # 0x4
    # to_add = 0
    # gra_spill_temp_0 = 4
    # gra_spill_temp_1 = 8
    # gra_spill_temp_2 = 12
    # gra_spill_temp_3 = 16
    # gra_spill_temp_4 = 20
    # gra_spill_temp_5 = 24
    # gra_spill_temp_6 = 28
    # gra_spill_temp_7 = 32
    # gra_spill_temp_8 = 36
    # gra_spill_temp_<> = 40
    # gra_spill_temp_<> = 44
    # gra_spill_temp_<> = 48
    # gra_spill_temp_13 = 64

 // registers:
 // a2: const int8_t *input1_data
 // a3: const int8_t *input2_data
 // a4: const int32_t input1_offset
 // a5: const int32_t input2_offset
 // a6: int8_t *output
 // a7: const int32_t out_offset

 // on stack:
 // 120: const int32_t out_mult
 // 124: const int32_t out_shift
 // 128: const int32_t activation_min
 // 132: const int32_t activation_max
 // 136: const int32_t size

    entry   a1,120                      #
    s32i.n  a4,a1,24                # [0]  gra_spill_temp_5, input1_offset
    s32i.n  a5,a1,28                # [1]  gra_spill_temp_12, input2_offset

    s32i.n  a3,a1,4                     # [5]  gra_spill_temp_0, input2
    mov.n   a10,a3                      # [6]
    l32i    a3,a1,136                   # [18]  id:361 size+0x0
    mov.n   a9,a6                       # [2] // out_addr
    blti    a3,1,.exit            # [0] // exit

    s32i.n  a2,a1,16                    # [9]  gra_spill_temp_3, input1
    s32i    a7,a1,40                    # [4]  id:358 out_offset+0x0
    movi.n  a11,0                       # [3]
    mov.n   a12,a2                      # [10]
    s32i    a4,a1,44                # [13]  id:356 input1_offset+0x0
    s32i    a5,a1,48                # [14]  id:357 input2_offset+0x0
    movi.n  a2,1                    # [15]

    l32i    a15,a1,124                  # [3]  id:362 out_shift+0x0
    l32i    a13,a1,120                  # [4]  id:363 out_mult+0x0
    s32i.n  a6,a1,8                 # [1]  gra_spill_temp_1, out_addr
    max     a14,a15,a11                 # [11] left_shift
    sub     a4,a14,a15              # right_shift
    s32i.n  a4,a1,20                # [9]  gra_spill_temp_4

    blti    a3,8,.process_leftover             # [20]

    // skip to leftover routine if inputs are unaligned
    or          a6,a12,a10
    extui       a6,a6,0,4
    bnez        a6,.process_leftover

    // `size > 8`, s3 optimisation path...
    ee.zero.q   q1                      # [0]
    addi    a4,a1,44                # [7]
    addi    a8,a1,48                    # [8]
    ee.vldbc.16 q0,a4               # [17]  id:359 input1_offset
    ee.vldbc.16 q7,a8               # [16]  id:360 input2_offset
    l32r    a4,.LC0_26_123              # [12]
    movi    a8, 8
    st.qr   q0,a1,64                    # [19]  gra_spill_temp_13
    s32i.n  a8,a1,12                # [6]  gra_spill_temp_2

.Lt_0_7682: # 0x60
    s32i            a9,a1,36                    # [1]  gra_spill_temp_8, out_addr
    ld.qr           q4,a1,64                    # [2]  gra_spill_temp_13, input1_offset
    ee.vld.l.64.ip  q2,a12,8        # [4]  id:367, input1_ptr
    movi.n          a7,16                   # [3]
    ee.vld.h.64.ip  q2,a10,8        # [5]  id:368, input2_ptr
    wsr.sar         a7                      # [6]
    ee.vcmp.lt.s8   q5,q2,q1            # [7]
    ee.vzip.8       q2,q5               # [8]
    ee.vadds.s16    q5,q5,q7            # [9] input2_offset
    ee.vadds.s16    q4,q2,q4            # [10] input1_offset
    ee.vmul.s16     q3,q4,q5            # [11]
    wsr.sar         a11                         # [12]
    ee.vmul.s16     q2,q4,q5            # [13]

    wsr.sar         a14                     # [14] left_shift
    ee.vzip.16      q2,q3               # [15]
    ee.vsl.32       q6,q2                   # [16] left_shift
    ssai            31                          # [17]

    ee.movi.32.a    q6,a3,2             # [18]
    ee.movi.32.a    q6,a8,3             # [26]

    mulsh           a6,a13,a3                   # [19]
    mull            a3,a13,a3                   # [20]
    mulsh           a7,a13,a8                   # [27]
    add.n           a3,a4,a3                    # [22]
    saltu           a2,a3,a4                    # [23]
    add.n           a2,a2,a6                    # [24]
    src             a2,a2,a3                    # [25]

    mull            a6,a13,a8                   # [28]
    add.n           a6,a4,a6                    # [30]
    saltu           a9,a6,a4                    # [31]
    add.n           a9,a9,a7                    # [32]
    src             a9,a9,a6                    # [33]
    ee.movi.32.q    q2,a2,2             # [53]
    ee.movi.32.q    q2,a9,3             # [54]

    ee.movi.32.a    q6,a6,1             # [34]
    mulsh           a7,a13,a6                   # [35]
    mull            a6,a13,a6                   # [36]
    add.n           a6,a4,a6                    # [38]
    saltu           a3,a6,a4                    # [39]
    add.n           a3,a3,a7                    # [16]
    src             a3,a3,a6                    # [41]
    ee.movi.32.a    q6,a2,0             # [42]
    mulsh           a8,a13,a2                   # [43]
    mull            a7,a13,a2                   # [4]
    add.n           a7,a4,a7                    # [46]
    saltu           a6,a7,a4                    # [47]
    add.n           a6,a6,a8                    # [24]
    src             a6,a6,a7                    # [49]
    ee.movi.32.q    q2,a3,1             # [28]
    ee.movi.32.q    q2,a6,0             # [50]

    wsr.sar         a14                     # [10]
    ee.vsl.32       q4,q3                   # [11]
    ee.movi.32.a    q4,a2,2             # [13]
    mulsh           a3,a13,a2                   # [14]
    mull            a2,a13,a2                   # [15]
    ssai            31                          # [12]
    add.n           a2,a4,a2                    # [17]
    saltu           a5,a2,a4                # [18]
    add.n           a5,a5,a3                # [19]
    src             a5,a5,a2                    # [20]
    ee.movi.32.a    q4,a3,3             # [21]
    mulsh           a6,a13,a3                   # [22]
    mull            a3,a13,a3                   # [23]
    add.n           a3,a4,a3                    # [25]
    saltu           a8,a3,a4                    # [26]
    add.n           a8,a8,a6                    # [27]
    src             a8,a8,a3                    # [28]
    ee.movi.32.q    q0,a5,2             # [24]
    ee.movi.32.q    q0,a8,3             # [51]

    ee.movi.32.a    q4,a7,1             # [29]
    mulsh           a6,a13,a7                   # [30]
    mull            a3,a13,a7                   # [31]
    add.n           a3,a4,a3                    # [33]
    saltu           a2,a3,a4                    # [34]
    add.n           a2,a2,a6                    # [35]
    src             a2,a2,a3                    # [36]
    ee.movi.32.a    q4,a6,0             # [37]
    mulsh           a7,a13,a6                   # [38]
    mull            a6,a13,a6                   # [39]
    add.n           a6,a4,a6                    # [41]
    saltu           a3,a6,a4                    # [42]
    add.n           a3,a3,a7                    # [43]
    src             a3,a3,a6                    # [4]
    ee.movi.32.q    q0,a2,1             # [47]
    ee.movi.32.q    q0,a3,0             # [46]

    l32i.n          a5,a1,20                # [0]  gra_spill_temp_4, right_shift
    movi.n          a7,1                    # [51]

    blti            a5,1,.skip_div_by_pow_of_2
// divide by power of 2
    ee.vcmp.lt.s32  q5,q2,q1        # [56]
    ee.vcmp.lt.s32  q6,q0,q1        # [28]

    addi.n          a8,a5,-1                # [1]
    ssl             a8                          # [2]
    sll             a7,a7                       # [3]
    s32i.n          a7,a1,0                 # [4]  to_add
    ee.vldbc.32     q4,a1               # [5]  id:376 to_add

    wsr.sar         a5                      # [6]
    ee.vadds.s32    q5,q4,q5            # [7]
    ee.vadds.s32    q5,q2,q5            # [8]
    ee.vsr.32       q2,q5                   # [9]

    wsr.sar         a5                      # [5]
    ee.vadds.s32    q5,q4,q6            # [9]
    ee.vadds.s32    q5,q0,q5            # [11]
    ee.vsr.32       q0,q5                   # [12]
.skip_div_by_pow_of_2:

// add offset, apply activation
    addi            a8,a1,132                   # [54]
    ee.vldbc.32     q4,a8               # [55]  id:385 activation_max
    addi            a5,a1,40                    # [8]
    ee.vldbc.32     q6,a5               # [10]  id:384 out_offset
    addi            a7,a1,128                   # [4]
    ee.vadds.s32    q0,q0,q6            # [13] // add out_offset
    ee.vadds.s32    q2,q2,q6            # [14] // add out_offset
    ee.vldbc.32     q6,a7               # [16]  id:386 activation_min
    ee.vmin.s32     q0,q0,q4            # [17]
    ee.vmin.s32     q2,q2,q4            # [15]
    ee.vmax.s32     q0,q0,q6            # [18]
    ee.vmax.s32     q2,q2,q6            # [19]

// pack and store
    ee.vunzip.16    q2,q0               # [20]
    ee.vunzip.8     q2,q0               # [21]
    l32i.n          a7,a1,12 // count
    l32i            a9,a1,36                    # [55]  gra_spill_temp_8
    l32i.n          a3,a1,136               # [1] , size
    ee.vst.l.64.ip  q2,a9,8         # [22]  id:387
    addi            a7,a7,8
    s32i.n          a7,a1,12 // increment count
    bge             a3,a7,.Lt_0_7682

    addi            a11,a7,-8
    bge             a11,a3,.exit  # [3] // exit

.process_leftover:
    sub     a8,a3,a11                   # [1]
    loopgtz a8,.LBB33_esp_nn_mul_elementwise_s8_esp32s3     # [9]

    ssl     a14                         # [0] left_shift
    l32i.n  a8,a1,24                # [1]  gra_spill_temp_5, input1_offset
    l32i.n  a10,a1,4                # [2]  gra_spill_temp_0, input2
    l32i.n  a12,a1,16               # [3]  gra_spill_temp_3, input1
    add.n   a10,a11,a10                 # [4], input2
    add.n   a12,a11,a12                 # [5], input1
    l8ui    a12,a12,0                   # [6]  id:390
    l8ui    a10,a10,0                   # [7]  id:391
    sext    a12,a12,7                   # [8]
    add.n   a12,a12,a8                  # [9]
    l32i.n  a8,a1,28                # [10]  gra_spill_temp_12, input2_offset
    sext    a10,a10,7                   # [11]
    add.n   a10,a10,a8                  # [12]
    mull    a10,a12,a10                 # [13] // multiplication result

// multiply by quantised mult
    l32i.n  a9,a1,20                # [0]  gra_spill_temp_4, load right_shift

    sll     a10,a10                     # [15] // left shift

    mulsh   a3,a10,a13                  # [1]
    mull    a8,a10,a13                  # [6]
    ssai    31                          # [0]
    add.n   a6,a8,a4                    # [8]
    saltu   a8,a6,a8                    # [9]
    add.n   a8,a8,a3                    # [10]
    src     a3,a8,a6                    # [19] // result

    blti    a9, 1, .skip_div_by_pow_of_2_remains
// divide by power of 2
    // calculate to_add = `1 << (exponent - 1)`
    addi    a6,a9,-1
    ssl     a6                          # [23]
    movi    a7,1
    sll     a7,a7                       // to_add

    extui   a8,a3,31,1                  # [24], sign
    add     a3,a3,a8            // add sign
    add     a3,a3,a7            // add to_add

    ssr     a9                          # [20] load right_shift
    sra     a3,a3               // right shift
.skip_div_by_pow_of_2_remains:

    l32i.n  a6,a1,40                    # [32], out_offset
    l32i.n  a8,a1,132                   # [35], act_max
    l32i.n  a7,a1,128                   # [36], act_min

// add offset and apply activation
    add.n   a3,a3,a6                    # [34], offset added
    min     a8,a8,a3                    # [37]
    l32i.n  a3,a1,8                 # [38]  gra_spill_temp_1, load base out_addr
    max     a8,a8,a7                    # [39]

// store
    add.n   a3,a11,a3                   # [16], add index from `a11`
    s8i     a8,a3,0                     # [41]  id:392 // store
    addi.n  a11,a11,1               # [42]  // inc index

.LBB33_esp_nn_mul_elementwise_s8_esp32s3:   # 0x2ed
.exit:
    retw.n                          # [0]

    .size   esp_nn_mul_elementwise_s8_esp32s3, . - esp_nn_mul_elementwise_s8_esp32s3


================================================
FILE: src/common/common_functions.h
================================================
/*
 * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#pragma once

#include <stdint.h>
#include <stdbool.h>
#include <string.h>

/**
 * c99 standard still doesn't strictly inline functions
 * We need to use attribute as well to do this.
 */
#define __NN_FORCE_INLINE__ __attribute((always_inline)) static inline

/* min/max macros */
#ifndef max
#define max(a, b) ({            \
    __typeof__ (a) _a = (a);    \
    __typeof__ (b) _b = (b);    \
    _a > _b ? _a : _b;          \
})

#define min(a, b) ({            \
    __typeof__ (a) _a = (a);    \
    __typeof__ (b) _b = (b);    \
    _a < _b ? _a : _b;          \
})
#endif

__NN_FORCE_INLINE__ int32_t esp_nn_clz32(uint32_t in)
{
#if CONFIG_IDF_TARGET_ARCH_XTENSA
    __asm__ volatile("nsau %0, %0" : "+r" (in));
    return in;
#elif defined(__GNUC__)
    return __builtin_clz(in);
#else
    int32_t count = 32;
    uint32_t x = in, y = in >> 16;
    if (y != 0) {
        count -= 16;
        x = y;
    }
    y = x >> 8;
    if (y != 0) {
        count -= 8;
        x = y;
    }
    y = x >> 4;
    if (y != 0) {
        count -= 4;
        x = y;
    }
    y = x >> 2;
    if (y != 0) {
        count -= 2;
        x = y;
    }
    y = x >> 1;
    if (y != 0) {
        return count - 2;
    }
    return count - x;
#endif
}

/**
 * Signed saturate a 32 bit value to 8 bits keeping output in 32 bit variable.
 */
__NN_FORCE_INLINE__ int32_t esp_nn_saturate8(int32_t in)
{
#if CONFIG_IDF_TARGET_ARCH_XTENSA
    __asm__ volatile("clamps %0, %0, 7" : "+a"(in));
    return in;
#else
    return max(INT8_MIN, min(in, INT8_MAX));
#endif
}

__NN_FORCE_INLINE__ int32_t esp_nn_pick_sat_high32_of64(int64_t val64)
{
    int32_t sign = (int32_t) (val64 >> 63);
    int32_t to_add = sign & ((1ul << 31) - 1);
    return (int32_t) ((int64_t) (val64 + to_add) >> 31);
}

__NN_FORCE_INLINE__ int32_t esp_nn_sat_round_doubling_high_mul(int32_t in0, int32_t in1)
{
    int32_t result;
    int64_t in0_64 = (int64_t) in0;
    bool overflow = (in0 == in1) && (in0 == (int32_t) INT32_MIN);

    /* Nudge value */
    int64_t nudge_val = 1 << 30;
    if ((in0 < 0) ^ (in1 < 0)) {
        nudge_val = 1 - nudge_val;
    }

    /* Multiply and add nudge */
    int64_t mult = in0_64 * in1 + nudge_val;

    /* Round and pickup 32 bits */
    result = esp_nn_pick_sat_high32_of64(mult);

    return overflow ? INT32_MAX : result;
}

/**
 * fast version
 * this will fail for values closer to INT32_MAX and INT32_MIN by `1 << (exponent - 1)`.
 * We can afford to do this because we are at the very last stage of filter.
 * Also it is pretty rare condition as our output is going to be 8 bit.
 */
__NN_FORCE_INLINE__ int32_t esp_nn_div_by_power_of_two_fast(int32_t val, int32_t exponent)
{
    int32_t to_add = (1 << (exponent - 1)) - (val < 0);
    return (int32_t) ((val + to_add) >> exponent);
}

__NN_FORCE_INLINE__ int32_t esp_nn_div_by_power_of_two(int32_t val, int32_t exponent)
{
    int32_t result;

    const int32_t mask = (1 << exponent) - 1;
    const int32_t remainder = val & mask;

    result = val >> exponent;
    int32_t threshold = (mask >> 1) + (result < 0);

    if (remainder > threshold) {
        result += 1;
    }
    return result;
}

__NN_FORCE_INLINE__ int32_t esp_nn_multiply_by_quantized_mult(int32_t x, int32_t mult, int32_t shift)
{
    int32_t left_shift = shift > 0 ? shift : 0;
    int32_t right_shift = shift > 0 ? 0 : -shift;
    int32_t result = esp_nn_sat_round_doubling_high_mul(x * (1 << left_shift), mult);
    return esp_nn_div_by_power_of_two(result, right_shift);
}

#if CONFIG_IDF_TARGET_ESP32P4
/** PIE enable macro - call once before using any esp.* instructions */
#define ESP_NN_PIE_ENABLE() do { \
    asm volatile ( \
        "csrsi  0x7f2, 0b01        \n\t" \
        "li     x29, 0b10          \n\t" \
        "esp.movx.w.cfg x29        \n\t" \
        ::: "x29" \
    ); \
} while(0)

/** Extract 16 int32 per-lane results from QACC into array */
#define ESP_NN_QACC_EXTRACT_S32(dst) do { \
    asm volatile ( \
        "mv                      x30, %0     \n\t" \
        "esp.st.qacc.l.l.128.ip  x30, 16     \n\t" \
        "esp.st.qacc.l.h.128.ip  x30, 16     \n\t" \
        "esp.st.qacc.h.l.128.ip  x30, 16     \n\t" \
        "esp.st.qacc.h.h.128.ip  x30, 0      \n\t" \
        :: "r"(dst) \
        : "x30", "memory" \
    ); \
} while(0)
#endif /* CONFIG_IDF_TARGET_ESP32P4 - PIE_ENABLE and QACC_EXTRACT */

/**
 * 2-wide interleaved requant macro for ESP32-P4 RISC-V.
 * Interleaves mulh across two independent elements for pipeline fill.
 * Outputs r0, r1 as requantized int32 values (before offset/clamp).
 */
#if CONFIG_IDF_TARGET_ESP32P4
#define ESP_NN_REQUANT_2X(x0, x1, m0, m1, s0, s1, r0, r1) do { \
    int32_t _ls0 = (s0) > 0 ? (s0) : 0; \
    int32_t _ls1 = (s1) > 0 ? (s1) : 0; \
    int32_t _v0 = (x0) << _ls0; \
    int32_t _v1 = (x1) << _ls1; \
    int32_t _rs0 = _ls0 - (s0); \
    int32_t _rs1 = _ls1 - (s1); \
    int32_t _hi0, _lo0, _hi1, _lo1; \
    asm volatile ( \
        "mulh  %[h0], %[v0], %[mm0]  \n\t" \
        "mulh  %[h1], %[v1], %[mm1]  \n\t" \
        "mul   %[l0], %[v0], %[mm0]  \n\t" \
        "mul   %[l1], %[v1], %[mm1]  \n\t" \
        : [h0] "=&r"(_hi0), [h1] "=&r"(_hi1), \
          [l0] "=&r"(_lo0), [l1] "=&r"(_lo1) \
        : [v0] "r"(_v0), [v1] "r"(_v1), \
          [mm0] "r"((int32_t)(m0)), [mm1] "r"((int32_t)(m1)) \
    ); \
    /* Add nudge (1<<30) and extract bits [31:62] */ \
    uint32_t _n = 0x40000000u; \
    uint32_t _a0 = (uint32_t)_lo0 + _n; \
    _hi0 += (_a0 < (uint32_t)_lo0); \
    (r0) = (_hi0 << 1) | (_a0 >> 31); \
    uint32_t _a1 = (uint32_t)_lo1 + _n; \
    _hi1 += (_a1 < (uint32_t)_lo1); \
    (r1) = (_hi1 << 1) | (_a1 >> 31); \
    /* Right shift with rounding */ \
    if (_rs0) { (r0) = ((r0) + (1 << (_rs0 - 1)) - ((r0) < 0)) >> _rs0; } \
    if (_rs1) { (r1) = ((r1) + (1 << (_rs1 - 1)) - ((r1) < 0)) >> _rs1; } \
} while(0)
#endif

__NN_FORCE_INLINE__ int32_t esp_nn_multiply_by_quantized_mult_fast(int32_t x, int32_t mult, int32_t shift)
{
    int32_t left_shift = max(shift, 0);
    int32_t right_shift = left_shift - shift;

    int64_t nudge_val = 1 << 30;
    int64_t in0_64 = (int64_t) (x << left_shift);

    /* Multiply and add nudge */
    int64_t mult_64 = in0_64 * mult + nudge_val;
    int32_t result = (int32_t) (mult_64 >> 31);
    if (right_shift) {
        result = esp_nn_div_by_power_of_two_fast(result, right_shift);
    }
    return result;
}

/*
 * Unified requantize wrapper. Defining either SKIP_NUDGE (legacy) or
 * CONFIG_NN_SKIP_NUDGE (Kconfig-driven) selects the faster, non-bit-exact
 * path; otherwise the bit-exact TFLite-reference path is used.
 */
#if defined(SKIP_NUDGE) || defined(CONFIG_NN_SKIP_NUDGE)
#define esp_nn_requantize(x, m, s) esp_nn_multiply_by_quantized_mult_fast((x), (m), (s))
#else
#define esp_nn_requantize(x, m, s) esp_nn_multiply_by_quantized_mult((x), (m), (s))
#endif

static void esp_nn_aligned_s8_pad_with_value(const int8_t *src, int8_t *dst,
                                             const uint16_t input_wd,
                                             const uint16_t input_ht,
                                             const uint16_t channels,
                                             const int32_t pad_val,
                                             const uint16_t pad_wd,
                                             const uint16_t pad_ht)
{
    /* memset with pad_val */
    memset(dst, pad_val, ((input_wd + 2 * pad_wd) * (input_ht + 2 * pad_ht)) * channels);
    dst += (pad_wd + input_wd + pad_wd) * pad_ht * channels;

    for (int i = 0; i < input_ht; i++) {
        dst += pad_wd * channels;
        for (int j = 0; j < input_wd * channels; j++) {
            *dst++ = *src++;
        }
        dst += pad_wd * channels;
    }
}

static void esp_nn_aligned_s8_pad_end_with_value(const int8_t *src, int8_t *dst,
                                                 const uint16_t input_wd,
                                                 const uint16_t input_ht,
                                                 const uint16_t channels,
                                                 const int32_t pad_val,
                                                 const uint16_t pad_wd,
                                                 const uint16_t pad_ht)
{
    for (int i = 0; i < input_ht; i++) {
        for (int j = 0; j < input_wd * channels; j++) {
            *dst++ = *src++;
        }
        if (pad_wd) {
            memset(dst, pad_val, pad_wd * channels);
            dst += pad_wd * channels;
        }
    }
    /* pad end `pad_ht` lines at end */
    if (pad_ht) {
        memset(dst, pad_val, (input_wd + pad_wd) * pad_ht * channels);
    }
}

/**
 * @brief       convert 8 bit input data to 16 bit
 *
 * @param       src int8_t source data
 * @param       dst int16_t dst data
 * @param       size length of data
 * @param       offset  offset to be added to src data. Range: [-128, 127]
 */
__NN_FORCE_INLINE__ void esp_nn_s8_to_s16_with_offset(const int8_t *src, int16_t *dst,
                                                      const int size, const int32_t offset)
{
    int i = 0;
    for (; i < size; i += 2) {
        dst[i + 0] = src[i + 0] + offset;
        dst[i + 1] = src[i + 1] + offset;
    }
    if(i < size) {
        dst[i] = src[i] + offset;
    }
}

/**
 * @brief       convert 8 bit input data to 16 bit
 *
 * @param       src int8_t source data
 * @param       dst int16_t dst data
 * @param       size length of data
 */
__NN_FORCE_INLINE__ void esp_nn_s8_to_s16(const int8_t *src, int16_t *dst, const int size)
{
    int i = 0;
    for (; i < size; i += 2) {
        dst[i + 0] = src[i + 0];
        dst[i + 1] = src[i + 1];
    }
    if(i < size) {
        dst[i] = src[i];
    }
}

#if CONFIG_IDF_TARGET_ESP32S3
/**
 * @brief       s8 dot product — both pointers 16-byte aligned.
 *              Uses ACCX accumulator with fused MAC+load.
 *
 * @param       a       input data (16-byte aligned)
 * @param       b       filter data (16-byte aligned)
 * @param       len     number of elements (must be multiple of 16, >= 16)
 * @return      int32_t dot product result
 */
extern int32_t esp_nn_dot_s8_aligned_esp32s3(const int8_t *a, const int8_t *b, int32_t len);

/**
 * @brief       s8 dot product — input aligned, filter may be unaligned.
 *              Uses USAR+QUP pattern for filter data.
 *
 * @param       a       input data (16-byte aligned)
 * @param       b       filter data (may be unaligned)
 * @param       len_div16  number of 16-element chunks (>= 1)
 * @return      int32_t dot product result
 */
extern int32_t esp_nn_dot_s8_unaligned_esp32s3(const int8_t *a, const int8_t *b, int32_t len_div16);
#endif


================================================
FILE: src/common/esp_nn_common_functions_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

	.text

	# Program Unit: esp_nn_aligned_s8_to_s16_with_offset_esp32s3
	.type	esp_nn_aligned_s8_to_s16_with_offset_esp32s3, @function
	.align	 4
	.global esp_nn_aligned_s8_to_s16_with_offset_esp32s3

esp_nn_aligned_s8_to_s16_with_offset_esp32s3:	# 0x30d

	entry	a1,48                   	#
	mov.n	a10,a2                  	# // src
	mov.n	a9,a3                   	# // dst
	mov.n	a8,a4                   	# // size
	s32i.n	a5,a1,12               	# [3] // offset
	addi.n	a2,a1,12               	# [4]

	blti	a4,32,.Lt_2_6402         	# [5] if (size < 32) goto unopt

	addi.n	a6,a8,-1               	# [0]
	ee.zero.q	q5                  	# [1]
	ee.vldbc.16	q4,a2             	# [2]  id:136 offset
	mov.n	a3,a10                  	# [3]
	mov.n	a2,a9                   	# [4]
	ee.vld.128.ip	q0,a3,16        	# [5]  id:137
	ee.vld.128.ip	q1,a3,16        	# [6]  id:138
	ee.vcmp.lt.s8	q2,q0,q5        	# [7]
	ee.vzip.8	q0,q2               	# [8]
	ee.vadds.s16	q0,q0,q4         	# [9]
	ee.vadds.s16.st.incp	q0,a2,q0,q2,q4 	# [10]  id:139
	blti	a4,64,.Lt_2_7170         	# [11]

	addi	a5,a4,-32                	# [0]
	srai	a5,a5,5                  	# [1]
	slli	a4,a5,5                  	# [2]
	loopgtz	a5,.LBB37_esp_nn_aligned_s8_to_s16_with_offset_esp32s3 	# [3]

	ee.vst.128.ip	q0,a2,16        	# [0*II+0]  id:140
	ee.vcmp.lt.s8	q0,q1,q5        	# [0*II+1]
	ee.vzip.8	q1,q0               	# [0*II+2]
	ee.vadds.s16.ld.incp	q2,a3,q3,q1,q4 	# [0*II+3]  id:141
	ee.vadds.s16.st.incp	q3,a2,q0,q0,q4 	# [0*II+4]  id:142
	ee.vcmp.lt.s8	q3,q2,q5        	# [0*II+5]
	ee.vst.128.ip	q0,a2,16        	# [0*II+6]  id:143
	ee.vzip.8	q2,q3               	# [0*II+7]
	ee.vadds.s16.ld.incp	q1,a3,q0,q2,q4 	# [0*II+8]  id:144
	ee.vadds.s16.st.incp	q0,a2,q0,q3,q4 	# [0*II+9]  id:145

.LBB37_esp_nn_aligned_s8_to_s16_with_offset_esp32s3:	# 0x36d
	addi	a4,a4,32                 	# [0]

.Lt_2_3842:	# 0x370
	ee.vst.128.ip	q0,a2,16        	# [0]  id:146
	ee.vcmp.lt.s8	q2,q1,q5        	# [1]
	ee.vzip.8	q1,q2               	# [2]
	ee.vadds.s16	q2,q2,q4         	# [3]
	ee.vadds.s16	q3,q1,q4         	# [4]
	ee.vst.128.ip	q3,a2,16        	# [5]  id:147
	ee.vst.128.ip	q2,a2,16        	# [6]  id:148
	bge	a4,a6,.Lt_2_4866          	# [7]

	l32i.n	a5,a1,12               	# [0]  id:135 offset+0x0

.Lt_2_5122:	# 0x38a
	mov.n	a11,a4                  	# [0]
	add.n	a2,a4,a10               	# [1]
 # 576          dst[i + 0] = src[i + 0] + offset;
	l8ui	a7,a2,0                  	# [2]  id:149
	addx2	a6,a4,a9                	# [3]
	sext	a7,a7,7                  	# [4]
	add.n	a7,a7,a5                	# [5]
	s16i	a7,a6,0                  	# [6]  id:150
 # 577          dst[i + 1] = src[i + 1] + offset;
	l8ui	a3,a2,1                  	# [7]  id:151
	sub	a7,a8,a4                  	# [8]
	addi.n	a2,a2,2                	# [9]
	srai	a7,a7,1                  	# [10]
	sext	a3,a3,7                  	# [11]
	add.n	a3,a3,a5                	# [12]
	s16i	a3,a6,2                  	# [13]  id:152
	addi.n	a3,a7,-1               	# [14]
	loopgtz	a3,.LBB52_esp_nn_aligned_s8_to_s16_with_offset_esp32s3 	# [15]

	l8ui	a3,a2,0                  	# [0*II+0]  id:149
	addi.n	a6,a6,4                	# [1*II+1]
	sext	a3,a3,7                  	# [0*II+2]
	add.n	a3,a3,a5                	# [0*II+3]
	s16i	a3,a6,0                  	# [0*II+4]  id:150
	l8ui	a3,a2,1                  	# [0*II+5]  id:151
	addi.n	a2,a2,2                	# [0*II+6]
	sext	a3,a3,7                  	# [0*II+7]
	add.n	a3,a3,a5                	# [0*II+8]
	s16i	a3,a6,2                  	# [0*II+9]  id:152

.LBB52_esp_nn_aligned_s8_to_s16_with_offset_esp32s3:	# 0x3ce
	addx2	a4,a7,a11               	# [0]

.Lt_2_4866:	# 0x3d1
	bge	a4,a8,.Lt_2_7682          	# [0]

 # 580          dst[i] = src[i] + offset;
	addx2	a11,a4,a9               	# [0]
	add.n	a8,a4,a10               	# [1]
	l8ui	a8,a8,0                  	# [2]  id:153
	l32i.n	a12,a1,12              	# [3]  id:135 offset+0x0
	sext	a8,a8,7                  	# [4]
	add.n	a8,a8,a12               	# [5]
	s16i	a8,a11,0                 	# [6]  id:154
	retw.n                        	# [7]

.Lt_2_6402:	# 0x3e8
	blti	a4,2,.Lt_2_6658          	# [0]

	movi.n	a4,0                   	# [0]
	j	.Lt_2_5122                  	# [1]

.Lt_2_7682:	# 0x3f0
	retw.n                        	# [0]

.Lt_2_6658:	# 0x3f2
	blti	a4,1,.Lt_2_7682          	# [0]

	l8ui	a11,a10,0                	# [0]  id:153
	sext	a11,a11,7                	# [2]
	add.n	a11,a11,a5              	# [3]
	s16i	a11,a3,0                 	# [4]  id:154
	retw.n                        	# [5]

.Lt_2_7170:	# 0x402
	movi.n	a4,32                  	# [0]
	j	.Lt_2_3842                  	# [1]

	.size	esp_nn_aligned_s8_to_s16_with_offset_esp32s3, . - esp_nn_aligned_s8_to_s16_with_offset_esp32s3


	.literal_position

	# Program Unit: esp_nn_s8_to_s16_esp32s3
	.type	esp_nn_s8_to_s16_esp32s3, @function
	.align	 4
	.global esp_nn_s8_to_s16_esp32s3

esp_nn_s8_to_s16_esp32s3:	# 0x40b
	entry	a1,32                   	#
	mov.n	a9,a2 // src
	mov.n	a8,a3 // dst
	mov.n	a7,a4 // size
    blti	a4,1,.Lt_3_4866  // size == 0
	blti	a4,16,.Lt_3_4610 // if (size < 16) jump to unopt path

 // load align_len to sar_byte
	extui	a2,a2,0,4               	# [0]
	wur.sar_byte	a2               	# [1]
	mov.n	a2,a9                   	# [2]

 // preload
	ee.vld.128.ip	q0,a2,16
	ee.vld.128.ip	q1,a2,16
    ee.zero.q	    q4
 # 672
 # 673      for (i = 16; i < size - 15; i += 16) {
	blti	a4,32,.Lt_3_5378         	# [5]
	addi	a6,a4,-16                	# [1]
	srai	a6,a6,4                  	# [2]
	slli	a4,a6,4                  	# [3]
	loopgtz	a6,.LBB35_esp_nn_s8_to_s16_esp32s3 	# [4]

	ee.src.q.qup	q2,q0,q1         	# [0*II+0]
	ee.vcmp.lt.s8	q3,q2,q4        	# [0*II+1] // sign
	ee.vld.128.ip	q1,a2,16        	# [0*II+2] // for next iteration
	ee.vzip.8	q2,q3               	# [0*II+3]
	ee.vst.128.ip	q2,a3,16        	# [0*II+4]  id:93
	ee.vst.128.ip	q3,a3,16        	# [0*II+5]  id:94

.LBB35_esp_nn_s8_to_s16_esp32s3:	# 0x449
	addi	a4,a4,16                 	# [0]

.Lt_3_2050:	# 0x44c
	ee.src.q.qup	q5,q0,q1         	# [0]
	ee.vcmp.lt.s8	q3,q5,q4        	# [1]
	ee.vzip.8	q5,q3               	# [2]
	ee.vst.128.ip	q5,a3,16        	# [3]  id:96
	ee.vst.128.ip	q3,a3,16        	# [4]  id:97
 # 687
 # 688  skip_to_remains_s8_to_s16:
 # 689      for (; i < size; i += 2) {
	bge	a4,a7,.Lt_3_4866          	# [5]

.Lt_3_3330:	# 0x45e
	mov.n	a11,a4                  	# [0]
	add.n	a2,a4,a9                	# [1]
 # 690          dst[i + 0] = src[i + 0];
	l8ui	a10,a2,0                 	# [2]  id:98
	addx2	a5,a4,a8                	# [3]
	sext	a10,a10,7                	# [4]
	s16i	a10,a5,0                 	# [5]  id:99
 # 691          dst[i + 1] = src[i + 1];
	l8ui	a3,a2,1                  	# [6]  id:100
	sub	a10,a7,a4                 	# [7]
	addi.n	a2,a2,2                	# [8]
	addi.n	a10,a10,1              	# [9]
	srai	a10,a10,1                	# [10]
	sext	a3,a3,7                  	# [11]
	s16i	a3,a5,2                  	# [12]  id:101
	addi.n	a3,a10,-1              	# [13]
	loopgtz	a3,.LBB50_esp_nn_s8_to_s16_esp32s3 	# [14]

	l8ui	a3,a2,0                  	# [0*II+0]  id:98
	addi.n	a5,a5,4                	# [1*II+1]
	sext	a3,a3,7                  	# [0*II+2]
	s16i	a3,a5,0                  	# [0*II+3]  id:99
	l8ui	a3,a2,1                  	# [0*II+4]  id:100
	addi.n	a2,a2,2                	# [0*II+5]
	sext	a3,a3,7                  	# [0*II+6]
	s16i	a3,a5,2                  	# [0*II+7]  id:101

.LBB50_esp_nn_s8_to_s16_esp32s3:	# 0x49c
	addx2	a4,a10,a11              	# [0]
 # 692      }
 # 693      if(i < size) {
	bge	a4,a7,.Lt_3_4866          	# [1]

 # 694          dst[i] = src[i];
	add.n	a11,a4,a9               	# [0]
	l8ui	a11,a11,0                	# [1]  id:102
	addx2	a12,a4,a8               	# [2]
	sext	a11,a11,7                	# [3]
	s16i	a11,a12,0                	# [4]  id:103
	retw.n                        	# [5]

.Lt_3_4610:	# 0x4b2
	movi.n	a4,0                   	# [0]
	j	.Lt_3_3330                  	# [1]

.Lt_3_4866:	# 0x4ba
	retw.n                        	# [0]

.Lt_3_5378:	# 0x4bc
	movi.n	a4,16                  	# [1]
	j	.Lt_3_2050                  	# [2]

	.size	esp_nn_s8_to_s16_esp32s3, . - esp_nn_s8_to_s16_esp32s3


================================================
FILE: src/common/esp_nn_dot_s8_esp32s3.S
================================================
//
// SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
//
// SPDX-License-Identifier: Apache-2.0
//

//
// Reusable s8 dot product kernels for ESP32-S3.
// Used by conv im2col, FC, and any kernel that reduces to a dot product.
//
// esp_nn_dot_s8_aligned_esp32s3:
//   Both input and filter 16-byte aligned. Uses ee.vld.128.ip + fused MAC.
//
// esp_nn_dot_s8_unaligned_esp32s3:
//   Input aligned, filter may be unaligned. Uses USAR+QUP for filter.
//

    .text
    .align  4

// ============================================================
// esp_nn_dot_s8_aligned_esp32s3
// Both pointers must be 16-byte aligned.
// a2: input_data (aligned)
// a3: filter_data (aligned)
// a4: len (must be multiple of 16, >= 16)
// Returns: int32_t dot product in a2
// ============================================================
    .type   esp_nn_dot_s8_aligned_esp32s3, @function
    .align  4
    .global esp_nn_dot_s8_aligned_esp32s3

esp_nn_dot_s8_aligned_esp32s3:
    entry   a1, 32

    ee.zero.accx
    beqz    a4, .Lalign_done

    // Compute loop count and remainder
    srli    a5, a4, 4               // a5 = len / 16
    beqz    a5, .Lalign_done

    // Prime: load first pair
    ee.vld.128.ip   q0, a2, 16
    ee.vld.128.ip   q1, a3, 16
    addi            a5, a5, -1
    beqz            a5, .Lalign_last

    // Main loop: fused MAC + load
    loopgtz a5, .Lalign_loop_end
    ee.vmulas.s8.accx.ld.ip  q0, a2, 16, q0, q1
    ee.vld.128.ip   q1, a3, 16
.Lalign_loop_end:

.Lalign_last:
    // Final MAC
    ee.vmulas.s8.accx  q0, q1

.Lalign_done:
    // Read lower 32 bits of ACCX (sufficient for int8 dot products)
    nop
    nop
    rur.accx_0 a2

    retw.n

    .size   esp_nn_dot_s8_aligned_esp32s3, . - esp_nn_dot_s8_aligned_esp32s3


// ============================================================
// esp_nn_dot_s8_unaligned_esp32s3
// Input must be 16-byte aligned. Filter can be unaligned.
// Uses USAR+QUP pattern for filter loads.
// a2: input_data (aligned)
// a3: filter_data (may be unaligned)
// a4: len_div16 (>= 1)
// Returns: int32_t dot product in a2
// ============================================================
    .type   esp_nn_dot_s8_unaligned_esp32s3, @function
    .align  4
    .global esp_nn_dot_s8_unaligned_esp32s3

esp_nn_dot_s8_unaligned_esp32s3:
    entry   a1, 32

    ee.zero.accx
    beqz    a4, .Lunalign_done

    // Prime: first unaligned filter load (sets SAR_BYTE)
    ee.ld.128.usar.ip   q0, a3, 16

    // Check if we can do 2x unrolled (need >= 2 iterations)
    srai    a5, a4, 1               // a5 = len_div16 / 2
    beqz    a5, .Lunalign_single

    // Load first input + filter pair for unrolled loop
    ee.vld.128.ip       q1, a2, 16
    ee.ld.128.usar.ip   q2, a3, 16

    // 2x unrolled main loop
    loopgtz a5, .Lunalign_loop2_end

    ee.src.q.qup        q4, q0, q2         // align filter[i]
    ee.vld.128.ip       q3, a2, 16         // input[i+1]
    ee.vmulas.s8.accx   q4, q1             // MAC filter[i] * input[i]
    ee.ld.128.usar.ip   q0, a3, 16         // filter chunk[i+2]
    ee.src.q.qup        q5, q2, q0         // align filter[i+1]
    ee.vld.128.ip       q1, a2, 16         // input[i+2] (primed for next)
    ee.vmulas.s8.accx   q5, q3             // MAC filter[i+1] * input[i+1]
    ee.ld.128.usar.ip   q2, a3, 16         // filter chunk[i+3]

.Lunalign_loop2_end:

    // Check if there's a remaining single iteration (odd len_div16)
    bbci    a4, 0, .Lunalign_done_mac

    // Odd remainder: the 2x loop already loaded q0/q2 for the next chunk.
    // Just qup the filter and MAC with the primed input (q1).
    // But q1 was loaded as input[i+2] in the last loop iteration — we need
    // to re-read the correct input. Actually, q1 is already the right input.
    // q0 and q2 are the filter chunks ready for qup.
    ee.src.q.qup        q4, q0, q2
    ee.vmulas.s8.accx   q4, q1
    j                   .Lunalign_done_mac

.Lunalign_single:
    // Called when len_div16 < 2 (single chunk only)
    ee.vld.128.ip       q1, a2, 16
    ee.ld.128.usar.ip   q2, a3, 16
    ee.src.q.qup        q4, q0, q2
    ee.vmulas.s8.accx   q4, q1

.Lunalign_done_mac:
.Lunalign_done:
    // 2-cycle gap before ACCX read
    movi.n  a3, 0
    nop
    ee.srs.accx a2, a3, 0

    retw.n

    .size   esp_nn_dot_s8_unaligned_esp32s3, . - esp_nn_dot_s8_unaligned_esp32s3


================================================
FILE: src/common/esp_nn_mean_ansi.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * Quantized mean reduction over spatial dimensions (axes 1,2).
 * Specialized for 4D tensors [N, H, W, C] → [N, 1, 1, C].
 * This is the common case in Squeeze-and-Excite blocks.
 */

#include <stdint.h>
#include <common_functions.h>

void esp_nn_mean_nhwc_s8_ansi(const int8_t *input,
                               int8_t *output,
                               const int32_t height,
                               const int32_t width,
                               const int32_t channels,
                               const int32_t input_zero_point,
                               const int32_t output_zero_point,
                               const int32_t multiplier,
                               const int32_t shift)
{
    const int32_t num_elements = height * width;

    for (int c = 0; c < channels; c++) {
        /* Sum over spatial dimensions */
        int32_t sum = 0;
        for (int h = 0; h < height; h++) {
            for (int w = 0; w < width; w++) {
                sum += input[(h * width + w) * channels + c];
            }
        }

        /* Apply zero point correction */
        sum -= num_elements * input_zero_point;

        /* Requantize: multiply_by_quantized_mult(sum, multiplier, shift) */
        int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);
        result += output_zero_point;
        result = max(result, -128);
        result = min(result, 127);
        output[c] = (int8_t)result;
    }
}


================================================
FILE: src/common/esp_nn_mean_s8_esp32p4.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * ESP32-P4 optimized spatial mean reduction using QACC per-lane accumulation.
 * Processes 16 channels in parallel via esp.vmulas.s8.qacc (same pattern as avg_pool).
 */

#include <stdint.h>
#include <common_functions.h>

void esp_nn_mean_nhwc_s8_esp32p4(const int8_t *input,
                                  int8_t *output,
                                  const int32_t height,
                                  const int32_t width,
                                  const int32_t channels,
                                  const int32_t input_zero_point,
                                  const int32_t output_zero_point,
                                  const int32_t multiplier,
                                  const int32_t shift)
{
    const int32_t num_elements = height * width;
    const int32_t ch_16 = channels >> 4;

    const int8_t one_val = 1;
    if (ch_16 > 0) {
        /* Enable PIE and broadcast 1 into q7 */
        asm volatile (
            "csrsi  0x7f2, 0b01        \n\t"
            "li     x29, 0b10          \n\t"
            "esp.movx.w.cfg x29        \n\t"
            ::: "x29"
        );
        asm volatile (
            "mv     x30, %0             \n\t"
            "esp.vldbc.8.ip q7, x30, 0  \n\t"
            :: "r"(&one_val) : "x30"
        );
    }

    /* Process all channels - QACC for 16-channel blocks, scalar for remainder */
    int ch = 0;
    for (int ch_blk = 0; ch_blk < ch_16; ch_blk++, ch += 16) {
        /* Single asm block: broadcast ones, zero QACC, accumulate all spatial
         * positions. Keeping in one block prevents compiler from clobbering
         * q7 between the broadcast and the MAC loop. */
        const int8_t *base_ptr = input + ch;
        asm volatile (
            /* Broadcast 1 into q7 */
            "mv     x30, %[one]             \n\t"
            "esp.vldbc.8.ip q7, x30, 0      \n\t"
            /* Zero QACC */
            "esp.zero.qacc                   \n\t"
            /* Accumulate loop: stride = channels between spatial positions */
            "mv     x30, %[base]            \n\t"
            "mv     s7,  %[cnt]             \n\t"
            "1:                             \n\t"
            "esp.vld.128.ip  q0, x30, 0     \n\t"
            "esp.vmulas.s8.qacc q0, q7      \n\t"
            "add    x30, x30, %[stride]     \n\t"
            "addi   s7, s7, -1              \n\t"
            "bnez   s7, 1b                  \n\t"
            :
            : [one] "r"(&one_val), [base] "r"(base_ptr),
              [cnt] "r"(num_elements), [stride] "r"((int32_t)channels)
            : "x30", "s7"
        );

        int32_t sums[16] __attribute__((aligned(16)));
        ESP_NN_QACC_EXTRACT_S32(sums);

        int32_t zp_correction = num_elements * input_zero_point;
        for (int k = 0; k < 16; k++) {
            int32_t result = sums[k] - zp_correction;
            result = esp_nn_multiply_by_quantized_mult(result, multiplier, shift);
            result += output_zero_point;
            result = max(result, -128);
            result = min(result, 127);
            output[ch + k] = (int8_t)result;
        }
    }

    /* Remaining channels scalar */
    for (; ch < channels; ch++) {
        int32_t sum = 0;
        for (int hw = 0; hw < num_elements; hw++) {
            sum += input[hw * channels + ch];
        }
        sum -= num_elements * input_zero_point;
        int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);
        result += output_zero_point;
        result = max(result, -128);
        result = min(result, 127);
        output[ch] = (int8_t)result;
    }
}


================================================
FILE: src/common/esp_nn_mean_s8_esp32s3.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * ESP32-S3 optimized mean reduction for NHWC int8 tensors.
 * Uses int16 accumulation for small spatial sizes (H*W <= 256),
 * int32 for larger. Accumulates all channels at once per spatial position.
 */

#include <stdint.h>
#include <string.h>
#include <common_functions.h>

void esp_nn_mean_nhwc_s8_esp32s3(const int8_t *input,
                                  int8_t *output,
                                  const int32_t height,
                                  const int32_t width,
                                  const int32_t channels,
                                  const int32_t input_zero_point,
                                  const int32_t output_zero_point,
                                  const int32_t multiplier,
                                  const int32_t shift)
{
    const int32_t num_elements = height * width;
    const int32_t zp_correction = num_elements * input_zero_point;

    if (num_elements <= 256 && channels <= 512) {
        /* int16 accumulation (safe: 256 * 127 = 32,512 < 32,767) */
        /* Process 8 channels at a time using int16 accumulators */
        int16_t acc16[channels];
        memset(acc16, 0, channels * sizeof(int16_t));

        const int8_t *ptr = input;
        for (int i = 0; i < num_elements; i++) {
            /* Inner loop — compiler should auto-vectorize with -O2 */
            for (int c = 0; c < channels; c++) {
                acc16[c] += (int16_t)ptr[c];
            }
            ptr += channels;
        }

        /* Requantize per channel */
        for (int c = 0; c < channels; c++) {
            int32_t sum = (int32_t)acc16[c] - zp_correction;
            int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);
            result += output_zero_point;
            result = max(result, -128);
            result = min(result, 127);
            output[c] = (int8_t)result;
        }
    } else if (channels <= 512) {
        /* int32 accumulation for larger spatial sizes */
        int32_t acc[channels];
        memset(acc, 0, channels * sizeof(int32_t));

        const int8_t *ptr = input;
        for (int i = 0; i < num_elements; i++) {
            for (int c = 0; c < channels; c++) {
                acc[c] += ptr[c];
            }
            ptr += channels;
        }

        for (int c = 0; c < channels; c++) {
            int32_t sum = acc[c] - zp_correction;
            int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);
            result += output_zero_point;
            result = max(result, -128);
            result = min(result, 127);
            output[c] = (int8_t)result;
        }
    } else {
        /* Per-channel fallback for huge channel counts */
        for (int c = 0; c < channels; c++) {
            int32_t sum = 0;
            for (int i = 0; i < num_elements; i++) {
                sum += input[i * channels + c];
            }
            sum -= zp_correction;
            int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);
            result += output_zero_point;
            result = max(result, -128);
            result = min(result, 127);
            output[c] = (int8_t)result;
        }
    }
}


================================================
FILE: src/common/esp_nn_multiply_by_quantized_mult_esp32p4.S
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * Fast 2-wide requantization for ESP32-P4 (RISC-V).
 * Interleaves mul/mulh across 2 elements for better pipeline utilization.
 * Uses a0-a7 and t0-t6 only (no callee-saved registers needed).
 *
 * void esp_nn_requant_2x_esp32p4(
 *     int32_t x0,       // a0
 *     int32_t x1,       // a1
 *     int32_t mult0,    // a2
 *     int32_t mult1,    // a3
 *     int32_t shift0,   // a4
 *     int32_t shift1,   // a5
 *     int32_t *out      // a6: pointer to store 2 results
 * );
 */

    .text
    .align  4
    .global esp_nn_requant_2x_esp32p4
    .type   esp_nn_requant_2x_esp32p4, @function

esp_nn_requant_2x_esp32p4:
    /* Compute left_shift and apply */
    mv      t0, a0              /* x0 */
    mv      t1, a1              /* x1 */
    bgez    a4, .Lls0_pos
    mv      t6, zero            /* ls0 = 0 */
    j       .Lls0_done
.Lls0_pos:
    sll     t0, t0, a4          /* x0 <<= shift0 (positive = left shift) */
    mv      t6, a4              /* ls0 = shift0 */
.Lls0_done:
    sub     a4, t6, a4          /* rs0 = ls0 - shift0 */

    bgez    a5, .Lls1_pos
    mv      t6, zero
    j       .Lls1_done
.Lls1_pos:
    sll     t1, t1, a5
    mv      t6, a5
.Lls1_done:
    sub     a5, t6, a5          /* rs1 = ls1 - shift1 */

    /* ---- Interleaved 64-bit multiply ---- */
    /* mulh first (both elements), then mul (both elements) */
    mulh    t2, t0, a2          /* hi0 */
    mulh    t3, t1, a3          /* hi1 */
    mul     t0, t0, a2          /* lo0 */
    mul     t1, t1, a3          /* lo1 */

    /* Add nudge and combine: result = ((hi:lo) + (1<<30)) >> 31 */
    li      t4, 0x40000000      /* nudge = 1 << 30 */

    add     t5, t0, t4          /* lo0 + nudge */
    sltu    t6, t5, t0          /* carry0 */
    add     t2, t2, t6          /* hi0 += carry0 */
    srli    t5, t5, 31          /* (lo0+nudge) >> 31 */
    slli    t0, t2, 1           /* hi0 << 1 */
    or      t0, t0, t5          /* result0 */

    add     t5, t1, t4          /* lo1 + nudge */
    sltu    t6, t5, t1          /* carry1 */
    add     t3, t3, t6          /* hi1 += carry1 */
    srli    t5, t5, 31
    slli    t1, t3, 1
    or      t1, t1, t5          /* result1 */

    /* ---- Right shift with rounding ---- */
    li      t4, 1

    beqz    a4, .Lskip_rs0
    addi    t5, a4, -1
    sll     t5, t4, t5          /* round0 = 1 << (rs0-1) */
    srai    t6, t0, 31          /* -1 if negative, 0 otherwise */
    add     t5, t5, t6          /* round0 += sign */
    add     t0, t0, t5
    sra     t0, t0, a4
.Lskip_rs0:

    beqz    a5, .Lskip_rs1
    addi    t5, a5, -1
    sll     t5, t4, t5
    srai    t6, t1, 31
    add     t5, t5, t6
    add     t1, t1, t5
    sra     t1, t1, a5
.Lskip_rs1:

    /* Store results */
    sw      t0, 0(a6)
    sw      t1, 4(a6)
    ret

    .size   esp_nn_requant_2x_esp32p4, . - esp_nn_requant_2x_esp32p4


================================================
FILE: src/common/esp_nn_multiply_by_quantized_mult_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// the macro `use_nudge` enables adding rounding factor similar to tflite implementation
// this barely changes any accuracy
// keep this disabled for better performance

#ifndef SKIP_NUDGE
    # set SKIP_NUDGE flag for ~20% faster (but not bit-exact) quantisation
    .set use_nudge, 1
#endif

    .text
    .literal_position
    .literal    .nudge_val, 1073741824          # 1 << 30

    .type   esp_nn_multiply_by_quantized_mult_asm_esp32s3, @function
    .align   4
    .global esp_nn_multiply_by_quantized_mult_asm_esp32s3

esp_nn_multiply_by_quantized_mult_asm_esp32s3:  # 0x4
    # to_add = 4

    entry       a1,32
    wsr.sar     a3
    ee.zero.q   q2

    bltz        a3,     .skip_left_shift
    ee.vsl.32   q0,q0                   # [13]
.skip_left_shift:

    ssai    31                      # [15]

# move data to general purpose registers
    ee.movi.32.a    q0,a12,0            # [17]
    ee.movi.32.a    q0,a13,1            # [16]
    ee.movi.32.a    q0,a14,2   
Download .txt
gitextract__zjpraf8/

├── .github/
│   └── workflows/
│       └── upload_component.yml
├── .gitignore
├── .gitlab-ci.yml
├── CMakeLists.txt
├── CONTRIBUTING.md
├── Kconfig.projbuild
├── LICENSE
├── README.md
├── idf_component.yml
├── include/
│   ├── esp_nn.h
│   ├── esp_nn_ansi_c.h
│   ├── esp_nn_ansi_headers.h
│   ├── esp_nn_defs.h
│   ├── esp_nn_esp32p4.h
│   ├── esp_nn_esp32s3.h
│   └── esp_nn_generic_opt.h
├── src/
│   ├── activation_functions/
│   │   ├── esp_nn_hard_swish_ansi.c
│   │   ├── esp_nn_hard_swish_s8_esp32p4.c
│   │   ├── esp_nn_hard_swish_s8_esp32s3.c
│   │   ├── esp_nn_relu_ansi.c
│   │   ├── esp_nn_relu_s8_esp32p4.c
│   │   └── esp_nn_relu_s8_esp32s3.S
│   ├── basic_math/
│   │   ├── esp_nn_add_ansi.c
│   │   ├── esp_nn_add_s8_esp32p4.c
│   │   ├── esp_nn_add_s8_esp32s3.S
│   │   ├── esp_nn_mul_ansi.c
│   │   ├── esp_nn_mul_broadcast_s8_esp32s3.S
│   │   ├── esp_nn_mul_s8_esp32p4.c
│   │   └── esp_nn_mul_s8_esp32s3.S
│   ├── common/
│   │   ├── common_functions.h
│   │   ├── esp_nn_common_functions_esp32s3.S
│   │   ├── esp_nn_dot_s8_esp32s3.S
│   │   ├── esp_nn_mean_ansi.c
│   │   ├── esp_nn_mean_s8_esp32p4.c
│   │   ├── esp_nn_mean_s8_esp32s3.c
│   │   ├── esp_nn_multiply_by_quantized_mult_esp32p4.S
│   │   ├── esp_nn_multiply_by_quantized_mult_esp32s3.S
│   │   └── esp_nn_multiply_by_quantized_mult_ver1_esp32s3.S
│   ├── convolution/
│   │   ├── esp_nn_conv_ansi.c
│   │   ├── esp_nn_conv_esp32p4.c
│   │   ├── esp_nn_conv_esp32s3.c
│   │   ├── esp_nn_conv_opt.c
│   │   ├── esp_nn_conv_s16_mult4_1x1_esp32s3.S
│   │   ├── esp_nn_conv_s16_mult8_esp32s3.S
│   │   ├── esp_nn_conv_s8_1x1_esp32s3.c
│   │   ├── esp_nn_conv_s8_3x3_opt_esp32s3.c
│   │   ├── esp_nn_conv_s8_filter_aligned_input_padded_esp32s3.S
│   │   ├── esp_nn_conv_s8_mult8_1x1_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_ansi.c
│   │   ├── esp_nn_depthwise_conv_esp32p4.c
│   │   ├── esp_nn_depthwise_conv_opt.c
│   │   ├── esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_s16_mult1_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_s16_mult4_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_s16_mult8_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_s8_esp32s3.c
│   │   └── esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3.S
│   ├── fully_connected/
│   │   ├── esp_nn_fc_s8_mac16_esp32s3.S
│   │   ├── esp_nn_fully_connected_ansi.c
│   │   ├── esp_nn_fully_connected_esp32s3.c
│   │   ├── esp_nn_fully_connected_per_ch_s8_esp32s3.S
│   │   ├── esp_nn_fully_connected_s8_esp32p4.c
│   │   └── esp_nn_fully_connected_s8_esp32s3.S
│   ├── logistic/
│   │   └── esp_nn_logistic_ansi.c
│   ├── pooling/
│   │   ├── esp_nn_avg_pool_ansi.c
│   │   ├── esp_nn_avg_pool_s8_esp32p4.c
│   │   ├── esp_nn_avg_pool_s8_esp32s3.S
│   │   ├── esp_nn_avg_pool_s8_esp32s3.c
│   │   ├── esp_nn_max_pool_ansi.c
│   │   ├── esp_nn_max_pool_s8_esp32p4.c
│   │   └── esp_nn_max_pool_s8_esp32s3.S
│   └── softmax/
│       ├── esp_nn_softmax_ansi.c
│       ├── esp_nn_softmax_opt.c
│       ├── esp_nn_softmax_s8_esp32p4.c
│       ├── esp_nn_softmax_s8_esp32s3.c
│       └── softmax_common.h
├── test_app/
│   ├── CMakeLists.txt
│   ├── Makefile
│   ├── main/
│   │   ├── CMakeLists.txt
│   │   ├── component.mk
│   │   └── main.c
│   ├── sdkconfig.defaults
│   ├── sdkconfig.defaults.esp32p4
│   └── sdkconfig.defaults.esp32s3
└── tests/
    ├── CMakeLists.txt
    ├── README.md
    ├── component.mk
    ├── include/
    │   ├── test_functions.h
    │   └── test_utils.h
    └── src/
        ├── basic_math_test.c
        ├── convolution_test.c
        ├── fully_connected_test.c
        ├── hard_swish_test.c
        ├── mean_test.c
        ├── pooling_test.c
        ├── relu_test.c
        └── softmax_test.c
Download .txt
SYMBOL INDEX (143 symbols across 47 files)

FILE: include/esp_nn_defs.h
  type data_dims_t (line 23) | typedef struct data_dims {
  type data_2d_t (line 35) | typedef struct data_2d {
  type act_params_t (line 43) | typedef struct act_params {
  type quant_data_t (line 53) | typedef struct quant_data {
  type conv_params_t (line 62) | typedef struct conv_params {
  type dw_conv_params_t (line 75) | typedef struct dw_conv_params {

FILE: src/activation_functions/esp_nn_hard_swish_ansi.c
  function sat_left_shift_s16 (line 18) | static inline int16_t sat_left_shift_s16(int16_t val, int shift)
  function sat_round_dbl_high_mul_s16 (line 29) | static inline int16_t sat_round_dbl_high_mul_s16(int16_t a, int16_t b)
  function sat_dbl_high_mul_s16 (line 39) | static inline int16_t sat_dbl_high_mul_s16(int16_t a, int16_t b)
  function rounding_div_pot_s16 (line 48) | static inline int16_t rounding_div_pot_s16(int16_t val, int exponent)
  function esp_nn_hard_swish_s8_ansi (line 56) | void esp_nn_hard_swish_s8_ansi(const int8_t *input,

FILE: src/activation_functions/esp_nn_hard_swish_s8_esp32p4.c
  function sat_rnd_dbl_hi_mul (line 16) | static inline __attribute__((always_inline))
  function sat_dbl_hi_mul (line 22) | static inline __attribute__((always_inline))
  function sat_left_shift_s16 (line 28) | static inline __attribute__((always_inline))
  function rounding_div_pot_s16 (line 35) | static inline __attribute__((always_inline))
  function hard_swish_output (line 44) | static inline __attribute__((always_inline))
  function esp_nn_hard_swish_s8_esp32p4 (line 55) | void esp_nn_hard_swish_s8_esp32p4(const int8_t *input,

FILE: src/activation_functions/esp_nn_hard_swish_s8_esp32s3.c
  function esp_nn_get_hard_swish_scratch_size_esp32s3 (line 34) | int32_t esp_nn_get_hard_swish_scratch_size_esp32s3(void)
  function esp_nn_set_hard_swish_scratch_buf_esp32s3 (line 39) | void esp_nn_set_hard_swish_scratch_buf_esp32s3(void *buf)
  function esp_nn_hard_swish_s8_esp32s3 (line 44) | void esp_nn_hard_swish_s8_esp32s3(const int8_t *input,

FILE: src/activation_functions/esp_nn_relu_ansi.c
  function esp_nn_relu6_s8_ansi (line 20) | void esp_nn_relu6_s8_ansi(int8_t *data, uint16_t size)

FILE: src/activation_functions/esp_nn_relu_s8_esp32p4.c
  function esp_nn_relu6_s8_esp32p4 (line 14) | void esp_nn_relu6_s8_esp32p4(int8_t *data, uint16_t size)

FILE: src/basic_math/esp_nn_add_ansi.c
  function esp_nn_add_elementwise_u8_ansi (line 19) | void esp_nn_add_elementwise_u8_ansi(const uint8_t *input1_data,
  function esp_nn_add_elementwise_s8_ansi (line 59) | void esp_nn_add_elementwise_s8_ansi(const int8_t *input1_data,

FILE: src/basic_math/esp_nn_add_s8_esp32p4.c
  function add_requant (line 18) | static inline __attribute__((always_inline))
  function esp_nn_add_elementwise_s8_esp32p4 (line 32) | void esp_nn_add_elementwise_s8_esp32p4(const int8_t *input1_data,

FILE: src/basic_math/esp_nn_mul_ansi.c
  function esp_nn_mul_elementwise_s8_ansi (line 19) | void esp_nn_mul_elementwise_s8_ansi(const int8_t *input1_data,
  function esp_nn_mul_broadcast_channel_s8_ansi (line 44) | void esp_nn_mul_broadcast_channel_s8_ansi(const int8_t *input1,

FILE: src/basic_math/esp_nn_mul_s8_esp32p4.c
  function esp_nn_mul_elementwise_s8_esp32p4 (line 15) | void esp_nn_mul_elementwise_s8_esp32p4(const int8_t *input1_data,

FILE: src/common/common_functions.h
  function __NN_FORCE_INLINE__ (line 34) | __NN_FORCE_INLINE__ int32_t esp_nn_clz32(uint32_t in)
  function __NN_FORCE_INLINE__ (line 74) | __NN_FORCE_INLINE__ int32_t esp_nn_saturate8(int32_t in)
  function __NN_FORCE_INLINE__ (line 84) | __NN_FORCE_INLINE__ int32_t esp_nn_pick_sat_high32_of64(int64_t val64)
  function __NN_FORCE_INLINE__ (line 91) | __NN_FORCE_INLINE__ int32_t esp_nn_sat_round_doubling_high_mul(int32_t i...
  function __NN_FORCE_INLINE__ (line 118) | __NN_FORCE_INLINE__ int32_t esp_nn_div_by_power_of_two_fast(int32_t val,...
  function __NN_FORCE_INLINE__ (line 124) | __NN_FORCE_INLINE__ int32_t esp_nn_div_by_power_of_two(int32_t val, int3...
  function __NN_FORCE_INLINE__ (line 140) | __NN_FORCE_INLINE__ int32_t esp_nn_multiply_by_quantized_mult(int32_t x,...
  function __NN_FORCE_INLINE__ (line 211) | __NN_FORCE_INLINE__ int32_t esp_nn_multiply_by_quantized_mult_fast(int32...
  function esp_nn_aligned_s8_pad_with_value (line 239) | static void esp_nn_aligned_s8_pad_with_value(const int8_t *src, int8_t *...
  function esp_nn_aligned_s8_pad_end_with_value (line 260) | static void esp_nn_aligned_s8_pad_end_with_value(const int8_t *src, int8...
  function __NN_FORCE_INLINE__ (line 291) | __NN_FORCE_INLINE__ void esp_nn_s8_to_s16_with_offset(const int8_t *src,...
  function __NN_FORCE_INLINE__ (line 311) | __NN_FORCE_INLINE__ void esp_nn_s8_to_s16(const int8_t *src, int16_t *ds...

FILE: src/common/esp_nn_mean_ansi.c
  function esp_nn_mean_nhwc_s8_ansi (line 16) | void esp_nn_mean_nhwc_s8_ansi(const int8_t *input,

FILE: src/common/esp_nn_mean_s8_esp32p4.c
  function esp_nn_mean_nhwc_s8_esp32p4 (line 15) | void esp_nn_mean_nhwc_s8_esp32p4(const int8_t *input,

FILE: src/common/esp_nn_mean_s8_esp32s3.c
  function esp_nn_mean_nhwc_s8_esp32s3 (line 17) | void esp_nn_mean_nhwc_s8_esp32s3(const int8_t *input,

FILE: src/convolution/esp_nn_conv_ansi.c
  function esp_nn_get_conv_scratch_size_ansi (line 19) | int esp_nn_get_conv_scratch_size_ansi(const data_dims_t *input_dims,
  function esp_nn_set_conv_scratch_buf_ansi (line 27) | void esp_nn_set_conv_scratch_buf_ansi(const void *buf)
  function esp_nn_conv_u8_ansi (line 37) | void esp_nn_conv_u8_ansi(const uint8_t *input_data,
  function esp_nn_conv_s8_ansi (line 109) | void esp_nn_conv_s8_ansi(const data_dims_t *input_dims,

FILE: src/convolution/esp_nn_conv_esp32p4.c
  function pie_dot_s8 (line 63) | static inline __attribute__((always_inline))
  function conv_1x1_batch16 (line 143) | __attribute__((noinline))
  function esp_nn_conv_s8_1x1 (line 217) | __attribute__ ((noinline))
  function esp_nn_conv_s8_padded (line 384) | __attribute__ ((noinline))
  function esp_nn_conv_s8_im2col (line 604) | __attribute__ ((noinline))
  function esp_nn_conv_s8_tiled (line 706) | __attribute__ ((noinline))
  function esp_nn_get_conv_scratch_size_esp32p4 (line 869) | int esp_nn_get_conv_scratch_size_esp32p4(const data_dims_t *input_dims,
  function esp_nn_set_conv_scratch_buf_esp32p4 (line 970) | void esp_nn_set_conv_scratch_buf_esp32p4(void *buf)
  function esp_nn_conv_s8_esp32p4 (line 985) | void esp_nn_conv_s8_esp32p4(const data_dims_t *input_dims,

FILE: src/convolution/esp_nn_conv_esp32s3.c
  function esp_nn_conv_s8_im2col_s3 (line 176) | __attribute__ ((noinline))
  function esp_nn_get_conv_scratch_size_esp32s3 (line 315) | int esp_nn_get_conv_scratch_size_esp32s3(const data_dims_t *input_dims,
  function esp_nn_set_conv_scratch_buf_esp32s3 (line 388) | void esp_nn_set_conv_scratch_buf_esp32s3(void *buf)
  function esp_nn_conv_s8_esp32s3 (line 393) | void esp_nn_conv_s8_esp32s3(const data_dims_t *input_dims,

FILE: src/convolution/esp_nn_conv_opt.c
  function esp_nn_get_conv_scratch_size_opt (line 20) | int esp_nn_get_conv_scratch_size_opt(const data_dims_t *input_dims,
  function esp_nn_set_conv_scratch_buf_opt (line 28) | void esp_nn_set_conv_scratch_buf_opt(const void *buf)
  function esp_nn_conv_s8_1x1 (line 33) | __attribute__ ((noinline))
  function esp_nn_conv_s8_opt (line 95) | void esp_nn_conv_s8_opt(const data_dims_t *input_dims,

FILE: src/convolution/esp_nn_conv_s8_1x1_esp32s3.c
  function esp_nn_conv_s8_1x1_scratch_size (line 17) | int esp_nn_conv_s8_1x1_scratch_size(int out_channels)
  function transpose_8x8_s16_c (line 28) | static inline void transpose_8x8_s16_c(const int8_t *input, int stride,
  function transpose_8x8_s16_simd (line 46) | static inline void transpose_8x8_s16_simd(const int8_t *input, int stride,
  function mac_8pos_8ch_simd (line 121) | static inline void mac_8pos_8ch_simd(const int16_t *data_buf, const int8...
  function esp_nn_conv_s8_1x1 (line 155) | void esp_nn_conv_s8_1x1(const int8_t *input,

FILE: src/convolution/esp_nn_conv_s8_3x3_opt_esp32s3.c
  function esp_nn_conv_s8_3x3_can_use (line 34) | int esp_nn_conv_s8_3x3_can_use(int filter_wd, int filter_ht,
  function esp_nn_conv_s8_3x3_scratch_size (line 46) | int esp_nn_conv_s8_3x3_scratch_size(int in_channels, int out_channels)
  function esp_nn_conv_s8_3x3_opt (line 57) | void esp_nn_conv_s8_3x3_opt(const int8_t *input,

FILE: src/convolution/esp_nn_depthwise_conv_ansi.c
  function esp_nn_get_depthwise_conv_scratch_size_ansi (line 18) | int esp_nn_get_depthwise_conv_scratch_size_ansi(const data_dims_t *input...
  function esp_nn_set_depthwise_conv_scratch_buf_ansi (line 26) | void esp_nn_set_depthwise_conv_scratch_buf_ansi(const void *buf)
  function esp_nn_depthwise_conv_s8_ansi (line 31) | void esp_nn_depthwise_conv_s8_ansi(const data_dims_t *input_dims,

FILE: src/convolution/esp_nn_depthwise_conv_esp32p4.c
  function esp_nn_get_depthwise_conv_scratch_size_esp32p4 (line 25) | int esp_nn_get_depthwise_conv_scratch_size_esp32p4(const data_dims_t *in...
  function esp_nn_set_depthwise_conv_scratch_buf_esp32p4 (line 33) | void esp_nn_set_depthwise_conv_scratch_buf_esp32p4(const void *buf)
  function depthwise_conv_s8_ch1_pie (line 42) | __attribute__ ((noinline))
  function esp_nn_depthwise_conv_s8_esp32p4 (line 262) | void esp_nn_depthwise_conv_s8_esp32p4(const data_dims_t *input_dims,

FILE: src/convolution/esp_nn_depthwise_conv_opt.c
  function esp_nn_get_depthwise_conv_scratch_size_opt (line 18) | int esp_nn_get_depthwise_conv_scratch_size_opt(const data_dims_t *input_...
  function esp_nn_set_depthwise_conv_scratch_buf_opt (line 26) | void esp_nn_set_depthwise_conv_scratch_buf_opt(const void *buf)
  function esp_nn_depthwise_conv_s8_ch_mult_1 (line 32) | __attribute__ ((noinline))
  function esp_nn_depthwise_conv_s8_opt (line 160) | void esp_nn_depthwise_conv_s8_opt(const data_dims_t *input_dims,

FILE: src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c
  function esp_nn_depthwise_conv_s8_unrolled (line 158) | static void esp_nn_depthwise_conv_s8_unrolled(const int8_t *input_data,
  function esp_nn_depthwise_conv_s8_ch_mult1 (line 288) | void esp_nn_depthwise_conv_s8_ch_mult1(const int8_t *input_data,
  function esp_nn_get_depthwise_conv_scratch_size_esp32s3 (line 348) | int esp_nn_get_depthwise_conv_scratch_size_esp32s3(const data_dims_t *in...
  function esp_nn_set_depthwise_conv_scratch_buf_esp32s3 (line 445) | void esp_nn_set_depthwise_conv_scratch_buf_esp32s3(void *buf)
  function esp_nn_depthwise_conv_s8_esp32s3 (line 471) | void esp_nn_depthwise_conv_s8_esp32s3(const data_dims_t *input_dims,

FILE: src/fully_connected/esp_nn_fully_connected_ansi.c
  function esp_nn_fully_connected_s8_ansi (line 19) | void esp_nn_fully_connected_s8_ansi(const int8_t *input_data,
  function esp_nn_fully_connected_per_ch_s8_ansi (line 52) | void esp_nn_fully_connected_per_ch_s8_ansi(const int8_t *input_data,

FILE: src/fully_connected/esp_nn_fully_connected_esp32s3.c
  function esp_nn_fully_connected_s8_esp32s3 (line 52) | void esp_nn_fully_connected_s8_esp32s3(const int8_t *input_data,
  function esp_nn_fully_connected_per_ch_s8_esp32s3 (line 120) | void esp_nn_fully_connected_per_ch_s8_esp32s3(const int8_t *input_data,

FILE: src/fully_connected/esp_nn_fully_connected_s8_esp32p4.c
  function fc_dot_s8_pie (line 24) | static inline __attribute__((always_inline))
  function esp_nn_fully_connected_s8_esp32p4 (line 114) | void esp_nn_fully_connected_s8_esp32p4(const int8_t *input_data,
  function esp_nn_fully_connected_per_ch_s8_esp32p4 (line 163) | void esp_nn_fully_connected_per_ch_s8_esp32p4(const int8_t *input_data,

FILE: src/logistic/esp_nn_logistic_ansi.c
  function esp_nn_get_logistic_s8_scratch_size_ansi (line 21) | int32_t esp_nn_get_logistic_s8_scratch_size_ansi(void)
  function esp_nn_logistic_s8_prepare_ansi (line 26) | void esp_nn_logistic_s8_prepare_ansi(int8_t *lut,
  function esp_nn_logistic_s8_ansi (line 51) | void esp_nn_logistic_s8_ansi(const int8_t *input, int8_t *output,

FILE: src/pooling/esp_nn_avg_pool_ansi.c
  function esp_nn_avg_pool_s8_ansi (line 19) | void esp_nn_avg_pool_s8_ansi(const int8_t *input,

FILE: src/pooling/esp_nn_avg_pool_s8_esp32p4.c
  function esp_nn_avg_pool_s8_esp32p4 (line 18) | void esp_nn_avg_pool_s8_esp32p4(const int8_t *input,

FILE: src/pooling/esp_nn_avg_pool_s8_esp32s3.c
  function esp_nn_avg_pool_s8_esp32s3 (line 34) | void esp_nn_avg_pool_s8_esp32s3(const int8_t *input,

FILE: src/pooling/esp_nn_max_pool_ansi.c
  function esp_nn_max_pool_s8_ansi (line 19) | void esp_nn_max_pool_s8_ansi(const int8_t *input,

FILE: src/pooling/esp_nn_max_pool_s8_esp32p4.c
  function esp_nn_max_pool_s8_esp32p4 (line 16) | void esp_nn_max_pool_s8_esp32p4(const int8_t *input,

FILE: src/softmax/esp_nn_softmax_ansi.c
  function esp_nn_get_softmax_scratch_size_ansi (line 17) | int32_t esp_nn_get_softmax_scratch_size_ansi(const int32_t width, const ...
  function esp_nn_set_softmax_scratch_buf_ansi (line 24) | void esp_nn_set_softmax_scratch_buf_ansi(void *buffer)
  function esp_nn_softmax_s8_ansi (line 30) | void esp_nn_softmax_s8_ansi(const int8_t *input_data,

FILE: src/softmax/esp_nn_softmax_opt.c
  function esp_nn_get_softmax_scratch_size_opt (line 29) | int32_t esp_nn_get_softmax_scratch_size_opt(const int32_t width, const i...
  function esp_nn_set_softmax_scratch_buf_opt (line 41) | void esp_nn_set_softmax_scratch_buf_opt(void *buffer)
  function esp_nn_softmax_s8_opt (line 46) | void esp_nn_softmax_s8_opt(const int8_t *input_data,

FILE: src/softmax/esp_nn_softmax_s8_esp32p4.c
  function esp_nn_get_softmax_scratch_size_esp32p4 (line 13) | int32_t esp_nn_get_softmax_scratch_size_esp32p4(const int32_t width, con...
  function esp_nn_set_softmax_scratch_buf_esp32p4 (line 19) | void esp_nn_set_softmax_scratch_buf_esp32p4(void *buffer)
  function esp_nn_softmax_s8_esp32p4 (line 36) | void esp_nn_softmax_s8_esp32p4(const int8_t *input_data,

FILE: src/softmax/esp_nn_softmax_s8_esp32s3.c
  function esp_nn_get_softmax_scratch_size_esp32s3 (line 16) | int32_t esp_nn_get_softmax_scratch_size_esp32s3(const int32_t width, con...
  function esp_nn_set_softmax_scratch_buf_esp32s3 (line 22) | void esp_nn_set_softmax_scratch_buf_esp32s3(void *buffer)
  function find_max_s8 (line 28) | static inline int8_t find_max_s8(const int8_t *data, int32_t len)
  function esp_nn_softmax_s8_esp32s3 (line 74) | void esp_nn_softmax_s8_esp32s3(const int8_t *input_data,

FILE: src/softmax/softmax_common.h
  function __NN_FORCE_INLINE__ (line 24) | __NN_FORCE_INLINE__ int32_t mul_power_of_2(int val, int exp)
  function __NN_FORCE_INLINE__ (line 44) | __NN_FORCE_INLINE__ int32_t esp_nn_one_over_one_plus_x_for_x_in_0_1(int3...
  function __NN_FORCE_INLINE__ (line 66) | __NN_FORCE_INLINE__ int32_t esp_nn_exp_on_negative_values(int32_t val)

FILE: test_app/main/main.c
  function profile_c_start (line 30) | void profile_c_start()
  function profile_c_end (line 36) | uint32_t profile_c_end()
  function profile_opt_start (line 43) | void profile_opt_start()
  function profile_opt_end (line 49) | uint32_t profile_opt_end()
  function print_profile (line 56) | static void print_profile(const char *kernel)
  function app_main (line 63) | void app_main()

FILE: tests/src/basic_math_test.c
  function esp_nn_add_elementwise_s8_test (line 52) | void esp_nn_add_elementwise_s8_test()
  function esp_nn_mul_elementwise_s8_test (line 243) | void esp_nn_mul_elementwise_s8_test()
  function esp_nn_mul_broadcast_channel_s8_test (line 381) | void esp_nn_mul_broadcast_channel_s8_test()

FILE: tests/src/convolution_test.c
  function esp_nn_depthwise_conv_s8_test (line 17) | void esp_nn_depthwise_conv_s8_test()
  function esp_nn_conv_s8_test (line 340) | void esp_nn_conv_s8_test()

FILE: tests/src/fully_connected_test.c
  function esp_nn_fully_connected_s8_test (line 17) | void esp_nn_fully_connected_s8_test()
  function esp_nn_fully_connected_per_ch_s8_test (line 160) | void esp_nn_fully_connected_per_ch_s8_test()

FILE: tests/src/hard_swish_test.c
  function esp_nn_hard_swish_s8_test (line 16) | void esp_nn_hard_swish_s8_test()

FILE: tests/src/mean_test.c
  function esp_nn_mean_nhwc_s8_test (line 17) | void esp_nn_mean_nhwc_s8_test()

FILE: tests/src/pooling_test.c
  function run_avg_pool_test (line 17) | static void run_avg_pool_test(uint16_t input_wd, uint16_t input_ht, uint...
  function esp_nn_avg_pool_s8_test (line 76) | void esp_nn_avg_pool_s8_test()
  function run_max_pool_test (line 100) | static void run_max_pool_test(uint16_t input_wd, uint16_t input_ht, uint...
  function esp_nn_max_pool_s8_test (line 159) | void esp_nn_max_pool_s8_test()

FILE: tests/src/relu_test.c
  function run_relu6_test (line 16) | static void run_relu6_test(int size, int iter)
  function esp_nn_relu6_s8_test (line 59) | void esp_nn_relu6_s8_test()

FILE: tests/src/softmax_test.c
  function run_softmax_test (line 17) | static void run_softmax_test(int32_t height, int32_t width, int32_t mult,
  function esp_nn_softmax_s8_test (line 80) | void esp_nn_softmax_s8_test()
Condensed preview — 99 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (851K chars).
[
  {
    "path": ".github/workflows/upload_component.yml",
    "chars": 428,
    "preview": "name: Push esp-nn to IDF Component Registry\n\non:\n  push:\n    branches:\n      - master\n\njobs:\n  upload_components:\n    ru"
  },
  {
    "path": ".gitignore",
    "chars": 654,
    "preview": ".config\n*.o\n*.i\n*.s\n*.orig\n*.pyc\n\n# gtags\nGTAGS\nGRTAGS\nGPATH\n\n# emacs\n.dir-locals.el\n\n# emacs temp file suffixes\n*~\n.#*\n"
  },
  {
    "path": ".gitlab-ci.yml",
    "chars": 2156,
    "preview": "stages:\n  - build\n\n# Avoid running duplicate pipeline\nworkflow:\n  rules:\n    - if: '$CI_PIPELINE_SOURCE == \"merge_reques"
  },
  {
    "path": "CMakeLists.txt",
    "chars": 4015,
    "preview": "cmake_minimum_required(VERSION 3.5)\n\nset(c_srcs\n    \"src/activation_functions/esp_nn_relu_ansi.c\"\n    \"src/activation_fu"
  },
  {
    "path": "CONTRIBUTING.md",
    "chars": 1701,
    "preview": "# Contributing\n\nContributions to ESP-NN project in the form of pull requests, bug reports, and feature requests are welc"
  },
  {
    "path": "Kconfig.projbuild",
    "chars": 1320,
    "preview": "menu \"ESP-NN\"\n\nchoice NN_OPTIMIZATIONS\n   bool \"Optimization for nn functions\"\n   default NN_OPTIMIZED\n   help\n      Use"
  },
  {
    "path": "LICENSE",
    "chars": 11358,
    "preview": "\n                                 Apache License\n                           Version 2.0, January 2004\n                  "
  },
  {
    "path": "README.md",
    "chars": 6247,
    "preview": "# ESP-NN\n\nThe library contains optimised NN (Neural Network) functions for various Espressif chips.\n\n* Supported platfor"
  },
  {
    "path": "idf_component.yml",
    "chars": 321,
    "preview": "version: \"1.2.3\"\ndescription: Optimized NN (Neural Network) functions for Espressif chips\nurl: https://github.com/espres"
  },
  {
    "path": "include/esp_nn.h",
    "chars": 1345,
    "preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "include/esp_nn_ansi_c.h",
    "chars": 1877,
    "preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
  },
  {
    "path": "include/esp_nn_ansi_headers.h",
    "chars": 17533,
    "preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
  },
  {
    "path": "include/esp_nn_defs.h",
    "chars": 1950,
    "preview": "// Copyright 2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"Licens"
  },
  {
    "path": "include/esp_nn_esp32p4.h",
    "chars": 11013,
    "preview": "/*\n * SPDX-FileCopyrightText: 2024-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
  },
  {
    "path": "include/esp_nn_esp32s3.h",
    "chars": 14349,
    "preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
  },
  {
    "path": "include/esp_nn_generic_opt.h",
    "chars": 1903,
    "preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
  },
  {
    "path": "src/activation_functions/esp_nn_hard_swish_ansi.c",
    "chars": 3279,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
  },
  {
    "path": "src/activation_functions/esp_nn_hard_swish_s8_esp32p4.c",
    "chars": 6113,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
  },
  {
    "path": "src/activation_functions/esp_nn_hard_swish_s8_esp32s3.c",
    "chars": 3185,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
  },
  {
    "path": "src/activation_functions/esp_nn_relu_ansi.c",
    "chars": 891,
    "preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/activation_functions/esp_nn_relu_s8_esp32p4.c",
    "chars": 1894,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
  },
  {
    "path": "src/activation_functions/esp_nn_relu_s8_esp32s3.S",
    "chars": 3677,
    "preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/basic_math/esp_nn_add_ansi.c",
    "chars": 4214,
    "preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/basic_math/esp_nn_add_s8_esp32p4.c",
    "chars": 3669,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
  },
  {
    "path": "src/basic_math/esp_nn_add_s8_esp32s3.S",
    "chars": 21647,
    "preview": "// Copyright 2021-2023 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/basic_math/esp_nn_mul_ansi.c",
    "chars": 3168,
    "preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/basic_math/esp_nn_mul_broadcast_s8_esp32s3.S",
    "chars": 11279,
    "preview": "// Copyright 2026 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"Licens"
  },
  {
    "path": "src/basic_math/esp_nn_mul_s8_esp32p4.c",
    "chars": 3187,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
  },
  {
    "path": "src/basic_math/esp_nn_mul_s8_esp32s3.S",
    "chars": 13293,
    "preview": "// Copyright 2021-2023 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/common/common_functions.h",
    "chars": 10801,
    "preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
  },
  {
    "path": "src/common/esp_nn_common_functions_esp32s3.S",
    "chars": 8793,
    "preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/common/esp_nn_dot_s8_esp32s3.S",
    "chars": 4359,
    "preview": "//\n// SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n//\n// SPDX-License-Identifier: Apache-2.0\n//\n\n//\n"
  },
  {
    "path": "src/common/esp_nn_mean_ansi.c",
    "chars": 1591,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
  },
  {
    "path": "src/common/esp_nn_mean_s8_esp32p4.c",
    "chars": 3743,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
  },
  {
    "path": "src/common/esp_nn_mean_s8_esp32s3.c",
    "chars": 3328,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
  },
  {
    "path": "src/common/esp_nn_multiply_by_quantized_mult_esp32p4.S",
    "chars": 2980,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
  },
  {
    "path": "src/common/esp_nn_multiply_by_quantized_mult_esp32s3.S",
    "chars": 4069,
    "preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/common/esp_nn_multiply_by_quantized_mult_ver1_esp32s3.S",
    "chars": 6841,
    "preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/convolution/esp_nn_conv_ansi.c",
    "chars": 8851,
    "preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/convolution/esp_nn_conv_esp32p4.c",
    "chars": 45228,
    "preview": "/*\n * SPDX-FileCopyrightText: 2024-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
  },
  {
    "path": "src/convolution/esp_nn_conv_esp32s3.c",
    "chars": 27082,
    "preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
  },
  {
    "path": "src/convolution/esp_nn_conv_opt.c",
    "chars": 8540,
    "preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/convolution/esp_nn_conv_s16_mult4_1x1_esp32s3.S",
    "chars": 15381,
    "preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/convolution/esp_nn_conv_s16_mult8_esp32s3.S",
    "chars": 22050,
    "preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/convolution/esp_nn_conv_s8_1x1_esp32s3.c",
    "chars": 11187,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
  },
  {
    "path": "src/convolution/esp_nn_conv_s8_3x3_opt_esp32s3.c",
    "chars": 6809,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
  },
  {
    "path": "src/convolution/esp_nn_conv_s8_filter_aligned_input_padded_esp32s3.S",
    "chars": 7386,
    "preview": "//\n// SPDX-FileCopyrightText: 2023-2026 Espressif Systems (Shanghai) CO LTD\n//\n// SPDX-License-Identifier: Apache-2.0\n//"
  },
  {
    "path": "src/convolution/esp_nn_conv_s8_mult8_1x1_esp32s3.S",
    "chars": 21309,
    "preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_ansi.c",
    "chars": 4881,
    "preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_esp32p4.c",
    "chars": 13248,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_opt.c",
    "chars": 14879,
    "preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3.S",
    "chars": 18148,
    "preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3.S",
    "chars": 16628,
    "preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_s16_mult1_esp32s3.S",
    "chars": 14717,
    "preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_s16_mult4_esp32s3.S",
    "chars": 18952,
    "preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3.S",
    "chars": 21304,
    "preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_s16_mult8_esp32s3.S",
    "chars": 19951,
    "preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c",
    "chars": 48323,
    "preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3.S",
    "chars": 23834,
    "preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/fully_connected/esp_nn_fc_s8_mac16_esp32s3.S",
    "chars": 2546,
    "preview": "//\n// SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n//\n// SPDX-License-Identifier: Apache-2.0\n//\n\n//\n"
  },
  {
    "path": "src/fully_connected/esp_nn_fully_connected_ansi.c",
    "chars": 3799,
    "preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/fully_connected/esp_nn_fully_connected_esp32s3.c",
    "chars": 8025,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
  },
  {
    "path": "src/fully_connected/esp_nn_fully_connected_per_ch_s8_esp32s3.S",
    "chars": 8441,
    "preview": "//\n// SPDX-FileCopyrightText: 2025-2026 Espressif Systems (Shanghai) CO LTD\n//\n// SPDX-License-Identifier: Apache-2.0\n//"
  },
  {
    "path": "src/fully_connected/esp_nn_fully_connected_s8_esp32p4.c",
    "chars": 8680,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
  },
  {
    "path": "src/fully_connected/esp_nn_fully_connected_s8_esp32s3.S",
    "chars": 8101,
    "preview": "//\n// SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n//\n// SPDX-License-Identifier: Apache-2.0\n//"
  },
  {
    "path": "src/logistic/esp_nn_logistic_ansi.c",
    "chars": 1951,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
  },
  {
    "path": "src/pooling/esp_nn_avg_pool_ansi.c",
    "chars": 3258,
    "preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/pooling/esp_nn_avg_pool_s8_esp32p4.c",
    "chars": 6113,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
  },
  {
    "path": "src/pooling/esp_nn_avg_pool_s8_esp32s3.S",
    "chars": 29214,
    "preview": "//\n// SPDX-FileCopyrightText: 2021-2026 Espressif Systems (Shanghai) CO LTD\n//\n// SPDX-License-Identifier: Apache-2.0\n//"
  },
  {
    "path": "src/pooling/esp_nn_avg_pool_s8_esp32s3.c",
    "chars": 4084,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
  },
  {
    "path": "src/pooling/esp_nn_max_pool_ansi.c",
    "chars": 2975,
    "preview": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/pooling/esp_nn_max_pool_s8_esp32p4.c",
    "chars": 5822,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
  },
  {
    "path": "src/pooling/esp_nn_max_pool_s8_esp32s3.S",
    "chars": 19966,
    "preview": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"L"
  },
  {
    "path": "src/softmax/esp_nn_softmax_ansi.c",
    "chars": 3421,
    "preview": "// Copyright 2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"Licens"
  },
  {
    "path": "src/softmax/esp_nn_softmax_opt.c",
    "chars": 3909,
    "preview": "// Copyright 2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"Licens"
  },
  {
    "path": "src/softmax/esp_nn_softmax_s8_esp32p4.c",
    "chars": 4729,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
  },
  {
    "path": "src/softmax/esp_nn_softmax_s8_esp32s3.c",
    "chars": 5249,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*"
  },
  {
    "path": "src/softmax/softmax_common.h",
    "chars": 4287,
    "preview": "// Copyright 2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"Licens"
  },
  {
    "path": "test_app/CMakeLists.txt",
    "chars": 319,
    "preview": "# The following lines of boilerplate have to be in your project's\n# CMakeLists in this exact order for cmake to work cor"
  },
  {
    "path": "test_app/Makefile",
    "chars": 384,
    "preview": "#\n# This is a project Makefile. It is assumed the directory this Makefile resides in is a\n# project subdirectory.\n#\n\nPRO"
  },
  {
    "path": "test_app/main/CMakeLists.txt",
    "chars": 132,
    "preview": "\nset(COMPONENT_SRCS \"main.c\")\nset(COMPONENT_ADD_INCLUDEDIRS \"\")\n\nset(COMPONENT_PRIV_REQUIRES tests esp_timer)\n\nregister_"
  },
  {
    "path": "test_app/main/component.mk",
    "chars": 316,
    "preview": "#\n# Main component makefile.\n#\n# This Makefile can be left empty. By default, it will take the sources in the \n# src/ di"
  },
  {
    "path": "test_app/main/main.c",
    "chars": 2668,
    "preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
  },
  {
    "path": "test_app/sdkconfig.defaults",
    "chars": 36,
    "preview": "\n#\n# esp-nn\n#\nCONFIG_NN_OPTIMIZED=y\n"
  },
  {
    "path": "test_app/sdkconfig.defaults.esp32p4",
    "chars": 659,
    "preview": "# Enables high speed SPIRAM and other options\nCONFIG_IDF_EXPERIMENTAL_FEATURES=y\n\n#\n# ESP System Settings\n#\nCONFIG_ESP_D"
  },
  {
    "path": "test_app/sdkconfig.defaults.esp32s3",
    "chars": 225,
    "preview": "# Default configurations for ESP32-S3\n\nCONFIG_ESP32S3_DEFAULT_CPU_FREQ_240=y\n# CONFIG_ESP32S3_SPIRAM_SUPPORT is not set\n"
  },
  {
    "path": "tests/CMakeLists.txt",
    "chars": 531,
    "preview": "\nset(COMPONENT_ADD_INCLUDEDIRS ./include/)\nset(COMPONENT_SRCS \"src/basic_math_test.c\"\n                   \"src/convolutio"
  },
  {
    "path": "tests/README.md",
    "chars": 129,
    "preview": "# Tests for esp_nn library\n\n- Include these in your test framework and run the framework.\n- For IDF test please refer `t"
  },
  {
    "path": "tests/component.mk",
    "chars": 74,
    "preview": "#FIXME\n\nCOMPONENT_ADD_INCLUDEDIRS := include/\n\nCOMPONENT_SRCDIRS :=  src/\n"
  },
  {
    "path": "tests/include/test_functions.h",
    "chars": 1060,
    "preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
  },
  {
    "path": "tests/include/test_utils.h",
    "chars": 3813,
    "preview": "/*\n * SPDX-FileCopyrightText: 2020-2023 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
  },
  {
    "path": "tests/src/basic_math_test.c",
    "chars": 21293,
    "preview": "/*\n * SPDX-FileCopyrightText: 2020-2023 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
  },
  {
    "path": "tests/src/convolution_test.c",
    "chars": 31742,
    "preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
  },
  {
    "path": "tests/src/fully_connected_test.c",
    "chars": 9977,
    "preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
  },
  {
    "path": "tests/src/hard_swish_test.c",
    "chars": 3297,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
  },
  {
    "path": "tests/src/mean_test.c",
    "chars": 2813,
    "preview": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#i"
  },
  {
    "path": "tests/src/pooling_test.c",
    "chars": 7725,
    "preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
  },
  {
    "path": "tests/src/relu_test.c",
    "chars": 2403,
    "preview": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
  },
  {
    "path": "tests/src/softmax_test.c",
    "chars": 4120,
    "preview": "/*\n * SPDX-FileCopyrightText: 2022-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n *"
  }
]

About this extraction

This page contains the full source code of the espressif/esp-nn GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 99 files (801.9 KB), approximately 247.9k tokens, and a symbol index with 143 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!