Repository: espressif/esp-nn
Branch: master
Commit: d45b843ca5f8
Files: 99
Total size: 801.9 KB

Directory structure:
gitextract__zjpraf8/

├── .github/
│   └── workflows/
│       └── upload_component.yml
├── .gitignore
├── .gitlab-ci.yml
├── CMakeLists.txt
├── CONTRIBUTING.md
├── Kconfig.projbuild
├── LICENSE
├── README.md
├── idf_component.yml
├── include/
│   ├── esp_nn.h
│   ├── esp_nn_ansi_c.h
│   ├── esp_nn_ansi_headers.h
│   ├── esp_nn_defs.h
│   ├── esp_nn_esp32p4.h
│   ├── esp_nn_esp32s3.h
│   └── esp_nn_generic_opt.h
├── src/
│   ├── activation_functions/
│   │   ├── esp_nn_hard_swish_ansi.c
│   │   ├── esp_nn_hard_swish_s8_esp32p4.c
│   │   ├── esp_nn_hard_swish_s8_esp32s3.c
│   │   ├── esp_nn_relu_ansi.c
│   │   ├── esp_nn_relu_s8_esp32p4.c
│   │   └── esp_nn_relu_s8_esp32s3.S
│   ├── basic_math/
│   │   ├── esp_nn_add_ansi.c
│   │   ├── esp_nn_add_s8_esp32p4.c
│   │   ├── esp_nn_add_s8_esp32s3.S
│   │   ├── esp_nn_mul_ansi.c
│   │   ├── esp_nn_mul_broadcast_s8_esp32s3.S
│   │   ├── esp_nn_mul_s8_esp32p4.c
│   │   └── esp_nn_mul_s8_esp32s3.S
│   ├── common/
│   │   ├── common_functions.h
│   │   ├── esp_nn_common_functions_esp32s3.S
│   │   ├── esp_nn_dot_s8_esp32s3.S
│   │   ├── esp_nn_mean_ansi.c
│   │   ├── esp_nn_mean_s8_esp32p4.c
│   │   ├── esp_nn_mean_s8_esp32s3.c
│   │   ├── esp_nn_multiply_by_quantized_mult_esp32p4.S
│   │   ├── esp_nn_multiply_by_quantized_mult_esp32s3.S
│   │   └── esp_nn_multiply_by_quantized_mult_ver1_esp32s3.S
│   ├── convolution/
│   │   ├── esp_nn_conv_ansi.c
│   │   ├── esp_nn_conv_esp32p4.c
│   │   ├── esp_nn_conv_esp32s3.c
│   │   ├── esp_nn_conv_opt.c
│   │   ├── esp_nn_conv_s16_mult4_1x1_esp32s3.S
│   │   ├── esp_nn_conv_s16_mult8_esp32s3.S
│   │   ├── esp_nn_conv_s8_1x1_esp32s3.c
│   │   ├── esp_nn_conv_s8_3x3_opt_esp32s3.c
│   │   ├── esp_nn_conv_s8_filter_aligned_input_padded_esp32s3.S
│   │   ├── esp_nn_conv_s8_mult8_1x1_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_ansi.c
│   │   ├── esp_nn_depthwise_conv_esp32p4.c
│   │   ├── esp_nn_depthwise_conv_opt.c
│   │   ├── esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_s16_mult1_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_s16_mult4_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_s16_mult8_esp32s3.S
│   │   ├── esp_nn_depthwise_conv_s8_esp32s3.c
│   │   └── esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3.S
│   ├── fully_connected/
│   │   ├── esp_nn_fc_s8_mac16_esp32s3.S
│   │   ├── esp_nn_fully_connected_ansi.c
│   │   ├── esp_nn_fully_connected_esp32s3.c
│   │   ├── esp_nn_fully_connected_per_ch_s8_esp32s3.S
│   │   ├── esp_nn_fully_connected_s8_esp32p4.c
│   │   └── esp_nn_fully_connected_s8_esp32s3.S
│   ├── logistic/
│   │   └── esp_nn_logistic_ansi.c
│   ├── pooling/
│   │   ├── esp_nn_avg_pool_ansi.c
│   │   ├── esp_nn_avg_pool_s8_esp32p4.c
│   │   ├── esp_nn_avg_pool_s8_esp32s3.S
│   │   ├── esp_nn_avg_pool_s8_esp32s3.c
│   │   ├── esp_nn_max_pool_ansi.c
│   │   ├── esp_nn_max_pool_s8_esp32p4.c
│   │   └── esp_nn_max_pool_s8_esp32s3.S
│   └── softmax/
│       ├── esp_nn_softmax_ansi.c
│       ├── esp_nn_softmax_opt.c
│       ├── esp_nn_softmax_s8_esp32p4.c
│       ├── esp_nn_softmax_s8_esp32s3.c
│       └── softmax_common.h
├── test_app/
│   ├── CMakeLists.txt
│   ├── Makefile
│   ├── main/
│   │   ├── CMakeLists.txt
│   │   ├── component.mk
│   │   └── main.c
│   ├── sdkconfig.defaults
│   ├── sdkconfig.defaults.esp32p4
│   └── sdkconfig.defaults.esp32s3
└── tests/
    ├── CMakeLists.txt
    ├── README.md
    ├── component.mk
    ├── include/
    │   ├── test_functions.h
    │   └── test_utils.h
    └── src/
        ├── basic_math_test.c
        ├── convolution_test.c
        ├── fully_connected_test.c
        ├── hard_swish_test.c
        ├── mean_test.c
        ├── pooling_test.c
        ├── relu_test.c
        └── softmax_test.c

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/upload_component.yml
================================================
name: Push esp-nn to IDF Component Registry

on:
  push:
    branches:
      - master

jobs:
  upload_components:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Upload esp-nn to IDF Component Registry
        uses: espressif/upload-components-ci-action@v1
        with:
          namespace: "espressif"
          name: "esp-nn"
          api_token: ${{ secrets.IDF_COMPONENT_API_TOKEN }}


================================================
FILE: .gitignore
================================================
.config
*.o
*.i
*.s
*.orig
*.pyc

# gtags
GTAGS
GRTAGS
GPATH

# emacs
.dir-locals.el

# emacs temp file suffixes
*~
.#*
\#*#

# eclipse setting
.settings

# MacOS directory files
.DS_Store

# Example project files
examples/**/sdkconfig
examples/**/sdkconfig.old
examples/**/build

# Test app files
test_app/build
test_app/sdkconfig
test_app/sdkconfig.old

# Doc build artifacts
docs/_build/
docs/doxygen-warning-log.txt
docs/sphinx-warning-log.txt
docs/sphinx-warning-log-sanitized.txt
docs/xml/
docs/xml_in/
docs/man/
docs/doxygen_sqlite3.db

TEST_LOGS


# gcov coverage reports
*.gcda
*.gcno
coverage.info
coverage_report/

# VS Code Settings
.vscode/


================================================
FILE: .gitlab-ci.yml
================================================
stages:
  - build

# Avoid running duplicate pipeline
workflow:
  rules:
    - if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
    - if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH'

variables:
  GIT_STRATEGY: fetch
  GIT_SUBMODULE_STRATEGY: recursive
before_script:
    - mkdir -p ~/.ssh
    - chmod 700 ~/.ssh
    - echo -n $GITLAB_KEY_TMP > ~/.ssh/id_rsa_base64
    - base64 --decode --ignore-garbage ~/.ssh/id_rsa_base64 > ~/.ssh/id_rsa
    - chmod 600 ~/.ssh/id_rsa
    - echo -e "Host gitlab.espressif.cn\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config
    - |
      if [ -n "$IDF_COMPONENT_MGR_VER" ]; then
        pip install idf-component-manager==$IDF_COMPONENT_MGR_VER
      fi

.test_build: &test_build
    # Build examples
    - for TARGET in $EXAMPLE_TARGETS; do
    - idf.py set-target $TARGET build
    - done

.build_template:
  stage: build
  image: espressif/idf:latest
  tags:
    - build
  variables:
    PEDANTIC_FLAGS: "-Werror -Wno-error=cpp -Werror=unused-variable -Werror=unused-but-set-variable -Werror=unused-function"
    EXTRA_CFLAGS: "${PEDANTIC_FLAGS}"
    EXTRA_CXXFLAGS: "${PEDANTIC_FLAGS}"
  rules:
    - if: '$CI_PIPELINE_SOURCE == "schedule"'
      when: never
    - when: always
  script:
    - cd ${CI_PROJECT_DIR}/test_app
    # build examples
    - *test_build
    - cd ${CI_PROJECT_DIR}

build_idf_v5.5:
  extends: .build_template
  image: espressif/idf:release-v5.5
  variables:
    EXAMPLE_TARGETS: esp32 esp32s3 esp32c3 esp32p4

build_idf_v5.2:
  extends: .build_template
  image: espressif/idf:release-v5.2
  variables:
    EXAMPLE_TARGETS: esp32 esp32s3 esp32c3

build_idf_v5.0:
  extends: .build_template
  image: espressif/idf:release-v5.0
  variables:
    EXAMPLE_TARGETS: esp32 esp32s3 esp32c3

build_idf_v4.4:
  extends: .build_template
  image: espressif/idf:release-v4.4
  variables:
    EXAMPLE_TARGETS: esp32 esp32s3 esp32c3
    IDF_COMPONENT_MGR_VER: "1.2.0"

build_idf_v4.3:
  extends: .build_template
  image: espressif/idf:release-v4.3
  variables:
    EXAMPLE_TARGETS: esp32

build_idf_v4.2:
  extends: .build_template
  image: espressif/idf:release-v4.2
  variables:
    EXAMPLE_TARGETS: esp32


================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.5)

set(c_srcs
    "src/activation_functions/esp_nn_relu_ansi.c"
    "src/activation_functions/esp_nn_hard_swish_ansi.c"
    "src/common/esp_nn_mean_ansi.c"
    "src/basic_math/esp_nn_add_ansi.c"
    "src/basic_math/esp_nn_mul_ansi.c"
    "src/convolution/esp_nn_conv_ansi.c"
    "src/convolution/esp_nn_conv_opt.c"
    "src/convolution/esp_nn_depthwise_conv_ansi.c"
    "src/convolution/esp_nn_depthwise_conv_opt.c"
    "src/fully_connected/esp_nn_fully_connected_ansi.c"
    "src/softmax/esp_nn_softmax_ansi.c"
    "src/softmax/esp_nn_softmax_opt.c"
    "src/logistic/esp_nn_logistic_ansi.c"
    "src/pooling/esp_nn_avg_pool_ansi.c"
    "src/pooling/esp_nn_max_pool_ansi.c")

if(CONFIG_IDF_TARGET_ESP32S3)
    set(s3_srcs
        "src/common/esp_nn_common_functions_esp32s3.S"
        "src/common/esp_nn_dot_s8_esp32s3.S"
        "src/common/esp_nn_multiply_by_quantized_mult_esp32s3.S"
        "src/common/esp_nn_multiply_by_quantized_mult_ver1_esp32s3.S"
        "src/activation_functions/esp_nn_relu_s8_esp32s3.S"
        "src/activation_functions/esp_nn_hard_swish_s8_esp32s3.c"
        "src/common/esp_nn_mean_s8_esp32s3.c"
        "src/basic_math/esp_nn_add_s8_esp32s3.S"
        "src/basic_math/esp_nn_mul_s8_esp32s3.S"
        "src/basic_math/esp_nn_mul_broadcast_s8_esp32s3.S"
        "src/convolution/esp_nn_conv_esp32s3.c"
        "src/convolution/esp_nn_conv_s8_1x1_esp32s3.c"
        "src/convolution/esp_nn_conv_s8_3x3_opt_esp32s3.c"
        "src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c"
        "src/convolution/esp_nn_conv_s16_mult8_esp32s3.S"
        "src/convolution/esp_nn_conv_s8_mult8_1x1_esp32s3.S"
        "src/convolution/esp_nn_conv_s16_mult4_1x1_esp32s3.S"
        "src/convolution/esp_nn_conv_s8_filter_aligned_input_padded_esp32s3.S"
        "src/convolution/esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3.S"
        "src/convolution/esp_nn_depthwise_conv_s16_mult1_esp32s3.S"
        "src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3.S"
        "src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3.S"
        "src/convolution/esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3.S"
        "src/convolution/esp_nn_depthwise_conv_s16_mult4_esp32s3.S"
        "src/convolution/esp_nn_depthwise_conv_s16_mult8_esp32s3.S"
        "src/fully_connected/esp_nn_fully_connected_esp32s3.c"
        "src/fully_connected/esp_nn_fc_s8_mac16_esp32s3.S"
        "src/fully_connected/esp_nn_fully_connected_s8_esp32s3.S"
        "src/fully_connected/esp_nn_fully_connected_per_ch_s8_esp32s3.S"
        "src/pooling/esp_nn_max_pool_s8_esp32s3.S"
        "src/pooling/esp_nn_avg_pool_s8_esp32s3.c"
        "src/pooling/esp_nn_avg_pool_s8_esp32s3.S"
        "src/softmax/esp_nn_softmax_s8_esp32s3.c")
endif()

if(CONFIG_IDF_TARGET_ESP32P4)
    set(p4_srcs
        "src/common/esp_nn_mean_s8_esp32p4.c"
        "src/common/esp_nn_multiply_by_quantized_mult_esp32p4.S"
        "src/activation_functions/esp_nn_hard_swish_s8_esp32p4.c"
        "src/activation_functions/esp_nn_relu_s8_esp32p4.c"
        "src/basic_math/esp_nn_add_s8_esp32p4.c"
        "src/basic_math/esp_nn_mul_s8_esp32p4.c"
        "src/convolution/esp_nn_conv_esp32p4.c"
        "src/convolution/esp_nn_depthwise_conv_esp32p4.c"
        "src/fully_connected/esp_nn_fully_connected_s8_esp32p4.c"
        "src/pooling/esp_nn_avg_pool_s8_esp32p4.c"
        "src/pooling/esp_nn_max_pool_s8_esp32p4.c"
        "src/softmax/esp_nn_softmax_s8_esp32p4.c")
endif()

idf_component_register(SRCS "${c_srcs}"
                            "${s3_srcs}"
                            "${p4_srcs}"
                       INCLUDE_DIRS "include" "src/common")

if(CONFIG_IDF_TARGET_ESP32S3)
    target_compile_options(${COMPONENT_LIB} PRIVATE -mlongcalls -fno-unroll-loops -O2 -Wno-unused-function)
else()
    target_compile_options(${COMPONENT_LIB} PRIVATE  -O2 -Wno-unused-function)
endif()

if(CONFIG_NN_SKIP_NUDGE)
    target_compile_definitions(${COMPONENT_LIB} PRIVATE SKIP_NUDGE)
endif()


================================================
FILE: CONTRIBUTING.md
================================================
# Contributing

Contributions to ESP-NN project in the form of pull requests, bug reports, and feature requests are welcome!

This document covers various topics related to contributions to the ESP-NN projects. Please read it if you plan to submit a PR!

## CLA

We require accepting the contributor's license agreement for all pull requests. When opening a pull request the first time you will be prompted to sign the CLA by the [CLA Assistant](https://cla-assistant.io/) service.

## Large-scale Changes

If you'd like to propose a change to the existing APIs or a large-scale refactoring of the implementation, we recommend opening an issue first to discuss this.

## Updating the Benchmarks Table

The benchmarks table in [README.md](README.md) contains benchmarks for ESP32-S3. The benchmarks are collected by running the app in [test_app](test_app/) directory. Please update this table if you have changed the implementations of some of the functions or added the new ones.

## Releasing a new version

Maintainers should follow the steps below to release a new version of ESP-NN component. Assuming the new version is `vX.Y.Z`:

1. Ensure you are on the latest `master` branch:
   ```bash
   git checkout master
   git pull --ff-only origin master
   ```
1. Create the new tag:
   ```bash
   git tag -s -a -m "vX.Y.Z" vX.Y.Z
   ```
1. Push the tag and the branch to the internal repository:
   ```bash
   git push origin vX.Y.Z
   ```
1. CI will automatically push the tag to Github and will upload the new version to the IDF Component Registry.
1. Go to https://github.com/espressif/esp-nn/releases and create a release from the tag vX.Y.Z.
1. Write the release notes and publish the release.


================================================
FILE: Kconfig.projbuild
================================================
menu "ESP-NN"

choice NN_OPTIMIZATIONS
   bool "Optimization for nn functions"
   default NN_OPTIMIZED
   help
      Use ANSI-C versions for verification and debug purpose.
      Optimisations are automatically picked up for a chipset.
      For ESP32-S3, assembly optimisations are selected.
      For other platforms(viz., ESP32, ESP32-C3), generic optimisations are used.

config NN_ANSI_C
   bool "ANSI C"
   help
      ANSI C versions for verification and debug purposes.
config NN_OPTIMIZED
   bool "Optimized versions"
   help
      Optimisations are automatically picked up for a chipset.
      For ESP32-S3, assembly optimisations are selected.
      For other platforms(viz., ESP32, ESP32-C3), generic optimisations are used.
endchoice

config NN_OPTIMIZATIONS
   int
   default 0 if NN_ANSI_C
   default 1 if NN_OPTIMIZED

config NN_SKIP_NUDGE
   bool "Use fast (non-bit-exact) requantization"
   depends on NN_OPTIMIZED
   default n
   help
      When enabled, kernels use a faster requantize path that may differ
      from the TFLite reference by +/-1 LSB at half-shift boundaries.
      On ESP32-S3, this also skips the nudge addition in the assembly
      requantize for ~20% speedup.
      Leave disabled for bit-exact behavior (recommended for tests and
      for matching reference outputs).

endmenu


================================================
FILE: LICENSE
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# ESP-NN

The library contains optimised NN (Neural Network) functions for various Espressif chips.

* Supported platforms:
   * TensorFlow Lite Micro (TFLite Micro). Repo can be found [here](https://github.com/espressif/tflite-micro-esp-examples)

* Supported ESP chips include:
   * ESP32-S3 (Assembly versions optimised to benefit from vector instructions of ESP32-S3)
   * ESP32-P4 (Optimised using PIE/QACC SIMD instructions)
   * ESP32 (Generic optimisations)
   * ESP32-C3 (Generic optimisations)

## Performance

### Kernelwise performance for s8 versions:

  * Kernelwise performance on ESP32-P4 chip
    * Numbers are ticks taken for kernel to execute
    * Chip config: 360MHz, SPI-RAM: HEX 200MHz, L2-Cache: 128KB

    | Function        | ANSI C  | Optimized | Opt Ratio | Data info   | Memory    |
    | ----------------| --------|---------|---------|-------------|-----------|
    | elementwise_add | 190786  | 88451   | 2.16    | size = 1615 | External  |
    | elementwise_mul | 76585   | 47601   | 1.60    | size = 1615 | External  |
    | convolution     | 4005512 | 572459  | 7.00    | input(10,10), filter(64x1x1x64), pad(0,0), stride(1,1) | External |
    | convolution     | 249700  | 71104   | 3.51    | input(8,8), filter(16x1x1x16), pad(0,0), stride(1,1) | External |
    | convolution     | 816975  | 533318  | 1.53    | input(10,10), filter(64x3x3x3), pad(0,0), stride(1,1) | External |
    | depthwise conv  | 962834  | 482389  | 2.00    | input (16, 16), pad(0,0), stride(1,1) filter: 1x3x3x16 | External |
    | depthwise conv  | 1365066 | 703989  | 1.94    | input (12, 12), pad(1,1), stride(1,1)  filter: 8x5x5x4 | External |
    | max pool        | 482184  | 24178   | 19.94   | input(16,16), filter (1x3x3x16) | Internal |
    | avg pool        | 303210  | 84401   | 3.59    | input(16,16), filter (1x3x3x16) | Internal |
    | fully connected | 7650    | 915     | 8.36    | len: 271, ch = 3 | Internal |
    | prelu (relu6)   | 1195    | 154     | 7.76    | size, 1615  | Internal  |
    | softmax         | 14260   | 8587    | 1.66    | width: 256  | Internal  |
    | hard_swish      | 703970  | 516582  | 1.36    | size: 12544 | External  |
    | mean            | 10113   | 4686    | 2.16    | 7x7x16     | Internal  |


  * Kernelwise performance on ESP32-S3 chip
    * Numbers are ticks taken for kernel to execute
    * Chip config: 240MHz, SPI: QPI 80MHz, Data cache: 64KB

    | Function        | ANSI C   | Optimized | Opt Ratio | Data info   | Memory    |
    | ----------------| ---------|-----------|-----------|-------------|-----------|
    | elementwise_add | 281337   | 74440     | 3.78      | size = 1615 | External  |
    | elementwise_mul | 122703   | 35002     | 3.51      | size = 1615 | External  |
    | convolution     | 4712500  | 331008    | 14.24     | input(10,10), filter(64x1x1x64), pad(0,0), stride(1,1) | External |
    | convolution     | 312754   | 39022     | 8.01      | input(8,8), filter(16x1x1x16), pad(0,0), stride(1,1) | External |
    | convolution     | 2193289  | 394842    | 5.55      | input(8,8), filter(64x3x3x3), pad(0,0), stride(1,1) | External |
    | depthwise conv  | 1159831  | 184176    | 6.30      | input(18,18), pad(0,0), stride(1,1), filter: 1x3x3x16 | External |
    | depthwise conv  | 1671363  | 372435    | 4.49      | input(12,12), pad(1,1), stride(1,1), filter: 8x5x5x4 | External |
    | max pool        | 376294   | 48069     | 7.83      | input(16,16), filter(1x3x3x16) | Internal |
    | avg pool        | 427293   | 118052    | 3.62      | input(16,16), filter(1x3x3x16) | Internal |
    | fully connected | 8443     | 1078      | 7.83      | len: 271, ch = 3 | Internal |
    | softmax         | 15209    | 11107     | 1.37      | h: 8, w: 32 | Internal  |
    | prelu (relu6)   | 1125     | 98        | 11.48     | size: 1615  | Internal  |


### Model-level performance:

  * **Person Detection** (Visual Wake Words, INT8 quantized — from [esp-tflite-micro](https://github.com/espressif/esp-tflite-micro))
    * Numbers are time (ms) for `invoke()` call, using internal memory

    | Chip     | CPU Freq | without ESP-NN | with ESP-NN |
    | -------- | -------- | -------------- | ----------- |
    | ESP32-P4 | 360MHz   | 1395ms         | 73ms        |
    | ESP32-S3 | 240MHz   | 2300ms         | 54ms        |
    | ESP32    | 240MHz   | 4084ms         | 380ms       |
    | ESP32-C3 | 160MHz   | 3355ms         | 426ms       |

  * **MobileNetV3 Small** (INT8 quantized, 224x224x3, 1000 classes)

    | Chip     | CPU Freq | without ESP-NN | with ESP-NN |
    | -------- | -------- | -------------- | ----------- |
    | ESP32-S3 | 240MHz   | 26000ms        | 1434ms      |
    | ESP32-P4 | 360MHz   | 11600ms        | 1050ms      |

> **Note**:
  - The above is time taken for execution of the `invoke()` call
  - SPIRAM used for TensorArena.
  - Person detection on ESP32-S3 with internal RAM: 47ms
  - ESP32-P4 optimisation is work in progress
  - `Without ESP-NN` case is when `esp-nn` is completely disabled by removing below flag from [CMakeLists.txt](CMakeLists.txt):
    ```cmake
      # enable ESP-NN optimizations by Espressif
      target_compile_options(${COMPONENT_LIB} PRIVATE -DESP_NN)
    ```


## Configuration

  * To configure, please use `idf.py menuconfig` and under `ESP-NN` select `NN_OPTIMIZATIONS`
  * There are two options presented:
     * Optimized versions
     * ANSI C

  * Default selection is for `Optimized versions`. For ESP32-S3 and ESP32-P4, assembly versions are automatically selected, whereas for other chips (viz., ESP32, ESP32-C3), generic optimisations are selected.
  * For debugging purposes, you may want to select `ANSI C` reference versions.


## Contributing

If you encounter an issue with ESP-NN, or wish to submit a feature request, please use the Issues section on the Github.

For general questions related to this library, please use the esp32.com forum.

Please check [CONTRIBUTING.md](CONTRIBUTING.md) for further information if you'd like to contribute to ESP-NN.

## Copyrights and License

All original source code in this repository is Copyright (C) 2020-2021 Espressif Systems. This source code is licensed under the Apache License 2.0 as described in the file LICENSE.


================================================
FILE: idf_component.yml
================================================
version: "1.2.3"
description: Optimized NN (Neural Network) functions for Espressif chips
url: https://github.com/espressif/esp-nn
repository: https://github.com/espressif/esp-nn.git
issues: https://github.com/espressif/esp-nn/issues
dependencies:
  idf:
    version: ">=4.2"
files:
  exclude:
    - test_app
    - tests


================================================
FILE: include/esp_nn.h
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#if defined(CONFIG_NN_OPTIMIZED)
// select apt optimisations
#ifdef CONFIG_IDF_TARGET_ESP32P4
#define ARCH_ESP32_P4 1
#endif
#ifdef CONFIG_IDF_TARGET_ESP32S3
#define ARCH_ESP32_S3 1
#endif
#ifdef CONFIG_IDF_TARGET_ESP32
#define ARCH_ESP32 1
#endif
#endif

#ifdef __cplusplus
extern "C" {
#endif

/* reference kernels included by default */
#include "esp_nn_ansi_headers.h"

#if defined(CONFIG_NN_OPTIMIZED)
#if defined(ARCH_ESP32_P4)
#include "esp_nn_esp32p4.h"
#elif defined(ARCH_ESP32_S3)
#include "esp_nn_esp32s3.h"
#else // for other platforms use generic optimisations
#include "esp_nn_generic_opt.h"
#endif // #if defined(ARCH_ESP32_S3)
#else
#include "esp_nn_ansi_c.h"
#endif

#ifdef __cplusplus
}
#endif


================================================
FILE: include/esp_nn_ansi_c.h
================================================
/*
 * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/**
 * @file        Header definitions to include for ANSI C versions.
 *              These are just typedefs to pick up ANSI versions.
 */

#pragma once

#include "esp_nn_defs.h"
#include "esp_nn_ansi_headers.h"

#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_ansi
#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_ansi
#define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_ansi

#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_ansi

#define esp_nn_conv_s8 esp_nn_conv_s8_ansi

#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_ansi
#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_ansi

#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_ansi
#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_ansi

#define esp_nn_relu6_s8 esp_nn_relu6_s8_ansi
#define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_ansi
#define esp_nn_get_hard_swish_scratch_size() 0
#define esp_nn_set_hard_swish_scratch_buf(buf)
#define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_ansi

#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_ansi
#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_ansi

#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_ansi
#define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_ansi

#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_ansi
#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_ansi
#define esp_nn_softmax_s8 esp_nn_softmax_s8_ansi

#define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi
#define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi
#define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi


================================================
FILE: include/esp_nn_ansi_headers.h
================================================
/*
 * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#pragma once

/**
 * @file        Header definitions to include for esp_nn reference functions
 */

#include "esp_nn_defs.h"
/************************** Basic math functions ****************************/

/**
 * @brief       elementwise addition
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 *
 *              shift values are expected to be <= 0
 */
void esp_nn_add_elementwise_s8_ansi(const int8_t *input1_data,
                                    const int8_t *input2_data,
                                    const int32_t input1_offset,
                                    const int32_t input2_offset,
                                    const int32_t input1_mult,
                                    const int32_t input2_mult,
                                    const int32_t input1_shift,
                                    const int32_t input2_shift,
                                    const int32_t left_shift,
                                    int8_t *output,
                                    const int32_t out_offset,
                                    const int32_t out_mult,
                                    const int32_t out_shift,
                                    const int32_t activation_min,
                                    const int32_t activation_max,
                                    const int32_t size);
/**
 * @brief       elementwise multiplication
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 *
 *              output shift is expected to be <= 0
 */
void esp_nn_mul_elementwise_s8_ansi(const int8_t *input1_data,
                                    const int8_t *input2_data,
                                    const int32_t input1_offset,
                                    const int32_t input2_offset,
                                    int8_t *output,
                                    const int32_t out_offset,
                                    const int32_t out_mult,
                                    const int32_t out_shift,
                                    const int32_t activation_min,
                                    const int32_t activation_max,
                                    const int32_t size);

/**
 * @brief       broadcast MUL for [H,W,C] * [1,1,C] pattern (SE-block)
 *
 * @note        input2_per_ch has `channels` elements, broadcast to all spatial positions.
 *              Uses fast requantization (constant nudge).
 */
void esp_nn_mul_broadcast_channel_s8_ansi(const int8_t *input1,
                                      const int8_t *input2_per_ch,
                                      const int32_t input1_offset,
                                      const int32_t input2_offset,
                                      int8_t *output,
                                      const int32_t output_offset,
                                      const int32_t output_mult,
                                      const int32_t output_shift,
                                      const int32_t activation_min,
                                      const int32_t activation_max,
                                      const int32_t total_spatial,
                                      const int32_t channels);


/************************** Convolution functions *****************************/

/**
 * @brief       depthwise convolution per channel
 *
 * @note        inputs type: int8_t, output: int8_t
 *              Version used in tflite is per channel.
 *              This version follows the same footsprints.
 *              Meaning, it has per out_channel shift and multiplier for
 *              requantization
 *
 *              optimization notes: Though input_offset is int32 type,
 *              offset values are contained in 8 bits [-128, 127]
 */
void esp_nn_depthwise_conv_s8_ansi(const data_dims_t *input_dims,
                                   const int8_t *input_data,
                                   const data_dims_t *filter_dims,
                                   const int8_t *filter_data,
                                   const int32_t *bias,
                                   const data_dims_t *output_dims,
                                   int8_t *out_data,
                                   const dw_conv_params_t *conv_params,
                                   const quant_data_t *quant_data);

/**
 * @brief       2d-convolution channelwise
 *
 * @note        operation: result += (input + offset) * filter
 *
 *              inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 */
void esp_nn_conv_s8_ansi(const data_dims_t *input_dims,
                         const int8_t *input_data,
                         const data_dims_t *filter_dims,
                         const int8_t *filter_data,
                         const int32_t *bias,
                         const data_dims_t *output_dims,
                         int8_t *out_data,
                         const conv_params_t *conv_params,
                         const quant_data_t *quant_data);

int esp_nn_get_conv_scratch_size_ansi(const data_dims_t *input_dims,
                                      const data_dims_t *filter_dims,
                                      const data_dims_t *output_dims,
                                      const conv_params_t *conv_params);
void esp_nn_set_conv_scratch_buf_ansi(const void *buf);

int esp_nn_get_depthwise_conv_scratch_size_ansi(const data_dims_t *input_dims,
                                                const data_dims_t *filter_dims,
                                                const data_dims_t *output_dims,
                                                const dw_conv_params_t *conv_params);
void esp_nn_set_depthwise_conv_scratch_buf_ansi(const void *buf);

/************************** Activation functions *****************************/

/**
 * @brief       relu6
 *
 * @note        inout: int8_t
 */
void esp_nn_relu6_s8_ansi(int8_t *data, uint16_t size);

/**
 * @brief       hard_swish activation: y = x * relu6(x + 3) / 6
 *
 * @note        Quantized int8 fixed-point implementation
 */
void esp_nn_hard_swish_s8_ansi(const int8_t *input, int8_t *output,
                                const int32_t size,
                                const int16_t input_zero_point,
                                const int16_t output_mult_fxp,
                                const int16_t reluish_mult_fxp,
                                const int32_t reluish_mult_exp,
                                const int32_t output_mult_exp,
                                const int16_t output_zero_point);

/**
 * @brief       mean reduction over spatial dims (H,W) for NHWC int8 tensor
 *
 * @note        Specialized for 4D [N,H,W,C] → [N,1,1,C] reduction.
 *              Used by Squeeze-and-Excite in MobileNetV3.
 */
void esp_nn_mean_nhwc_s8_ansi(const int8_t *input, int8_t *output,
                               const int32_t height, const int32_t width,
                               const int32_t channels,
                               const int32_t input_zero_point,
                               const int32_t output_zero_point,
                               const int32_t multiplier,
                               const int32_t shift);

/************************** Pooling functions *****************************/


/**
 * @brief       max_pool
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 */
void esp_nn_max_pool_s8_ansi(const int8_t *input,
                             const uint16_t input_wd,
                             const uint16_t input_ht,
                             int8_t *output,
                             const uint16_t output_wd,
                             const uint16_t output_ht,
                             const uint16_t stride_wd,
                             const uint16_t stride_ht,
                             const uint16_t filter_wd,
                             const uint16_t filter_ht,
                             const uint16_t pad_wd,
                             const uint16_t pad_ht,
                             const int32_t activation_min,
                             const int32_t activation_max,
                             const uint16_t channels);

/**
 * @brief       avg_pool
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 */
void esp_nn_avg_pool_s8_ansi(const int8_t *input,
                             const uint16_t input_wd,
                             const uint16_t input_ht,
                             int8_t *output,
                             const uint16_t output_wd,
                             const uint16_t output_ht,
                             const uint16_t stride_wd,
                             const uint16_t stride_ht,
                             const uint16_t filter_wd,
                             const uint16_t filter_ht,
                             const uint16_t pad_wd,
                             const uint16_t pad_ht,
                             const int32_t activation_min,
                             const int32_t activation_max,
                             const uint16_t channels);


/************************** Fully connected functions ***********************/

/**
 * @brief       fully connected
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 */
void esp_nn_fully_connected_s8_ansi(const int8_t *input_data,
                                    const int32_t input_offset,
                                    const uint16_t row_len,
                                    const int8_t *filter_data,
                                    const int32_t filter_offset,
                                    const int32_t *bias,
                                    int8_t *out_data,
                                    const uint16_t out_channels,
                                    const int32_t out_offset,
                                    const int32_t out_shift,
                                    const int32_t out_mult,
                                    const int32_t activation_min,
                                    const int32_t activation_max);

/**
 * @brief       fully connected
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 *              out_mult, out_shift: int32_t* containing per-channel data
 */
void esp_nn_fully_connected_per_ch_s8_ansi(const int8_t *input_data,
                                    const int32_t input_offset,
                                    const uint16_t row_len,
                                    const int8_t *filter_data,
                                    const int32_t filter_offset,
                                    const int32_t *bias,
                                    int8_t *out_data,
                                    const uint16_t out_channels,
                                    const int32_t out_offset,
                                    const int32_t* out_shift,
                                    const int32_t* out_mult,
                                    const int32_t activation_min,
                                    const int32_t activation_max);

/**
 * @brief   Get scratch buffer size needed by softmax function
 *
 * @param   width
 * @param   height
 * @return  size in bytes
 *
 * @note    buffer must be 4 byte aligned
 */
int32_t esp_nn_get_softmax_scratch_size_ansi(const int32_t width, const int32_t height);

/* ANSI C function to be hooked up when optimised version needed */
int32_t esp_nn_get_softmax_scratch_size_opt(const int32_t width, const int32_t height);

/**
 * @brief   Set scratch buffer to be used by softmax function
 *
 * @param   buffer  this can be NULL if one needs to unset it
 *                  must be aligned to 4 bytes
 */
void esp_nn_set_softmax_scratch_buf_ansi(void *buffer);

/**
 * @brief       reference softmax function
 *
 * @note        inputs type: int8_t, output: int8_t
 */
void esp_nn_softmax_s8_ansi(const int8_t *input_data,
                            const int32_t height,
                            const int32_t width,
                            const int32_t mult,
                            const int32_t shift,
                            const int32_t diff_min,
                            int8_t *output_data);


//////////////////////////// Generic optimisations /////////////////////////////

/************************** Convolution functions *****************************/

/**
 * @brief       2d-convolution channelwise optimized version
 *
 * @note        operation: result += (input + offset) * filter
 *
 *              inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 */
void esp_nn_conv_s8_opt(const data_dims_t *input_dims,
                        const int8_t *input_data,
                        const data_dims_t *filter_dims,
                        const int8_t *filter_data,
                        const int32_t *bias,
                        const data_dims_t *output_dims,
                        int8_t *out_data,
                        const conv_params_t *conv_params,
                        const quant_data_t *quant_data);

/**
 * @brief       depthwise convolution per channel optimized version
 *
 * @note        inputs type: int8_t, output: int8_t
 *              Version used in tflite is per channel.
 *              This version follows the same footsprints.
 *              Meaning, it has per out_channel shift and multiplier for
 *              requantization
 *
 *              optimization notes: Though input_offset is int32 type,
 *              offset values are contained in 8 bits [-128, 127]
 */
void esp_nn_depthwise_conv_s8_opt(const data_dims_t *input_dims,
                                  const int8_t *input_data,
                                  const data_dims_t *filter_dims,
                                  const int8_t *filter_data,
                                  const int32_t *bias,
                                  const data_dims_t *output_dims,
                                  int8_t *out_data,
                                  const dw_conv_params_t *conv_params,
                                  const quant_data_t *quant_data);

int esp_nn_get_conv_scratch_size_opt(const data_dims_t *input_dims,
                                     const data_dims_t *filter_dims,
                                     const data_dims_t *output_dims,
                                     const conv_params_t *conv_params);
void esp_nn_set_conv_scratch_buf_opt(const void *buf);

int esp_nn_get_depthwise_conv_scratch_size_opt(const data_dims_t *input_dims,
                                               const data_dims_t *filter_dims,
                                               const data_dims_t *output_dims,
                                               const dw_conv_params_t *conv_params);
void esp_nn_set_depthwise_conv_scratch_buf_opt(const void *buf);

/* ANSI C function to be hooked up when optimised version needed */
void esp_nn_set_softmax_scratch_buf_opt(void *buffer);

/**
 * @brief       optimised version of softmax function
 *
 * @note        the function uses extra buffer (4 * width bytes)
 *              hence, scratch buffers must be set before calling this.
 */
void esp_nn_softmax_s8_opt(const int8_t *input_data,
                           const int32_t height,
                           const int32_t width,
                           const int32_t mult,
                           const int32_t shift,
                           const int32_t diff_min,
                           int8_t *output_data);

/**
 * @brief       Get scratch buffer size for int8 logistic (sigmoid).
 * @return      256 (size of LUT in bytes)
 */
int32_t esp_nn_get_logistic_s8_scratch_size_ansi(void);

/**
 * @brief       Prepare LUT for int8 logistic (sigmoid).
 *              Call once during model preparation after scratch is allocated.
 *
 * @param       scratch_buf         Scratch buffer (256 bytes, from get_scratch_size)
 * @param       input_zero_point    Input quantization zero point
 * @param       input_scale         Input quantization scale (float)
 *
 * @note        Output quantization is fixed: scale=1/256, zero_point=-128.
 */
void esp_nn_logistic_s8_prepare_ansi(int8_t *scratch_buf,
                                      int32_t input_zero_point,
                                      float input_scale);

/**
 * @brief       Apply int8 logistic (sigmoid) using precomputed LUT.
 *
 * @param       input       Input int8 data
 * @param       output      Output int8 data
 * @param       size        Number of elements
 * @param       scratch_buf 256-byte LUT from esp_nn_logistic_s8_prepare()
 */
void esp_nn_logistic_s8_ansi(const int8_t *input, int8_t *output,
                              int32_t size, const int8_t *scratch_buf);


================================================
FILE: include/esp_nn_defs.h
================================================
// Copyright 2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include <stdint.h>

/**
 * @brief structure to club data dims
 * this structure can be used for input, output and filter
 */
typedef struct data_dims {
    int32_t width;
    int32_t height;
    int32_t channels;

    int32_t extra; // can be used as batch or any other param
} data_dims_t;

/**
 * @brief 2d data structure (width, height)
 *
 */
typedef struct data_2d {
    int32_t width;
    int32_t height;
} data_2d_t;

/**
 * @brief min/max activation
 */
typedef struct act_params {
    int32_t min;
    int32_t max;
} act_params_t;

/**
 * @brief per channel quant data
 *
 * @note number of shift and mult elements are equal to output channels
 */
typedef struct quant_data {
    int32_t *shift;
    int32_t *mult;
} quant_data_t;

/**
 * @brief params specific to convolution 2d
 *
 */
typedef struct conv_params {
    int32_t in_offset;
    int32_t out_offset;
    data_2d_t stride;
    data_2d_t padding;
    data_2d_t dilation;
    act_params_t activation;
} conv_params_t;

/**
 * @brief params specific to depthwise convolution 2d
 *
 */
typedef struct dw_conv_params {
    int32_t in_offset;
    int32_t out_offset;
    int32_t ch_mult; // channel multiplier. (in_ch * ch_mult = out_ch)
    data_2d_t stride;
    data_2d_t padding;
    data_2d_t dilation;
    act_params_t activation;
} dw_conv_params_t;


================================================
FILE: include/esp_nn_esp32p4.h
================================================
/*
 * SPDX-FileCopyrightText: 2024-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/**
 * @file        Header definitions to include for esp_nn optimized functions for
 *              the ESP32-P4 platform
 */

#pragma once

#include "esp_nn_defs.h"
#include "esp_nn_ansi_headers.h"

/**
 * @brief       2d - convolution channelwise
 *
 * @note        operation: result += (input + offset) * filter
 *
 *              inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 */
void esp_nn_conv_s8_esp32p4(const data_dims_t *input_dims,
                            const int8_t *input_data,
                            const data_dims_t *filter_dims,
                            const int8_t *filter_data,
                            const int32_t *bias,
                            const data_dims_t *output_dims,
                            int8_t *output_data,
                            const conv_params_t *conv_params,
                            const quant_data_t *quant_data);

int esp_nn_get_conv_scratch_size_esp32p4(const data_dims_t *input_dims,
                                         const data_dims_t *filter_dims,
                                         const data_dims_t *output_dims,
                                         const conv_params_t *conv_params);
void esp_nn_set_conv_scratch_buf_esp32p4(const void *buf);

/********************** function defines ***************************/


#define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_ansi

void esp_nn_add_elementwise_s8_esp32p4(const int8_t *input1_data,
                                        const int8_t *input2_data,
                                        const int32_t input1_offset,
                                        const int32_t input2_offset,
                                        const int32_t input1_mult,
                                        const int32_t input2_mult,
                                        const int32_t input1_shift,
                                        const int32_t input2_shift,
                                        const int32_t left_shift,
                                        int8_t *output,
                                        const int32_t out_offset,
                                        const int32_t out_mult,
                                        const int32_t out_shift,
                                        const int32_t activation_min,
                                        const int32_t activation_max,
                                        const int32_t size);
#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_esp32p4

void esp_nn_mul_elementwise_s8_esp32p4(const int8_t *input1_data,
                                        const int8_t *input2_data,
                                        const int32_t input1_offset,
                                        const int32_t input2_offset,
                                        int8_t *output,
                                        const int32_t out_offset,
                                        const int32_t out_mult,
                                        const int32_t out_shift,
                                        const int32_t activation_min,
                                        const int32_t activation_max,
                                        const int32_t size);
#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_esp32p4

void esp_nn_depthwise_conv_s8_esp32p4(const data_dims_t *input_dims,
                                       const int8_t *input_data,
                                       const data_dims_t *filter_dims,
                                       const int8_t *filter_data,
                                       const int32_t *bias,
                                       const data_dims_t *output_dims,
                                       int8_t *out_data,
                                       const dw_conv_params_t *conv_params,
                                       const quant_data_t *quant_data);
int esp_nn_get_depthwise_conv_scratch_size_esp32p4(const data_dims_t *input_dims,
                                                    const data_dims_t *filter_dims,
                                                    const data_dims_t *output_dims,
                                                    const dw_conv_params_t *conv_params);
void esp_nn_set_depthwise_conv_scratch_buf_esp32p4(const void *buf);
#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_esp32p4

#define esp_nn_conv_s8 esp_nn_conv_s8_esp32p4

#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_esp32p4
#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_esp32p4

#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_esp32p4
#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_esp32p4

/* Functions not yet optimized for P4 - use ANSI fallback */
void esp_nn_hard_swish_s8_esp32p4(const int8_t *input, int8_t *output,
                                   const int32_t size,
                                   const int16_t input_zero_point,
                                   const int16_t output_mult_fxp,
                                   const int16_t reluish_mult_fxp,
                                   const int32_t reluish_mult_exp,
                                   const int32_t output_mult_exp,
                                   const int16_t output_zero_point);
#define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_esp32p4
#define esp_nn_get_hard_swish_scratch_size() 0
#define esp_nn_set_hard_swish_scratch_buf(buf)

void esp_nn_mean_nhwc_s8_esp32p4(const int8_t *input, int8_t *output,
                                  const int32_t height, const int32_t width,
                                  const int32_t channels,
                                  const int32_t input_zero_point,
                                  const int32_t output_zero_point,
                                  const int32_t multiplier,
                                  const int32_t shift);
#define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_esp32p4

void esp_nn_relu6_s8_esp32p4(int8_t *data, uint16_t size);
#define esp_nn_relu6_s8 esp_nn_relu6_s8_esp32p4

void esp_nn_avg_pool_s8_esp32p4(const int8_t *input,
                                 const uint16_t input_wd,
                                 const uint16_t input_ht,
                                 int8_t *output,
                                 const uint16_t output_wd,
                                 const uint16_t output_ht,
                                 const uint16_t stride_wd,
                                 const uint16_t stride_ht,
                                 const uint16_t filter_wd,
                                 const uint16_t filter_ht,
                                 const uint16_t pad_wd,
                                 const uint16_t pad_ht,
                                 const int32_t activation_min,
                                 const int32_t activation_max,
                                 const uint16_t channels);
#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_esp32p4
void esp_nn_max_pool_s8_esp32p4(const int8_t *input,
                                 const uint16_t input_wd,
                                 const uint16_t input_ht,
                                 int8_t *output,
                                 const uint16_t output_wd,
                                 const uint16_t output_ht,
                                 const uint16_t stride_wd,
                                 const uint16_t stride_ht,
                                 const uint16_t filter_wd,
                                 const uint16_t filter_ht,
                                 const uint16_t pad_wd,
                                 const uint16_t pad_ht,
                                 const int32_t activation_min,
                                 const int32_t activation_max,
                                 const uint16_t channels);
#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_esp32p4

void esp_nn_fully_connected_s8_esp32p4(const int8_t *input_data,
                                        const int32_t input_offset,
                                        const uint16_t row_len,
                                        const int8_t *filter_data,
                                        const int32_t filter_offset,
                                        const int32_t *bias,
                                        int8_t *out_data,
                                        const uint16_t out_channels,
                                        const int32_t out_offset,
                                        const int32_t out_shift,
                                        const int32_t out_mult,
                                        const int32_t activation_min,
                                        const int32_t activation_max);
void esp_nn_fully_connected_per_ch_s8_esp32p4(const int8_t *input_data,
                                        const int32_t input_offset,
                                        const uint16_t row_len,
                                        const int8_t *filter_data,
                                        const int32_t filter_offset,
                                        const int32_t *bias,
                                        int8_t *out_data,
                                        const uint16_t out_channels,
                                        const int32_t out_offset,
                                        const int32_t *out_shift,
                                        const int32_t *out_mult,
                                        const int32_t activation_min,
                                        const int32_t activation_max);
#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_esp32p4
#define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_esp32p4

int32_t esp_nn_get_softmax_scratch_size_esp32p4(const int32_t width, const int32_t height);
void esp_nn_set_softmax_scratch_buf_esp32p4(void *buffer);
void esp_nn_softmax_s8_esp32p4(const int8_t *input_data,
                                const int32_t height,
                                const int32_t width,
                                const int32_t mult,
                                const int32_t shift,
                                const int32_t diff_min,
                                int8_t *output_data);
#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_esp32p4
#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_esp32p4
#define esp_nn_softmax_s8 esp_nn_softmax_s8_esp32p4

#define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi
#define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi
#define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi


================================================
FILE: include/esp_nn_esp32s3.h
================================================
/*
 * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/**
 * @file        Header definitions to include for esp_nn optimized functions for
 *              the ESP32-S3 platform
 */

#pragma once

#include "esp_nn_defs.h"
#include "esp_nn_ansi_headers.h"

/************************** Basic math functions *****************************/


/**
 * @brief       elementwise addition
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 *
 *              shift values are expected to be <= 0
 */
void esp_nn_add_elementwise_s8_esp32s3(const int8_t *input1_data,
                                       const int8_t *input2_data,
                                       const int32_t input1_offset,
                                       const int32_t input2_offset,
                                       const int32_t input1_mult,
                                       const int32_t input2_mult,
                                       const int32_t input1_shift,
                                       const int32_t input2_shift,
                                       const int32_t left_shift,
                                       int8_t *output,
                                       const int32_t out_offset,
                                       const int32_t out_mult,
                                       const int32_t out_shift,
                                       const int32_t activation_min,
                                       const int32_t activation_max,
                                       const int32_t size);

/**
 * @brief       elementwise multiplication
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 *
 *              output shift is expected to be <= 0
 */
void esp_nn_mul_elementwise_s8_esp32s3(const int8_t *input1_data,
                                       const int8_t *input2_data,
                                       const int32_t input1_offset,
                                       const int32_t input2_offset,
                                       int8_t *output,
                                       const int32_t out_offset,
                                       const int32_t out_mult,
                                       const int32_t out_shift,
                                       const int32_t activation_min,
                                       const int32_t activation_max,
                                       const int32_t size);


/************************** Convolution functions *****************************/

/**
 * @brief       depthwise convolution per channel
 *
 * @note        inputs type: int8_t, output: int8_t
 *              Version used in tflite is per channel.
 *              This version follows the same footsprints.
 *              Meaning, it has per out_channel shift and multiplier for
 *              requantization
 *
 *              optimization notes: Though input_offset is int32 type,
 *              offset values are contained in 8 bits [-128, 127]
 */
void esp_nn_depthwise_conv_s8_esp32s3(const data_dims_t *input_dims,
                                      const int8_t *input_data,
                                      const data_dims_t *filter_dims,
                                      const int8_t *filter_data,
                                      const int32_t *bias,
                                      const data_dims_t *output_dims,
                                      int8_t *output_data,
                                      const dw_conv_params_t *conv_params,
                                      const quant_data_t *quant_data);

/**
 * @brief       2d - convolution channelwise
 *
 * @note        operation: result += (input + offset) * filter
 *
 *              inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 */
void esp_nn_conv_s8_esp32s3(const data_dims_t *input_dims,
                            const int8_t *input_data,
                            const data_dims_t *filter_dims,
                            const int8_t *filter_data,
                            const int32_t *bias,
                            const data_dims_t *output_dims,
                            int8_t *output_data,
                            const conv_params_t *conv_params,
                            const quant_data_t *quant_data);

int esp_nn_get_conv_scratch_size_esp32s3(const data_dims_t *input_dims,
                                         const data_dims_t *filter_dims,
                                         const data_dims_t *output_dims,
                                         const conv_params_t *conv_params);
void esp_nn_set_conv_scratch_buf_esp32s3(const void *buf);

int esp_nn_get_depthwise_conv_scratch_size_esp32s3(const data_dims_t *input_dims,
                                                   const data_dims_t *filter_dims,
                                                   const data_dims_t *output_dims,
                                                   const dw_conv_params_t *conv_params);
void esp_nn_set_depthwise_conv_scratch_buf_esp32s3(const void *buf);

/************************** Pooling functions *****************************/

/**
 * @brief       max_pool
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 */
void esp_nn_max_pool_s8_esp32s3(const int8_t *input,
                                const uint16_t input_wd,
                                const uint16_t input_ht,
                                int8_t *output,
                                const uint16_t output_wd,
                                const uint16_t output_ht,
                                const uint16_t stride_wd,
                                const uint16_t stride_ht,
                                const uint16_t filter_wd,
                                const uint16_t filter_ht,
                                const uint16_t pad_wd,
                                const uint16_t pad_ht,
                                const int32_t activation_min,
                                const int32_t activation_max,
                                const uint16_t channels);

/**
 * @brief       avg_pool
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 */
void esp_nn_avg_pool_s8_esp32s3(const int8_t *input,
                                const uint16_t input_wd,
                                const uint16_t input_ht,
                                int8_t *output,
                                const uint16_t output_wd,
                                const uint16_t output_ht,
                                const uint16_t stride_wd,
                                const uint16_t stride_ht,
                                const uint16_t filter_wd,
                                const uint16_t filter_ht,
                                const uint16_t pad_wd,
                                const uint16_t pad_ht,
                                const int32_t activation_min,
                                const int32_t activation_max,
                                const uint16_t channels);


/************************** Fully connected functions *****************************/

/**
 * @brief       fully connected
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 *
 *              Current version works only on aligned input.
 *              row_len and channels should both be multiple of 8.
 */
void esp_nn_fully_connected_s8_esp32s3(const int8_t *input_data,
                                       const int32_t input_offset,
                                       const uint16_t row_len,
                                       const int8_t *filter_data,
                                       const int32_t filter_offset,
                                       const int32_t *bias,
                                       int8_t *out_data,
                                       const uint16_t out_channels,
                                       const int32_t out_offset,
                                       const int32_t out_shift,
                                       const int32_t out_mult,
                                       const int32_t activation_min,
                                       const int32_t activation_max);

/**
 * @brief       fully connected - per channel
 *
 * @note        inputs type: int8_t, output: int8_t
 *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]
 *              out_mult, out_shift: int32_t* containing per-channel data
 *
 *              Current version works only on aligned input.
 *              row_len and channels should both be multiple of 8.
 */
void esp_nn_fully_connected_per_ch_s8_esp32s3(const int8_t *input_data,
                                       const int32_t input_offset,
                                       const uint16_t row_len,
                                       const int8_t *filter_data,
                                       const int32_t filter_offset,
                                       const int32_t *bias,
                                       int8_t *out_data,
                                       const uint16_t out_channels,
                                       const int32_t out_offset,
                                       const int32_t* out_shift,
                                       const int32_t* out_mult,
                                       const int32_t activation_min,
                                       const int32_t activation_max);

/**
 * @brief       relu6
 *
 * @note        inout: int8_t
 */
void esp_nn_relu6_s8_esp32s3(int8_t *data, uint16_t size);

/********************** function defines ***************************/

#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_esp32s3
#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_esp32s3

void esp_nn_mul_broadcast_channel_s8_esp32s3(const int8_t *input1,
                                              const int8_t *input2_per_ch,
                                              const int32_t input1_offset,
                                              const int32_t input2_offset,
                                              int8_t *output,
                                              const int32_t output_offset,
                                              const int32_t output_mult,
                                              const int32_t output_shift,
                                              const int32_t activation_min,
                                              const int32_t activation_max,
                                              const int32_t total_spatial,
                                              const int32_t channels);
#define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_esp32s3

#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_esp32s3

#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_esp32s3
#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_esp32s3

#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_esp32s3
#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_esp32s3

#define esp_nn_conv_s8 esp_nn_conv_s8_esp32s3

#define esp_nn_relu6_s8 esp_nn_relu6_s8_esp32s3

int32_t esp_nn_get_hard_swish_scratch_size_esp32s3(void);
void esp_nn_set_hard_swish_scratch_buf_esp32s3(void *buf);
void esp_nn_hard_swish_s8_esp32s3(const int8_t *input, int8_t *output,
                                   const int32_t size,
                                   const int16_t input_zero_point,
                                   const int16_t output_mult_fxp,
                                   const int16_t reluish_mult_fxp,
                                   const int32_t reluish_mult_exp,
                                   const int32_t output_mult_exp,
                                   const int16_t output_zero_point);
#define esp_nn_get_hard_swish_scratch_size esp_nn_get_hard_swish_scratch_size_esp32s3
#define esp_nn_set_hard_swish_scratch_buf esp_nn_set_hard_swish_scratch_buf_esp32s3
#define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_esp32s3

void esp_nn_mean_nhwc_s8_esp32s3(const int8_t *input, int8_t *output,
                                  const int32_t height, const int32_t width,
                                  const int32_t channels,
                                  const int32_t input_zero_point,
                                  const int32_t output_zero_point,
                                  const int32_t multiplier,
                                  const int32_t shift);
#define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_esp32s3

#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_esp32s3
#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_esp32s3

#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_esp32s3
#define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_esp32s3

int32_t esp_nn_get_softmax_scratch_size_esp32s3(const int32_t width, const int32_t height);
void esp_nn_set_softmax_scratch_buf_esp32s3(void *buffer);
void esp_nn_softmax_s8_esp32s3(const int8_t *input_data, const int32_t height,
                                const int32_t width, const int32_t mult,
                                const int32_t shift, const int32_t diff_min,
                                int8_t *output_data);

#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_esp32s3
#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_esp32s3
#define esp_nn_softmax_s8 esp_nn_softmax_s8_esp32s3

/* Logistic (sigmoid) — LUT-based, same impl for all targets */
#define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi
#define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi
#define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi


================================================
FILE: include/esp_nn_generic_opt.h
================================================
/*
 * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/**
 * @file        Header definitions to include for esp_nn generic optimisations
 *              For functions which not having optimisations, _ansi versions are picked.
 */

#pragma once

#include "esp_nn_defs.h"
#include "esp_nn_ansi_headers.h"

#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_ansi
#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_ansi
#define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_ansi

#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_opt

#define esp_nn_conv_s8 esp_nn_conv_s8_opt

#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_opt
#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_opt

#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_opt
#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_opt

#define esp_nn_relu6_s8 esp_nn_relu6_s8_ansi
#define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_ansi
#define esp_nn_get_hard_swish_scratch_size() 0
#define esp_nn_set_hard_swish_scratch_buf(buf)
#define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_ansi

#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_ansi
#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_ansi

#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_ansi
#define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_ansi

#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_opt
#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_opt
#define esp_nn_softmax_s8 esp_nn_softmax_s8_opt

#define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi
#define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi
#define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi


================================================
FILE: src/activation_functions/esp_nn_hard_swish_ansi.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * HardSwish activation function: y = x * relu6(x + 3) / 6
 * Quantized int8 implementation using fixed-point arithmetic.
 */

#include <stdint.h>
#include <common_functions.h>

/*
 * Saturating left shift for int16
 */
static inline int16_t sat_left_shift_s16(int16_t val, int shift)
{
    int32_t result = (int32_t)val << shift;
    if (result > 32767) return 32767;
    if (result < -32768) return -32768;
    return (int16_t)result;
}

/*
 * SaturatingRoundingDoublingHighMul for int16: (a * b + (1<<14)) >> 15
 */
static inline int16_t sat_round_dbl_high_mul_s16(int16_t a, int16_t b)
{
    if (a == b && a == -32768) return 32767;
    int32_t ab = (int32_t)a * (int32_t)b;
    return (int16_t)((ab + (1 << 14)) >> 15);
}

/*
 * SaturatingDoublingHighMul (NOT rounding): (a * b) >> 15
 */
static inline int16_t sat_dbl_high_mul_s16(int16_t a, int16_t b)
{
    if (a == b && a == -32768) return 32767;
    return (int16_t)(((int32_t)a * (int32_t)b) / (1 << 15));
}

/*
 * RoundingDivideByPOT for int16
 */
static inline int16_t rounding_div_pot_s16(int16_t val, int exponent)
{
    int32_t mask = (1 << exponent) - 1;
    int32_t remainder = val & mask;
    int32_t threshold = (mask >> 1) + (val < 0 ? 1 : 0);
    return (int16_t)((val >> exponent) + (remainder > threshold ? 1 : 0));
}

void esp_nn_hard_swish_s8_ansi(const int8_t *input,
                                int8_t *output,
                                const int32_t size,
                                const int16_t input_zero_point,
                                const int16_t output_mult_fxp,
                                const int16_t reluish_mult_fxp,
                                const int32_t reluish_mult_exp,
                                const int32_t output_mult_exp,
                                const int16_t output_zero_point)
{
    for (int i = 0; i < size; i++) {
        const int16_t in_val = input[i] - input_zero_point;
        const int16_t in_hires = in_val * 128; /* << 7 */

        /* Scale input to output scale */
        const int16_t in_on_out_scale = sat_round_dbl_high_mul_s16(in_hires, output_mult_fxp);

        /* Compute reluish value: maps input from [-3,3] to [-1,1] */
        int16_t reluish = in_hires;
        if (reluish_mult_exp > 0) {
            reluish = sat_left_shift_s16(reluish, reluish_mult_exp - 1);
        }
        reluish = sat_round_dbl_high_mul_s16(reluish, reluish_mult_fxp);
        if (reluish_mult_exp > 0) {
            reluish = sat_left_shift_s16(reluish, 1);
        }
        if (reluish_mult_exp < 0) {
            reluish = rounding_div_pot_s16(reluish, -reluish_mult_exp);
        }

        /* Convert from [-1,1] to [0,1] */
        reluish = (reluish + (1 << 15)) >> 1;

        /* Multiply: output = reluish * input_on_output_scale */
        const int16_t pre_out = sat_dbl_high_mul_s16(reluish, in_on_out_scale);

        /* Final shift and offset */
        int16_t out_val = rounding_div_pot_s16(pre_out, -output_mult_exp);
        out_val += output_zero_point;
        if (out_val > 127) out_val = 127;
        if (out_val < -128) out_val = -128;
        output[i] = (int8_t)out_val;
    }
}


================================================
FILE: src/activation_functions/esp_nn_hard_swish_s8_esp32p4.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * ESP32-P4 optimized HardSwish with:
 * 1. Branch hoisting (borrowed from S3): dispatch on reluish_mult_exp ONCE
 * 2. 2x loop unrolling for better ILP on RISC-V pipeline
 * 3. All int16 arithmetic - no 64-bit multiply bottleneck
 */

#include <stdint.h>

static inline __attribute__((always_inline))
int16_t sat_rnd_dbl_hi_mul(int16_t a, int16_t b) {
    if (__builtin_expect(a == b && a == -32768, 0)) return 32767;
    return (int16_t)(((int32_t)a * (int32_t)b + (1 << 14)) >> 15);
}

static inline __attribute__((always_inline))
int16_t sat_dbl_hi_mul(int16_t a, int16_t b) {
    if (__builtin_expect(a == b && a == -32768, 0)) return 32767;
    return (int16_t)(((int32_t)a * (int32_t)b) >> 15);
}

static inline __attribute__((always_inline))
int16_t sat_left_shift_s16(int32_t val) {
    if (val > 32767) return 32767;
    if (val < -32768) return -32768;
    return (int16_t)val;
}

static inline __attribute__((always_inline))
int16_t rounding_div_pot_s16(int16_t val, int exp) {
    int32_t mask = (1 << exp) - 1;
    int32_t remainder = val & mask;
    int32_t threshold = (mask >> 1) + (val < 0 ? 1 : 0);
    return (int16_t)((val >> exp) + (remainder > threshold ? 1 : 0));
}

/* Core output computation shared by all paths */
static inline __attribute__((always_inline))
int8_t hard_swish_output(int16_t reluish, int16_t in_on_out_scale,
                          int neg_out_exp, int16_t output_zero_point) {
    int16_t pre = sat_dbl_hi_mul(reluish, in_on_out_scale);
    int16_t ov = rounding_div_pot_s16(pre, neg_out_exp);
    int32_t result = ov + output_zero_point;
    if (result > 127) result = 127;
    if (result < -128) result = -128;
    return (int8_t)result;
}

void esp_nn_hard_swish_s8_esp32p4(const int8_t *input,
                                   int8_t *output,
                                   const int32_t size,
                                   const int16_t input_zero_point,
                                   const int16_t output_mult_fxp,
                                   const int16_t reluish_mult_fxp,
                                   const int32_t reluish_mult_exp,
                                   const int32_t output_mult_exp,
                                   const int16_t output_zero_point)
{
    const int neg_out_exp = -output_mult_exp;
    int i = 0;

    /* Branch on reluish_mult_exp ONCE - 3 specialized loops */
    if (reluish_mult_exp > 0) {
        const int ls1 = reluish_mult_exp - 1;

        for (; i <= size - 2; i += 2) {
            int16_t iv0 = input[i] - input_zero_point;
            int16_t iv1 = input[i+1] - input_zero_point;
            int16_t hi0 = iv0 * 128, hi1 = iv1 * 128;

            int16_t on0 = sat_rnd_dbl_hi_mul(hi0, output_mult_fxp);
            int16_t on1 = sat_rnd_dbl_hi_mul(hi1, output_mult_fxp);

            int16_t rv0 = sat_left_shift_s16((int32_t)hi0 << ls1);
            int16_t rv1 = sat_left_shift_s16((int32_t)hi1 << ls1);
            rv0 = sat_rnd_dbl_hi_mul(rv0, reluish_mult_fxp);
            rv1 = sat_rnd_dbl_hi_mul(rv1, reluish_mult_fxp);
            rv0 = sat_left_shift_s16((int32_t)rv0 * 2);
            rv1 = sat_left_shift_s16((int32_t)rv1 * 2);

            rv0 = (int16_t)(((int32_t)rv0 + 32768) >> 1);
            rv1 = (int16_t)(((int32_t)rv1 + 32768) >> 1);

            output[i]   = hard_swish_output(rv0, on0, neg_out_exp, output_zero_point);
            output[i+1] = hard_swish_output(rv1, on1, neg_out_exp, output_zero_point);
        }
    } else if (reluish_mult_exp < 0) {
        const int neg_relu_exp = -reluish_mult_exp;

        for (; i <= size - 2; i += 2) {
            int16_t iv0 = input[i] - input_zero_point;
            int16_t iv1 = input[i+1] - input_zero_point;
            int16_t hi0 = iv0 * 128, hi1 = iv1 * 128;

            int16_t on0 = sat_rnd_dbl_hi_mul(hi0, output_mult_fxp);
            int16_t on1 = sat_rnd_dbl_hi_mul(hi1, output_mult_fxp);

            int16_t rv0 = sat_rnd_dbl_hi_mul(hi0, reluish_mult_fxp);
            int16_t rv1 = sat_rnd_dbl_hi_mul(hi1, reluish_mult_fxp);
            rv0 = rounding_div_pot_s16(rv0, neg_relu_exp);
            rv1 = rounding_div_pot_s16(rv1, neg_relu_exp);

            rv0 = (int16_t)(((int32_t)rv0 + 32768) >> 1);
            rv1 = (int16_t)(((int32_t)rv1 + 32768) >> 1);

            output[i]   = hard_swish_output(rv0, on0, neg_out_exp, output_zero_point);
            output[i+1] = hard_swish_output(rv1, on1, neg_out_exp, output_zero_point);
        }
    } else {
        for (; i <= size - 2; i += 2) {
            int16_t iv0 = input[i] - input_zero_point;
            int16_t iv1 = input[i+1] - input_zero_point;
            int16_t hi0 = iv0 * 128, hi1 = iv1 * 128;

            int16_t on0 = sat_rnd_dbl_hi_mul(hi0, output_mult_fxp);
            int16_t on1 = sat_rnd_dbl_hi_mul(hi1, output_mult_fxp);
            int16_t rv0 = sat_rnd_dbl_hi_mul(hi0, reluish_mult_fxp);
            int16_t rv1 = sat_rnd_dbl_hi_mul(hi1, reluish_mult_fxp);

            rv0 = (int16_t)(((int32_t)rv0 + 32768) >> 1);
            rv1 = (int16_t)(((int32_t)rv1 + 32768) >> 1);

            output[i]   = hard_swish_output(rv0, on0, neg_out_exp, output_zero_point);
            output[i+1] = hard_swish_output(rv1, on1, neg_out_exp, output_zero_point);
        }
    }

    /* Scalar remainder */
    for (; i < size; i++) {
        int16_t iv = input[i] - input_zero_point;
        int16_t hi = iv * 128;
        int16_t on_out = sat_rnd_dbl_hi_mul(hi, output_mult_fxp);

        int16_t rv = hi;
        if (reluish_mult_exp > 0)
            rv = sat_left_shift_s16((int32_t)rv << (reluish_mult_exp - 1));
        rv = sat_rnd_dbl_hi_mul(rv, reluish_mult_fxp);
        if (reluish_mult_exp > 0)
            rv = sat_left_shift_s16((int32_t)rv * 2);
        if (reluish_mult_exp < 0)
            rv = rounding_div_pot_s16(rv, -reluish_mult_exp);

        rv = (int16_t)(((int32_t)rv + 32768) >> 1);
        output[i] = hard_swish_output(rv, on_out, neg_out_exp, output_zero_point);
    }
}


================================================
FILE: src/activation_functions/esp_nn_hard_swish_s8_esp32s3.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * ESP32-S3 optimized HardSwish using 256-byte lookup table.
 *
 * Key insight: HardSwish maps int8 -> int8 with fixed quantization parameters
 * per layer. Only 256 possible input values exist. We precompute the full
 * mapping once using the ANSI reference (bit-exact), then the inner loop
 * is a single byte load per element.
 *
 * Scratch buffer: 256 bytes (set via esp_nn_set_hard_swish_scratch_buf).
 */

#include <stdint.h>
#include <stddef.h>

/* Use ANSI C reference to build LUT — guarantees bit-exact match */
extern void esp_nn_hard_swish_s8_ansi(const int8_t *input,
                                       int8_t *output,
                                       const int32_t size,
                                       const int16_t input_zero_point,
                                       const int16_t output_mult_fxp,
                                       const int16_t reluish_mult_fxp,
                                       const int32_t reluish_mult_exp,
                                       const int32_t output_mult_exp,
                                       const int16_t output_zero_point);

static int8_t *hard_swish_scratch = NULL;

int32_t esp_nn_get_hard_swish_scratch_size_esp32s3(void)
{
    return 512; /* 256 for lut_input + 256 for lut output */
}

void esp_nn_set_hard_swish_scratch_buf_esp32s3(void *buf)
{
    hard_swish_scratch = (int8_t *)buf;
}

void esp_nn_hard_swish_s8_esp32s3(const int8_t *input,
                                   int8_t *output,
                                   const int32_t size,
                                   const int16_t input_zero_point,
                                   const int16_t output_mult_fxp,
                                   const int16_t reluish_mult_fxp,
                                   const int32_t reluish_mult_exp,
                                   const int32_t output_mult_exp,
                                   const int16_t output_zero_point)
{
    if (!hard_swish_scratch) {
        /* No scratch — fall through to ANSI */
        esp_nn_hard_swish_s8_ansi(input, output, size,
                                   input_zero_point, output_mult_fxp,
                                   reluish_mult_fxp, reluish_mult_exp,
                                   output_mult_exp, output_zero_point);
        return;
    }

    /* Build 256-byte LUT using ANSI reference (bit-exact).
     * lut[i] = hardswish((int8_t)i) for the given quant params.
     * Indexed by (uint8_t)input_val for direct lookup. */
    int8_t *lut_input = hard_swish_scratch;
    int8_t *lut = hard_swish_scratch + 256;

    for (int i = 0; i < 256; i++) {
        lut_input[i] = (int8_t)i;
    }
    esp_nn_hard_swish_s8_ansi(lut_input, lut, 256,
                               input_zero_point, output_mult_fxp,
                               reluish_mult_fxp, reluish_mult_exp,
                               output_mult_exp, output_zero_point);

    /* Apply LUT — one byte load per element */
    for (int i = 0; i < size; i++) {
        output[i] = lut[(uint8_t)input[i]];
    }
}


================================================
FILE: src/activation_functions/esp_nn_relu_ansi.c
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <stdint.h>
#include <stdlib.h>

#include <common_functions.h>

void esp_nn_relu6_s8_ansi(int8_t *data, uint16_t size)
{
    int32_t i;

    for (i = 0; i < size; i++) {
        int32_t ip = data[i];

        ip = max(ip, 0);
        data[i] = min(ip, 6);
    }
}


================================================
FILE: src/activation_functions/esp_nn_relu_s8_esp32p4.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>

/**
 * In-place ReLU6 for s8 data using ESP32-P4 PIE SIMD.
 * Clamps each element to [0, 6].
 * Processes 16 elements per iteration via 128-bit vector ops.
 */
void esp_nn_relu6_s8_esp32p4(int8_t *data, uint16_t size)
{
    /* Enable PIE */
    asm volatile (
        "csrsi  0x7f2, 0b01        \n\t"
        "li     x29, 0b10          \n\t"
        "esp.movx.w.cfg x29        \n\t"
        ::: "x29"
    );

    int i = 0;

    if (size >= 16) {
        /* Broadcast 0 into q2 and 6 into q3 */
        const int8_t zero_val = 0;
        const int8_t six_val = 6;

        asm volatile (
            "esp.vldbc.8.ip  q2, %0, 0   \n\t"
            "esp.vldbc.8.ip  q3, %1, 0   \n\t"
            :: "r"(&zero_val), "r"(&six_val)
        );

        int count = size >> 4;
        int stride = 16;

        asm volatile (
            "mv     x30, %[ptr]             \n\t"
            "mv     x31, %[cnt]             \n\t"

            "1:                             \n\t"
            "esp.vld.128.ip   q0, x30, 0    \n\t"  /* load 16 bytes, no auto-increment */
            "esp.vmax.s8      q0, q0, q2    \n\t"  /* max(val, 0) */
            "esp.vmin.s8      q0, q0, q3    \n\t"  /* min(val, 6) */
            "esp.vst.128.xp   q0, x30, %[stride] \n\t"  /* store and advance ptr by 16 */
            "addi   x31, x31, -1            \n\t"
            "bnez   x31, 1b                 \n\t"

            :
            : [ptr] "r"(data), [cnt] "r"(count), [stride] "r"(stride)
            : "x30", "x31", "memory"
        );

        i = count << 4;
    }

    /* Handle remaining elements scalar */
    for (; i < size; i++) {
        int32_t val = data[i];
        if (val < 0) val = 0;
        if (val > 6) val = 6;
        data[i] = (int8_t) val;
    }
}


================================================
FILE: src/activation_functions/esp_nn_relu_s8_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


    .text
    .align  4
    .literal_position

# in place relu6 function. a2: data, a3: size
    # Program Unit: esp_nn_relu6_s8_esp32s3
    .type   esp_nn_relu6_s8_esp32s3, @function
    .align   4
    .global esp_nn_relu6_s8_esp32s3

esp_nn_relu6_s8_esp32s3:
    entry   a1,48                       #
    mov.n   a9,a2                       # [0], data
    mov.n   a7,a3                       # [1], size

 // process multiple of 16
    movi.n      a4,6                    # [4]
    s8i         a4,a1,0                     # [5]  six
    addi        a10,a3,-7                   # [2]
    ee.vldbc.8  q1,a1               # [6]  id:72 six+0x0
    blti        a3,16,.Lt_0_5634            # [7]

    srai        a8,a3,4                     # [0]
    ee.zero.q   q2                      # [1]
    loopgtz     a8,.LBB37_esp_nn_relu6_s8_esp32s3   # [3]

    ee.vld.128.ip   q0,a2,0             # [0*II+0]  id:73
    ee.vmax.s8      q0,q0,q2            # [0*II+2]
    ee.vmin.s8      q0,q0,q1            # [0*II+3]
    ee.vst.128.ip   q0,a2,16            # [0*II+4]  id:74
.LBB37_esp_nn_relu6_s8_esp32s3: # 0x34

    slli    a8,a8,4                     # [0]

 // remaining multiple of 8 data
    bge     a8,a10,.Lt_0_3586           # [1]

.Lt_0_3842: # 0x3a
    sub     a6,a7,a8                    # [0]
    srai    a6,a6,3                     # [1]
    loopgtz a6,.LBB52_esp_nn_relu6_s8_esp32s3   # [2]

    ee.vld.l.64.ip  q0,a2,0         # [0*II+0]  id:75
    ee.vmax.s8      q0,q0,q2            # [0*II+2]
    ee.vmin.s8      q0,q0,q1            # [0*II+3]
    ee.vst.l.64.ip  q0,a2,8         # [0*II+4]  id:76

.LBB52_esp_nn_relu6_s8_esp32s3: # 0x4f
    addx8   a8,a6,a8                    # [0]

.Lt_0_3586: # 0x52
 // process leftover
    bge     a8,a7,.Lt_0_6402            # [0]

.Lt_0_4866: # 0x55
    movi.n  a5,0                    # [0]
    sub     a3,a7,a8                    # [1]
    add.n   a2,a8,a9                    # [2]
    l8ui    a6,a2,0                     # [3]  id:78
    addi.n  a3,a3,-1                # [4]
    sext    a6,a6,7
    max     a6,a5,a6                    # [6]
    min     a6,a4,a6                    # [7]
    s8i     a6,a2,0                     # [8]  id:79

    loopgtz a3,.LBB67_esp_nn_relu6_s8_esp32s3   # [9]

    l8ui    a3,a2,1                     # [0*II+0]  id:78
    addi.n  a2,a2,1                 # [1*II+1]
    sext    a3,a3,7
    max     a3,a5,a3                    # [0*II+3]
    min     a3,a4,a3                    # [0*II+4]
    s8i     a3,a2,0                     # [0*II+5]  id:79
.LBB67_esp_nn_relu6_s8_esp32s3: # 0x81

.Lt_0_6402: # 0x83
    retw.n                          # [0]

.Lt_0_5634: # 0x85
    blti    a10,1,.Lt_0_5890            # [0]

    movi.n  a8,0                    # [0]
    ee.zero.q   q2                      # [1]
    j   .Lt_0_3842                      # [2]

.Lt_0_5890: # 0x90
    beqz.n  a3,.Lt_0_6402           # [0]

    movi.n  a8,0                    # [0]
    j   .Lt_0_4866                      # [1]

    .size   esp_nn_relu6_s8_esp32s3, . - esp_nn_relu6_s8_esp32s3


================================================
FILE: src/basic_math/esp_nn_add_ansi.c
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <stdint.h>

#include <common_functions.h>

void esp_nn_add_elementwise_u8_ansi(const uint8_t *input1_data,
                                    const uint8_t *input2_data,
                                    const int32_t input1_offset,
                                    const int32_t input2_offset,
                                    const int32_t input1_mult,
                                    const int32_t input2_mult,
                                    const int32_t input1_shift,
                                    const int32_t input2_shift,
                                    const int32_t left_shift,
                                    uint8_t *output,
                                    const int32_t out_offset,
                                    const int32_t out_mult,
                                    const int32_t out_shift,
                                    const int32_t activation_min,
                                    const int32_t activation_max,
                                    const int32_t size)
{
    for (int i = 0; i < size; i++) {
        int32_t tmp1 = input1_data[i] + input1_offset;
        int32_t tmp2 = input2_data[i] + input2_offset;

        tmp1 <<= left_shift;
        tmp2 <<= left_shift;

        tmp1 = esp_nn_sat_round_doubling_high_mul(tmp1, input1_mult);
        tmp2 = esp_nn_sat_round_doubling_high_mul(tmp2, input2_mult);

        tmp1 = esp_nn_div_by_power_of_two(tmp1, -input1_shift);
        tmp2 = esp_nn_div_by_power_of_two(tmp2, -input2_shift);

        int32_t out = tmp1 + tmp2;
        out = esp_nn_sat_round_doubling_high_mul(out, out_mult);
        out = esp_nn_div_by_power_of_two(out, -out_shift);
        out = out + out_offset;

        out = max(activation_min, min(out, activation_max));
        output[i] = (uint8_t) out;
    }
}

void esp_nn_add_elementwise_s8_ansi(const int8_t *input1_data,
                                    const int8_t *input2_data,
                                    const int32_t input1_offset,
                                    const int32_t input2_offset,
                                    const int32_t input1_mult,
                                    const int32_t input2_mult,
                                    const int32_t input1_shift,
                                    const int32_t input2_shift,
                                    const int32_t left_shift,
                                    int8_t *output,
                                    const int32_t out_offset,
                                    const int32_t out_mult,
                                    const int32_t out_shift,
                                    const int32_t activation_min,
                                    const int32_t activation_max,
                                    const int32_t size)
{
    for (int i = 0; i < size; i++) {
        int32_t tmp1 = input1_data[i] + input1_offset;
        int32_t tmp2 = input2_data[i] + input2_offset;

        tmp1 <<= left_shift;
        tmp2 <<= left_shift;

        tmp1 = esp_nn_sat_round_doubling_high_mul(tmp1, input1_mult);
        tmp2 = esp_nn_sat_round_doubling_high_mul(tmp2, input2_mult);

        tmp1 = esp_nn_div_by_power_of_two(tmp1, -input1_shift);
        tmp2 = esp_nn_div_by_power_of_two(tmp2, -input2_shift);

        int32_t out = tmp1 + tmp2;
        out = esp_nn_sat_round_doubling_high_mul(out, out_mult);
        out = esp_nn_div_by_power_of_two(out, -out_shift);
        out = out + out_offset;

        out = max(activation_min, min(out, activation_max));
        output[i] = (int8_t) out;
    }
}


================================================
FILE: src/basic_math/esp_nn_add_s8_esp32p4.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>
#include <common_functions.h>

/**
 * Optimized elementwise add for s8 on ESP32-P4.
 * Uses fast multiply-by-quantized-mult and 2x unrolling.
 */

/* Inline the core requantization to avoid function call overhead */
/* Inlined fast requant using explicit RISC-V mul/mulh to avoid
 * compiler generating 64-bit multiply helper calls */
static inline __attribute__((always_inline))
int32_t add_requant(int32_t val, int32_t mult, int32_t neg_shift)
{
    /* Use C 64-bit multiply - compiler already generates mul+mulh pair at -O2 */
    int64_t prod64 = (int64_t)val * mult + ((int64_t)1 << 30);
    int32_t result = (int32_t)(prod64 >> 31);

    if (neg_shift > 0) {
        int32_t rnd = (1 << (neg_shift - 1)) - (result < 0);
        result = (result + rnd) >> neg_shift;
    }
    return result;
}

void esp_nn_add_elementwise_s8_esp32p4(const int8_t *input1_data,
                                        const int8_t *input2_data,
                                        const int32_t input1_offset,
                                        const int32_t input2_offset,
                                        const int32_t input1_mult,
                                        const int32_t input2_mult,
                                        const int32_t input1_shift,
                                        const int32_t input2_shift,
                                        const int32_t left_shift,
                                        int8_t *output,
                                        const int32_t out_offset,
                                        const int32_t out_mult,
                                        const int32_t out_shift,
                                        const int32_t activation_min,
                                        const int32_t activation_max,
                                        const int32_t size)
{
    const int32_t neg_in1_shift = -input1_shift;
    const int32_t neg_in2_shift = -input2_shift;
    const int32_t neg_out_shift = -out_shift;

    int i = 0;
    /* Process 2 at a time - C inline requant lets compiler optimize across calls */
    for (; i <= size - 2; i += 2) {
        int32_t a0 = (input1_data[i + 0] + input1_offset) << left_shift;
        int32_t b0 = (input2_data[i + 0] + input2_offset) << left_shift;

        a0 = add_requant(a0, input1_mult, neg_in1_shift);
        b0 = add_requant(b0, input2_mult, neg_in2_shift);
        int32_t out0 = add_requant(a0 + b0, out_mult, neg_out_shift) + out_offset;
        out0 = max(activation_min, min(out0, activation_max));

        int32_t a1 = (input1_data[i + 1] + input1_offset) << left_shift;
        int32_t b1 = (input2_data[i + 1] + input2_offset) << left_shift;

        a1 = add_requant(a1, input1_mult, neg_in1_shift);
        b1 = add_requant(b1, input2_mult, neg_in2_shift);
        int32_t out1 = add_requant(a1 + b1, out_mult, neg_out_shift) + out_offset;
        out1 = max(activation_min, min(out1, activation_max));

        output[i + 0] = (int8_t) out0;
        output[i + 1] = (int8_t) out1;
    }

    for (; i < size; i++) {
        int32_t tmp1 = (input1_data[i] + input1_offset) << left_shift;
        int32_t tmp2 = (input2_data[i] + input2_offset) << left_shift;

        tmp1 = add_requant(tmp1, input1_mult, neg_in1_shift);
        tmp2 = add_requant(tmp2, input2_mult, neg_in2_shift);

        int32_t out = add_requant(tmp1 + tmp2, out_mult, neg_out_shift) + out_offset;
        out = max(activation_min, min(out, activation_max));
        output[i] = (int8_t) out;
    }
}


================================================
FILE: src/basic_math/esp_nn_add_s8_esp32s3.S
================================================
// Copyright 2021-2023 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

    .text
    .align  4
    .literal_position
    .literal    .nudge_val, 1073741824

    # Program Unit: esp_nn_add_elementwise_s8_esp32s3
    .type   esp_nn_add_elementwise_s8_esp32s3, @function
    .align   4
    .global esp_nn_add_elementwise_s8_esp32s3

esp_nn_add_elementwise_s8_esp32s3:  # 0x4
    # temp_neg_out_shift = 0
    # temp_neg_input2_shift = 4
    # temp_neg_input1_shift = 8
    # gra_spill_temp_2 = 12
    # gra_spill_temp_3 = 16
    # gra_spill_temp_4 = 20
    # gra_spill_temp_5 = 24
    # gra_spill_temp_6 = 28
    # gra_spill_temp_7 = 32
    # gra_spill_temp_8 = 36
    # gra_spill_temp_9 = 40
    # gra_spill_temp_10 = 44
    # gra_spill_temp_11 = 48
    # gra_spill_temp_12 = 52
    # gra_spill_temp_13 = 56

 // a2 : *input1_data
 // a3 : *input2_data
 // a4 : input1_offset
 // a5 : input2_offset
 // a6 : input1_mult
 // a7 : input2_mult
 // On stack:
 // 80: input1_shift
 // 84: input2_shift
 // 88: left_shift
 // 92: *output
 // 96: out_offset
 // 100: out_mult, loaded in `a8`
 // 104: out_shift
 // 108: activation_min
 // 112: activation_max
 // 116: size

    entry       a1,80                      #
    s32i.n      a4,a1,48                    # [10]  gra_spill_temp_11, input1_offset
    s32i.n      a5,a1,52                    # [0]  gra_spill_temp_12, input2_offset
    s32i.n      a2,a1,32                 # [5]  gra_spill_temp_7, input1_data
    s32i.n      a3,a1,12                    # [3]  gra_spill_temp_2, input2_data

    l32i        a12,a1,116                  # [11]  id:720 size+0x0
    mov.n       a14,a2                      # [6]
    mov.n       a10,a3                      # [8]
    blti        a12,1,.exit           # [1] // exit

    l32i        a3,a1,80                   # [0]  id:721 input1_shift+0x0
    l32i        a13,a1,84                  # [1]  id:722 input2_shift+0x0
    l32i        a2,a1,104                   # [8]  id:723 out_shift+0x0
    l32i        a8,a1,100                   # [1]  out_mult

    neg         a3,a3                       # [12]
    neg         a13,a13                     # [7]
    neg         a2,a2                       # [11]

    s32i.n      a3,a1,8                    # [12]  temp_neg_input1_shift, -input1_shift
    s32i.n      a13,a1,4                   # [7]  temp_neg_input2_shift, -input2_shift
    s32i.n      a2,a1,0                    # [16]  temp_neg_out_shift, -out_shift

    movi.n      a5,1
    addi        a9,a3,-1
    ssl         a9
    sll         a15,a5
    s32i.n      a15,a1,16               # gra_spill_temp_3, 1 << (exponent - 1) for input1

    addi        a9,a13,-1
    ssl         a9
    sll         a15,a5
    s32i.n      a15,a1,20               # gra_spill_temp_4, 1 << (exponent - 1) for input2

    addi        a9,a2,-1
    ssl         a9
    sll         a15,a5
    s32i.n      a15,a1,24               # gra_spill_temp_5, 1 << (exponent - 1) for out

    movi.n      a2,0
    blti        a12,12,.process_leftover          # [23]

    // skip to leftover routine if inputs are unaligned
    or          a9,a14,a10
    extui       a9,a9,0,4
    bnez        a9,.process_leftover

    l32i        a9,a1,92                   # [17]  id:1279 output+0x0

    l32i        a13,a1,116                  # [20]
    srai        a13,a13,3                   # [21]
    s32i.n      a13,a1,56                   # [22]  gra_spill_temp_13

    movi.n      a13,8
    s32i.n      a13,a1,28               # gra_spill_temp_6, mult_of8 counter

    ee.zero.q       q6                      # [8]

.vector_loop: // process 8 values in one go
    l32i            a15,a1,88                  # [6]  left_shift
    ee.vld.l.64.ip  q0,a14,8        # [9]  id:729
    s32i.n          a9,a1,44                    # [10]  gra_spill_temp_10, out_ptr
    s32i.n          a14,a1,40                   # [20]  gra_spill_temp_9
    wsr.sar         a15                     # [21] load left shift

    addi.n          a15,a1,48                   # [14]
    ee.vldbc.16     q7,a15              # [21]  id:1277 input1_offset
    ee.vcmp.lt.s8   q5,q0,q6            # [29]
    ee.vzip.8       q0,q5                   # [31], 20 bits
    ee.vadds.s16    q0,q0,q7            # [34], add offset
    ee.vcmp.lt.s16  q2,q0,q6        # [36]
    ee.vzip.16      q0,q2               # [39], 32 bits
    ee.vsl.32       q0,q0                   # [41] left_shift
    ee.vsl.32       q2,q2                   # [42] left_shift

    l32r            a9,.nudge_val              # [15], nudge

// mulhi32 for q0
    ee.movi.32.a    q0,a3,2             # [44]
    ee.movi.32.a    q0,a4,3             # [45]
    ee.movi.32.a    q0,a14,1            # [46]
    ee.movi.32.a    q0,a5,0             # [62]

    mulsh           a13,a6,a3                   # [51]
    mull            a3,a6,a3                    # [53]

    mulsh           a12,a6,a4                   # [50]
    mull            a4,a6,a4                    # [55]

    mulsh           a15,a6,a14                  # [48]
    mull            a14,a6,a14                  # [49]

    ssai            31                          # [47]

    add             a3,a3,a9
    saltu           a2,a3,a9
    add.n           a13,a13,a2
    src             a13,a13,a3

    add             a4,a4,a9
    saltu           a2,a4,a9
    add.n           a12,a12,a2
    src             a12,a12,a4
    ee.movi.32.q    q0,a13,2            # [62]

    add             a14,a14,a9
    saltu           a2,a14,a9
    add.n           a15,a15,a2
    src             a15,a15,a14
    ee.movi.32.q    q0,a12,3            # [62]

    mulsh           a13,a6,a5                   # [51]
    mull            a5,a6,a5                    # [53]
    ee.movi.32.q    q0,a15,1            # [62]

    add             a5,a5,a9
    saltu           a2,a5,a9
    add.n           a13,a13,a2
    src             a13,a13,a5
    ee.movi.32.q    q0,a13,0            # [62]


// mulhi32 for q2
    ee.movi.32.a    q2,a3,2             # [44]
    ee.movi.32.a    q2,a4,3             # [45]
    ee.movi.32.a    q2,a14,1            # [46]
    ee.movi.32.a    q2,a5,0             # [62]

    mulsh           a13,a6,a3                   # [51]
    mull            a3,a6,a3                    # [53]

    mulsh           a12,a6,a4                   # [50]
    mull            a4,a6,a4                    # [55]

    mulsh           a15,a6,a14                  # [48]
    mull            a14,a6,a14                  # [49]

    ssai            31                          # [47]

    add             a3,a3,a9
    saltu           a2,a3,a9
    add.n           a13,a13,a2
    src             a13,a13,a3

    add             a4,a4,a9
    saltu           a2,a4,a9
    add.n           a12,a12,a2
    src             a12,a12,a4
    ee.movi.32.q    q2,a13,2            # [62]

    add             a14,a14,a9
    saltu           a2,a14,a9
    add.n           a15,a15,a2
    src             a15,a15,a14
    ee.movi.32.q    q2,a12,3            # [62]

    mulsh           a13,a6,a5                   # [51]
    mull            a5,a6,a5                    # [53]
    ee.movi.32.q    q2,a15,1            # [62]

    l32i            a3,a1,8                    # [12]  temp_neg_input1_shift, -input1_shift
    add             a5,a5,a9
    saltu           a2,a5,a9
    add.n           a13,a13,a2
    src             a13,a13,a5
    ee.movi.32.q    q2,a13,0            # [62]


    blti            a3,1, .skip_div_by2_in0

    addi.n          a13,a1,16
    ee.vcmp.lt.s32  q1,q0,q6
    ee.vcmp.lt.s32  q3,q2,q6
    ee.vldbc.32     q5,a13      // 1 << (exponent - 1)
    wsr.sar         a3          // load right_shift
    ee.vadds.s32    q0,q0,q1    // subtract 1 `if (val < 0)`
    ee.vadds.s32    q2,q2,q3    // subtract 1 `if (val < 0)`
    ee.vadds.s32    q0,q0,q5
    ee.vadds.s32    q2,q2,q5
    ee.vsr.32       q0,q0
    ee.vsr.32       q2,q2

.skip_div_by2_in0:


    ee.vld.l.64.ip  q1,a10,8        # [11]  id:1290
    addi.n          a15,a1,52                   # [12]
    ee.vldbc.16     q7,a15              # [19]  id:1278 input2_offset
    l32i            a15,a1,88                  # [6]  left_shift
    s32i            a10,a1,36                   # [14]  gra_spill_temp_8
    ee.vcmp.lt.s8   q3,q1,q6            # [271]
    wsr.sar         a15                     # [21], load shift for left shift
    ee.vzip.8       q1,q3                   # [274], 20 bits
    ee.vadds.s16    q1,q1,q7            # [281]
    ee.vcmp.lt.s16  q3,q1,q6        # [282]
    ee.vzip.16      q1,q3               # [283], 32 bits
    ee.vsl.32       q1,q1                   # [284]
    ee.vsl.32       q3,q3                   # [285]


// mulhi32 for q1
    ee.movi.32.a    q1,a3,2             # [44]
    ee.movi.32.a    q1,a4,3             # [45]
    ee.movi.32.a    q1,a14,1            # [46]
    ee.movi.32.a    q1,a5,0             # [62]

    mulsh           a13,a7,a3                   # [51]
    mull            a3,a7,a3                    # [53]

    mulsh           a12,a7,a4                   # [50]
    mull            a4,a7,a4                    # [55]

    mulsh           a15,a7,a14                  # [48]
    mull            a14,a7,a14                  # [49]

    ssai            31                          # [47]

    add             a3,a3,a9
    saltu           a2,a3,a9
    add.n           a13,a13,a2
    src             a13,a13,a3

    add             a4,a4,a9
    saltu           a2,a4,a9
    add.n           a12,a12,a2
    src             a12,a12,a4
    ee.movi.32.q    q1,a13,2            # [62]

    add             a14,a14,a9
    saltu           a2,a14,a9
    add.n           a15,a15,a2
    src             a15,a15,a14
    ee.movi.32.q    q1,a12,3            # [62]

    mulsh           a13,a7,a5                   # [51]
    mull            a5,a7,a5                    # [53]
    ee.movi.32.q    q1,a15,1            # [62]

    add             a5,a5,a9
    saltu           a2,a5,a9
    add.n           a13,a13,a2
    src             a13,a13,a5
    ee.movi.32.q    q1,a13,0            # [62]


// mulhi32 for q3
    ee.movi.32.a    q3,a3,2             # [44]
    ee.movi.32.a    q3,a4,3             # [45]
    ee.movi.32.a    q3,a14,1            # [46]
    ee.movi.32.a    q3,a5,0             # [62]

    mulsh           a13,a7,a3                   # [51]
    mull            a3,a7,a3                    # [53]

    mulsh           a12,a7,a4                   # [50]
    mull            a4,a7,a4                    # [55]

    mulsh           a15,a7,a14                  # [48]
    mull            a14,a7,a14                  # [49]

    ssai            31                          # [47]

    add             a3,a3,a9
    saltu           a2,a3,a9
    add.n           a13,a13,a2
    src             a13,a13,a3

    add             a4,a4,a9
    saltu           a2,a4,a9
    add.n           a12,a12,a2
    src             a12,a12,a4
    ee.movi.32.q    q3,a13,2            # [62]

    add             a14,a14,a9
    saltu           a2,a14,a9
    add.n           a15,a15,a2
    src             a15,a15,a14
    ee.movi.32.q    q3,a12,3            # [62]

    mulsh           a13,a7,a5                   # [51]
    mull            a5,a7,a5                    # [53]
    ee.movi.32.q    q3,a15,1            # [62]
    l32i            a14,a1,4                   # [7]  temp_neg_input2_shift, -input2_shift

    add             a5,a5,a9
    saltu           a2,a5,a9
    add.n           a13,a13,a2
    src             a13,a13,a5
    ee.movi.32.q    q3,a13,0            # [62]

    // multiplication results: q0-q2 & q1-q3


    blti            a14,1, .skip_div_by2_in1

    addi.n          a5,a1,20
    ee.vcmp.lt.s32  q4,q1,q6
    ee.vcmp.lt.s32  q5,q3,q6
    ee.vldbc.32     q7,a5       // 1 << (exponent - 1)
    wsr.sar         a14         // load right_shift
    ee.vadds.s32    q4,q4,q7    // subtract 1 `if (val < 0)`
    ee.vadds.s32    q5,q5,q7    // subtract 1 `if (val < 0)`
    ee.vadds.s32    q1,q1,q4
    ee.vadds.s32    q3,q3,q5
    ee.vsr.32       q1,q1
    ee.vsr.32       q3,q3

.skip_div_by2_in1:

    ee.vadds.s32        q0,q0,q1
    ee.vadds.s32        q1,q2,q3

// mulhi32 for q0
    ee.movi.32.a    q0,a3,2             # [44]
    ee.movi.32.a    q0,a4,3             # [45]
    ee.movi.32.a    q0,a14,1            # [46]
    ee.movi.32.a    q0,a5,0             # [62]

    mulsh           a13,a8,a3                   # [51]
    mull            a3,a8,a3                    # [53]

    mulsh           a12,a8,a4                   # [50]
    mull            a4,a8,a4                    # [55]

    mulsh           a15,a8,a14                  # [48]
    mull            a14,a8,a14                  # [49]

    ssai            31                          # [47]

    add             a3,a3,a9
    saltu           a2,a3,a9
    add.n           a13,a13,a2
    src             a13,a13,a3

    add             a4,a4,a9
    saltu           a2,a4,a9
    add.n           a12,a12,a2
    src             a12,a12,a4
    ee.movi.32.q    q0,a13,2            # [62]

    add             a14,a14,a9
    saltu           a2,a14,a9
    add.n           a15,a15,a2
    src             a15,a15,a14
    ee.movi.32.q    q0,a12,3            # [62]

    mulsh           a13,a8,a5                   # [51]
    mull            a5,a8,a5                    # [53]
    ee.movi.32.q    q0,a15,1            # [62]

    add             a5,a5,a9
    saltu           a2,a5,a9
    add.n           a13,a13,a2
    src             a13,a13,a5
    ee.movi.32.q    q0,a13,0            # [62]


// mulhi32 for q1
    ee.movi.32.a    q1,a3,2             # [44]
    ee.movi.32.a    q1,a4,3             # [45]
    ee.movi.32.a    q1,a14,1            # [46]
    ee.movi.32.a    q1,a5,0             # [62]

    mulsh           a13,a8,a3                   # [51]
    mull            a3,a8,a3                    # [53]

    mulsh           a12,a8,a4                   # [50]
    mull            a4,a8,a4                    # [55]

    mulsh           a15,a8,a14                  # [48]
    mull            a14,a8,a14                  # [49]

    ssai            31                          # [47]

    add             a3,a3,a9
    saltu           a2,a3,a9
    add.n           a13,a13,a2
    src             a13,a13,a3

    add             a4,a4,a9
    saltu           a2,a4,a9
    add.n           a12,a12,a2
    src             a12,a12,a4
    ee.movi.32.q    q1,a13,2            # [62]

    add             a14,a14,a9
    saltu           a2,a14,a9
    add.n           a15,a15,a2
    src             a15,a15,a14
    ee.movi.32.q    q1,a12,3            # [62]

    mulsh           a13,a8,a5                   # [51]
    mull            a5,a8,a5                    # [53]
    ee.movi.32.q    q1,a15,1            # [62]
    l32i            a14,a1,0                   # [738]  temp_neg_out_shift, -out_shift

    add             a5,a5,a9
    saltu           a2,a5,a9
    add.n           a13,a13,a2
    src             a13,a13,a5
    ee.movi.32.q    q1,a13,0            # [62]


    //q0-q1 has output

    blti            a14,1,.skip_div_by2_out
    addi.n          a5,a1,24
    ee.vcmp.lt.s32  q2,q0,q6
    ee.vcmp.lt.s32  q3,q1,q6
    ee.vldbc.32     q5,a5       // 1 << (exponent - 1)
    wsr.sar         a14         // load right shift
    ee.vadds.s32    q0,q0,q2    // subtract 1 `if (val < 0)`
    ee.vadds.s32    q1,q1,q3    // subtract 1 `if (val < 0)`
    ee.vadds.s32    q0,q0,q5
    ee.vadds.s32    q1,q1,q5
    ee.vsr.32       q0,q0
    ee.vsr.32       q1,q1

.skip_div_by2_out:

// add offset and apply activation
    addi            a15,a1,96
    ee.vldbc.32     q3,a15              # [809]  id:802 out_offset
    ee.vadds.s32    q0,q0,q3            # [811]
    ee.vadds.s32    q1,q1,q3            # [812]
    addi            a13,a1,108
    addi            a14,a1,112
    ee.vldbc.32     q3,a14              # [813]  id:803 activation_max
    ee.vmin.s32     q0,q0,q3            # [815]
    ee.vmin.s32     q1,q1,q3            # [816]
    ee.vldbc.32     q3,a13              # [817]  id:804 activation_min
    l32i            a13,a1,4                   # [818]  temp_neg_input2_shift
    ee.vmax.s32     q1,q1,q3            # [819]
    ee.vmax.s32     q0,q0,q3            # [820]

//pack the data and store
    l32i.n          a9,a1,44                    # [784]  gra_spill_temp_10
    ee.vunzip.16    q0,q1               # [821]
    ee.vunzip.8     q0,q1               # [822]
    l32i.n          a13,a1,28           # gra_spill_temp_6, multiple of 12 index
    ee.vst.l.64.ip  q0,a9,8             # [823]  id:805
    l32i            a15,a1,116                  # [1], size
    l32i.n          a14,a1,40                   # [20]  gra_spill_temp_9
    l32i.n          a10,a1,36                   # [14]  gra_spill_temp_8
    addi            a13,a13,8
    s32i.n          a13,a1,28           # gra_spill_temp_6
    bge             a15,a13,.vector_loop

    l32i.n  a2,a1,56                # [0]  gra_spill_temp_13

// check for leftover
    l32i    a10,a1,116                  # [1]
    slli    a2,a2,3                     # [2]
    bge     a2,a10,.exit          # [3] // done, exit

.process_leftover:
    l32i.n  a3,a1,48                    # [1]  gra_spill_temp_11
    l32i.n  a12,a1,52                   # [2]  gra_spill_temp_12

    l32i.n  a10,a1,12                   # [3]  gra_spill_temp_2
    l32i.n  a14,a1,32                # [8]  gra_spill_temp_7
    add.n   a10,a2,a10                  # [5]
    add.n   a14,a2,a14                  # [6]
    l8ui    a14,a14,0                   # [7]  id:809, input1
    l8ui    a10,a10,0                   # [12]  id:1370, input2

    sext    a14,a14,7                   # [9]
    sext    a10,a10,7                   # [10]
    add.n   a10,a10,a12                 # [11] // add offset2
    add.n   a14,a14,a3                  # [16] // add offset1
    l32i    a12,a1,88                  # [13]  left_shift

    // sat_round_doubling_high_mul step for input1 and input2
    ssl     a12                         # [15]
    sll     a10,a10                     # [20]
    sll     a14,a14                     # [17]

    l32r            a12,.nudge_val             # [0], nudge

    // a13,a3 are free, a12: nudge, a6:mult1
    mulsh           a13,a14,a6
    mull            a9,a14,a6
    ssai            31

    add             a9,a9,a12
    saltu           a3,a9,a12
    add.n           a13,a13,a3
    src             a14,a13,a9 //result in a14

    mulsh           a13,a10,a7
    mull            a9,a10,a7
    ssai            31

    add             a9,a9,a12
    saltu           a3,a9,a12
    add.n           a13,a13,a3
    src             a10,a13,a9 //result in a10

// divide_by_power_of2_step for input1 (a14), input2 (a10)
// free registers: a13, a12, a9, a3

    l32i.n          a12,a1,8   // -input1_shift
    l32i.n          a13,a1,4   // -input2_shift

    blti            a12,1,.skip_div_by2_in0_remain
    l32i.n          a3,a1,16    // 1 << (exponent - 1)
    extui           a9,a14,31,1
    ssr             a12         // load right_shift
    sub             a3,a3,a9    // 1 << (exponent - 1) - (val < 0)
    add             a14,a14,a3
    sra             a14,a14
.skip_div_by2_in0_remain:

    blti            a13,1,.skip_div_by2_in1_remain
    l32i.n          a3,a1,20    // 1 << (exponent - 1)
    extui           a9,a10,31,1
    ssr             a13         // load right_shift
    sub             a3,a3,a9    // 1 << (exponent - 1) - (val < 0)
    add             a10,a10,a3
    sra             a10,a10
.skip_div_by2_in1_remain:

// process output
    l32r            a12,.nudge_val             # [0], nudge
    l32i            a13,a1,0                   // -out_shift
    add.n           a10,a10,a14                 # [45]

// multiply and pick high32
    mulsh           a3,a10,a8
    mull            a10,a10,a8
    ssai            31                          # [0]
    add             a10,a10,a12
    saltu           a9,a10,a12
    add             a12,a3,a9
    src             a12,a12,a10

// div by power of 2 for output

    l32i            a9,a1,96                   # [31]  out_offset
    blti            a13,1,.skip_div_by2_out_remain
    l32i.n          a3,a1,24    // 1 << (exponent - 1)
    extui           a14,a12,31,1
    ssr             a13         // load right_shift
    sub             a3,a3,a14   // 1 << (exponent - 1) - (val < 0)
    add             a12,a12,a3
    sra             a12,a12
.skip_div_by2_out_remain:

// add offset
    add.n   a9,a9,a12                   # [33]

// apply activation
    l32i    a13,a1,112                  # [34]  activation_max
    l32i    a12,a1,108                  # [35]  activation_min
    min     a13,a13,a9                      # [36]
    l32i    a9,a1,92                   # [37]  output
    max     a13,a13,a12                     # [38]
    add.n   a9,a2,a9                    # [39]
    s8i     a13,a9,0                    # [40]  id:1371
    l32i    a12,a1,116
    addi.n  a2,a2,1                 # [41]
    blt     a2,a12,.process_leftover

.exit:
    retw.n                          # [0]

    .size   esp_nn_add_elementwise_s8_esp32s3, . - esp_nn_add_elementwise_s8_esp32s3


================================================
FILE: src/basic_math/esp_nn_mul_ansi.c
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <stdint.h>

#include <common_functions.h>

void esp_nn_mul_elementwise_s8_ansi(const int8_t *input1_data,
                                    const int8_t *input2_data,
                                    const int32_t input1_offset,
                                    const int32_t input2_offset,
                                    int8_t *output,
                                    const int32_t out_offset,
                                    const int32_t out_mult,
                                    const int32_t out_shift,
                                    const int32_t activation_min,
                                    const int32_t activation_max,
                                    const int32_t size)
{
    for (int i = 0; i < size; i++) {
        int32_t tmp1 = input1_data[i] + input1_offset;
        int32_t tmp2 = input2_data[i] + input2_offset;

        int32_t out = tmp1 * tmp2;
        out = esp_nn_multiply_by_quantized_mult(out, out_mult, out_shift);
        out = out + out_offset;

        out = max(activation_min, min(out, activation_max));
        output[i] = (int8_t) out;
    }
}

void esp_nn_mul_broadcast_channel_s8_ansi(const int8_t *input1,
                                          const int8_t *input2_per_ch,
                                          const int32_t input1_offset,
                                          const int32_t input2_offset,
                                          int8_t *output,
                                          const int32_t output_offset,
                                          const int32_t output_mult,
                                          const int32_t output_shift,
                                          const int32_t activation_min,
                                          const int32_t activation_max,
                                          const int32_t total_spatial,
                                          const int32_t channels)
{
    for (int s = 0; s < total_spatial; s++) {
        const int8_t *in_row = input1 + s * channels;
        int8_t *out_row = output + s * channels;
        for (int c = 0; c < channels; c++) {
            int32_t val = ((int32_t)in_row[c] + input1_offset) *
                          ((int32_t)input2_per_ch[c] + input2_offset);
            val = esp_nn_multiply_by_quantized_mult(val, output_mult, output_shift);
            val += output_offset;
            val = max(val, activation_min);
            val = min(val, activation_max);
            out_row[c] = (int8_t)val;
        }
    }
}


================================================
FILE: src/basic_math/esp_nn_mul_broadcast_s8_esp32s3.S
================================================
// Copyright 2026 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Optimized broadcast MUL for SE-block pattern: [H,W,C] * [1,1,C]
// Processes 8 channels at a time using S3 SIMD.

    .text
    .align  4
    .literal_position
    .literal    .LC_nudge, 1073741824   // 1 << 30

    .type   esp_nn_mul_broadcast_channel_s8_esp32s3, @function
    .align  4
    .global esp_nn_mul_broadcast_channel_s8_esp32s3

// void esp_nn_mul_broadcast_channel_s8_esp32s3(
//     const int8_t *input1,           // a2
//     const int8_t *input2_per_ch,    // a3
//     const int32_t input1_offset,    // a4
//     const int32_t input2_offset,    // a5
//     int8_t *output,                 // a6
//     const int32_t output_offset,    // a7
//     const int32_t output_mult,      // stack+120
//     const int32_t output_shift,     // stack+124
//     const int32_t activation_min,   // stack+128
//     const int32_t activation_max,   // stack+132
//     const int32_t total_spatial,    // stack+136
//     const int32_t channels);        // stack+140

// Stack frame layout (entry a1, 120):
//  0: to_add (for div by power of 2)
//  4: input2_per_ch (saved)
//  8: output base (saved)
// 12: channels
// 16: input1 base (saved)
// 20: right_shift
// 24: input1_offset (saved)
// 28: input2_offset (saved)
// 32: spatial counter
// 36: out_ptr (current)
// 40: out_offset (from a7)
// 44: input1_offset (for vldbc)
// 48: input2_offset (for vldbc)

esp_nn_mul_broadcast_channel_s8_esp32s3:
    entry   a1, 120

    // Save args
    s32i.n  a3, a1, 4               // input2_per_ch base
    s32i.n  a6, a1, 8               // output base
    s32i.n  a2, a1, 16              // input1 base
    s32i.n  a4, a1, 24              // input1_offset
    s32i.n  a5, a1, 28              // input2_offset
    s32i    a7, a1, 40              // out_offset

    l32i    a8, a1, 136             // total_spatial
    l32i    a9, a1, 140             // channels
    s32i.n  a9, a1, 12              // save channels

    blti    a8, 1, .Lexit           // no spatial positions
    blti    a9, 1, .Lexit           // no channels

    // Prepare shift values
    l32i    a15, a1, 124            // output_shift
    movi.n  a11, 0
    max     a14, a15, a11           // left_shift = max(shift, 0)
    sub     a4, a14, a15            // right_shift = left_shift - shift
    s32i.n  a4, a1, 20              // save right_shift

    l32i    a13, a1, 120            // output_mult
    l32r    a4, .LC_nudge           // nudge = 1 << 30

    // Store offsets for vldbc
    l32i    a8, a1, 136             // reload total_spatial
    s32i    a5, a1, 48              // input2_offset for vldbc
    l32i.n  a5, a1, 24              // input1_offset
    s32i    a5, a1, 44              // input1_offset for vldbc

    // Init spatial counter
    movi.n  a10, 0
    s32i    a10, a1, 32             // spatial counter = 0

    // Pointers: a2 = input1 (current), a3 = input2_per_ch (reloaded each row),
    //           a6 = output (current)

.Lspatial_loop:
    l32i    a8, a1, 136             // total_spatial
    l32i    a10, a1, 32             // spatial counter
    bge     a10, a8, .Lexit

    // Reset input2 pointer for each spatial position
    l32i.n  a3, a1, 4               // input2_per_ch base

    // Channel counter
    l32i.n  a9, a1, 12              // channels
    movi.n  a11, 0                  // channel index

    blti    a9, 8, .Lchannel_leftover

    // Check alignment for SIMD path
    or      a8, a2, a3
    or      a8, a8, a6
    extui   a8, a8, 0, 4
    bnez    a8, .Lchannel_leftover

    // Setup SIMD constants
    ee.zero.q   q1                  // zero register
    addi    a8, a1, 44
    ee.vldbc.16 q0, a8              // input1_offset broadcast
    addi    a8, a1, 48
    ee.vldbc.16 q7, a8              // input2_offset broadcast
    st.qr   q0, a1, 64             // save for reload in loop

.Lchannel_simd_loop:
    addi    a8, a9, -7              // channels - 7
    blt     a11, a8, .Lchannel_simd_body
    j       .Lchannel_leftover

.Lchannel_simd_body:
    ld.qr           q4, a1, 64             // input1_offset
    ee.vld.l.64.ip  q2, a2, 8              // load 8 input1 values
    movi.n          a7, 16
    ee.vld.h.64.ip  q2, a3, 8              // load 8 input2 values (per-ch)
    wsr.sar         a7
    ee.vcmp.lt.s8   q5, q2, q1             // sign extend
    ee.vzip.8       q2, q5                 // interleave to 16-bit
    ee.vadds.s16    q5, q5, q7             // add input2_offset
    ee.vadds.s16    q4, q2, q4             // add input1_offset
    ee.vmul.s16     q3, q4, q5             // multiply (high part)
    ssai            0                      // sar = 0
    ee.vmul.s16     q2, q4, q5             // multiply (low part)

    // Requantize 8 results (same pattern as elementwise mul)
    wsr.sar         a14                     // left_shift
    ee.vzip.16      q2, q3
    ee.vsl.32       q6, q2                  // left shift first 4

    ssai            31

    // Element 2 of q6
    ee.movi.32.a    q6, a8, 2
    mulsh           a7, a13, a8
    mull            a8, a13, a8
    add.n           a8, a4, a8
    saltu           a5, a8, a4
    add.n           a5, a5, a7
    src             a5, a5, a8
    // Element 3
    ee.movi.32.a    q6, a8, 3
    mulsh           a7, a13, a8
    mull            a8, a13, a8
    add.n           a8, a4, a8
    saltu           a12, a8, a4
    add.n           a12, a12, a7
    src             a12, a12, a8
    ee.movi.32.q    q2, a5, 2
    ee.movi.32.q    q2, a12, 3
    // Element 1
    ee.movi.32.a    q6, a8, 1
    mulsh           a7, a13, a8
    mull            a8, a13, a8
    add.n           a8, a4, a8
    saltu           a5, a8, a4
    add.n           a5, a5, a7
    src             a5, a5, a8
    // Element 0
    ee.movi.32.a    q6, a8, 0
    mulsh           a7, a13, a8
    mull            a8, a13, a8
    add.n           a8, a4, a8
    saltu           a12, a8, a4
    add.n           a12, a12, a7
    src             a12, a12, a8
    ee.movi.32.q    q2, a5, 1
    ee.movi.32.q    q2, a12, 0

    // Second group of 4 (q3)
    wsr.sar         a14                     // left_shift
    ee.vsl.32       q4, q3

    ssai            31

    ee.movi.32.a    q4, a8, 2
    mulsh           a7, a13, a8
    mull            a8, a13, a8
    add.n           a8, a4, a8
    saltu           a5, a8, a4
    add.n           a5, a5, a7
    src             a5, a5, a8
    ee.movi.32.a    q4, a8, 3
    mulsh           a7, a13, a8
    mull            a8, a13, a8
    add.n           a8, a4, a8
    saltu           a12, a8, a4
    add.n           a12, a12, a7
    src             a12, a12, a8
    ee.movi.32.q    q0, a5, 2
    ee.movi.32.q    q0, a12, 3
    ee.movi.32.a    q4, a8, 1
    mulsh           a7, a13, a8
    mull            a8, a13, a8
    add.n           a8, a4, a8
    saltu           a5, a8, a4
    add.n           a5, a5, a7
    src             a5, a5, a8
    ee.movi.32.a    q4, a8, 0
    mulsh           a7, a13, a8
    mull            a8, a13, a8
    add.n           a8, a4, a8
    saltu           a12, a8, a4
    add.n           a12, a12, a7
    src             a12, a12, a8
    ee.movi.32.q    q0, a5, 1
    ee.movi.32.q    q0, a12, 0

    // Divide by power of 2 (right_shift)
    l32i.n          a5, a1, 20              // right_shift
    movi.n          a7, 1

    blti            a5, 1, .Lskip_div

    ee.vcmp.lt.s32  q5, q2, q1
    ee.vcmp.lt.s32  q6, q0, q1
    addi.n          a8, a5, -1
    ssl             a8
    sll             a7, a7                  // to_add = 1 << (right_shift - 1)
    s32i.n          a7, a1, 0
    ee.vldbc.32     q4, a1                  // broadcast to_add
    wsr.sar         a5
    ee.vadds.s32    q5, q4, q5
    ee.vadds.s32    q5, q2, q5
    ee.vsr.32       q2, q5
    wsr.sar         a5
    ee.vadds.s32    q5, q4, q6
    ee.vadds.s32    q5, q0, q5
    ee.vsr.32       q0, q5

.Lskip_div:
    // Add output offset, apply activation
    addi            a8, a1, 132
    ee.vldbc.32     q4, a8                  // activation_max
    addi            a5, a1, 40
    ee.vldbc.32     q6, a5                  // output_offset
    addi            a7, a1, 128
    ee.vadds.s32    q0, q0, q6              // add offset
    ee.vadds.s32    q2, q2, q6
    ee.vldbc.32     q6, a7                  // activation_min
    ee.vmin.s32     q0, q0, q4
    ee.vmin.s32     q2, q2, q4
    ee.vmax.s32     q0, q0, q6
    ee.vmax.s32     q2, q2, q6

    // Pack 32-bit -> 8-bit and store
    ee.vunzip.16    q2, q0
    ee.vunzip.8     q2, q0
    ee.vst.l.64.ip  q2, a6, 8

    addi            a11, a11, 8             // channel index += 8
    j               .Lchannel_simd_loop

.Lchannel_leftover:
    // Process remaining channels one by one
    l32i.n  a9, a1, 12              // channels
    bge     a11, a9, .Lspatial_next

    ssl     a14                     // left_shift
    l32i.n  a8, a1, 24              // input1_offset
    l8ui    a10, a2, 0              // *input1
    sext    a10, a10, 7
    add.n   a10, a10, a8            // + input1_offset
    l32i.n  a8, a1, 28              // input2_offset
    l8ui    a12, a3, 0              // *input2_per_ch
    sext    a12, a12, 7
    add.n   a12, a12, a8            // + input2_offset
    mull    a10, a10, a12           // multiply

    // Requantize
    sll     a10, a10                // left shift

    l32i.n  a9, a1, 20              // right_shift
    mulsh   a8, a10, a13
    mull    a12, a10, a13
    ssai    31
    add.n   a12, a4, a12
    saltu   a10, a12, a4
    add.n   a10, a10, a8
    src     a10, a10, a12           // result

    blti    a9, 1, .Lskip_div_scalar

    addi    a8, a9, -1
    ssl     a8
    movi    a7, 1
    sll     a7, a7                  // to_add
    extui   a8, a10, 31, 1          // sign bit (1 if neg, 0 if pos)
    sub     a10, a10, a8            // val -= sign (fast rounding)
    add     a10, a10, a7
    ssr     a9
    sra     a10, a10

.Lskip_div_scalar:
    l32i    a8, a1, 40              // output_offset
    l32i    a7, a1, 128             // activation_min
    l32i    a12, a1, 132            // activation_max
    add.n   a10, a10, a8
    min     a10, a10, a12
    max     a10, a10, a7
    s8i     a10, a6, 0              // store

    addi    a2, a2, 1               // input1++
    addi    a3, a3, 1               // input2++
    addi    a6, a6, 1               // output++
    addi    a11, a11, 1             // channel index++
    j       .Lchannel_leftover

.Lspatial_next:
    l32i    a10, a1, 32             // spatial counter
    addi    a10, a10, 1
    s32i    a10, a1, 32
    j       .Lspatial_loop

.Lexit:
    retw.n

    .size   esp_nn_mul_broadcast_channel_s8_esp32s3, . - esp_nn_mul_broadcast_channel_s8_esp32s3


================================================
FILE: src/basic_math/esp_nn_mul_s8_esp32p4.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>
#include <common_functions.h>

/**
 * Elementwise multiply for s8 optimized for ESP32-P4.
 * Uses inlined fast requantization with 4x unrolled loop.
 * Interleaves independent computations to hide latency.
 */
void esp_nn_mul_elementwise_s8_esp32p4(const int8_t *input1_data,
                                        const int8_t *input2_data,
                                        const int32_t input1_offset,
                                        const int32_t input2_offset,
                                        int8_t *output,
                                        const int32_t out_offset,
                                        const int32_t out_mult,
                                        const int32_t out_shift,
                                        const int32_t activation_min,
                                        const int32_t activation_max,
                                        const int32_t size)
{
    const int32_t left_shift = out_shift > 0 ? out_shift : 0;
    const int32_t right_shift = left_shift - out_shift;
    const int64_t nudge = (int64_t)1 << 30;

    int i = 0;
    for (; i <= size - 4; i += 4) {
        int32_t prod0 = (input1_data[i+0] + input1_offset) * (input2_data[i+0] + input2_offset);
        int32_t prod1 = (input1_data[i+1] + input1_offset) * (input2_data[i+1] + input2_offset);
        int32_t prod2 = (input1_data[i+2] + input1_offset) * (input2_data[i+2] + input2_offset);
        int32_t prod3 = (input1_data[i+3] + input1_offset) * (input2_data[i+3] + input2_offset);

        int32_t s0 = prod0 << left_shift;
        int32_t s1 = prod1 << left_shift;
        int32_t s2 = prod2 << left_shift;
        int32_t s3 = prod3 << left_shift;

        int32_t r0 = (int32_t)(((int64_t)s0 * out_mult + nudge) >> 31);
        int32_t r1 = (int32_t)(((int64_t)s1 * out_mult + nudge) >> 31);
        int32_t r2 = (int32_t)(((int64_t)s2 * out_mult + nudge) >> 31);
        int32_t r3 = (int32_t)(((int64_t)s3 * out_mult + nudge) >> 31);

        if (right_shift > 0) {
            int32_t rnd = (1 << (right_shift - 1));
            r0 = (r0 + rnd - (r0 < 0)) >> right_shift;
            r1 = (r1 + rnd - (r1 < 0)) >> right_shift;
            r2 = (r2 + rnd - (r2 < 0)) >> right_shift;
            r3 = (r3 + rnd - (r3 < 0)) >> right_shift;
        }

        r0 = max(activation_min, min(r0 + out_offset, activation_max));
        r1 = max(activation_min, min(r1 + out_offset, activation_max));
        r2 = max(activation_min, min(r2 + out_offset, activation_max));
        r3 = max(activation_min, min(r3 + out_offset, activation_max));

        output[i+0] = (int8_t) r0;
        output[i+1] = (int8_t) r1;
        output[i+2] = (int8_t) r2;
        output[i+3] = (int8_t) r3;
    }

    for (; i < size; i++) {
        int32_t prod = (input1_data[i] + input1_offset) * (input2_data[i] + input2_offset);
        int32_t out = esp_nn_requantize(prod, out_mult, out_shift);
        out = max(activation_min, min(out + out_offset, activation_max));
        output[i] = (int8_t) out;
    }
}


================================================
FILE: src/basic_math/esp_nn_mul_s8_esp32s3.S
================================================
// Copyright 2021-2023 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

    .text
    .align  4
    .literal_position
    .literal    .LC0_26_123, 1073741824 // `1 << 30`

    # Program Unit: esp_nn_mul_elementwise_s8_esp32s3
    .type   esp_nn_mul_elementwise_s8_esp32s3, @function
    .align   4
    .global esp_nn_mul_elementwise_s8_esp32s3

esp_nn_mul_elementwise_s8_esp32s3:  # 0x4
    # to_add = 0
    # gra_spill_temp_0 = 4
    # gra_spill_temp_1 = 8
    # gra_spill_temp_2 = 12
    # gra_spill_temp_3 = 16
    # gra_spill_temp_4 = 20
    # gra_spill_temp_5 = 24
    # gra_spill_temp_6 = 28
    # gra_spill_temp_7 = 32
    # gra_spill_temp_8 = 36
    # gra_spill_temp_<> = 40
    # gra_spill_temp_<> = 44
    # gra_spill_temp_<> = 48
    # gra_spill_temp_13 = 64

 // registers:
 // a2: const int8_t *input1_data
 // a3: const int8_t *input2_data
 // a4: const int32_t input1_offset
 // a5: const int32_t input2_offset
 // a6: int8_t *output
 // a7: const int32_t out_offset

 // on stack:
 // 120: const int32_t out_mult
 // 124: const int32_t out_shift
 // 128: const int32_t activation_min
 // 132: const int32_t activation_max
 // 136: const int32_t size

    entry   a1,120                      #
    s32i.n  a4,a1,24                # [0]  gra_spill_temp_5, input1_offset
    s32i.n  a5,a1,28                # [1]  gra_spill_temp_12, input2_offset

    s32i.n  a3,a1,4                     # [5]  gra_spill_temp_0, input2
    mov.n   a10,a3                      # [6]
    l32i    a3,a1,136                   # [18]  id:361 size+0x0
    mov.n   a9,a6                       # [2] // out_addr
    blti    a3,1,.exit            # [0] // exit

    s32i.n  a2,a1,16                    # [9]  gra_spill_temp_3, input1
    s32i    a7,a1,40                    # [4]  id:358 out_offset+0x0
    movi.n  a11,0                       # [3]
    mov.n   a12,a2                      # [10]
    s32i    a4,a1,44                # [13]  id:356 input1_offset+0x0
    s32i    a5,a1,48                # [14]  id:357 input2_offset+0x0
    movi.n  a2,1                    # [15]

    l32i    a15,a1,124                  # [3]  id:362 out_shift+0x0
    l32i    a13,a1,120                  # [4]  id:363 out_mult+0x0
    s32i.n  a6,a1,8                 # [1]  gra_spill_temp_1, out_addr
    max     a14,a15,a11                 # [11] left_shift
    sub     a4,a14,a15              # right_shift
    s32i.n  a4,a1,20                # [9]  gra_spill_temp_4

    blti    a3,8,.process_leftover             # [20]

    // skip to leftover routine if inputs are unaligned
    or          a6,a12,a10
    extui       a6,a6,0,4
    bnez        a6,.process_leftover

    // `size > 8`, s3 optimisation path...
    ee.zero.q   q1                      # [0]
    addi    a4,a1,44                # [7]
    addi    a8,a1,48                    # [8]
    ee.vldbc.16 q0,a4               # [17]  id:359 input1_offset
    ee.vldbc.16 q7,a8               # [16]  id:360 input2_offset
    l32r    a4,.LC0_26_123              # [12]
    movi    a8, 8
    st.qr   q0,a1,64                    # [19]  gra_spill_temp_13
    s32i.n  a8,a1,12                # [6]  gra_spill_temp_2

.Lt_0_7682: # 0x60
    s32i            a9,a1,36                    # [1]  gra_spill_temp_8, out_addr
    ld.qr           q4,a1,64                    # [2]  gra_spill_temp_13, input1_offset
    ee.vld.l.64.ip  q2,a12,8        # [4]  id:367, input1_ptr
    movi.n          a7,16                   # [3]
    ee.vld.h.64.ip  q2,a10,8        # [5]  id:368, input2_ptr
    wsr.sar         a7                      # [6]
    ee.vcmp.lt.s8   q5,q2,q1            # [7]
    ee.vzip.8       q2,q5               # [8]
    ee.vadds.s16    q5,q5,q7            # [9] input2_offset
    ee.vadds.s16    q4,q2,q4            # [10] input1_offset
    ee.vmul.s16     q3,q4,q5            # [11]
    wsr.sar         a11                         # [12]
    ee.vmul.s16     q2,q4,q5            # [13]

    wsr.sar         a14                     # [14] left_shift
    ee.vzip.16      q2,q3               # [15]
    ee.vsl.32       q6,q2                   # [16] left_shift
    ssai            31                          # [17]

    ee.movi.32.a    q6,a3,2             # [18]
    ee.movi.32.a    q6,a8,3             # [26]

    mulsh           a6,a13,a3                   # [19]
    mull            a3,a13,a3                   # [20]
    mulsh           a7,a13,a8                   # [27]
    add.n           a3,a4,a3                    # [22]
    saltu           a2,a3,a4                    # [23]
    add.n           a2,a2,a6                    # [24]
    src             a2,a2,a3                    # [25]

    mull            a6,a13,a8                   # [28]
    add.n           a6,a4,a6                    # [30]
    saltu           a9,a6,a4                    # [31]
    add.n           a9,a9,a7                    # [32]
    src             a9,a9,a6                    # [33]
    ee.movi.32.q    q2,a2,2             # [53]
    ee.movi.32.q    q2,a9,3             # [54]

    ee.movi.32.a    q6,a6,1             # [34]
    mulsh           a7,a13,a6                   # [35]
    mull            a6,a13,a6                   # [36]
    add.n           a6,a4,a6                    # [38]
    saltu           a3,a6,a4                    # [39]
    add.n           a3,a3,a7                    # [16]
    src             a3,a3,a6                    # [41]
    ee.movi.32.a    q6,a2,0             # [42]
    mulsh           a8,a13,a2                   # [43]
    mull            a7,a13,a2                   # [4]
    add.n           a7,a4,a7                    # [46]
    saltu           a6,a7,a4                    # [47]
    add.n           a6,a6,a8                    # [24]
    src             a6,a6,a7                    # [49]
    ee.movi.32.q    q2,a3,1             # [28]
    ee.movi.32.q    q2,a6,0             # [50]

    wsr.sar         a14                     # [10]
    ee.vsl.32       q4,q3                   # [11]
    ee.movi.32.a    q4,a2,2             # [13]
    mulsh           a3,a13,a2                   # [14]
    mull            a2,a13,a2                   # [15]
    ssai            31                          # [12]
    add.n           a2,a4,a2                    # [17]
    saltu           a5,a2,a4                # [18]
    add.n           a5,a5,a3                # [19]
    src             a5,a5,a2                    # [20]
    ee.movi.32.a    q4,a3,3             # [21]
    mulsh           a6,a13,a3                   # [22]
    mull            a3,a13,a3                   # [23]
    add.n           a3,a4,a3                    # [25]
    saltu           a8,a3,a4                    # [26]
    add.n           a8,a8,a6                    # [27]
    src             a8,a8,a3                    # [28]
    ee.movi.32.q    q0,a5,2             # [24]
    ee.movi.32.q    q0,a8,3             # [51]

    ee.movi.32.a    q4,a7,1             # [29]
    mulsh           a6,a13,a7                   # [30]
    mull            a3,a13,a7                   # [31]
    add.n           a3,a4,a3                    # [33]
    saltu           a2,a3,a4                    # [34]
    add.n           a2,a2,a6                    # [35]
    src             a2,a2,a3                    # [36]
    ee.movi.32.a    q4,a6,0             # [37]
    mulsh           a7,a13,a6                   # [38]
    mull            a6,a13,a6                   # [39]
    add.n           a6,a4,a6                    # [41]
    saltu           a3,a6,a4                    # [42]
    add.n           a3,a3,a7                    # [43]
    src             a3,a3,a6                    # [4]
    ee.movi.32.q    q0,a2,1             # [47]
    ee.movi.32.q    q0,a3,0             # [46]

    l32i.n          a5,a1,20                # [0]  gra_spill_temp_4, right_shift
    movi.n          a7,1                    # [51]

    blti            a5,1,.skip_div_by_pow_of_2
// divide by power of 2
    ee.vcmp.lt.s32  q5,q2,q1        # [56]
    ee.vcmp.lt.s32  q6,q0,q1        # [28]

    addi.n          a8,a5,-1                # [1]
    ssl             a8                          # [2]
    sll             a7,a7                       # [3]
    s32i.n          a7,a1,0                 # [4]  to_add
    ee.vldbc.32     q4,a1               # [5]  id:376 to_add

    wsr.sar         a5                      # [6]
    ee.vadds.s32    q5,q4,q5            # [7]
    ee.vadds.s32    q5,q2,q5            # [8]
    ee.vsr.32       q2,q5                   # [9]

    wsr.sar         a5                      # [5]
    ee.vadds.s32    q5,q4,q6            # [9]
    ee.vadds.s32    q5,q0,q5            # [11]
    ee.vsr.32       q0,q5                   # [12]
.skip_div_by_pow_of_2:

// add offset, apply activation
    addi            a8,a1,132                   # [54]
    ee.vldbc.32     q4,a8               # [55]  id:385 activation_max
    addi            a5,a1,40                    # [8]
    ee.vldbc.32     q6,a5               # [10]  id:384 out_offset
    addi            a7,a1,128                   # [4]
    ee.vadds.s32    q0,q0,q6            # [13] // add out_offset
    ee.vadds.s32    q2,q2,q6            # [14] // add out_offset
    ee.vldbc.32     q6,a7               # [16]  id:386 activation_min
    ee.vmin.s32     q0,q0,q4            # [17]
    ee.vmin.s32     q2,q2,q4            # [15]
    ee.vmax.s32     q0,q0,q6            # [18]
    ee.vmax.s32     q2,q2,q6            # [19]

// pack and store
    ee.vunzip.16    q2,q0               # [20]
    ee.vunzip.8     q2,q0               # [21]
    l32i.n          a7,a1,12 // count
    l32i            a9,a1,36                    # [55]  gra_spill_temp_8
    l32i.n          a3,a1,136               # [1] , size
    ee.vst.l.64.ip  q2,a9,8         # [22]  id:387
    addi            a7,a7,8
    s32i.n          a7,a1,12 // increment count
    bge             a3,a7,.Lt_0_7682

    addi            a11,a7,-8
    bge             a11,a3,.exit  # [3] // exit

.process_leftover:
    sub     a8,a3,a11                   # [1]
    loopgtz a8,.LBB33_esp_nn_mul_elementwise_s8_esp32s3     # [9]

    ssl     a14                         # [0] left_shift
    l32i.n  a8,a1,24                # [1]  gra_spill_temp_5, input1_offset
    l32i.n  a10,a1,4                # [2]  gra_spill_temp_0, input2
    l32i.n  a12,a1,16               # [3]  gra_spill_temp_3, input1
    add.n   a10,a11,a10                 # [4], input2
    add.n   a12,a11,a12                 # [5], input1
    l8ui    a12,a12,0                   # [6]  id:390
    l8ui    a10,a10,0                   # [7]  id:391
    sext    a12,a12,7                   # [8]
    add.n   a12,a12,a8                  # [9]
    l32i.n  a8,a1,28                # [10]  gra_spill_temp_12, input2_offset
    sext    a10,a10,7                   # [11]
    add.n   a10,a10,a8                  # [12]
    mull    a10,a12,a10                 # [13] // multiplication result

// multiply by quantised mult
    l32i.n  a9,a1,20                # [0]  gra_spill_temp_4, load right_shift

    sll     a10,a10                     # [15] // left shift

    mulsh   a3,a10,a13                  # [1]
    mull    a8,a10,a13                  # [6]
    ssai    31                          # [0]
    add.n   a6,a8,a4                    # [8]
    saltu   a8,a6,a8                    # [9]
    add.n   a8,a8,a3                    # [10]
    src     a3,a8,a6                    # [19] // result

    blti    a9, 1, .skip_div_by_pow_of_2_remains
// divide by power of 2
    // calculate to_add = `1 << (exponent - 1)`
    addi    a6,a9,-1
    ssl     a6                          # [23]
    movi    a7,1
    sll     a7,a7                       // to_add

    extui   a8,a3,31,1                  # [24], sign
    add     a3,a3,a8            // add sign
    add     a3,a3,a7            // add to_add

    ssr     a9                          # [20] load right_shift
    sra     a3,a3               // right shift
.skip_div_by_pow_of_2_remains:

    l32i.n  a6,a1,40                    # [32], out_offset
    l32i.n  a8,a1,132                   # [35], act_max
    l32i.n  a7,a1,128                   # [36], act_min

// add offset and apply activation
    add.n   a3,a3,a6                    # [34], offset added
    min     a8,a8,a3                    # [37]
    l32i.n  a3,a1,8                 # [38]  gra_spill_temp_1, load base out_addr
    max     a8,a8,a7                    # [39]

// store
    add.n   a3,a11,a3                   # [16], add index from `a11`
    s8i     a8,a3,0                     # [41]  id:392 // store
    addi.n  a11,a11,1               # [42]  // inc index

.LBB33_esp_nn_mul_elementwise_s8_esp32s3:   # 0x2ed
.exit:
    retw.n                          # [0]

    .size   esp_nn_mul_elementwise_s8_esp32s3, . - esp_nn_mul_elementwise_s8_esp32s3


================================================
FILE: src/common/common_functions.h
================================================
/*
 * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#pragma once

#include <stdint.h>
#include <stdbool.h>
#include <string.h>

/**
 * c99 standard still doesn't strictly inline functions
 * We need to use attribute as well to do this.
 */
#define __NN_FORCE_INLINE__ __attribute((always_inline)) static inline

/* min/max macros */
#ifndef max
#define max(a, b) ({            \
    __typeof__ (a) _a = (a);    \
    __typeof__ (b) _b = (b);    \
    _a > _b ? _a : _b;          \
})

#define min(a, b) ({            \
    __typeof__ (a) _a = (a);    \
    __typeof__ (b) _b = (b);    \
    _a < _b ? _a : _b;          \
})
#endif

__NN_FORCE_INLINE__ int32_t esp_nn_clz32(uint32_t in)
{
#if CONFIG_IDF_TARGET_ARCH_XTENSA
    __asm__ volatile("nsau %0, %0" : "+r" (in));
    return in;
#elif defined(__GNUC__)
    return __builtin_clz(in);
#else
    int32_t count = 32;
    uint32_t x = in, y = in >> 16;
    if (y != 0) {
        count -= 16;
        x = y;
    }
    y = x >> 8;
    if (y != 0) {
        count -= 8;
        x = y;
    }
    y = x >> 4;
    if (y != 0) {
        count -= 4;
        x = y;
    }
    y = x >> 2;
    if (y != 0) {
        count -= 2;
        x = y;
    }
    y = x >> 1;
    if (y != 0) {
        return count - 2;
    }
    return count - x;
#endif
}

/**
 * Signed saturate a 32 bit value to 8 bits keeping output in 32 bit variable.
 */
__NN_FORCE_INLINE__ int32_t esp_nn_saturate8(int32_t in)
{
#if CONFIG_IDF_TARGET_ARCH_XTENSA
    __asm__ volatile("clamps %0, %0, 7" : "+a"(in));
    return in;
#else
    return max(INT8_MIN, min(in, INT8_MAX));
#endif
}

__NN_FORCE_INLINE__ int32_t esp_nn_pick_sat_high32_of64(int64_t val64)
{
    int32_t sign = (int32_t) (val64 >> 63);
    int32_t to_add = sign & ((1ul << 31) - 1);
    return (int32_t) ((int64_t) (val64 + to_add) >> 31);
}

__NN_FORCE_INLINE__ int32_t esp_nn_sat_round_doubling_high_mul(int32_t in0, int32_t in1)
{
    int32_t result;
    int64_t in0_64 = (int64_t) in0;
    bool overflow = (in0 == in1) && (in0 == (int32_t) INT32_MIN);

    /* Nudge value */
    int64_t nudge_val = 1 << 30;
    if ((in0 < 0) ^ (in1 < 0)) {
        nudge_val = 1 - nudge_val;
    }

    /* Multiply and add nudge */
    int64_t mult = in0_64 * in1 + nudge_val;

    /* Round and pickup 32 bits */
    result = esp_nn_pick_sat_high32_of64(mult);

    return overflow ? INT32_MAX : result;
}

/**
 * fast version
 * this will fail for values closer to INT32_MAX and INT32_MIN by `1 << (exponent - 1)`.
 * We can afford to do this because we are at the very last stage of filter.
 * Also it is pretty rare condition as our output is going to be 8 bit.
 */
__NN_FORCE_INLINE__ int32_t esp_nn_div_by_power_of_two_fast(int32_t val, int32_t exponent)
{
    int32_t to_add = (1 << (exponent - 1)) - (val < 0);
    return (int32_t) ((val + to_add) >> exponent);
}

__NN_FORCE_INLINE__ int32_t esp_nn_div_by_power_of_two(int32_t val, int32_t exponent)
{
    int32_t result;

    const int32_t mask = (1 << exponent) - 1;
    const int32_t remainder = val & mask;

    result = val >> exponent;
    int32_t threshold = (mask >> 1) + (result < 0);

    if (remainder > threshold) {
        result += 1;
    }
    return result;
}

__NN_FORCE_INLINE__ int32_t esp_nn_multiply_by_quantized_mult(int32_t x, int32_t mult, int32_t shift)
{
    int32_t left_shift = shift > 0 ? shift : 0;
    int32_t right_shift = shift > 0 ? 0 : -shift;
    int32_t result = esp_nn_sat_round_doubling_high_mul(x * (1 << left_shift), mult);
    return esp_nn_div_by_power_of_two(result, right_shift);
}

#if CONFIG_IDF_TARGET_ESP32P4
/** PIE enable macro - call once before using any esp.* instructions */
#define ESP_NN_PIE_ENABLE() do { \
    asm volatile ( \
        "csrsi  0x7f2, 0b01        \n\t" \
        "li     x29, 0b10          \n\t" \
        "esp.movx.w.cfg x29        \n\t" \
        ::: "x29" \
    ); \
} while(0)

/** Extract 16 int32 per-lane results from QACC into array */
#define ESP_NN_QACC_EXTRACT_S32(dst) do { \
    asm volatile ( \
        "mv                      x30, %0     \n\t" \
        "esp.st.qacc.l.l.128.ip  x30, 16     \n\t" \
        "esp.st.qacc.l.h.128.ip  x30, 16     \n\t" \
        "esp.st.qacc.h.l.128.ip  x30, 16     \n\t" \
        "esp.st.qacc.h.h.128.ip  x30, 0      \n\t" \
        :: "r"(dst) \
        : "x30", "memory" \
    ); \
} while(0)
#endif /* CONFIG_IDF_TARGET_ESP32P4 - PIE_ENABLE and QACC_EXTRACT */

/**
 * 2-wide interleaved requant macro for ESP32-P4 RISC-V.
 * Interleaves mulh across two independent elements for pipeline fill.
 * Outputs r0, r1 as requantized int32 values (before offset/clamp).
 */
#if CONFIG_IDF_TARGET_ESP32P4
#define ESP_NN_REQUANT_2X(x0, x1, m0, m1, s0, s1, r0, r1) do { \
    int32_t _ls0 = (s0) > 0 ? (s0) : 0; \
    int32_t _ls1 = (s1) > 0 ? (s1) : 0; \
    int32_t _v0 = (x0) << _ls0; \
    int32_t _v1 = (x1) << _ls1; \
    int32_t _rs0 = _ls0 - (s0); \
    int32_t _rs1 = _ls1 - (s1); \
    int32_t _hi0, _lo0, _hi1, _lo1; \
    asm volatile ( \
        "mulh  %[h0], %[v0], %[mm0]  \n\t" \
        "mulh  %[h1], %[v1], %[mm1]  \n\t" \
        "mul   %[l0], %[v0], %[mm0]  \n\t" \
        "mul   %[l1], %[v1], %[mm1]  \n\t" \
        : [h0] "=&r"(_hi0), [h1] "=&r"(_hi1), \
          [l0] "=&r"(_lo0), [l1] "=&r"(_lo1) \
        : [v0] "r"(_v0), [v1] "r"(_v1), \
          [mm0] "r"((int32_t)(m0)), [mm1] "r"((int32_t)(m1)) \
    ); \
    /* Add nudge (1<<30) and extract bits [31:62] */ \
    uint32_t _n = 0x40000000u; \
    uint32_t _a0 = (uint32_t)_lo0 + _n; \
    _hi0 += (_a0 < (uint32_t)_lo0); \
    (r0) = (_hi0 << 1) | (_a0 >> 31); \
    uint32_t _a1 = (uint32_t)_lo1 + _n; \
    _hi1 += (_a1 < (uint32_t)_lo1); \
    (r1) = (_hi1 << 1) | (_a1 >> 31); \
    /* Right shift with rounding */ \
    if (_rs0) { (r0) = ((r0) + (1 << (_rs0 - 1)) - ((r0) < 0)) >> _rs0; } \
    if (_rs1) { (r1) = ((r1) + (1 << (_rs1 - 1)) - ((r1) < 0)) >> _rs1; } \
} while(0)
#endif

__NN_FORCE_INLINE__ int32_t esp_nn_multiply_by_quantized_mult_fast(int32_t x, int32_t mult, int32_t shift)
{
    int32_t left_shift = max(shift, 0);
    int32_t right_shift = left_shift - shift;

    int64_t nudge_val = 1 << 30;
    int64_t in0_64 = (int64_t) (x << left_shift);

    /* Multiply and add nudge */
    int64_t mult_64 = in0_64 * mult + nudge_val;
    int32_t result = (int32_t) (mult_64 >> 31);
    if (right_shift) {
        result = esp_nn_div_by_power_of_two_fast(result, right_shift);
    }
    return result;
}

/*
 * Unified requantize wrapper. Defining either SKIP_NUDGE (legacy) or
 * CONFIG_NN_SKIP_NUDGE (Kconfig-driven) selects the faster, non-bit-exact
 * path; otherwise the bit-exact TFLite-reference path is used.
 */
#if defined(SKIP_NUDGE) || defined(CONFIG_NN_SKIP_NUDGE)
#define esp_nn_requantize(x, m, s) esp_nn_multiply_by_quantized_mult_fast((x), (m), (s))
#else
#define esp_nn_requantize(x, m, s) esp_nn_multiply_by_quantized_mult((x), (m), (s))
#endif

static void esp_nn_aligned_s8_pad_with_value(const int8_t *src, int8_t *dst,
                                             const uint16_t input_wd,
                                             const uint16_t input_ht,
                                             const uint16_t channels,
                                             const int32_t pad_val,
                                             const uint16_t pad_wd,
                                             const uint16_t pad_ht)
{
    /* memset with pad_val */
    memset(dst, pad_val, ((input_wd + 2 * pad_wd) * (input_ht + 2 * pad_ht)) * channels);
    dst += (pad_wd + input_wd + pad_wd) * pad_ht * channels;

    for (int i = 0; i < input_ht; i++) {
        dst += pad_wd * channels;
        for (int j = 0; j < input_wd * channels; j++) {
            *dst++ = *src++;
        }
        dst += pad_wd * channels;
    }
}

static void esp_nn_aligned_s8_pad_end_with_value(const int8_t *src, int8_t *dst,
                                                 const uint16_t input_wd,
                                                 const uint16_t input_ht,
                                                 const uint16_t channels,
                                                 const int32_t pad_val,
                                                 const uint16_t pad_wd,
                                                 const uint16_t pad_ht)
{
    for (int i = 0; i < input_ht; i++) {
        for (int j = 0; j < input_wd * channels; j++) {
            *dst++ = *src++;
        }
        if (pad_wd) {
            memset(dst, pad_val, pad_wd * channels);
            dst += pad_wd * channels;
        }
    }
    /* pad end `pad_ht` lines at end */
    if (pad_ht) {
        memset(dst, pad_val, (input_wd + pad_wd) * pad_ht * channels);
    }
}

/**
 * @brief       convert 8 bit input data to 16 bit
 *
 * @param       src int8_t source data
 * @param       dst int16_t dst data
 * @param       size length of data
 * @param       offset  offset to be added to src data. Range: [-128, 127]
 */
__NN_FORCE_INLINE__ void esp_nn_s8_to_s16_with_offset(const int8_t *src, int16_t *dst,
                                                      const int size, const int32_t offset)
{
    int i = 0;
    for (; i < size; i += 2) {
        dst[i + 0] = src[i + 0] + offset;
        dst[i + 1] = src[i + 1] + offset;
    }
    if(i < size) {
        dst[i] = src[i] + offset;
    }
}

/**
 * @brief       convert 8 bit input data to 16 bit
 *
 * @param       src int8_t source data
 * @param       dst int16_t dst data
 * @param       size length of data
 */
__NN_FORCE_INLINE__ void esp_nn_s8_to_s16(const int8_t *src, int16_t *dst, const int size)
{
    int i = 0;
    for (; i < size; i += 2) {
        dst[i + 0] = src[i + 0];
        dst[i + 1] = src[i + 1];
    }
    if(i < size) {
        dst[i] = src[i];
    }
}

#if CONFIG_IDF_TARGET_ESP32S3
/**
 * @brief       s8 dot product — both pointers 16-byte aligned.
 *              Uses ACCX accumulator with fused MAC+load.
 *
 * @param       a       input data (16-byte aligned)
 * @param       b       filter data (16-byte aligned)
 * @param       len     number of elements (must be multiple of 16, >= 16)
 * @return      int32_t dot product result
 */
extern int32_t esp_nn_dot_s8_aligned_esp32s3(const int8_t *a, const int8_t *b, int32_t len);

/**
 * @brief       s8 dot product — input aligned, filter may be unaligned.
 *              Uses USAR+QUP pattern for filter data.
 *
 * @param       a       input data (16-byte aligned)
 * @param       b       filter data (may be unaligned)
 * @param       len_div16  number of 16-element chunks (>= 1)
 * @return      int32_t dot product result
 */
extern int32_t esp_nn_dot_s8_unaligned_esp32s3(const int8_t *a, const int8_t *b, int32_t len_div16);
#endif


================================================
FILE: src/common/esp_nn_common_functions_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

	.text

	# Program Unit: esp_nn_aligned_s8_to_s16_with_offset_esp32s3
	.type	esp_nn_aligned_s8_to_s16_with_offset_esp32s3, @function
	.align	 4
	.global esp_nn_aligned_s8_to_s16_with_offset_esp32s3

esp_nn_aligned_s8_to_s16_with_offset_esp32s3:	# 0x30d

	entry	a1,48                   	#
	mov.n	a10,a2                  	# // src
	mov.n	a9,a3                   	# // dst
	mov.n	a8,a4                   	# // size
	s32i.n	a5,a1,12               	# [3] // offset
	addi.n	a2,a1,12               	# [4]

	blti	a4,32,.Lt_2_6402         	# [5] if (size < 32) goto unopt

	addi.n	a6,a8,-1               	# [0]
	ee.zero.q	q5                  	# [1]
	ee.vldbc.16	q4,a2             	# [2]  id:136 offset
	mov.n	a3,a10                  	# [3]
	mov.n	a2,a9                   	# [4]
	ee.vld.128.ip	q0,a3,16        	# [5]  id:137
	ee.vld.128.ip	q1,a3,16        	# [6]  id:138
	ee.vcmp.lt.s8	q2,q0,q5        	# [7]
	ee.vzip.8	q0,q2               	# [8]
	ee.vadds.s16	q0,q0,q4         	# [9]
	ee.vadds.s16.st.incp	q0,a2,q0,q2,q4 	# [10]  id:139
	blti	a4,64,.Lt_2_7170         	# [11]

	addi	a5,a4,-32                	# [0]
	srai	a5,a5,5                  	# [1]
	slli	a4,a5,5                  	# [2]
	loopgtz	a5,.LBB37_esp_nn_aligned_s8_to_s16_with_offset_esp32s3 	# [3]

	ee.vst.128.ip	q0,a2,16        	# [0*II+0]  id:140
	ee.vcmp.lt.s8	q0,q1,q5        	# [0*II+1]
	ee.vzip.8	q1,q0               	# [0*II+2]
	ee.vadds.s16.ld.incp	q2,a3,q3,q1,q4 	# [0*II+3]  id:141
	ee.vadds.s16.st.incp	q3,a2,q0,q0,q4 	# [0*II+4]  id:142
	ee.vcmp.lt.s8	q3,q2,q5        	# [0*II+5]
	ee.vst.128.ip	q0,a2,16        	# [0*II+6]  id:143
	ee.vzip.8	q2,q3               	# [0*II+7]
	ee.vadds.s16.ld.incp	q1,a3,q0,q2,q4 	# [0*II+8]  id:144
	ee.vadds.s16.st.incp	q0,a2,q0,q3,q4 	# [0*II+9]  id:145

.LBB37_esp_nn_aligned_s8_to_s16_with_offset_esp32s3:	# 0x36d
	addi	a4,a4,32                 	# [0]

.Lt_2_3842:	# 0x370
	ee.vst.128.ip	q0,a2,16        	# [0]  id:146
	ee.vcmp.lt.s8	q2,q1,q5        	# [1]
	ee.vzip.8	q1,q2               	# [2]
	ee.vadds.s16	q2,q2,q4         	# [3]
	ee.vadds.s16	q3,q1,q4         	# [4]
	ee.vst.128.ip	q3,a2,16        	# [5]  id:147
	ee.vst.128.ip	q2,a2,16        	# [6]  id:148
	bge	a4,a6,.Lt_2_4866          	# [7]

	l32i.n	a5,a1,12               	# [0]  id:135 offset+0x0

.Lt_2_5122:	# 0x38a
	mov.n	a11,a4                  	# [0]
	add.n	a2,a4,a10               	# [1]
 # 576          dst[i + 0] = src[i + 0] + offset;
	l8ui	a7,a2,0                  	# [2]  id:149
	addx2	a6,a4,a9                	# [3]
	sext	a7,a7,7                  	# [4]
	add.n	a7,a7,a5                	# [5]
	s16i	a7,a6,0                  	# [6]  id:150
 # 577          dst[i + 1] = src[i + 1] + offset;
	l8ui	a3,a2,1                  	# [7]  id:151
	sub	a7,a8,a4                  	# [8]
	addi.n	a2,a2,2                	# [9]
	srai	a7,a7,1                  	# [10]
	sext	a3,a3,7                  	# [11]
	add.n	a3,a3,a5                	# [12]
	s16i	a3,a6,2                  	# [13]  id:152
	addi.n	a3,a7,-1               	# [14]
	loopgtz	a3,.LBB52_esp_nn_aligned_s8_to_s16_with_offset_esp32s3 	# [15]

	l8ui	a3,a2,0                  	# [0*II+0]  id:149
	addi.n	a6,a6,4                	# [1*II+1]
	sext	a3,a3,7                  	# [0*II+2]
	add.n	a3,a3,a5                	# [0*II+3]
	s16i	a3,a6,0                  	# [0*II+4]  id:150
	l8ui	a3,a2,1                  	# [0*II+5]  id:151
	addi.n	a2,a2,2                	# [0*II+6]
	sext	a3,a3,7                  	# [0*II+7]
	add.n	a3,a3,a5                	# [0*II+8]
	s16i	a3,a6,2                  	# [0*II+9]  id:152

.LBB52_esp_nn_aligned_s8_to_s16_with_offset_esp32s3:	# 0x3ce
	addx2	a4,a7,a11               	# [0]

.Lt_2_4866:	# 0x3d1
	bge	a4,a8,.Lt_2_7682          	# [0]

 # 580          dst[i] = src[i] + offset;
	addx2	a11,a4,a9               	# [0]
	add.n	a8,a4,a10               	# [1]
	l8ui	a8,a8,0                  	# [2]  id:153
	l32i.n	a12,a1,12              	# [3]  id:135 offset+0x0
	sext	a8,a8,7                  	# [4]
	add.n	a8,a8,a12               	# [5]
	s16i	a8,a11,0                 	# [6]  id:154
	retw.n                        	# [7]

.Lt_2_6402:	# 0x3e8
	blti	a4,2,.Lt_2_6658          	# [0]

	movi.n	a4,0                   	# [0]
	j	.Lt_2_5122                  	# [1]

.Lt_2_7682:	# 0x3f0
	retw.n                        	# [0]

.Lt_2_6658:	# 0x3f2
	blti	a4,1,.Lt_2_7682          	# [0]

	l8ui	a11,a10,0                	# [0]  id:153
	sext	a11,a11,7                	# [2]
	add.n	a11,a11,a5              	# [3]
	s16i	a11,a3,0                 	# [4]  id:154
	retw.n                        	# [5]

.Lt_2_7170:	# 0x402
	movi.n	a4,32                  	# [0]
	j	.Lt_2_3842                  	# [1]

	.size	esp_nn_aligned_s8_to_s16_with_offset_esp32s3, . - esp_nn_aligned_s8_to_s16_with_offset_esp32s3


	.literal_position

	# Program Unit: esp_nn_s8_to_s16_esp32s3
	.type	esp_nn_s8_to_s16_esp32s3, @function
	.align	 4
	.global esp_nn_s8_to_s16_esp32s3

esp_nn_s8_to_s16_esp32s3:	# 0x40b
	entry	a1,32                   	#
	mov.n	a9,a2 // src
	mov.n	a8,a3 // dst
	mov.n	a7,a4 // size
    blti	a4,1,.Lt_3_4866  // size == 0
	blti	a4,16,.Lt_3_4610 // if (size < 16) jump to unopt path

 // load align_len to sar_byte
	extui	a2,a2,0,4               	# [0]
	wur.sar_byte	a2               	# [1]
	mov.n	a2,a9                   	# [2]

 // preload
	ee.vld.128.ip	q0,a2,16
	ee.vld.128.ip	q1,a2,16
    ee.zero.q	    q4
 # 672
 # 673      for (i = 16; i < size - 15; i += 16) {
	blti	a4,32,.Lt_3_5378         	# [5]
	addi	a6,a4,-16                	# [1]
	srai	a6,a6,4                  	# [2]
	slli	a4,a6,4                  	# [3]
	loopgtz	a6,.LBB35_esp_nn_s8_to_s16_esp32s3 	# [4]

	ee.src.q.qup	q2,q0,q1         	# [0*II+0]
	ee.vcmp.lt.s8	q3,q2,q4        	# [0*II+1] // sign
	ee.vld.128.ip	q1,a2,16        	# [0*II+2] // for next iteration
	ee.vzip.8	q2,q3               	# [0*II+3]
	ee.vst.128.ip	q2,a3,16        	# [0*II+4]  id:93
	ee.vst.128.ip	q3,a3,16        	# [0*II+5]  id:94

.LBB35_esp_nn_s8_to_s16_esp32s3:	# 0x449
	addi	a4,a4,16                 	# [0]

.Lt_3_2050:	# 0x44c
	ee.src.q.qup	q5,q0,q1         	# [0]
	ee.vcmp.lt.s8	q3,q5,q4        	# [1]
	ee.vzip.8	q5,q3               	# [2]
	ee.vst.128.ip	q5,a3,16        	# [3]  id:96
	ee.vst.128.ip	q3,a3,16        	# [4]  id:97
 # 687
 # 688  skip_to_remains_s8_to_s16:
 # 689      for (; i < size; i += 2) {
	bge	a4,a7,.Lt_3_4866          	# [5]

.Lt_3_3330:	# 0x45e
	mov.n	a11,a4                  	# [0]
	add.n	a2,a4,a9                	# [1]
 # 690          dst[i + 0] = src[i + 0];
	l8ui	a10,a2,0                 	# [2]  id:98
	addx2	a5,a4,a8                	# [3]
	sext	a10,a10,7                	# [4]
	s16i	a10,a5,0                 	# [5]  id:99
 # 691          dst[i + 1] = src[i + 1];
	l8ui	a3,a2,1                  	# [6]  id:100
	sub	a10,a7,a4                 	# [7]
	addi.n	a2,a2,2                	# [8]
	addi.n	a10,a10,1              	# [9]
	srai	a10,a10,1                	# [10]
	sext	a3,a3,7                  	# [11]
	s16i	a3,a5,2                  	# [12]  id:101
	addi.n	a3,a10,-1              	# [13]
	loopgtz	a3,.LBB50_esp_nn_s8_to_s16_esp32s3 	# [14]

	l8ui	a3,a2,0                  	# [0*II+0]  id:98
	addi.n	a5,a5,4                	# [1*II+1]
	sext	a3,a3,7                  	# [0*II+2]
	s16i	a3,a5,0                  	# [0*II+3]  id:99
	l8ui	a3,a2,1                  	# [0*II+4]  id:100
	addi.n	a2,a2,2                	# [0*II+5]
	sext	a3,a3,7                  	# [0*II+6]
	s16i	a3,a5,2                  	# [0*II+7]  id:101

.LBB50_esp_nn_s8_to_s16_esp32s3:	# 0x49c
	addx2	a4,a10,a11              	# [0]
 # 692      }
 # 693      if(i < size) {
	bge	a4,a7,.Lt_3_4866          	# [1]

 # 694          dst[i] = src[i];
	add.n	a11,a4,a9               	# [0]
	l8ui	a11,a11,0                	# [1]  id:102
	addx2	a12,a4,a8               	# [2]
	sext	a11,a11,7                	# [3]
	s16i	a11,a12,0                	# [4]  id:103
	retw.n                        	# [5]

.Lt_3_4610:	# 0x4b2
	movi.n	a4,0                   	# [0]
	j	.Lt_3_3330                  	# [1]

.Lt_3_4866:	# 0x4ba
	retw.n                        	# [0]

.Lt_3_5378:	# 0x4bc
	movi.n	a4,16                  	# [1]
	j	.Lt_3_2050                  	# [2]

	.size	esp_nn_s8_to_s16_esp32s3, . - esp_nn_s8_to_s16_esp32s3


================================================
FILE: src/common/esp_nn_dot_s8_esp32s3.S
================================================
//
// SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
//
// SPDX-License-Identifier: Apache-2.0
//

//
// Reusable s8 dot product kernels for ESP32-S3.
// Used by conv im2col, FC, and any kernel that reduces to a dot product.
//
// esp_nn_dot_s8_aligned_esp32s3:
//   Both input and filter 16-byte aligned. Uses ee.vld.128.ip + fused MAC.
//
// esp_nn_dot_s8_unaligned_esp32s3:
//   Input aligned, filter may be unaligned. Uses USAR+QUP for filter.
//

    .text
    .align  4

// ============================================================
// esp_nn_dot_s8_aligned_esp32s3
// Both pointers must be 16-byte aligned.
// a2: input_data (aligned)
// a3: filter_data (aligned)
// a4: len (must be multiple of 16, >= 16)
// Returns: int32_t dot product in a2
// ============================================================
    .type   esp_nn_dot_s8_aligned_esp32s3, @function
    .align  4
    .global esp_nn_dot_s8_aligned_esp32s3

esp_nn_dot_s8_aligned_esp32s3:
    entry   a1, 32

    ee.zero.accx
    beqz    a4, .Lalign_done

    // Compute loop count and remainder
    srli    a5, a4, 4               // a5 = len / 16
    beqz    a5, .Lalign_done

    // Prime: load first pair
    ee.vld.128.ip   q0, a2, 16
    ee.vld.128.ip   q1, a3, 16
    addi            a5, a5, -1
    beqz            a5, .Lalign_last

    // Main loop: fused MAC + load
    loopgtz a5, .Lalign_loop_end
    ee.vmulas.s8.accx.ld.ip  q0, a2, 16, q0, q1
    ee.vld.128.ip   q1, a3, 16
.Lalign_loop_end:

.Lalign_last:
    // Final MAC
    ee.vmulas.s8.accx  q0, q1

.Lalign_done:
    // Read lower 32 bits of ACCX (sufficient for int8 dot products)
    nop
    nop
    rur.accx_0 a2

    retw.n

    .size   esp_nn_dot_s8_aligned_esp32s3, . - esp_nn_dot_s8_aligned_esp32s3


// ============================================================
// esp_nn_dot_s8_unaligned_esp32s3
// Input must be 16-byte aligned. Filter can be unaligned.
// Uses USAR+QUP pattern for filter loads.
// a2: input_data (aligned)
// a3: filter_data (may be unaligned)
// a4: len_div16 (>= 1)
// Returns: int32_t dot product in a2
// ============================================================
    .type   esp_nn_dot_s8_unaligned_esp32s3, @function
    .align  4
    .global esp_nn_dot_s8_unaligned_esp32s3

esp_nn_dot_s8_unaligned_esp32s3:
    entry   a1, 32

    ee.zero.accx
    beqz    a4, .Lunalign_done

    // Prime: first unaligned filter load (sets SAR_BYTE)
    ee.ld.128.usar.ip   q0, a3, 16

    // Check if we can do 2x unrolled (need >= 2 iterations)
    srai    a5, a4, 1               // a5 = len_div16 / 2
    beqz    a5, .Lunalign_single

    // Load first input + filter pair for unrolled loop
    ee.vld.128.ip       q1, a2, 16
    ee.ld.128.usar.ip   q2, a3, 16

    // 2x unrolled main loop
    loopgtz a5, .Lunalign_loop2_end

    ee.src.q.qup        q4, q0, q2         // align filter[i]
    ee.vld.128.ip       q3, a2, 16         // input[i+1]
    ee.vmulas.s8.accx   q4, q1             // MAC filter[i] * input[i]
    ee.ld.128.usar.ip   q0, a3, 16         // filter chunk[i+2]
    ee.src.q.qup        q5, q2, q0         // align filter[i+1]
    ee.vld.128.ip       q1, a2, 16         // input[i+2] (primed for next)
    ee.vmulas.s8.accx   q5, q3             // MAC filter[i+1] * input[i+1]
    ee.ld.128.usar.ip   q2, a3, 16         // filter chunk[i+3]

.Lunalign_loop2_end:

    // Check if there's a remaining single iteration (odd len_div16)
    bbci    a4, 0, .Lunalign_done_mac

    // Odd remainder: the 2x loop already loaded q0/q2 for the next chunk.
    // Just qup the filter and MAC with the primed input (q1).
    // But q1 was loaded as input[i+2] in the last loop iteration — we need
    // to re-read the correct input. Actually, q1 is already the right input.
    // q0 and q2 are the filter chunks ready for qup.
    ee.src.q.qup        q4, q0, q2
    ee.vmulas.s8.accx   q4, q1
    j                   .Lunalign_done_mac

.Lunalign_single:
    // Called when len_div16 < 2 (single chunk only)
    ee.vld.128.ip       q1, a2, 16
    ee.ld.128.usar.ip   q2, a3, 16
    ee.src.q.qup        q4, q0, q2
    ee.vmulas.s8.accx   q4, q1

.Lunalign_done_mac:
.Lunalign_done:
    // 2-cycle gap before ACCX read
    movi.n  a3, 0
    nop
    ee.srs.accx a2, a3, 0

    retw.n

    .size   esp_nn_dot_s8_unaligned_esp32s3, . - esp_nn_dot_s8_unaligned_esp32s3


================================================
FILE: src/common/esp_nn_mean_ansi.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * Quantized mean reduction over spatial dimensions (axes 1,2).
 * Specialized for 4D tensors [N, H, W, C] → [N, 1, 1, C].
 * This is the common case in Squeeze-and-Excite blocks.
 */

#include <stdint.h>
#include <common_functions.h>

void esp_nn_mean_nhwc_s8_ansi(const int8_t *input,
                               int8_t *output,
                               const int32_t height,
                               const int32_t width,
                               const int32_t channels,
                               const int32_t input_zero_point,
                               const int32_t output_zero_point,
                               const int32_t multiplier,
                               const int32_t shift)
{
    const int32_t num_elements = height * width;

    for (int c = 0; c < channels; c++) {
        /* Sum over spatial dimensions */
        int32_t sum = 0;
        for (int h = 0; h < height; h++) {
            for (int w = 0; w < width; w++) {
                sum += input[(h * width + w) * channels + c];
            }
        }

        /* Apply zero point correction */
        sum -= num_elements * input_zero_point;

        /* Requantize: multiply_by_quantized_mult(sum, multiplier, shift) */
        int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);
        result += output_zero_point;
        result = max(result, -128);
        result = min(result, 127);
        output[c] = (int8_t)result;
    }
}


================================================
FILE: src/common/esp_nn_mean_s8_esp32p4.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * ESP32-P4 optimized spatial mean reduction using QACC per-lane accumulation.
 * Processes 16 channels in parallel via esp.vmulas.s8.qacc (same pattern as avg_pool).
 */

#include <stdint.h>
#include <common_functions.h>

void esp_nn_mean_nhwc_s8_esp32p4(const int8_t *input,
                                  int8_t *output,
                                  const int32_t height,
                                  const int32_t width,
                                  const int32_t channels,
                                  const int32_t input_zero_point,
                                  const int32_t output_zero_point,
                                  const int32_t multiplier,
                                  const int32_t shift)
{
    const int32_t num_elements = height * width;
    const int32_t ch_16 = channels >> 4;

    const int8_t one_val = 1;
    if (ch_16 > 0) {
        /* Enable PIE and broadcast 1 into q7 */
        asm volatile (
            "csrsi  0x7f2, 0b01        \n\t"
            "li     x29, 0b10          \n\t"
            "esp.movx.w.cfg x29        \n\t"
            ::: "x29"
        );
        asm volatile (
            "mv     x30, %0             \n\t"
            "esp.vldbc.8.ip q7, x30, 0  \n\t"
            :: "r"(&one_val) : "x30"
        );
    }

    /* Process all channels - QACC for 16-channel blocks, scalar for remainder */
    int ch = 0;
    for (int ch_blk = 0; ch_blk < ch_16; ch_blk++, ch += 16) {
        /* Single asm block: broadcast ones, zero QACC, accumulate all spatial
         * positions. Keeping in one block prevents compiler from clobbering
         * q7 between the broadcast and the MAC loop. */
        const int8_t *base_ptr = input + ch;
        asm volatile (
            /* Broadcast 1 into q7 */
            "mv     x30, %[one]             \n\t"
            "esp.vldbc.8.ip q7, x30, 0      \n\t"
            /* Zero QACC */
            "esp.zero.qacc                   \n\t"
            /* Accumulate loop: stride = channels between spatial positions */
            "mv     x30, %[base]            \n\t"
            "mv     s7,  %[cnt]             \n\t"
            "1:                             \n\t"
            "esp.vld.128.ip  q0, x30, 0     \n\t"
            "esp.vmulas.s8.qacc q0, q7      \n\t"
            "add    x30, x30, %[stride]     \n\t"
            "addi   s7, s7, -1              \n\t"
            "bnez   s7, 1b                  \n\t"
            :
            : [one] "r"(&one_val), [base] "r"(base_ptr),
              [cnt] "r"(num_elements), [stride] "r"((int32_t)channels)
            : "x30", "s7"
        );

        int32_t sums[16] __attribute__((aligned(16)));
        ESP_NN_QACC_EXTRACT_S32(sums);

        int32_t zp_correction = num_elements * input_zero_point;
        for (int k = 0; k < 16; k++) {
            int32_t result = sums[k] - zp_correction;
            result = esp_nn_multiply_by_quantized_mult(result, multiplier, shift);
            result += output_zero_point;
            result = max(result, -128);
            result = min(result, 127);
            output[ch + k] = (int8_t)result;
        }
    }

    /* Remaining channels scalar */
    for (; ch < channels; ch++) {
        int32_t sum = 0;
        for (int hw = 0; hw < num_elements; hw++) {
            sum += input[hw * channels + ch];
        }
        sum -= num_elements * input_zero_point;
        int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);
        result += output_zero_point;
        result = max(result, -128);
        result = min(result, 127);
        output[ch] = (int8_t)result;
    }
}


================================================
FILE: src/common/esp_nn_mean_s8_esp32s3.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * ESP32-S3 optimized mean reduction for NHWC int8 tensors.
 * Uses int16 accumulation for small spatial sizes (H*W <= 256),
 * int32 for larger. Accumulates all channels at once per spatial position.
 */

#include <stdint.h>
#include <string.h>
#include <common_functions.h>

void esp_nn_mean_nhwc_s8_esp32s3(const int8_t *input,
                                  int8_t *output,
                                  const int32_t height,
                                  const int32_t width,
                                  const int32_t channels,
                                  const int32_t input_zero_point,
                                  const int32_t output_zero_point,
                                  const int32_t multiplier,
                                  const int32_t shift)
{
    const int32_t num_elements = height * width;
    const int32_t zp_correction = num_elements * input_zero_point;

    if (num_elements <= 256 && channels <= 512) {
        /* int16 accumulation (safe: 256 * 127 = 32,512 < 32,767) */
        /* Process 8 channels at a time using int16 accumulators */
        int16_t acc16[channels];
        memset(acc16, 0, channels * sizeof(int16_t));

        const int8_t *ptr = input;
        for (int i = 0; i < num_elements; i++) {
            /* Inner loop — compiler should auto-vectorize with -O2 */
            for (int c = 0; c < channels; c++) {
                acc16[c] += (int16_t)ptr[c];
            }
            ptr += channels;
        }

        /* Requantize per channel */
        for (int c = 0; c < channels; c++) {
            int32_t sum = (int32_t)acc16[c] - zp_correction;
            int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);
            result += output_zero_point;
            result = max(result, -128);
            result = min(result, 127);
            output[c] = (int8_t)result;
        }
    } else if (channels <= 512) {
        /* int32 accumulation for larger spatial sizes */
        int32_t acc[channels];
        memset(acc, 0, channels * sizeof(int32_t));

        const int8_t *ptr = input;
        for (int i = 0; i < num_elements; i++) {
            for (int c = 0; c < channels; c++) {
                acc[c] += ptr[c];
            }
            ptr += channels;
        }

        for (int c = 0; c < channels; c++) {
            int32_t sum = acc[c] - zp_correction;
            int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);
            result += output_zero_point;
            result = max(result, -128);
            result = min(result, 127);
            output[c] = (int8_t)result;
        }
    } else {
        /* Per-channel fallback for huge channel counts */
        for (int c = 0; c < channels; c++) {
            int32_t sum = 0;
            for (int i = 0; i < num_elements; i++) {
                sum += input[i * channels + c];
            }
            sum -= zp_correction;
            int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);
            result += output_zero_point;
            result = max(result, -128);
            result = min(result, 127);
            output[c] = (int8_t)result;
        }
    }
}


================================================
FILE: src/common/esp_nn_multiply_by_quantized_mult_esp32p4.S
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * Fast 2-wide requantization for ESP32-P4 (RISC-V).
 * Interleaves mul/mulh across 2 elements for better pipeline utilization.
 * Uses a0-a7 and t0-t6 only (no callee-saved registers needed).
 *
 * void esp_nn_requant_2x_esp32p4(
 *     int32_t x0,       // a0
 *     int32_t x1,       // a1
 *     int32_t mult0,    // a2
 *     int32_t mult1,    // a3
 *     int32_t shift0,   // a4
 *     int32_t shift1,   // a5
 *     int32_t *out      // a6: pointer to store 2 results
 * );
 */

    .text
    .align  4
    .global esp_nn_requant_2x_esp32p4
    .type   esp_nn_requant_2x_esp32p4, @function

esp_nn_requant_2x_esp32p4:
    /* Compute left_shift and apply */
    mv      t0, a0              /* x0 */
    mv      t1, a1              /* x1 */
    bgez    a4, .Lls0_pos
    mv      t6, zero            /* ls0 = 0 */
    j       .Lls0_done
.Lls0_pos:
    sll     t0, t0, a4          /* x0 <<= shift0 (positive = left shift) */
    mv      t6, a4              /* ls0 = shift0 */
.Lls0_done:
    sub     a4, t6, a4          /* rs0 = ls0 - shift0 */

    bgez    a5, .Lls1_pos
    mv      t6, zero
    j       .Lls1_done
.Lls1_pos:
    sll     t1, t1, a5
    mv      t6, a5
.Lls1_done:
    sub     a5, t6, a5          /* rs1 = ls1 - shift1 */

    /* ---- Interleaved 64-bit multiply ---- */
    /* mulh first (both elements), then mul (both elements) */
    mulh    t2, t0, a2          /* hi0 */
    mulh    t3, t1, a3          /* hi1 */
    mul     t0, t0, a2          /* lo0 */
    mul     t1, t1, a3          /* lo1 */

    /* Add nudge and combine: result = ((hi:lo) + (1<<30)) >> 31 */
    li      t4, 0x40000000      /* nudge = 1 << 30 */

    add     t5, t0, t4          /* lo0 + nudge */
    sltu    t6, t5, t0          /* carry0 */
    add     t2, t2, t6          /* hi0 += carry0 */
    srli    t5, t5, 31          /* (lo0+nudge) >> 31 */
    slli    t0, t2, 1           /* hi0 << 1 */
    or      t0, t0, t5          /* result0 */

    add     t5, t1, t4          /* lo1 + nudge */
    sltu    t6, t5, t1          /* carry1 */
    add     t3, t3, t6          /* hi1 += carry1 */
    srli    t5, t5, 31
    slli    t1, t3, 1
    or      t1, t1, t5          /* result1 */

    /* ---- Right shift with rounding ---- */
    li      t4, 1

    beqz    a4, .Lskip_rs0
    addi    t5, a4, -1
    sll     t5, t4, t5          /* round0 = 1 << (rs0-1) */
    srai    t6, t0, 31          /* -1 if negative, 0 otherwise */
    add     t5, t5, t6          /* round0 += sign */
    add     t0, t0, t5
    sra     t0, t0, a4
.Lskip_rs0:

    beqz    a5, .Lskip_rs1
    addi    t5, a5, -1
    sll     t5, t4, t5
    srai    t6, t1, 31
    add     t5, t5, t6
    add     t1, t1, t5
    sra     t1, t1, a5
.Lskip_rs1:

    /* Store results */
    sw      t0, 0(a6)
    sw      t1, 4(a6)
    ret

    .size   esp_nn_requant_2x_esp32p4, . - esp_nn_requant_2x_esp32p4


================================================
FILE: src/common/esp_nn_multiply_by_quantized_mult_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// the macro `use_nudge` enables adding rounding factor similar to tflite implementation
// this barely changes any accuracy
// keep this disabled for better performance

#ifndef SKIP_NUDGE
    # set SKIP_NUDGE flag for ~20% faster (but not bit-exact) quantisation
    .set use_nudge, 1
#endif

    .text
    .literal_position
    .literal    .nudge_val, 1073741824          # 1 << 30

    .type   esp_nn_multiply_by_quantized_mult_asm_esp32s3, @function
    .align   4
    .global esp_nn_multiply_by_quantized_mult_asm_esp32s3

esp_nn_multiply_by_quantized_mult_asm_esp32s3:  # 0x4
    # to_add = 4

    entry       a1,32
    wsr.sar     a3
    ee.zero.q   q2

    bltz        a3,     .skip_left_shift
    ee.vsl.32   q0,q0                   # [13]
.skip_left_shift:

    ssai    31                      # [15]

# move data to general purpose registers
    ee.movi.32.a    q0,a12,0            # [17]
    ee.movi.32.a    q0,a13,1            # [16]
    ee.movi.32.a    q0,a14,2            # [18]
    ee.movi.32.a    q0,a15,3            # [19]

.ifdef use_nudge
    l32r            a6,.nudge_val
.endif

# perform 64 bit mult
    mulsh   a4,a2,a12                   # [22]
    mulsh   a11,a2,a13                  # [23]
    mulsh   a10,a2,a14                  # [21]
    mulsh   a8,a2,a15                   # [20]
    mull    a12,a2,a12                  # [24]
    mull    a13,a2,a13                  # [25]
    mull    a14,a2,a14                  # [26]
    mull    a15,a2,a15                  # [27]

# add nudge_val and discard low31

.ifdef use_nudge
    add.n           a14,a6,a14                  # [41]
    saltu           a2,a14,a6                   # [44]
    add.n           a10,a10,a2                  # [45]

    add.n           a13,a6,a13                  # [47]
    saltu           a9,a13,a6                   # [50]
    add.n           a11,a11,a9                  # [51]
.endif

    src             a10,a10,a14                     # [88]
    src             a11,a11,a13                 # [78]
    ee.movi.32.q    q0,a10,2
    ee.movi.32.q    q0,a11,1

.ifdef use_nudge
    add.n           a15,a6,a15                  # [36]
    saltu           a2,a15,a6                   # [39]
    add.n           a8,a8,a2                    # [40]

    add.n           a12,a6,a12                  # [54]
    saltu           a10,a12,a6                  # [57]
    add.n           a4,a4,a10                   # [58]
.endif

    src             a8,a8,a15                  # [95]
    src             a4,a4,a12                  # [69] # discard lower 31 bits
    ee.movi.32.q    q0,a8,3
    ee.movi.32.q    q0,a4,0

    bgez    a3, .skip_div_by_power_of_2

    neg     a5,a3                       # [0]  right_shift/exponent = -shift
    ee.vcmp.lt.s32  q2,q0,q2        # [97]
    addi.n          a7,a5,-1                # [0]  exponent - 1
    ssl             a7                      # [1]
    movi.n          a6,1                    # [92]
    sll             a6,a6                   # [2]
    s32i.n          a6,a1,4                 # [3]  to_add
    addi.n          a4,a1,4                 # [94]  to_add_addr
    ee.vldbc.32     q1,a4           # [4]  id:148 to_add
    wsr.sar         a5
    ee.vadds.s32    q1,q1,q2
    ee.vadds.s32    q0,q0,q1
    ee.vsr.32       q0,q0

.skip_div_by_power_of_2:
    retw.n                          # [9]

    .size   esp_nn_multiply_by_quantized_mult_asm_esp32s3, . - esp_nn_multiply_by_quantized_mult_asm_esp32s3


================================================
FILE: src/common/esp_nn_multiply_by_quantized_mult_ver1_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// quantisation version where we deal with different shifts and mults.

    .set use_nudge, 1

    .text
    .literal_position
    .literal    .LC3_19_48, 1073741824

    # Program Unit: esp_nn_multiply_by_quantized_mult_ver1_esp32s3
    .type   esp_nn_multiply_by_quantized_mult_ver1_esp32s3, @function
    .align   4
    .global esp_nn_multiply_by_quantized_mult_ver1_esp32s3

esp_nn_multiply_by_quantized_mult_ver1_esp32s3:    # 0x1ee
    entry       a1,32                       #
    ee.zero.q   q3                      # [0]
    l32i.n      a8,a3,0                 # [5]  id:200 // shift0
    l32i.n      a7,a3,4                 # [2]  id:201 // shift1
    l32i.n      a12,a2,0                # [3]  id:204 // mult0
    l32i.n      a15,a2,4                # [1]  id:205 // mult1
    movi.n      a10,0                   # [7]

    max             a6,a10,a8                   # [1] // left_shift0
    max             a5,a10,a7                   # [7] // left_shift1
    sub             a8,a6,a8                    # [2] // right_shift0
    sub             a7,a5,a7                    # [8] // right_shift1

    ee.movi.32.a    q0,a9,0             # [4]
    ee.movi.32.a    q0,a11,1            # [11]
    ssl             a6                          # [3]
    sll             a9,a9                       # [4]
    mulsh           a4,a12,a9                   # [6]
    mull            a12,a12,a9                  # [9]
    ssl             a5                          # [10]
    sll             a11,a11                         # [12]
    mulsh           a14,a15,a11                 # [14]
    mull            a15,a15,a11                 # [16]
    l32r            a13,.LC3_19_48              # [23]

    ee.movi.32.q    q0,a9,0             # [5]
    ee.movi.32.q    q0,a11,1            # [15]


    l32i.n          a6,a3,8                 # [6]  id:202 // shift2
    l32i.n          a9,a2,8                 # [19]  id:206 // mult2
    max             a5,a10,a6                   # [0] // left_shift2
    sub             a6,a5,a6                    # [24] // right_shift2


    ee.movi.32.a    q0,a11,2            # [17]
    ssl             a5                          # [13]
    sll             a11,a11                     # [18]
    ee.movi.32.q    q0,a11,2            # [20]
    mulsh           a5,a9,a11                  # [21]
    mull            a9,a9,a11                   # [22]
    mov             a11, a5

// add nudge to result0 & result1
    add.n           a12,a13,a12                 # [25]
    saltu           a5,a12,a13                  # [26]
    add.n           a15,a13,a15                 # [27]
    add.n           a5,a5,a4                    # [28]
    saltu           a4,a15,a13                  # [29]
    add.n           a4,a4,a14                   # [30]


    l32i.n          a14,a3,12               # [31]  id:203 // shift3
    add.n           a9,a13,a9                   # [32] // add nudge low2
    max             a10,a10,a14                 # [33]  // left_shift3
    sub             a14,a10,a14                 # [34]  // right_shift3
    ssl             a10                         # [35]
    ee.movi.32.a    q0,a10,3            # [36]
    sll             a10,a10                     # [37]

// select high32 from result0 and resul1
    ssai            31                          # [39]
    src             a5,a5,a12                   # [40]
    src             a4,a4,a15                   # [41]
    movi.n          a12,1                   # [42]
    ee.movi.32.q    q0,a5,0             # [43]
    saltu           a15,a9,a13                  # [44]
    add.n           a15,a15,a11                 # [45]
    ee.movi.32.q    q0,a4,1             # [46]
    l32i.n          a11,a2,12               # [47]  id:207 // mult3
    src             a15,a15,a9                  # [48]
    ee.movi.32.q    q0,a15,2            # [49]
    mull            a9,a11,a10                  # [50]
    mulsh           a11,a11,a10                 # [51]
    add.n           a9,a13,a9                   # [52]
    saltu           a13,a9,a13                  # [53]
    add.n           a13,a13,a11                 # [54]
    src             a13,a13,a9                  # [55]
    ee.movi.32.q    q0,a13,3            # [57]

// divide_by_power_of2_step
    ssl             a8                          # [56]
    sll             a9,a12                      # [58]
    ssl             a7                          # [59]
    addi.n          a9,a9,-1                # [60]
    ee.movi.32.q    q2,a9,0             # [61]
    sll             a11,a12                     # [62]
    addi.n          a11,a11,-1              # [63]
    ssl             a6                          # [64]
    sll             a10,a12                     # [65]
    ee.movi.32.q    q2,a11,1            # [66]
    ssl             a14                         # [67]
    addi.n          a10,a10,-1              # [68]
    ee.movi.32.q    q2,a10,2            # [69]
    sll             a9,a12                      # [70]
    addi.n          a9,a9,-1                # [71]
    ee.movi.32.q    q2,a9,3             # [74]
    ee.andq         q1,q0,q2                # [75]

    ssr             a8                          # [72]
    sra             a5,a5                       # [73]
    ssr             a7                          # [76]
    sra             a4,a4                       # [78]
    ssr             a6                          # [79]
    sra             a15,a15                     # [81]
    ssr             a14                         # [82]
    sra             a13,a13                     # [84]
    wsr.sar         a12                     # [85]

    ee.movi.32.q    q7,a5,0             # [77]
    ee.movi.32.q    q7,a4,1             # [80]
    ee.movi.32.q    q7,a15,2            # [83]
    ee.movi.32.q    q7,a13,3            # [86]

    ee.vcmp.lt.s32  q3,q7,q3        # [87]
    ee.vsr.32       q2,q2                   # [88]
    ee.vsubs.s32    q2,q2,q3            # [89]
    ee.vcmp.gt.s32  q1,q1,q2        # [90]
    ee.vsubs.s32    q0,q7,q1            # [91]

// return
    retw.n                          # [92]

    .size   esp_nn_multiply_by_quantized_mult_ver1_esp32s3, . - esp_nn_multiply_by_quantized_mult_ver1_esp32s3


================================================
FILE: src/convolution/esp_nn_conv_ansi.c
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <esp_nn_defs.h>

#include <common_functions.h>

int esp_nn_get_conv_scratch_size_ansi(const data_dims_t *input_dims,
                                      const data_dims_t *filter_dims,
                                      const data_dims_t *output_dims,
                                      const conv_params_t *conv_params)
{
    return 0;
}

void esp_nn_set_conv_scratch_buf_ansi(const void *buf)
{

}

/**
 * Assumption 1: i/p channels == o/p channels
 * Assumption 2: Pointers are valid
 * Assumption 3: dialation width = 1
 */
void esp_nn_conv_u8_ansi(const uint8_t *input_data,
                         const uint16_t input_wd,
                         const uint16_t input_ht,
                         const uint16_t in_channels,
                         const int32_t input_offset,
                         const uint16_t pad_wd,
                         const uint16_t pad_ht,
                         const uint16_t stride_wd,
                         const uint16_t stride_ht,
                         const uint8_t *filter_data,
                         const uint16_t filter_wd,
                         const uint16_t filter_ht,
                         const int32_t filter_offset,
                         const int32_t *bias,
                         uint8_t *out_data,
                         const uint16_t out_wd,
                         const uint16_t out_ht,
                         const uint16_t out_channels,
                         const int32_t out_offset,
                         const int32_t out_shift,
                         const int32_t out_mult,
                         const int32_t activation_min,
                         const int32_t activation_max)
{
    for (int out_y = 0; out_y < out_ht; out_y++) { //height loop
        const int16_t base_y = (out_y * stride_ht) - pad_ht;
        for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop
            const int16_t base_x = (out_x * stride_wd) - pad_wd;
            for (int out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {//channel_loop
                int32_t result = 0;

                /* Select filter so as the point doesn't lie outside block */
                int filter_y_start = max(0, -base_y);
                int filter_x_start = max(0, -base_x);
                int filter_y_end = min(filter_ht, input_ht - base_y);
                int filter_x_end = min(filter_wd, input_wd - base_x);

                for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
                    const int32_t idx_y = base_y + filter_y_idx;
                    for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
                        const int32_t idx_x = base_x + filter_x_idx;
                        for (int in_ch_idx = 0; in_ch_idx < in_channels; in_ch_idx++) {
                            int32_t input_index = (idx_y * input_wd + idx_x) * in_channels + in_ch_idx;
                            int32_t filter_index = ((out_ch_idx * filter_ht + filter_y_idx)
                                                    * filter_wd + filter_x_idx) * in_channels
                                                   + in_ch_idx;
                            int32_t input_val = input_data[input_index] + input_offset;
                            int32_t filter_val = filter_data[filter_index] + filter_offset;
                            result += input_val * filter_val;
                        }
                    }
                }
                if (bias) {
                    result += bias[out_ch_idx];
                }
                result = esp_nn_multiply_by_quantized_mult(result, out_mult, out_shift);
                result += out_offset;
                result = max(result, activation_min);
                result = min(result, activation_max);

                int out_index = (out_y * out_wd + out_x) * out_channels + out_ch_idx;
                out_data[out_index] = (uint8_t) result;
            }
        }
    }
}

/**
 * Assumption 1: i/p channels == o/p channels
 * Assumption 2: Pointers are valid
 * Assumption 3: dialation width = 1
 */
void esp_nn_conv_s8_ansi(const data_dims_t *input_dims,
                         const int8_t *input_data,
                         const data_dims_t *filter_dims,
                         const int8_t *filter_data,
                         const int32_t *bias,
                         const data_dims_t *output_dims,
                         int8_t *out_data,
                         const conv_params_t *conv_params,
                         const quant_data_t *quant_data)
{
    const uint16_t input_wd = input_dims->width;
    const uint16_t input_ht = input_dims->height;
    const uint16_t in_channels = input_dims->channels;
    const int32_t input_offset = conv_params->in_offset;
    const int32_t out_offset = conv_params->out_offset;
    const uint16_t pad_wd = conv_params->padding.width;
    const uint16_t pad_ht = conv_params->padding.height;
    const uint16_t stride_wd = conv_params->stride.width;
    const uint16_t stride_ht = conv_params->stride.height;
    const uint16_t filter_wd = filter_dims->width;
    const uint16_t filter_ht = filter_dims->height;
    const uint16_t out_wd = output_dims->width;
    const uint16_t out_ht = output_dims->height;
    const uint16_t out_channels = output_dims->channels;
    const int32_t *out_shift = quant_data->shift;
    const int32_t *out_mult = quant_data->mult;
    const int32_t activation_min = conv_params->activation.min;
    const int32_t activation_max = conv_params->activation.max;

    /* Fall back to in_channels when filter_dims->channels is unset (legacy callers). */
    const uint16_t filter_ch = filter_dims->channels ? filter_dims->channels : in_channels;
    const int32_t groups = in_channels / filter_ch;
    const int32_t filters_per_group = out_channels / groups;

    int32_t out_ch_idx, out_y, out_x, in_ch_idx, filter_y_idx, filter_x_idx;

    for (out_y = 0; out_y < out_ht; out_y++) {
        for (out_x = 0; out_x < out_wd; out_x++) {
            for (out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {
                int32_t conv_out = 0;
                const int32_t group = out_ch_idx / filters_per_group;
                const int32_t in_ch_start = group * filter_ch;

                const int32_t base_y = stride_ht * out_y - pad_ht;
                const int32_t base_x = stride_wd * out_x - pad_wd;

                const int32_t filter_y_start = max(0, -base_y);
                const int32_t filter_x_start = max(0, -base_x);

                const int32_t filter_y_end = min(filter_ht, input_ht - base_y);
                const int32_t filter_x_end = min(filter_wd, input_wd - base_x);

                for (filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
                    for (filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
                        const int32_t in_row = base_y + filter_y_idx;
                        const int32_t in_col = base_x + filter_x_idx;
                        int32_t input_base_offset = (in_row * input_wd + in_col) * in_channels + in_ch_start;
                        int32_t filter_base_offset = out_ch_idx * filter_ch * filter_ht * filter_wd +
                                                       (filter_y_idx * filter_wd + filter_x_idx) * filter_ch;
                        for (in_ch_idx = 0; in_ch_idx < filter_ch; in_ch_idx++) {
                            conv_out +=
                                (input_data[input_base_offset + in_ch_idx] + input_offset) *
                                filter_data[filter_base_offset + in_ch_idx];
                        }
                    }
                }
                if (bias) {
                    conv_out += bias[out_ch_idx];
                }
                conv_out = esp_nn_multiply_by_quantized_mult(conv_out, out_mult[out_ch_idx], out_shift[out_ch_idx]);
                conv_out += out_offset;
                conv_out = max(conv_out, activation_min);
                conv_out = min(conv_out, activation_max);
                *out_data++ = (int8_t) conv_out;
            }
        }
    }
}


================================================
FILE: src/convolution/esp_nn_conv_esp32p4.c
================================================
/*
 * SPDX-FileCopyrightText: 2024-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/**
 * Optimizations strategies used:
 * Below optimizations are capable of any size of input/filter:
 *
 * 1. For filter wdxht = 1x1 (Refer esp_nn_conv_s8_mult8_1x1_esp32p4 function)
 *      - For this specific version, the strategy we employ:
 *          > This particular filter has only the channel
 *              dimension and we have `out_ch` number of such filters.
 *          > We take 8 input lines at a time and transpose those.
 *          > Keep loading and multiplying filter values one by one,
 *              to produce 8 outputs in parallel
 *
 * 2. General version: (Refer esp_nn_conv_s8_filter_aligned_input_padded_esp32p4)
 *      - For all other cases:
 *          > Consider `filter_wd * in_ch` as a single row. These many values can
 *              be continuosly loaded from inputs as well.
 *          > multiply accumulate into a single filter output.
 *          > To speed things up further, we pre-calculate
 *              (filter * in_offset + bias term) earlier and add it at the end of filter
 *
 *      About ((filter * in_offset + bias term)) accumulate term:
 *          > The conv operation before requantization is as follows:
 *              for i in filter_size:
 *                  conv_out += (input + input_offset) * filter;
 *               conv_out += bias
 *
 *          > where input_offset is constant term hence, we can see that
 *              this term can be precalculated as:
 *                  for i in filter_size:
 *                      acc_term += input_offset * filter[i];
 *                  acc_term += bias
 *              OR
 *                   for i in filter_size:
 *                      acc_term += filter[i]; // accumulate filter values
 *                  acc_term = acc_term * input_offset + bias
 *
 *
 * In both the above versions we align the filter if needed, pad the input with
 *       -input_offset if needed and extend the channels to make those multiple
 *       of 8/16 as per function needs
 */

#include <stdio.h>
#include <esp_nn_defs.h>
#include <esp_nn_ansi_headers.h>
#include "esp_nn_generic_opt.h"

#include <common_functions.h>

static int16_t *scratch_buffer = NULL;

/**
 * Reusable PIE-accelerated dot product (same as FC version).
 * Processes 32 elements/iter (double-pump) for len >= 32,
 * 16 elements/iter for len >= 16, scalar remainder.
 */
static inline __attribute__((always_inline))
int32_t pie_dot_s8(const int8_t *a, const int8_t *b, int32_t len)
{
    int32_t result = 0;
    int32_t idx = 0;

    if (len >= 32) {
        asm volatile (
            "esp.zero.xacc                          \n\t"
            "mv     x30, %[in]                      \n\t"
            "mv     x31, %[flt]                     \n\t"
            "li     %[idx], 32                      \n\t"
            "addi   s7, %[len], -31                 \n\t"
            "esp.vld.128.ip  q0, x30, 16            \n\t"
            "esp.vld.128.ip  q2, x30, 16            \n\t"
            "esp.vld.128.ip  q1, x31, 16            \n\t"
            "esp.vld.128.ip  q3, x31, 16            \n\t"
            "j      2f                              \n\t"
            "1:                                     \n\t"
            "esp.vmulas.s8.xacc.ld.ip q0, x30, 16, q0, q1 \n\t"
            "esp.vld.128.ip  q1, x31, 16            \n\t"
            "esp.vmulas.s8.xacc.ld.ip q2, x30, 16, q2, q3 \n\t"
            "esp.vld.128.ip  q3, x31, 16            \n\t"
            "addi   %[idx], %[idx], 32              \n\t"
            "2:                                     \n\t"
            "blt    %[idx], s7, 1b                  \n\t"
            "esp.vmulas.s8.xacc  q0, q1             \n\t"
            "esp.vmulas.s8.xacc  q2, q3             \n\t"
            "addi   s7, %[len], -15                 \n\t"
            "bge    %[idx], s7, 3f                  \n\t"
            "esp.vld.128.ip  q0, x30, 16            \n\t"
            "esp.vld.128.ip  q1, x31, 16            \n\t"
            "esp.vmulas.s8.xacc  q0, q1             \n\t"
            "addi   %[idx], %[idx], 16              \n\t"
            "3:                                     \n\t"
            "esp.movx.r.xacc.l   x30                \n\t"
            "mv     %[res], x30                     \n\t"
            : [idx] "+r"(idx), [res] "=r"(result)
            : [in] "r"(a), [flt] "r"(b), [len] "r"(len)
            : "x30", "x31", "s7"
        );
    } else if (len >= 16) {
        asm volatile (
            "esp.zero.xacc                          \n\t"
            "mv     x30, %[in]                      \n\t"
            "mv     x31, %[flt]                     \n\t"
            "li     %[idx], 16                      \n\t"
            "addi   s7, %[len], -15                 \n\t"
            "esp.vld.128.ip  q0, x30, 16            \n\t"
            "esp.vld.128.ip  q1, x31, 16            \n\t"
            "j      5f                              \n\t"
            "4:                                     \n\t"
            "esp.vmulas.s8.xacc.ld.ip q0, x30, 16, q0, q1 \n\t"
            "esp.vld.128.ip  q1, x31, 16            \n\t"
            "addi   %[idx], %[idx], 16              \n\t"
            "5:                                     \n\t"
            "blt    %[idx], s7, 4b                  \n\t"
            "esp.vmulas.s8.xacc  q0, q1             \n\t"
            "esp.movx.r.xacc.l   x30                \n\t"
            "mv     %[res], x30                     \n\t"
            : [idx] "+r"(idx), [res] "=r"(result)
            : [in] "r"(a), [flt] "r"(b), [len] "r"(len)
            : "x30", "x31", "s7"
        );
    }

    for (; idx < len; idx++) {
        result += (int32_t)a[idx] * (int32_t)b[idx];
    }
    return result;
}

/**
 * Batched 1x1 conv using QACC per-lane: processes 16 pixels simultaneously.
 * Transposes input so each QACC lane = one pixel, then broadcasts filter
 * coefficients for per-lane accumulation. Critical for small in_ch where
 * XACC can't be used (in_ch < 16).
 *
 * For in_ch=8: 4.5x faster than scalar per-pixel approach.
 */
__attribute__((noinline))
static void conv_1x1_batch16(const int8_t *pixel_ptrs[16],
                      const int8_t *filter_data,
                      const int32_t *filter_sum,
                      const int32_t *bias,
                      int8_t *out_ptrs[16],
                      int32_t in_ch, int32_t out_ch,
                      int32_t out_offset,
                      const int32_t *out_mult, const int32_t *out_shift,
                      int32_t act_min, int32_t act_max)
{
    /* Ensure PIE is enabled (might be lost across noinline function call) */
    asm volatile (
        "csrsi  0x7f2, 0b01        \n\t"
        "li     x29, 0b10          \n\t"
        "esp.movx.w.cfg x29        \n\t"
        ::: "x29"
    );

    /* Transpose: arrange 16 pixels' data as ch0[p0..p15], ch1[p0..p15], ... */
    int8_t transposed[16 * 16] __attribute__((aligned(16)));  /* in_ch <= 16 for this path */
    for (int c = 0; c < in_ch; c++) {
        for (int p = 0; p < 16; p++) {
            transposed[c * 16 + p] = pixel_ptrs[p][c];
        }
    }

    /* For each output channel: QACC per-lane MAC with broadcast filter.
     * Use single asm block for zero + accumulate loop to prevent
     * q register clobber between separate asm blocks. */
    const int8_t *filt = filter_data;
    for (int32_t oc = 0; oc < out_ch; oc++) {
        /* Single asm: zero QACC, then loop over in_ch channels:
         * broadcast filter[ch], load 16 transposed pixels, MAC per-lane */
        asm volatile (
            "esp.zero.qacc                       \n\t"
            "mv     x30, %[trans]                \n\t"  /* transposed base */
            "mv     x31, %[flt]                  \n\t"  /* filter base */
            "mv     s7,  %[cnt]                  \n\t"  /* in_ch count */
            "1:                                  \n\t"
            "esp.vld.128.ip  q0, x30, 16         \n\t"  /* load 16 pixel values, advance by 16 */
            "esp.vldbc.8.ip  q1, x31, 1          \n\t"  /* broadcast filter[ch], advance by 1 */
            "esp.vmulas.s8.qacc q0, q1           \n\t"
            "addi   s7, s7, -1                   \n\t"
            "bnez   s7, 1b                       \n\t"
            :
            : [trans] "r"(transposed), [flt] "r"(filt), [cnt] "r"(in_ch)
            : "x30", "x31", "s7"
        );

        /* Extract 16 results */
        int32_t results[16] __attribute__((aligned(16)));
        ESP_NN_QACC_EXTRACT_S32(results);

        /* Add filter_sum + bias, requant, clamp, store for each pixel */
        int32_t fs = filter_sum[oc];
        int32_t b = bias ? bias[oc] : 0;
        int32_t combined = fs + b;
        int32_t m = out_mult[oc];
        int32_t s = out_shift[oc];

        for (int p = 0; p < 16; p++) {
            int32_t r = results[p] + combined;
            r = esp_nn_multiply_by_quantized_mult(r, m, s);
            r += out_offset;
            r = max(r, act_min);
            r = min(r, act_max);
            out_ptrs[p][oc] = (int8_t) r;
        }

        filt += in_ch;
    }
}

__attribute__ ((noinline))
static void esp_nn_conv_s8_1x1(const data_dims_t *input_dims,
                               const int8_t *input_data,
                               const int8_t *filter_data,
                               const int32_t *bias,
                               const data_dims_t *output_dims,
                               int8_t *out_data,
                               const conv_params_t *conv_params,
                               const quant_data_t *quant_data,
                               void *scratch)
{
    const uint16_t input_wd = input_dims->width;
    const uint16_t in_channels = input_dims->channels;
    const int32_t input_offset = conv_params->in_offset;
    const int32_t out_offset = conv_params->out_offset;
    const uint16_t out_wd = output_dims->width;
    const uint16_t out_ht = output_dims->height;
    const uint16_t out_channels = output_dims->channels;
    const int32_t activation_min = conv_params->activation.min;
    const int32_t activation_max = conv_params->activation.max;

    int32_t *filter_sum = (int32_t *) scratch; // alignment of 4 bytes assumed

    /* pre-calculate filter_sum * input_offset */
    const int8_t *filter_ptr = filter_data;
    for (int32_t out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {
        int32_t sum = 0;
        int32_t in_ch_idx = 0;
        for (; in_ch_idx < in_channels - 3; in_ch_idx += 4) {
            sum += *filter_ptr++;
            sum += *filter_ptr++;
            sum += *filter_ptr++;
            sum += *filter_ptr++;
        }
        for (; in_ch_idx < in_channels; in_ch_idx ++) {
            sum += *filter_ptr++;
        }
        filter_sum[out_ch_idx] = sum * input_offset;
    }

    /* When in_ch < 16: use QACC batch path (16 pixels at once) or channel padding.
     * QACC batch: transpose pixels, broadcast filter, per-lane MAC.
     * Channel pad: pad in/filter to 16 ch for XACC. */
    /* When in_ch < 16: use QACC batch (16 pixels at a time with broadcast filter).
     * Falls back to channel-padding for remaining pixels. */
    if (in_channels < 16) {
        /* Enable PIE for QACC */
        asm volatile (
            "csrsi  0x7f2, 0b01        \n\t"
            "li     x29, 0b10          \n\t"
            "esp.movx.w.cfg x29        \n\t"
            ::: "x29"
        );

        int32_t total_pixels = out_wd * out_ht;
        int32_t pix = 0;

        /* Process batches of 16 pixels using QACC per-lane */
        for (; pix <= total_pixels - 16; pix += 16) {
            const int8_t *pp[16];
            int8_t *op[16];
            for (int p = 0; p < 16; p++) {
                pp[p] = input_data + (pix + p) * in_channels;
                op[p] = out_data + (pix + p) * out_channels;
            }
            conv_1x1_batch16(pp, filter_data, filter_sum, bias, op,
                             in_channels, out_channels, out_offset,
                             quant_data->mult, quant_data->shift,
                             activation_min, activation_max);
        }

        /* Remaining pixels (< 16): scalar fallback */
        for (; pix < total_pixels; pix++) {
            const int8_t *inp = input_data + pix * in_channels;
            filter_ptr = filter_data;
            for (int32_t oc = 0; oc < out_channels; oc++) {
                int32_t conv_out = 0;
                for (int32_t ic = 0; ic < in_channels; ic++) {
                    conv_out += inp[ic] * filter_ptr[ic];
                }
                conv_out += filter_sum[oc];
                if (bias) conv_out += bias[oc];
                conv_out = esp_nn_multiply_by_quantized_mult(conv_out,
                    quant_data->mult[oc], quant_data->shift[oc]);
                conv_out += out_offset;
                conv_out = max(conv_out, activation_min);
                conv_out = min(conv_out, activation_max);
                out_data[pix * out_channels + oc] = (int8_t) conv_out;
                filter_ptr += in_channels;
            }
        }
        return;
    }

    for (int32_t in_row = 0; in_row < out_ht; in_row++) {
        for (int32_t in_col = 0; in_col < out_wd; in_col++) {
            const int32_t *out_mult = quant_data->mult;
            const int32_t *out_shift = quant_data->shift;
            filter_ptr = filter_data;
            const int8_t *input_base_ptr = input_data + (in_row * input_wd + in_col) * in_channels;
            for (int32_t out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {
                /* initializations */
                int32_t conv_out = 0;
                const int8_t *input_ptr = input_base_ptr;

                int32_t in_ch_idx = 0;
#if 1 // inline asm
                // for now check for the alignment as well
                if (in_channels < 16) {// || ((uint32_t) input_ptr & 15) || ((uint32_t) filter_ptr & 15)) {
                    goto skip_asm;
                }

                asm volatile (
                    "li %0, 16                      \n\t"
                    "addi s7, %4, -15               \n\t"
                    "mv x30, %1                     \n\t"
                    "mv x31, %2                     \n\t"
                    "esp.zero.xacc                  \n\t"
                    "esp.vld.128.ip  q0, x30, 16    \n\t"
                    "esp.vld.128.ip  q1, x31, 16    \n\t"

                    "j .loop16_end  \n\t"

                    ".loop16_start:      \n\t"
                    "esp.vmulas.s8.xacc.ld.ip  q0, x30, 16, q0, q1   \n\t"
                    "esp.vld.128.ip  q1, x31, 16                     \n\t"
                    "addi %0, %0, 16                \n\t"   // in_ch_idx += 16

                    ".loop16_end:    \n\t"
                    "blt %0, s7, .loop16_start \n\t"  // if in_ch_idx < `in_channels - 15` abort

                    // move input_ptr, filter_ptr and conv_out
                    "mv %1, x30                     \n\t"
                    "mv %2, x31                     \n\t"
                    "esp.vmulas.s8.xacc  q0, q1     \n\t"
                    "esp.movx.r.xacc.l  %3          \n\t"

                    : "+r" (in_ch_idx), "+r" (input_ptr), "+r" (filter_ptr), "=r" (conv_out)
                    :  "r"(in_channels)
                    : "x30", "x31", "s7"
                );
skip_asm:
#endif
                for (; in_ch_idx < in_channels - 3; in_ch_idx += 4) {
                    conv_out += *input_ptr++ * *filter_ptr++;
                    conv_out += *input_ptr++ * *filter_ptr++;
                    conv_out += *input_ptr++ * *filter_ptr++;
                    conv_out += *input_ptr++ * *filter_ptr++;
                }

                for (; in_ch_idx < in_channels; in_ch_idx++) {
                    conv_out += *input_ptr++ * *filter_ptr++;
                }
                conv_out = conv_out + filter_sum[out_ch_idx];
                if (bias) {
                    conv_out += bias[out_ch_idx];
                }
                conv_out = esp_nn_requantize(conv_out, *out_mult++, *out_shift++);
                conv_out += out_offset;
                conv_out = max(conv_out, activation_min);
                conv_out = min(conv_out, activation_max);
                *out_data++ = (int8_t) conv_out;
            }
        }
    }
}

__attribute__ ((noinline))
static void esp_nn_conv_s8_padded(
        const data_dims_t *input_dims,
        const int8_t *input_data,
        const data_dims_t *filter_dims,
        const int8_t *filter_data,
        const int32_t *bias,
        const data_dims_t *output_dims,
        int8_t *out_data,
        const conv_params_t *conv_params,
        const quant_data_t *quant_data,
        void *scratch)
{
    const uint16_t input_wd = input_dims->width;
    const uint16_t input_ht = input_dims->height;
    const uint16_t in_channels = input_dims->channels;
    const int32_t input_offset = conv_params->in_offset;
    const int32_t out_offset = conv_params->out_offset;
    const uint16_t stride_wd = conv_params->stride.width;
    const uint16_t stride_ht = conv_params->stride.height;
    const uint16_t filter_wd = filter_dims->width;
    const uint16_t filter_ht = filter_dims->height;
    const uint16_t out_wd = output_dims->width;
    const uint16_t out_ht = output_dims->height;
    const uint16_t out_channels = output_dims->channels;
    const int32_t *out_shift = quant_data->shift;
    const int32_t *out_mult = quant_data->mult;
    const int32_t activation_min = conv_params->activation.min;
    const int32_t activation_max = conv_params->activation.max;

    /* Grouped conv (filter_ch < input_ch): fall back to ansi which handles it */
    if (in_channels != filter_dims->channels) {
        esp_nn_conv_s8_ansi(input_dims, input_data, filter_dims, filter_data,
                            bias, output_dims, out_data, conv_params, quant_data);
        return;
    }

    int32_t *filter_sum = (int32_t *) scratch; // alignment of 4 bytes assumed

    /* pre-calculate filter_sum * input_offset */
    const int8_t *filter_ptr = filter_data;
    for (int32_t out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {
        int32_t sum = 0;
        int32_t filter_len = filter_wd * filter_ht * in_channels;
        int32_t filter_idx = 0;
        for (; filter_idx < filter_len - 3; filter_idx += 4) {
            sum += *filter_ptr++;
            sum += *filter_ptr++;
            sum += *filter_ptr++;
            sum += *filter_ptr++;
        }
        for (; filter_idx < filter_len; filter_idx++) {
            sum += *filter_ptr++;
        }
        filter_sum[out_ch_idx] = sum * input_offset;
    }

    const int32_t row_size = filter_wd * in_channels;

    bool right_pad = max(0, ((out_wd - 1) * stride_wd + filter_wd - input_wd));
    bool bottom_pad = max(0, ((out_ht - 1) * stride_ht + filter_ht - input_ht));

    for (int32_t out_y = 0; out_y < out_ht - bottom_pad; out_y++) {
        for (int32_t out_x = 0; out_x < out_wd - right_pad; out_x++) {
            const int32_t base_y = stride_ht * out_y;
            const int32_t base_x = stride_wd * out_x;
            const int32_t *out_mult_ptr = out_mult;
            const int32_t *out_shift_ptr = out_shift;
            const int32_t *bias_ptr = bias;
            const int8_t *filter_data_ptr = filter_data;
            for (int32_t out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {
                int32_t conv_out = 0, filter_y_idx;
                if (row_size >= 16) {
                    asm volatile("esp.zero.xacc                  \n\t");
                }

                for (filter_y_idx = 0; filter_y_idx < filter_ht; filter_y_idx++) {
                    const int32_t in_row = base_y + filter_y_idx;
                    const int32_t in_col = base_x;
                    const int8_t *input_data_ptr =
                            input_data + (in_row * input_wd + in_col) * in_channels;
                    int32_t row_idx = 0;
#if 1 // inline asm
                // for now check for the alignment as well
                if (row_size < 16) {// || ((uint32_t) input_ptr & 15) || ((uint32_t) filter_ptr & 15)) {
                    goto skip_asm_pad0;
                }

                asm volatile (
                    "li %0, 16                      \n\t"
                    "addi s7, %3, -15               \n\t"
                    "mv x30, %1                     \n\t"
                    "mv x31, %2                     \n\t"
                    "esp.vld.128.ip  q0, x30, 16    \n\t"
                    "esp.vld.128.ip  q1, x31, 16    \n\t"

                    "j .loop16_pad0_end  \n\t"

                    ".loop16_pad0_start:      \n\t"
                    "esp.vmulas.s8.xacc.ld.ip  q0, x30, 16, q0, q1   \n\t"
                    "esp.vld.128.ip  q1, x31, 16                     \n\t"
                    "addi %0, %0, 16                \n\t"   // in_ch_idx += 16

                    ".loop16_pad0_end:    \n\t"
                    "blt %0, s7, .loop16_pad0_start \n\t"  // if in_ch_idx < `in_channels - 15` abort

                    // move input_ptr, filter_ptr and conv_out
                    "mv %1, x30                     \n\t"
                    "mv %2, x31                     \n\t"
                    "esp.vmulas.s8.xacc  q0, q1     \n\t"

                    : "+r" (row_idx), "+r" (input_data_ptr), "+r" (filter_data_ptr)
                    :  "r"(row_size)
                    : "x30", "x31", "s7"
                );
skip_asm_pad0:
#endif
                    for (; row_idx < row_size - 3; row_idx += 4) {
                        conv_out += *input_data_ptr++ * *filter_data_ptr++;
                        conv_out += *input_data_ptr++ * *filter_data_ptr++;
                        conv_out += *input_data_ptr++ * *filter_data_ptr++;
                        conv_out += *input_data_ptr++ * *filter_data_ptr++;
                    }
                    for (; row_idx < row_size; row_idx++) {
                        conv_out += *input_data_ptr++ * *filter_data_ptr++;
                    }
                }
                if (row_size >= 16) {
                    asm volatile (
                        "esp.movx.r.xacc.l  x30   \n\t"
                        "add %0, %0, x30          \n\t"
                        : "+r" (conv_out)
                        :
                        : "x30"
                    );
                }
                /* add input_offset term */
                conv_out += filter_sum[out_ch_idx];

                if (bias) {
                    conv_out += *bias_ptr++;
                }
                conv_out = esp_nn_requantize(conv_out, *out_mult_ptr++, *out_shift_ptr++);
                conv_out += out_offset;
                conv_out = max(conv_out, activation_min);
                conv_out = min(conv_out, activation_max);
                *out_data++ = (int8_t) conv_out;
            }
        }

        for (int32_t out_x = out_wd - right_pad; out_x < out_wd; out_x++) {
            const int32_t base_y = stride_ht * out_y;
            const int32_t base_x = stride_wd * out_x;
            const int32_t *out_mult_ptr = out_mult;
            const int32_t *out_shift_ptr = out_shift;
            const int32_t *bias_ptr = bias;
            for (int32_t out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {
                int32_t conv_out = 0, filter_y_idx;
                for (filter_y_idx = 0; filter_y_idx < filter_ht; filter_y_idx++) {
                    for (int32_t filter_x_idx = 0; filter_x_idx < filter_wd - right_pad; filter_x_idx++) {
                        const int32_t in_row = base_y + filter_y_idx;
                        const int32_t in_col = base_x + filter_x_idx;

                        const int8_t *input_ptr = input_data +
                                        (in_row * input_wd + in_col) * in_channels;
                        const int8_t *filter_ptr = filter_data +
                                        out_ch_idx * in_channels * filter_ht * filter_wd +
                                        (filter_y_idx * filter_wd + filter_x_idx) * in_channels;
                        int32_t in_ch_idx = 0;
                        for (; in_ch_idx < in_channels - 3; in_ch_idx += 4) {
                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
                        }
                        for (; in_ch_idx < in_channels; in_ch_idx ++) {
                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
                        }
                    }
                }

                if (bias) {
                    conv_out += *bias_ptr++;
                }
                conv_out = esp_nn_requantize(conv_out, *out_mult_ptr++, *out_shift_ptr++);
                conv_out += out_offset;
                conv_out = max(conv_out, activation_min);
                conv_out = min(conv_out, activation_max);
                *out_data++ = (int8_t) conv_out;
            }
        }
    }

    // Calculate the last row if needed
    if (bottom_pad) {
        int in_row = input_dims->height - filter_dims->height + 1;
        esp_nn_conv_s8_opt(&(data_dims_t){input_dims->width, 2, input_dims->channels, 0},
                            input_data + in_row * input_dims->width * input_dims->channels,
                            filter_dims, filter_data, bias,
                            &(data_dims_t){output_dims->width, 1, output_dims->channels, 0},
                            out_data, conv_params, quant_data);
    }
}

/* L1D cache budget: use half of 64KB to leave room for filter streaming */
#define L1D_BUDGET 32768

/**
 * Im2col convolution for small in_ch where filter_wd * in_ch < 16.
 *
 * Instead of padding channels (81% wasted MACs for in_ch=3),
 * concatenates the entire filter window into one contiguous vector:
 *   window_len = filter_wd * filter_ht * in_ch (e.g., 3*3*3 = 27)
 *
 * For each output pixel: copy the input window into a contiguous scratch
 * buffer, then use PIE dot product on the full window. No wasted MACs.
 *
 * Scratch layout: [filter_sum | im2col_buf]
 *   im2col_buf = filter_wd * filter_ht * in_ch bytes
 */
__attribute__ ((noinline))
static void esp_nn_conv_s8_im2col(
        const data_dims_t *input_dims,
        const int8_t *input_data,
        const data_dims_t *filter_dims,
        const int8_t *filter_data,
        const int32_t *bias,
        const data_dims_t *output_dims,
        int8_t *out_data,
        const conv_params_t *conv_params,
        const quant_data_t *quant_data,
        void *scratch)
{
    const uint16_t input_wd = input_dims->width;
    const uint16_t input_ht = input_dims->height;
    const uint16_t in_ch = input_dims->channels;
    const uint16_t filter_wd = filter_dims->width;
    const uint16_t filter_ht = filter_dims->height;
    const uint16_t out_wd = output_dims->width;
    const uint16_t out_ht = output_dims->height;
    const uint16_t out_ch = output_dims->channels;
    const uint16_t pad_wd = conv_params->padding.width;
    const uint16_t pad_ht = conv_params->padding.height;
    const uint16_t stride_wd = conv_params->stride.width;
    const uint16_t stride_ht = conv_params->stride.height;
    const int32_t input_offset = conv_params->in_offset;
    const int32_t out_offset = conv_params->out_offset;
    const int32_t activation_min = conv_params->activation.min;
    const int32_t activation_max = conv_params->activation.max;

    const int32_t window_len = filter_wd * filter_ht * in_ch;
    const int8_t pad_val = (int8_t)(-input_offset);

    /* Scratch: filter_sum[out_ch] + im2col_buf[window_len] */
    int32_t *filter_sum = (int32_t *)scratch;
    int8_t *im2col_buf = (int8_t *)scratch + out_ch * sizeof(int32_t);

    /* Pre-compute filter_sum * input_offset */
    const int8_t *fptr = filter_data;
    for (int32_t oc = 0; oc < out_ch; oc++) {
        int32_t sum = 0;
        for (int32_t fi = 0; fi < window_len; fi++) {
            sum += *fptr++;
        }
        filter_sum[oc] = sum * input_offset;
    }

    /* Process each output pixel */
    int8_t *out_ptr = out_data;
    for (int32_t out_y = 0; out_y < out_ht; out_y++) {
        for (int32_t out_x = 0; out_x < out_wd; out_x++) {
            const int32_t base_y = out_y * stride_ht - pad_ht;
            const int32_t base_x = out_x * stride_wd - pad_wd;

            /* Copy input window into contiguous im2col buffer */
            int8_t *buf = im2col_buf;
            for (int32_t fy = 0; fy < filter_ht; fy++) {
                int32_t in_y = base_y + fy;
                for (int32_t fx = 0; fx < filter_wd; fx++) {
                    int32_t in_x = base_x + fx;
                    if (in_y >= 0 && in_y < input_ht && in_x >= 0 && in_x < input_wd) {
                        const int8_t *src = input_data + (in_y * input_wd + in_x) * in_ch;
                        for (int c = 0; c < in_ch; c++) {
                            *buf++ = src[c];
                        }
                    } else {
                        /* Padding pixel */
                        for (int c = 0; c < in_ch; c++) {
                            *buf++ = pad_val;
                        }
                    }
                }
            }

            /* Dot product against each output channel's filter */
            const int32_t *out_mult = quant_data->mult;
            const int32_t *out_shift = quant_data->shift;
            const int8_t *filter_ptr = filter_data;

            for (int32_t oc = 0; oc < out_ch; oc++) {
                int32_t conv_out = pie_dot_s8(im2col_buf, filter_ptr, window_len);
                conv_out += filter_sum[oc];
                if (bias) conv_out += bias[oc];
                conv_out = esp_nn_requantize(conv_out, *out_mult++, *out_shift++);
                conv_out += out_offset;
                conv_out = max(conv_out, activation_min);
                conv_out = min(conv_out, activation_max);
                *out_ptr++ = (int8_t) conv_out;
                filter_ptr += window_len;
            }
        }
    }
}

/**
 * Tiled convolution: process T output rows at a time.
 * Converts padded conv into a series of no-pad sub-problems by
 * copying/padding input tiles into the scratch buffer.
 *
 * This keeps the working set in L1D for large input tensors.
 * Reuses the existing esp_nn_conv_s8_padded PIE inner loop per tile.
 */
__attribute__ ((noinline))
static void esp_nn_conv_s8_tiled(
        const data_dims_t *input_dims,
        const int8_t *input_data,
        const data_dims_t *filter_dims,
        const int8_t *filter_data,
        const int32_t *bias,
        const data_dims_t *output_dims,
        int8_t *out_data,
        const conv_params_t *conv_params,
        const quant_data_t *quant_data,
        void *scratch)
{
    const uint16_t input_wd = input_dims->width;
    const uint16_t input_ht = input_dims->height;
    const uint16_t in_ch = input_dims->channels;
    const uint16_t filter_wd = filter_dims->width;
    const uint16_t filter_ht = filter_dims->height;
    const uint16_t out_wd = output_dims->width;
    const uint16_t out_ht = output_dims->height;
    const uint16_t out_ch = output_dims->channels;
    const uint16_t pad_wd = conv_params->padding.width;
    const uint16_t pad_ht = conv_params->padding.height;
    const uint16_t stride_ht = conv_params->stride.height;
    const int32_t input_offset = conv_params->in_offset;

    /* Check if we need channel padding for PIE (row_size must be >= 16) */
    int new_ch = in_ch;
    int need_ch_pad = 0;
    if (filter_wd * in_ch < 16) {
        new_ch = (16 + filter_wd - 1) / filter_wd;  /* minimum channels for PIE */
        new_ch = (new_ch + 15) & ~15;                /* align to 16 */
        need_ch_pad = 1;
    }
    int padded_input_wd = input_wd + 2 * pad_wd;

    /* Scratch layout:
     * [0] filter_sum: out_ch * 4 bytes
     * [after filter_sum] aligned_filter (if ch padding): filter_wd * filter_ht * new_ch * out_ch
     * [after filter] tile_input_buf: variable per tile
     */
    int32_t *filter_sum = (int32_t *) scratch;
    int filter_sum_size = out_ch * sizeof(int32_t);

    /* Pre-compute filter_sum * input_offset (once for entire layer) */
    const int8_t *fptr = filter_data;
    for (int32_t oc = 0; oc < out_ch; oc++) {
        int32_t sum = 0;
        int32_t flen = filter_wd * filter_ht * in_ch;
        for (int32_t fi = 0; fi < flen; fi++) {
            sum += *fptr++;
        }
        filter_sum[oc] = sum * input_offset;
    }

    /* Channel-pad filter if needed (pad with 0s - doesn't affect filter_sum) */
    int8_t *aligned_filter = NULL;
    int aligned_filter_size = 0;
    if (need_ch_pad) {
        aligned_filter = (int8_t *)scratch + filter_sum_size;
        aligned_filter_size = filter_wd * filter_ht * new_ch * out_ch;
        memset(aligned_filter, 0, aligned_filter_size);
        const int8_t *src_f = filter_data;
        int8_t *dst_f = aligned_filter;
        for (int oc = 0; oc < out_ch; oc++) {
            for (int fh = 0; fh < filter_ht; fh++) {
                for (int fw = 0; fw < filter_wd; fw++) {
                    memcpy(dst_f, src_f, in_ch);
                    src_f += in_ch;
                    dst_f += new_ch;  /* zero-padded channels */
                }
            }
        }
    }

    /* Tile input buffer starts after filter_sum + aligned_filter */
    int8_t *tile_buf = (int8_t *)scratch + filter_sum_size + aligned_filter_size;

    /* Use effective channel count for tile buffer sizing */
    int eff_ch = need_ch_pad ? new_ch : in_ch;
    int tile_input_row_bytes = padded_input_wd * eff_ch;

    /* Compute tile height T (output rows per tile) */
    int tile_T = out_ht;
    int total_input_bytes = padded_input_wd * (input_ht + 2 * pad_ht) * eff_ch;
    int used_scratch = filter_sum_size + aligned_filter_size;
    if (total_input_bytes + used_scratch > L1D_BUDGET) {
        int budget_for_input = L1D_BUDGET - used_scratch;
        int min_input_rows = filter_ht;
        if (min_input_rows * tile_input_row_bytes <= budget_for_input) {
            tile_T = (budget_for_input - filter_ht * tile_input_row_bytes)
                     / (stride_ht * tile_input_row_bytes) + 1;
            if (tile_T < 1) tile_T = 1;
            if (tile_T > out_ht) tile_T = out_ht;
        }
    }

    /* Process tiles */
    const int8_t *use_filter = need_ch_pad ? aligned_filter : filter_data;
    data_dims_t eff_filter_dims = {filter_wd, filter_ht, eff_ch, 0};

    for (int32_t tile_y = 0; tile_y < out_ht; tile_y += tile_T) {
        int32_t actual_T = min(tile_T, out_ht - tile_y);

        int32_t in_row_start = tile_y * stride_ht - pad_ht;
        int32_t in_row_end = (tile_y + actual_T - 1) * stride_ht + filter_ht - 1;
        int32_t tile_input_ht = in_row_end - in_row_start + 1;

        /* Copy/pad input rows into tile buffer, with channel padding if needed */
        int8_t pad_val = (int8_t)(-input_offset);
        int8_t *dst = tile_buf;

        for (int32_t row = in_row_start; row <= in_row_end; row++) {
            if (row < 0 || row >= input_ht) {
                memset(dst, pad_val, padded_input_wd * eff_ch);
            } else {
                /* For each pixel in padded row */
                int8_t *row_dst = dst;
                /* Left padding */
                for (int px = 0; px < pad_wd; px++) {
                    memset(row_dst, pad_val, eff_ch);
                    row_dst += eff_ch;
                }
                /* Valid pixels - with optional channel padding */
                const int8_t *row_src = input_data + row * input_wd * in_ch;
                if (need_ch_pad) {
                    for (int px = 0; px < input_wd; px++) {
                        memcpy(row_dst, row_src, in_ch);
                        if (eff_ch > in_ch) {
                            memset(row_dst + in_ch, pad_val, eff_ch - in_ch);
                        }
                        row_src += in_ch;
                        row_dst += eff_ch;
                    }
                } else {
                    memcpy(row_dst, row_src, input_wd * in_ch);
                    row_dst += input_wd * in_ch;
                }
                /* Right padding */
                for (int px = 0; px < pad_wd; px++) {
                    memset(row_dst, pad_val, eff_ch);
                    row_dst += eff_ch;
                }
            }
            dst += padded_input_wd * eff_ch;
        }

        /* Sub-problem with pad=0, effective channels */
        data_dims_t tile_input_dims = {padded_input_wd, tile_input_ht, eff_ch, 0};
        data_dims_t tile_output_dims = {out_wd, actual_T, out_ch, 0};
        conv_params_t tile_conv_params = *conv_params;
        tile_conv_params.padding.width = 0;
        tile_conv_params.padding.height = 0;

        esp_nn_conv_s8_padded(&tile_input_dims, tile_buf,
                              &eff_filter_dims, use_filter, bias,
                              &tile_output_dims,
                              out_data + tile_y * out_wd * out_ch,
                              &tile_conv_params, quant_data,
                              filter_sum);
    }
}

int esp_nn_get_conv_scratch_size_esp32p4(const data_dims_t *input_dims,
                                         const data_dims_t *filter_dims,
                                         const data_dims_t *output_dims,
                                         const conv_params_t *conv_params)
{
    const uint16_t input_wd = input_dims->width;
    const uint16_t input_ht = input_dims->height;
    const uint16_t in_ch = input_dims->channels;
    const uint16_t filter_wd = filter_dims->width;
    const uint16_t filter_ht = filter_dims->height;
    const uint16_t out_ch = output_dims->channels;
    const uint16_t pad_wd = conv_params->padding.width;
    const uint16_t pad_ht = conv_params->padding.height;
    const uint16_t stride_wd = conv_params->stride.width;
    const uint16_t stride_ht = conv_params->stride.height;

    int new_channels = (in_ch + 7) & ~7;

    int input_scratch = input_wd * input_ht * in_ch;
    int filter_scratch = filter_wd * filter_ht * in_ch * out_ch;

    int align_buf_size = 32; /* extra buffer for alignment */
    if ((filter_wd == 1 && filter_ht == 1 && pad_wd == 0 && pad_ht == 0) &&
            (stride_wd == 1 && stride_ht == 1)) {
        if (in_ch < 16) {
            /* Channel-padding path: filter_sum + padded_filter + padded_input */
            int filter_sum_sz = out_ch * 4;
            int padded_filter_sz = 16 * out_ch;
            int padded_input_sz = 32; /* 16 bytes + alignment */
            return filter_sum_sz + padded_filter_sz + padded_input_sz + align_buf_size;
        }
        int transpose_buf_size = 2 * (8 * new_channels);
        if (input_wd * input_ht < 8) {
            transpose_buf_size = 0;
        }
        if (in_ch % 8) {
            input_scratch = input_wd * input_ht * new_channels;
        } else {
            input_scratch = 0;
        }
        filter_scratch = new_channels * out_ch;
        return input_scratch + filter_scratch + transpose_buf_size + align_buf_size;
    } else {
        new_channels = (in_ch + 15) & ~15;
        int offset_acc_scratch = out_ch * 4;

        if (pad_wd == 0 && pad_ht == 0 && filter_wd * in_ch >= 16) {
            /* Direct no-pad path: no input scratch needed */
            input_scratch = 0;
            filter_scratch = filter_wd * filter_ht * new_channels * out_ch;
            return input_scratch + filter_scratch + align_buf_size + offset_acc_scratch;
        }

        /* Im2col path: scratch = filter_sum + im2col_buf */
        if (filter_wd * filter_ht * in_ch >= 16) {
            int window_len = filter_wd * filter_ht * in_ch;
            int im2col_scratch = window_len;  /* one window buffer */
            return offset_acc_scratch + im2col_scratch + align_buf_size;
        }

        if (pad_wd == 0 && pad_ht == 0) {
            /* Very small window (< 16 elements total): tiled path */
            int eff_ch = ((16 + filter_wd - 1) / filter_wd + 15) & ~15;
            int filt_aligned = filter_wd * filter_ht * eff_ch * out_ch;
            int tile_input = input_wd * input_ht * eff_ch;
            return offset_acc_scratch + filt_aligned + tile_input + align_buf_size;
        }

        /* Padded case: check if tiling is beneficial */
        int padded_input_wd = input_wd + 2 * pad_wd;
        int full_input_size = padded_input_wd * (input_ht + 2 * pad_ht) * in_ch;

        if (full_input_size + offset_acc_scratch > L1D_BUDGET) {
            /* Tiled path: compute tile input size */
            int eff_ch = in_ch;
            int filt_aligned = 0;
            if (filter_wd * in_ch < 16) {
                eff_ch = ((16 + filter_wd - 1) / filter_wd + 15) & ~15;
                filt_aligned = filter_wd * filter_ht * eff_ch * out_ch;
            }
            int tile_row_bytes = padded_input_wd * eff_ch;
            int budget_for_input = L1D_BUDGET - offset_acc_scratch - filt_aligned;
            int tile_T = 1;
            if (budget_for_input > 0 && filter_ht * tile_row_bytes <= budget_for_input) {
                tile_T = (budget_for_input - filter_ht * tile_row_bytes)
                         / (stride_ht * tile_row_bytes) + 1;
                if (tile_T > (int)(output_dims->height)) tile_T = output_dims->height;
            }
            int tile_input_rows = (tile_T - 1) * stride_ht + filter_ht + 2 * pad_ht;
            input_scratch = tile_input_rows * tile_row_bytes;
            filter_scratch = filt_aligned;
        } else {
            /* Monolithic padded path */
            input_scratch = full_input_size;
            filter_scratch = filter_wd * filter_ht * new_channels * out_ch;
        }
        return input_scratch + filter_scratch + align_buf_size + offset_acc_scratch;
    }
    return align_buf_size;
}

void esp_nn_set_conv_scratch_buf_esp32p4(void *buf)
{
    // We are going to use the vector extensions
    asm volatile (
        "csrsi 0x7f2, 0b01      \n\t" // enable `esp` vector extension
        "li x29, 0b10           \n\t"
        "esp.movx.w.cfg x29     \n\t"
        :
        :
        : "x29"
    );

    scratch_buffer = (int16_t *) buf;
}

void esp_nn_conv_s8_esp32p4(const data_dims_t *input_dims,
                            const int8_t *input,
                            const data_dims_t *filter_dims,
                            const int8_t *filter_data,
                            const int32_t *bias,
                            const data_dims_t *output_dims,
                            int8_t *out_data,
                            const conv_params_t *conv_params,
                            const quant_data_t *quant_data)
{
    if (scratch_buffer == NULL) {
        printf("esp_nn_conv error! scratch_buffer not set!\n");
        return;
    }

    const uint16_t filter_wd = filter_dims->width;
    const uint16_t filter_ht = filter_dims->height;
    const uint16_t pad_wd = conv_params->padding.width;
    const uint16_t pad_ht = conv_params->padding.height;
    const uint16_t stride_wd = conv_params->stride.width;
    const uint16_t stride_ht = conv_params->stride.height;

    if (filter_wd == 1 && filter_ht == 1 && pad_wd == 0 && pad_ht == 0 &&
            stride_wd == 1 && stride_ht == 1) {
        esp_nn_conv_s8_1x1(input_dims, input, filter_data, bias,
                           output_dims, out_data, conv_params, quant_data,
                           scratch_buffer);
    } else if (pad_wd == 0 && pad_ht == 0 &&
               filter_wd * input_dims->channels >= 16) {
        /* No-pad, channels large enough for PIE: use direct padded path */
        esp_nn_conv_s8_padded(input_dims, input, filter_dims, filter_data, bias,
                              output_dims, out_data, conv_params, quant_data,
                              scratch_buffer);
    } else if (filter_wd * filter_ht * input_dims->channels >= 16) {
        /* Small in_ch but window_len >= 16: use im2col for zero-waste PIE.
         * Also handles padded cases naturally. */
        esp_nn_conv_s8_im2col(input_dims, input, filter_dims, filter_data, bias,
                              output_dims, out_data, conv_params, quant_data,
                              scratch_buffer);
    } else if (pad_wd != 0 || pad_ht != 0) {
        /* Padded case with very small window: use tiled path */
        esp_nn_conv_s8_tiled(input_dims, input, filter_dims, filter_data, bias,
                             output_dims, out_data, conv_params, quant_data,
                             scratch_buffer);
    } else {
        /* Tiny output: fall back to generic opt */
        esp_nn_conv_s8_opt(input_dims, input, filter_dims, filter_data, bias,
                           output_dims, out_data, conv_params, quant_data);
    }
}


================================================
FILE: src/convolution/esp_nn_conv_esp32s3.c
================================================
/*
 * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/**
 * Optimizations strategies used:
 * Below optimizations are capable of any size of input/filter:
 *
 * 1. For filter wdxht = 1x1 (Refer esp_nn_conv_s8_mult8_1x1_esp32s3 function)
 *      - For this specific version, the strategy we employ:
 *          > This particular filter has only the channel
 *              dimension and we have `out_ch` number of such filters.
 *          > We take 8 input lines at a time and transpose those.
 *          > Keep loading and multiplying filter values one by one,
 *              to produce 8 outputs in parallel
 *
 * 2. General version: (Refer esp_nn_conv_s8_filter_aligned_input_padded_esp32s3)
 *      - For all other cases:
 *          > Consider `filter_wd * in_ch` as a single row. These many values can
 *              be continuosly loaded from inputs as well.
 *          > multiply accumulate into a single filter output.
 *          > To speed things up further, we pre-calculate
 *              (filter * in_offset + bias term) earlier and add it at the end of filter
 *
 *      About ((filter * in_offset + bias term)) accumulate term:
 *          > The conv operation before requantization is as follows:
 *              for i in filter_size:
 *                  conv_out += (input + input_offset) * filter;
 *               conv_out += bias
 *
 *          > where input_offset is constant term hence, we can see that
 *              this term can be precalculated as:
 *                  for i in filter_size:
 *                      acc_term += input_offset * filter[i];
 *                  acc_term += bias
 *              OR
 *                   for i in filter_size:
 *                      acc_term += filter[i]; // accumulate filter values
 *                  acc_term = acc_term * input_offset + bias
 *
 *
 * In both the above versions we align the filter if needed, pad the input with
 *       -input_offset if needed and extend the channels to make those multiple
 *       of 8/16 as per function needs
 *
 * 3. Im2col version: (for small in_ch where filter_wd * in_ch < 16)
 *      - Inspired by ESP32-P4 im2col approach.
 *      - Instead of padding channels (wastes 81% of SIMD lanes for in_ch=3),
 *        flatten the entire filter window into one contiguous vector:
 *          window_len = filter_wd * filter_ht * in_ch (e.g., 3*3*3 = 27)
 *      - For each output pixel: copy the input window into a scratch buffer,
 *        then use ACCX dot product on the full window. No wasted MACs.
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <esp_nn_defs.h>

#include <common_functions.h>

/* 3x3 optimized path — im2col per pixel, iterate OC with input in cache */
extern int esp_nn_conv_s8_3x3_can_use(int filter_wd, int filter_ht, int in_channels);
extern void esp_nn_conv_s8_3x3_opt(const int8_t *input,
    const uint16_t input_wd, const uint16_t input_ht,
    const uint16_t in_channels, const int32_t input_offset,
    const uint16_t stride_wd, const uint16_t stride_ht,
    const int8_t *filter_data, const int32_t *bias,
    int8_t *out_data, const uint16_t out_wd, const uint16_t out_ht,
    const uint16_t out_channels, const int32_t out_offset,
    const int32_t *out_shift, const int32_t *out_mult,
    const int32_t activation_min, const int32_t activation_max,
    void *scratch);

/* ANSI C reference conv for comparison */
extern void esp_nn_conv_s8_ansi(const data_dims_t *input_dims,
                                const int8_t *input_data,
                                const data_dims_t *filter_dims,
                                const int8_t *filter_data,
                                const int32_t *bias,
                                const data_dims_t *output_dims,
                                int8_t *out_data,
                                const conv_params_t *conv_params,
                                const quant_data_t *quant_data);

/* 1x1 conv — correct SIMD implementation */
extern int esp_nn_conv_s8_1x1_scratch_size(int out_channels);
extern void esp_nn_conv_s8_1x1(const int8_t *input,
                                const uint16_t input_wd,
                                const uint16_t input_ht,
                                const uint16_t in_channels,
                                const int32_t input_offset,
                                const int8_t *filter_data,
                                const int32_t *bias,
                                int8_t *out_data,
                                const uint16_t out_channels,
                                const int32_t out_offset,
                                const int32_t *out_shift,
                                const int32_t *out_mult,
                                const int32_t activation_min,
                                const int32_t activation_max,
                                void *scratch);

/* Debug heap checks — enable to find buffer overruns */
#if CONFIG_IDF_CMAKE
#include "esp_heap_caps.h"
#define CONV_HEAP_CHECK(tag) do { \
    if (!heap_caps_check_integrity_all(false)) { \
        printf("CONV HEAP CORRUPT: %s\n", tag); \
    } \
} while(0)
#else
#define CONV_HEAP_CHECK(tag)
#endif

static int16_t *scratch_buffer = NULL;

extern void esp_nn_conv_s8_mult8_1x1_esp32s3(
                const int8_t *input_data,
                const uint16_t input_wd,
                const uint16_t input_ht,
                const uint16_t in_channels,
                const int32_t input_offset,
                const int8_t *filter_aligned,
                const int32_t *bias,
                int8_t *out_data,
                const uint16_t out_wd,
                const uint16_t out_ht,
                const uint16_t out_channels,
                const int32_t out_offset,
                const int32_t *out_shift,
                const int32_t *out_mult,
                const int32_t activation_min,
                const int32_t activation_max,
                void *buffer /* scratch buffer */);

extern void esp_nn_conv_s8_filter_aligned_input_padded_esp32s3(
                const int8_t *input_data,
                const uint16_t input_wd,
                const uint16_t input_ht,
                const uint16_t in_channels,
                const int32_t input_offset,
                const uint16_t stride_wd,
                const uint16_t stride_ht,
                const int8_t *filter_data,
                const uint16_t filter_wd,
                const uint16_t filter_ht,
                const int32_t *bias,
                int8_t *out_data,
                const uint16_t out_wd,
                const uint16_t out_ht,
                const uint16_t out_channels,
                const int32_t out_offset,
                const int32_t *out_shift,
                const int32_t *out_mult,
                const int32_t activation_min,
                const int32_t activation_max,
                void *scratch_buffer);

/* Use shared dot product from common — see esp_nn_dot_s8_esp32s3.S */

/**
 * Im2col convolution for small in_ch (filter_wd * in_ch < 16).
 *
 * Instead of padding channels to 16 (wasting 81% MACs for in_ch=3),
 * flatten the entire filter window into one contiguous vector:
 *   window_len = filter_wd * filter_ht * in_ch (e.g., 3*3*3 = 27)
 *
 * For each output pixel: copy the input window into a contiguous scratch
 * buffer, then use ACCX dot product. No wasted MACs.
 *
 * Scratch layout: [filter_sum[out_ch] | im2col_buf[window_len_aligned]]
 */
__attribute__ ((noinline))
static void esp_nn_conv_s8_im2col_s3(
        const data_dims_t *input_dims,
        const int8_t *input_data,
        const data_dims_t *filter_dims,
        const int8_t *filter_data,
        const int32_t *bias,
        const data_dims_t *output_dims,
        int8_t *out_data,
        const conv_params_t *conv_params,
        const quant_data_t *quant_data,
        void *scratch)
{
    const uint16_t input_wd = input_dims->width;
    const uint16_t input_ht = input_dims->height;
    const uint16_t in_ch = input_dims->channels;
    const uint16_t filter_wd = filter_dims->width;
    const uint16_t filter_ht = filter_dims->height;
    const uint16_t out_wd = output_dims->width;
    const uint16_t out_ht = output_dims->height;
    const uint16_t out_ch = output_dims->channels;
    const uint16_t pad_wd = conv_params->padding.width;
    const uint16_t pad_ht = conv_params->padding.height;
    const uint16_t stride_wd = conv_params->stride.width;
    const uint16_t stride_ht = conv_params->stride.height;
    const int32_t input_offset = conv_params->in_offset;
    const int32_t out_offset = conv_params->out_offset;
    const int32_t activation_min = conv_params->activation.min;
    const int32_t activation_max = conv_params->activation.max;

    const int32_t window_len = filter_wd * filter_ht * in_ch;
    /* Align to 16 for SIMD: zero-padded tail doesn't affect dot product */
    const int32_t window_len_aligned = (window_len + 15) & ~15;
    const int8_t pad_val = (int8_t)(-input_offset);

    /* Scratch layout (16-byte aligned):
     * [filter_sum: out_ch * 4]
     * [aligned_filter: out_ch * window_len_aligned]  -- zero-padded copy
     * [im2col_buf: window_len_aligned]
     */
    int32_t *filter_sum = (int32_t *)scratch;
    int8_t *aligned_filter = (int8_t *)((uintptr_t)((int8_t *)scratch + out_ch * sizeof(int32_t) + 15) & ~15);
    int8_t *im2col_buf = (int8_t *)((uintptr_t)(aligned_filter + out_ch * window_len_aligned + 15) & ~15);

    /* Pre-compute filter_sum * input_offset AND copy filter with zero-padded tail */
    const int8_t *fptr = filter_data;
    int8_t *af_ptr = aligned_filter;
    for (int32_t oc = 0; oc < out_ch; oc++) {
        int32_t sum = 0;
        for (int32_t fi = 0; fi < window_len; fi++) {
            sum += fptr[fi];
        }
        filter_sum[oc] = sum * input_offset;
        /* Copy filter + zero-pad tail for safe SIMD reads */
        memcpy(af_ptr, fptr, window_len);
        memset(af_ptr + window_len, 0, window_len_aligned - window_len);
        fptr += window_len;
        af_ptr += window_len_aligned;
    }

    /* Zero the tail of im2col buffer once (for aligned SIMD reads) */
    memset(im2col_buf + window_len, 0, window_len_aligned - window_len);

    /* Compute safe interior region where no bounds checking needed.
     * Interior: all filter taps fall within valid input. */
    const int32_t row_bytes = filter_wd * in_ch;
    int32_t safe_y_start = (pad_ht + stride_ht - 1) / stride_ht;
    int32_t safe_y_end = (input_ht - filter_ht + pad_ht) / stride_ht + 1;
    int32_t safe_x_start = (pad_wd + stride_wd - 1) / stride_wd;
    int32_t safe_x_end = (input_wd - filter_wd + pad_wd) / stride_wd + 1;
    if (safe_y_start > out_ht) safe_y_start = out_ht;
    if (safe_y_end > out_ht) safe_y_end = out_ht;
    if (safe_y_end < safe_y_start) safe_y_end = safe_y_start;
    if (safe_x_start > out_wd) safe_x_start = out_wd;
    if (safe_x_end > out_wd) safe_x_end = out_wd;
    if (safe_x_end < safe_x_start) safe_x_end = safe_x_start;

    /* Process each output pixel */
    int8_t *out_ptr = out_data;
    for (int32_t out_y = 0; out_y < out_ht; out_y++) {
        const int32_t base_y = out_y * stride_ht - pad_ht;
        int is_safe_y = (out_y >= safe_y_start && out_y < safe_y_end);

        for (int32_t out_x = 0; out_x < out_wd; out_x++) {
            const int32_t base_x = out_x * stride_wd - pad_wd;

            /* Copy input window into contiguous im2col buffer */
            int8_t *buf = im2col_buf;

            if (is_safe_y && out_x >= safe_x_start && out_x < safe_x_end) {
                /* FAST PATH: interior pixel — no bounds checking needed.
                 * All filter taps guaranteed to be within valid input. */
                for (int32_t fy = 0; fy < filter_ht; fy++) {
                    const int8_t *src = input_data + ((base_y + fy) * input_wd + base_x) * in_ch;
                    memcpy(buf, src, row_bytes);
                    buf += row_bytes;
                }
            } else {
                /* SLOW PATH: edge pixel — per-element bounds checking */
                for (int32_t fy = 0; fy < filter_ht; fy++) {
                    int32_t in_y = base_y + fy;
                    if (in_y >= 0 && in_y < input_ht) {
                        for (int32_t fx = 0; fx < filter_wd; fx++) {
                            int32_t in_x = base_x + fx;
                            if (in_x >= 0 && in_x < input_wd) {
                                const int8_t *src = input_data + (in_y * input_wd + in_x) * in_ch;
                                memcpy(buf, src, in_ch);
                            } else {
                                memset(buf, pad_val, in_ch);
                            }
                            buf += in_ch;
                        }
                    } else {
                        memset(buf, pad_val, row_bytes);
                        buf += row_bytes;
                    }
                }
            }

            /* Dot product against each output channel's filter (aligned copy) */
            const int32_t *out_mult_ptr = quant_data->mult;
            const int32_t *out_shift_ptr = quant_data->shift;
            const int8_t *filter_ptr = aligned_filter;

            for (int32_t oc = 0; oc < out_ch; oc++) {
                int32_t conv_out = esp_nn_dot_s8_aligned_esp32s3(im2col_buf, filter_ptr, window_len_aligned);
                conv_out += filter_sum[oc];
                if (bias) conv_out += bias[oc];
                conv_out = esp_nn_requantize(conv_out, *out_mult_ptr++, *out_shift_ptr++);
                conv_out += out_offset;
                conv_out = max(conv_out, activation_min);
                conv_out = min(conv_out, activation_max);
                *out_ptr++ = (int8_t) conv_out;
                filter_ptr += window_len_aligned;
            }
        }
    }
}

int esp_nn_get_conv_scratch_size_esp32s3(const data_dims_t *input_dims,
                                         const data_dims_t *filter_dims,
                                         const data_dims_t *output_dims,
                                         const conv_params_t *conv_params)
{
    const uint16_t input_wd = input_dims->width;
    const uint16_t input_ht = input_dims->height;
    const uint16_t in_ch = input_dims->channels;
    const uint16_t filter_wd = filter_dims->width;
    const uint16_t filter_ht = filter_dims->height;
    const uint16_t out_ch = output_dims->channels;
    const uint16_t pad_wd = conv_params->padding.width;
    const uint16_t pad_ht = conv_params->padding.height;
    const uint16_t stride_wd = conv_params->stride.width;
    const uint16_t stride_ht = conv_params->stride.height;

    int new_channels = (in_ch + 7) & ~7;

    int input_scratch = input_wd * input_ht * in_ch;
    int filter_scratch = filter_wd * filter_ht * in_ch * out_ch;

    int align_buf_size = 64; /* alignment (16) + assembly pre/post access margin (48) */
    if ((filter_wd == 1 && filter_ht == 1 && pad_wd == 0 && pad_ht == 0) &&
            (stride_wd == 1 && stride_ht == 1)) {
        int transpose_buf_size = 2 * (8 * new_channels);
        if (input_wd * input_ht < 8) {
            transpose_buf_size = 0;
        }
        if (in_ch % 8) {
            input_scratch = input_wd * input_ht * new_channels;
        } else {
            input_scratch = 0;
        }
        filter_scratch = new_channels * out_ch;
        return input_scratch + filter_scratch + transpose_buf_size + align_buf_size;
    } else {
        int32_t filter_row_size = filter_wd * in_ch;
        int32_t window_len = filter_wd * filter_ht * in_ch;

        /* Im2col path: filter_wd * in_ch < 16 but window_len >= 16 */
        if (filter_row_size < 16 && window_len >= 16) {
            int32_t window_len_aligned = (window_len + 15) & ~15;
            /* filter_sum + aligned_filter_copy + im2col_buf + alignment padding */
            int im2col_scratch = out_ch * 4 + 16 + out_ch * window_len_aligned + 16 + window_len_aligned;
            return im2col_scratch + align_buf_size;
        }

        new_channels = (in_ch + 15) & ~15;
        if (pad_wd == 0 && pad_ht == 0) {
            input_scratch = 0;
        } else {
            input_scratch = (input_wd + 2 * pad_wd) * (input_ht + 2 * pad_ht) * in_ch;
        }
        filter_scratch = filter_wd * filter_ht * new_channels * out_ch;

        // Account for filter alignment padding (worst case)
        int32_t aligned_filter_row_size = ((filter_row_size + 15) / 16) * 16;
        int filter_alignment_scratch = aligned_filter_row_size * filter_ht * out_ch;

        // Account for right/bottom padding even when pad_wd=0, pad_ht=0
        int pad_right = max(0, (output_dims->width * stride_wd + filter_wd - 1) - input_wd);
        int pad_bottom = max(0, (output_dims->height * stride_ht + filter_ht - 1) - input_ht);
        int boundary_padding_scratch = 0;
        if (pad_right > 0 || pad_bottom > 0) {
            boundary_padding_scratch = (input_wd + pad_right) * (input_ht + pad_bottom) * in_ch;
        }

        int offset_acc_scratch = out_ch * 4;
        return input_scratch + filter_scratch + filter_alignment_scratch + boundary_padding_scratch + align_buf_size + offset_acc_scratch;
    }
    return align_buf_size;
}

void esp_nn_set_conv_scratch_buf_esp32s3(void *buf)
{
    scratch_buffer = (int16_t *) buf;
}

void esp_nn_conv_s8_esp32s3(const data_dims_t *input_dims,
                            const int8_t *input,
                            const data_dims_t *filter_dims,
                            const int8_t *filter_data,
                            const int32_t *bias,
                            const data_dims_t *output_dims,
                            int8_t *out_data,
                            const conv_params_t *conv_params,
                            const quant_data_t *quant_data)
{
    const uint16_t input_wd = input_dims->width;
    const uint16_t input_ht = input_dims->height;
    const uint16_t channels = input_dims->channels;
    const int32_t input_offset = conv_params->in_offset;
    const int32_t out_offset = conv_params->out_offset;
    const uint16_t pad_wd = conv_params->padding.width;
    const uint16_t pad_ht = conv_params->padding.height;
    const uint16_t stride_wd = conv_params->stride.width;
    const uint16_t stride_ht = conv_params->stride.height;
    const uint16_t filter_wd = filter_dims->width;
    const uint16_t filter_ht = filter_dims->height;
    const uint16_t out_wd = output_dims->width;
    const uint16_t out_ht = output_dims->height;
    const uint16_t out_channels = output_dims->channels;
    const int32_t *out_shift = quant_data->shift;
    const int32_t *out_mult = quant_data->mult;
    const int32_t activation_min = conv_params->activation.min;
    const int32_t activation_max = conv_params->activation.max;

    /* Grouped conv (filter_ch < input_ch): fall back to ansi which handles it */
    if (channels != filter_dims->channels) {
        esp_nn_conv_s8_ansi(input_dims, input, filter_dims, filter_data,
                            bias, output_dims, out_data, conv_params, quant_data);
        return;
    }

    int filter_size = filter_wd * filter_ht * channels * out_channels;

    /* 1x1 stride-1 conv */
    if (filter_wd == 1 && filter_ht == 1 && pad_wd == 0 && pad_ht == 0 &&
            stride_wd == 1 && stride_ht == 1) {
        if (channels % 8 == 0) {
            /* Full asm path — requires mult8 channels + 8-byte aligned filter */
            esp_nn_conv_s8_mult8_1x1_esp32s3(input, input_wd, input_ht, channels,
                               input_offset, filter_data, bias, out_data,
                               out_wd, out_ht, out_channels, out_offset,
                               out_shift, out_mult, activation_min, activation_max,
                               scratch_buffer);
        } else {
            /* Fallback: handles any alignment + any channel count */
            esp_nn_conv_s8_1x1(input, input_wd, input_ht, channels, input_offset,
                               filter_data, bias, out_data, out_channels, out_offset,
                               out_shift, out_mult, activation_min, activation_max,
                               scratch_buffer);
        }
        return;
    }

    if (scratch_buffer == NULL) {
        printf("esp_nn_conv error! scratch_buffer not set!\n");
        return;
    }

    {
        int32_t filter_row_size = filter_wd * channels;
        int32_t window_len = filter_wd * filter_ht * channels;

        /* 3x3 optimized path: im2col per pixel, iterate OC with input in cache.
         * TODO: fix inline asm priming + performance regression before enabling.
         * Avoids the 128× input reload of the general aligned asm. */
#if 0
        if (esp_nn_conv_s8_3x3_can_use(filter_wd, filter_ht, channels) &&
                pad_wd == 0 && pad_ht == 0) {
            esp_nn_conv_s8_3x3_opt(input, input_wd, input_ht, channels,
                                    input_offset, stride_wd, stride_ht,
                                    filter_data, bias, out_data,
                                    out_wd, out_ht, out_channels, out_offset,
                                    out_shift, out_mult, activation_min, activation_max,
                                    (void *)scratch_buffer);
            return;
        }
#endif

        /* Im2col path: small in_ch where per-row SIMD is wasteful,
         * but entire window is large enough for SIMD dot product.
         * E.g., 3x3 conv with in_ch=3: row=9 (<16), window=27 (>=16). */
        if (filter_row_size < 16 && window_len >= 16) {
            esp_nn_conv_s8_im2col_s3(input_dims, input, filter_dims, filter_data,
                                      bias, output_dims, out_data, conv_params,
                                      quant_data, scratch_buffer);
            return;
        }

        // align the `filter width * channels` to 16 bytes. Do zero padding for the same
        int32_t filter_alignment_padding = 16 - (filter_row_size & 15);
        int8_t *filter_data_aligned = (int8_t *) filter_data;
        int8_t *input_padded = (int8_t *) input;
        int8_t *scratch_data = (int8_t *) scratch_buffer;
        int new_input_wd = input_wd, new_input_ht = input_ht;
        if (filter_alignment_padding != 16) {
            // pad filter_data
            int32_t new_row_size = filter_wd * channels + filter_alignment_padding;
            filter_data_aligned = scratch_data;
            int8_t *row_ptr = filter_data_aligned;
            const int8_t *filter_data_ptr = filter_data;
            for (int32_t ch_idx = 0; ch_idx < out_channels; ch_idx++) {
                for (int32_t row_idx = 0; row_idx < filter_ht; row_idx++) {
                    memcpy(row_ptr, filter_data_ptr, filter_row_size);
                    memset(row_ptr + filter_row_size, 0, new_row_size - filter_row_size);
                    filter_data_ptr += filter_row_size;
                    row_ptr += new_row_size;
                }
            }
            scratch_data += new_row_size * filter_ht * out_channels;
            filter_row_size = new_row_size;
        } else if ((int) filter_data & 15) {
            filter_data_aligned = scratch_data;
            memcpy(filter_data_aligned, filter_data, filter_size);
            scratch_data += filter_size;
        }
        // Calculate if right/bottom padding is needed even when pad_wd=0, pad_ht=0
        // This happens when the filter extends beyond input boundaries at the edges
        // Formula matches depthwise convolution: (out_wd * stride_wd + filter_wd - 1) - input_wd
        int32_t pad_right = max(0, (out_wd * stride_wd + filter_wd - 1) - input_wd);
        int32_t pad_bottom = max(0, (out_ht * stride_ht + filter_ht - 1) - input_ht);

        // Apply padding if explicitly requested (pad_wd/pad_ht) OR if needed for boundary handling
        if (pad_wd != 0 || pad_ht != 0) {
            // Full padding (top, bottom, left, right) when pad_wd/pad_ht are set
            input_padded = (int8_t *) scratch_data;
            esp_nn_aligned_s8_pad_with_value(input, input_padded, input_wd, input_ht, channels,
                                            -input_offset, pad_wd, pad_ht);
            new_input_wd = input_wd + 2 * pad_wd;
            new_input_ht = input_ht + 2 * pad_ht;
            scratch_data += new_input_wd * new_input_ht * channels;
        } else if (pad_right > 0 || pad_bottom > 0) {
            // Only right/bottom padding needed for boundary handling (like depthwise conv)
            input_padded = (int8_t *) scratch_data;
            esp_nn_aligned_s8_pad_end_with_value(input, input_padded, input_wd, input_ht, channels,
                                                -input_offset, (uint16_t)pad_right, (uint16_t)pad_bottom);
            new_input_wd = input_wd + pad_right;
            new_input_ht = input_ht + pad_bottom;
            scratch_data += new_input_wd * new_input_ht * channels;
        }


        int filter_total = filter_wd * filter_ht * channels * out_channels;
        if (input_offset != 0 && filter_total > 16384) {
            int32_t *corrections = (int32_t *)scratch_data;
            int32_t filter_ch_size = filter_wd * filter_ht * channels;
            const int8_t *f_src = filter_data; // use ORIGINAL (not aligned) filter for sum
            for (int ch = 0; ch < out_channels; ch++) {
                int32_t filter_sum = 0;
                for (int i = 0; i < filter_ch_size; i++) {
                    filter_sum += f_src[i];
                }
                corrections[ch] = filter_sum * input_offset;
                if (bias) {
                    corrections[ch] += bias[ch];
                }
                f_src += filter_ch_size;
            }
            // Pass input_offset=0 to assembly so it skips its pre-computation.
            // Pass scratch_data as "bias" pointer — the assembly's bias-copy loop
            // will read from scratch and write to scratch (identity, no-op).
            esp_nn_conv_s8_filter_aligned_input_padded_esp32s3(
                input_padded, new_input_wd, new_input_ht, channels, 0,
                stride_wd, stride_ht, filter_data_aligned, filter_wd, filter_ht,
                (const int32_t *)scratch_data, out_data, out_wd, out_ht, out_channels,
                out_offset, out_shift, out_mult, activation_min, activation_max,
                scratch_data);
            CONV_HEAP_CHECK("general: after asm (precomp)");
        } else {
            esp_nn_conv_s8_filter_aligned_input_padded_esp32s3(
                input_padded, new_input_wd, new_input_ht, channels, input_offset,
                stride_wd, stride_ht, filter_data_aligned, filter_wd, filter_ht,
                bias, out_data, out_wd, out_ht, out_channels, out_offset,
                out_shift, out_mult, activation_min, activation_max, scratch_data);
            CONV_HEAP_CHECK("general: after asm (normal)");
        }
    }
}


================================================
FILE: src/convolution/esp_nn_conv_opt.c
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <esp_nn_defs.h>
#include <esp_nn_ansi_headers.h>

#include <common_functions.h>

int esp_nn_get_conv_scratch_size_opt(const data_dims_t *input_dims,
                                     const data_dims_t *filter_dims,
                                     const data_dims_t *output_dims,
                                     const conv_params_t *conv_params)
{
    return 0;
}

void esp_nn_set_conv_scratch_buf_opt(const void *buf)
{

}

__attribute__ ((noinline))
static void esp_nn_conv_s8_1x1(const data_dims_t *input_dims,
                               const int8_t *input_data,
                               const int8_t *filter_data,
                               const int32_t *bias,
                               const data_dims_t *output_dims,
                               int8_t *out_data,
                               const conv_params_t *conv_params,
                               const quant_data_t *quant_data)
{
    const uint16_t input_wd = input_dims->width;
    const uint16_t in_channels = input_dims->channels;
    const int32_t input_offset = conv_params->in_offset;
    const int32_t out_offset = conv_params->out_offset;
    const uint16_t stride_wd = conv_params->stride.width;
    const uint16_t stride_ht = conv_params->stride.height;
    const uint16_t out_wd = output_dims->width;
    const uint16_t out_ht = output_dims->height;
    const uint16_t out_channels = output_dims->channels;
    const int32_t activation_min = conv_params->activation.min;
    const int32_t activation_max = conv_params->activation.max;

    for (int32_t in_row = 0; in_row < out_ht * stride_ht; in_row += stride_ht) {
        for (int32_t in_col = 0; in_col < out_wd * stride_wd; in_col += stride_wd) {
            const int32_t *out_mult = quant_data->mult;
            const int32_t *out_shift = quant_data->shift;
            const int8_t *filter_ptr = filter_data;
            const int8_t *input_base_ptr = input_data + (in_row * input_wd + in_col) * in_channels;
            int32_t out_ch_idx = 0;
            for (; out_ch_idx < out_channels; out_ch_idx++) {
                int32_t conv_out = 0;

                const int8_t *input_ptr = input_base_ptr;

                int32_t in_ch_idx = 0;
                for (; in_ch_idx < in_channels - 3; in_ch_idx += 4) {
                    conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
                    conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
                    conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
                    conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
                }
                for (; in_ch_idx < in_channels; in_ch_idx ++) {
                    conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
                }
                if (bias) {
                    conv_out += bias[out_ch_idx];
                }
                conv_out = esp_nn_requantize(conv_out, *out_mult++, *out_shift++);
                conv_out += out_offset;
                conv_out = max(conv_out, activation_min);
                conv_out = min(conv_out, activation_max);
                *out_data++ = (int8_t) conv_out;
            }
        }
    }
}

/**
 * Assumption 1: i/p channels == o/p channels
 * Assumption 2: Pointers are valid
 * Assumption 3: dialation width = 1
 */
void esp_nn_conv_s8_opt(const data_dims_t *input_dims,
                        const int8_t *input_data,
                        const data_dims_t *filter_dims,
                        const int8_t *filter_data,
                        const int32_t *bias,
                        const data_dims_t *output_dims,
                        int8_t *out_data,
                        const conv_params_t *conv_params,
                        const quant_data_t *quant_data)
{
    const uint16_t filter_wd = filter_dims->width;
    const uint16_t filter_ht = filter_dims->height;

    if (filter_wd == 1 && filter_ht == 1) {
        esp_nn_conv_s8_1x1(input_dims, input_data, filter_data, bias,
                           output_dims, out_data, conv_params, quant_data);
        return;
    }

    const uint16_t input_wd = input_dims->width;
    const uint16_t input_ht = input_dims->height;
    const uint16_t in_channels = input_dims->channels;
    const int32_t input_offset = conv_params->in_offset;
    const int32_t out_offset = conv_params->out_offset;
    const uint16_t pad_wd = conv_params->padding.width;
    const uint16_t pad_ht = conv_params->padding.height;
    const uint16_t stride_wd = conv_params->stride.width;
    const uint16_t stride_ht = conv_params->stride.height;
    const uint16_t out_wd = output_dims->width;
    const uint16_t out_ht = output_dims->height;
    const uint16_t out_channels = output_dims->channels;
    const int32_t activation_min = conv_params->activation.min;
    const int32_t activation_max = conv_params->activation.max;

    /* Grouped conv (filter_ch < input_ch): fall back to ansi which handles it */
    if (in_channels != filter_dims->channels) {
        esp_nn_conv_s8_ansi(input_dims, input_data, filter_dims, filter_data,
                            bias, output_dims, out_data, conv_params, quant_data);
        return;
    }

    int32_t out_ch_idx, out_y, out_x, filter_y_idx, filter_x_idx;

    for (out_y = 0; out_y < out_ht; out_y++) {
        for (out_x = 0; out_x < out_wd; out_x++) {
            const int32_t *out_shift = quant_data->shift;
            const int32_t *out_mult = quant_data->mult;
            for (out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {
                int32_t conv_out = 0;

                const int32_t base_y = stride_ht * out_y - pad_ht;
                const int32_t base_x = stride_wd * out_x - pad_wd;

                const int32_t filter_y_start = max(0, -base_y);
                const int32_t filter_x_start = max(0, -base_x);

                const int32_t filter_y_end = min(filter_ht, input_ht - base_y);
                const int32_t filter_x_end = min(filter_wd, input_wd - base_x);

                for (filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
                    for (filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
                        const int32_t in_row = base_y + filter_y_idx;
                        const int32_t in_col = base_x + filter_x_idx;

                        const int8_t *input_ptr = input_data +
                                        (in_row * input_wd + in_col) * in_channels;
                        const int8_t *filter_ptr = filter_data +
                                        out_ch_idx * in_channels * filter_ht * filter_wd +
                                        (filter_y_idx * filter_wd + filter_x_idx) * in_channels;
                        int32_t in_ch_idx = 0;
                        for (; in_ch_idx < in_channels - 3; in_ch_idx += 4) {
                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
                        }
                        for (; in_ch_idx < in_channels; in_ch_idx ++) {
                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;
                        }
                    }
                }
                if (bias) {
                    conv_out += bias[out_ch_idx];
                }
                conv_out = esp_nn_requantize(conv_out, *out_mult++, *out_shift++);
                conv_out += out_offset;
                conv_out = max(conv_out, activation_min);
                conv_out = min(conv_out, activation_max);
                *out_data++ = (int8_t) conv_out;
            }
        }
    }
}


================================================
FILE: src/convolution/esp_nn_conv_s16_mult4_1x1_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

    .text
    .literal_position
    .literal    .nudge_val, 1073741824

    # Program Unit: esp_nn_conv_s16_mult4_1x1_esp32s3
    .type   esp_nn_conv_s16_mult4_1x1_esp32s3, @function
    .align   4
    .global esp_nn_conv_s16_mult4_1x1_esp32s3
esp_nn_conv_s16_mult4_1x1_esp32s3:  # 0xa62
    # scratch_buf = 0
    # to_add = 32
    # gra_spill_temp_139 = 36
    # gra_spill_temp_140 = 40
    # gra_spill_temp_141 = 44
    # gra_spill_temp_155 = 48
    # gra_spill_temp_156 = 52
    # gra_spill_temp_144 = 56
    # gra_spill_temp_145 = 60
    # gra_spill_temp_146 = 64
    # gra_spill_temp_147 = 68
    # gra_spill_temp_148 = 72
    # gra_spill_temp_149 = 76
    # gra_spill_temp_150 = 80
    # gra_spill_temp_151 = 84
    # gra_spill_temp_152 = 88
    # gra_spill_temp_153 = 92
    # lgra_spill_temp_165 = 96
    # lgra_spill_temp_166 = 100
    # lgra_spill_temp_167 = 104
    # lgra_spill_temp_168 = 108
    # gra_spill_temp_158 = 112
    # gra_spill_temp_159 = 116
    # gra_spill_temp_160 = 120


 // registers:
 // a2: int16_t *input_data
 // a3: uint16_t input_wd
 // a4: uint16_t input_ht
 // a5: uint16_t in_channels
 // a6: int16_t *filter_data
 // a7: int32_t *bias

 // on stack:
 // 160: int8_t *out_data
 // 164: uint16_t out_wd
 // 168: uint16_t out_ht
 // 172: uint16_t out_channels
 // 176: int32_t out_offset
 // 180: int32_t *out_shift
 // 184: int32_t *out_mult
 // 188: int32_t activation_min
 // 192: int32_t activation_max
 // 196: *buffer /* scratch buffer */


    entry   a1,160                      #
    s32i.n  a2,a1,40                # [0]  gra_spill_temp_140
    s32i    a6,a1,68                    # [1]  gra_spill_temp_147
    s32i    a7,a1,116                   # [2]  gra_spill_temp_159

    mul16u  a3,a3,a4                # [3]
    addi    a10,a1,112                  # [4]
    addmi   a11,a1,176                  # [5]
    addmi   a8,a1,176                   # [6]
    addmi   a9,a1,176                   # [7]
    addi.n  a9,a9,12                # [8]
    addi    a8,a8,16                    # [9]
    ee.vldbc.32 q5,a11              # [10]  id:188 out_offset
    ee.vldbc.32 q7,a8               # [12]  id:270 activation_max
    ee.vldbc.32 q6,a9               # [13]  id:269 activation_min
    blti    a3,4,.Lt_3_6402             # [14]

.LBB3_esp_nn_conv_s16_mult4_1x1_esp32s3:    # 0xa90
    l32i    a13,a1,160                  # [0]  id:280 out_data+0x0
    srai    a8,a5,2                     # [1]
    addi    a10,a3,-3                   # [2]
    addi    a9,a5,-3                    # [3]
    movi.n  a12,0                   # [4]
    slli    a11,a5,2                    # [5]
    slli    a15,a5,1                    # [6]
    l16ui   a14,a1,172                  # [7]  id:271 out_channels+0x0
    s32i.n  a15,a1,36               # [9]  gra_spill_temp_139
    s32i.n  a11,a1,56               # [10]  gra_spill_temp_144
    s32i    a12,a1,84                   # [11]  gra_spill_temp_151
    s32i    a9,a1,52                   # [12]  gra_spill_temp_156
    s32i.n  a10,a1,60               # [13]  gra_spill_temp_145
    s32i    a8,a1,88                    # [14]  gra_spill_temp_152
    movi.n  a10,0                   # [15]
    l32i    a8,a1,196                   # [16]  id:281 buffer+0x0
    slli    a11,a11,1                   # [19]
    l32i    a15,a1,184                  # [20]  id:192 out_mult+0x0
    s32i    a11,a1,64                   # [22]  gra_spill_temp_146
    s32i    a8,a1,112                   # [25]  gra_spill_temp_158
    s32i    a10,a1,92                   # [26]  gra_spill_temp_153
    movi.n  a8,0                    # [27]
    s32i    a10,a1,80                   # [31]  gra_spill_temp_150
    s32i    a8,a1,76                    # [32]  gra_spill_temp_149
    slli    a8,a14,1                    # [34]
    addx2   a9,a14,a14                  # [35]
    s32i    a9,a1,72                    # [36]  gra_spill_temp_148
    s32i.n  a8,a1,44                # [37]  gra_spill_temp_141
    addx4   a14,a14,a15                 # [38]
    s32i    a14,a1,48                  # [39]  gra_spill_temp_155
    j   .Lt_3_6914                      # [40]

.Lt_3_8194: # 0xb00
#<loop> Part of loop body line 305, head labeled .Lt_3_6914
    l32i.n  a12,a1,60               # [0]  gra_spill_temp_145
    l32i.n  a9,a1,56                # [1]  gra_spill_temp_144
    l32i    a8,a1,76                    # [2]  gra_spill_temp_149
    l32i    a15,a1,64                   # [3]  gra_spill_temp_146
    l32i    a11,a1,72                   # [4]  gra_spill_temp_148
    l32i    a14,a1,84                   # [5]  gra_spill_temp_151
    add.n   a13,a13,a11                 # [6]
    l32i    a11,a1,80                   # [7]  gra_spill_temp_150
    add.n   a14,a14,a15                 # [8]
    add.n   a8,a8,a9                    # [9]
    s32i    a8,a1,76                    # [10]  gra_spill_temp_149
    s32i    a14,a1,84                   # [11]  gra_spill_temp_151
    addi.n  a11,a11,4               # [12]
    s32i    a11,a1,80                   # [13]  gra_spill_temp_150
    bge     a11,a12,.Lt_3_6402          # [14]

.Lt_3_6914: # 0xb27
    l32i    a12,a1,52                  # [0]  gra_spill_temp_156
    l32i    a4,a1,112                   # [1]  gra_spill_temp_158
    blti    a12,1,.Lt_3_7170            # [2]

.LBB6_esp_nn_conv_s16_mult4_1x1_esp32s3:    # 0xb30
    l32i    a3,a1,88                    # [0]  gra_spill_temp_152
    l32i.n  a5,a1,40                # [1]  gra_spill_temp_140
    l32i    a2,a1,84                    # [3]  gra_spill_temp_151
    add.n   a2,a2,a5                    # [7]
    l32i.n  a5,a1,36                # [9]  gra_spill_temp_139

    // load and transose 4 lines of input 4xchannels,
    loopgtz a3,.transpose_loop_end
    mov.n   a3,a2                       # [0*II+0]
    ee.vld.l.64.xp  q0,a3,a5        # [0*II+2]  id:282
    ee.vld.l.64.xp  q1,a3,a5        # [0*II+3]  id:283
    ee.vld.l.64.xp  q2,a3,a5        # [0*II+4]  id:284
    ee.vld.l.64.xp  q3,a3,a5        # [0*II+5]  id:285
    ee.vzip.16      q0,q1               # [0*II+6]
    ee.vzip.16      q2,q3               # [0*II+7]
    ee.vzip.32      q0,q2               # [0*II+8]
    ee.vst.128.ip   q0,a4,16            # [0*II+9]  id:286
    ee.vst.128.ip   q2,a4,16            # [0*II+10]  id:287
    addi.n  a2,a2,8                 # [0*II+1]
.transpose_loop_end:

.Lt_3_7170: # 0xb7c
    l32i    a2,a1,68                    # [0]  gra_spill_temp_147
    l32i    a9,a1,116                   # [1]  gra_spill_temp_159
    l16ui   a8,a1,172                   # [2]  out_channels
    s32i    a9,a1,120                   # [3]  gra_spill_temp_160
    beqz.n  a8,.Lt_3_8194           # [4]

    l32i    a9,a1,180                # [0]  out_shift
    l32i    a11,a1,184               # [1]  out_mult
    l32i    a15,a1,72                   # [2]  gra_spill_temp_148
    l32i.n  a14,a1,44               # [3]  gra_spill_temp_141
    add.n   a15,a15,a13                 # [4]
    add.n   a14,a14,a13                 # [5]
    j   .Lt_3_8706                      # [6]

.Lt_3_10754:    # 0xb9a

    movi.n  a3,0                    # [0]

.Lt_3_10498:    # 0xb9c

// esp_nn_multiply_by_quantized_mult_esp32s3
    ee.zero.q   q0                      # [0]
    l32i        a5,a1,92                    # [1]  gra_spill_temp_153
    s32i        a2,a1,96                   # [2]  lgra_spill_temp_165
    s32i        a11,a1,104                  # [3]  lgra_spill_temp_167
    s32i        a13,a1,108                  # [4]  lgra_spill_temp_168
    s32i        a9,a1,100                   # [5]  lgra_spill_temp_166

    movi.n          a13,0                   # [6]
    max             a12,a12,a13                 # [7]
    wsr.sar         a12                     # [8]
    ee.vsl.32       q1,q1                   # [9]
    ssai            31                          # [10]
    ee.movi.32.a    q1,a7,0             # [11]
    ee.movi.32.a    q1,a8,1             # [12]
    ee.movi.32.a    q1,a6,3             # [13]
    ee.movi.32.a    q1,a9,2             # [14]
    mulsh           a12,a4,a9                   # [15]
    mulsh           a11,a4,a6                   # [16]
    mulsh           a2,a4,a8                    # [17]
    mulsh           a13,a7,a4                   # [18]
    mull            a8,a4,a8                    # [19]
    mull            a7,a7,a4                    # [20]
    mull            a6,a4,a6                    # [24]

    add.n           a11,a5,a11                  # [21]
    add.n           a12,a5,a12                  # [22]
    add.n           a2,a5,a2                    # [23]
    add.n           a5,a5,a13                   # [25]

    l32r            a13,.nudge_val
    mull            a9,a4,a9                    # [27]

    add.n           a6,a13,a6                   # [28]
    add.n           a9,a13,a9                   # [29]
    add.n           a10,a13,a7                   # [30]
    add.n           a8,a13,a8                   # [32]

    saltu           a7,a10,a13                   # [33]
    add.n           a7,a7,a5                    # [34]
    saltu           a5,a8,a13                   # [35]
    add.n           a5,a5,a2                    # [36]
    src             a5,a5,a8                    # [37]
    saltu           a2,a9,a13                   # [38]
    add.n           a2,a2,a12                   # [40]
    saltu           a13,a6,a13                  # [41]
    addi.n          a12,a3,-1               # [42]
    src             a2,a2,a9                    # [43]
    ee.movi.32.q    q3,a5,1             # [51]
    ee.movi.32.q    q3,a2,2             # [54]

    add.n           a13,a13,a11                 # [44]
    addi            a9,a1,32                    # [45]  to_add
    movi.n          a11,1                   # [46]
    src             a7,a7,a10                    # [47]
    src             a13,a13,a6                  # [48]
    ee.movi.32.q    q3,a7,0             # [50]
    ee.movi.32.q    q3,a13,3            # [57]

    addi            a8,a1,112                   # [49]

    l32i            a7,a1,48                   # [52]  gra_spill_temp_155
    l16ui           a5,a1,172                   # [53]  out_channels
    ssl             a12                         # [55]
    sll             a11,a11                     # [56]
    wsr.sar         a3                      # [58]
    ee.vcmp.lt.s32  q0,q3,q0        # [59]
    l32i            a13,a1,108                  # [60]  lgra_spill_temp_168
    s32i.n          a11,a1,32               # [61]  to_add
    ee.vldbc.32     q1,a9               # [62]  id:317 to_add
    add.n           a5,a5,a13                   # [63]
    l32i            a9,a1,100                   # [64]  lgra_spill_temp_166
    ee.vadds.s32    q1,q1,q0            # [65]
    addi.n          a9,a9,4                 # [66]
    ee.vadds.s32    q1,q3,q1            # [67]
    ee.vsr.32       q1,q1                   # [69]

# add offset, apply activation and store
    ee.vadds.s32    q1,q1,q5            # [70]
    ee.vmin.s32     q1,q1,q7            # [72]
    ee.vmax.s32     q1,q1,q6            # [73]
    ee.vst.128.ip   q1,a1,0             # [74]  id:320
    l8ui        a6,a1,0                     # [75]  scratch_buf
    s8i         a6,a13,0                    # [76]
    addi.n      a13,a13,1               # [77]
    l8ui        a2,a1,4                     # [78]  scratch_buf+4
    s8i         a2,a5,0                     # [79]
    l8ui        a12,a1,8                    # [80]  scratch_buf+8
    l32i        a2,a1,96                   # [81]  lgra_spill_temp_165
    s8i         a12,a14,0                   # [82]
    addi.n      a14,a14,1               # [83]
    l8ui        a11,a1,12                   # [84]  scratch_buf+12
    s8i         a11,a15,0                   # [85]
    l32i        a11,a1,104                  # [86]  lgra_spill_temp_167
    addi.n      a15,a15,1               # [87]
    addi.n      a11,a11,4               # [88]
    sub         a7,a11,a7                   # [89]
    beqz        a7,.Lt_3_8194               # [90]

.Lt_3_8706: # 0xc97
    ee.zero.qacc                    # [0]
    l32i    a8,a1,52                   # [1]  gra_spill_temp_156
    l32i    a3,a1,112                   # [2]  gra_spill_temp_158
    blti    a8,1,.Lt_3_8962             # [3]

    l32i    a4,a1,88                    # [0]  gra_spill_temp_152
    loopgtz a4,.LBB53_esp_nn_conv_s16_mult4_1x1_esp32s3     # [2]

    ee.vld.l.64.ip          q0,a2,8         # [0*II+0]  id:289
    ee.vld.l.64.ip          q1,a3,8         # [0*II+1]  id:290
    ee.vld.l.64.ip          q2,a3,8         # [0*II+2]  id:291
    ee.vsmulas.s16.qacc     q1,q0,0     # [0*II+3]
    ee.vld.l.64.ip          q3,a3,8         # [0*II+4]  id:292
    ee.vsmulas.s16.qacc     q2,q0,1     # [0*II+5]
    ee.vld.l.64.ip          q4,a3,8         # [0*II+6]  id:293
    ee.vsmulas.s16.qacc     q3,q0,2     # [0*II+7]
    ee.vsmulas.s16.qacc     q4,q0,3     # [0*II+8]

.LBB53_esp_nn_conv_s16_mult4_1x1_esp32s3:   # 0xcc4

.Lt_3_8962: # 0xcc4

// extract data:
    mov     a10,a1
    ee.st.qacc_l.l.128.ip   a10,16      # [0]  id:298
    ee.st.qacc_l.h.32.ip    a10,-16     # [1]  id:299
    l8ui    a12,a1,16                   # [2]  scratch_buf+16
    l8ui    a8,a1,6                     # [3]  scratch_buf+6
    s8i     a8,a1,3                     # [4]  scratch_buf+3
    s8i     a12,a1,7                    # [5]  scratch_buf+7
    l8ui    a8,a1,15                    # [6]  scratch_buf+15
    l8ui    a12,a1,5                    # [7]  scratch_buf+5
    s8i     a12,a1,2                    # [8]  scratch_buf+2
    s8i     a8,a1,6                     # [9]  scratch_buf+6
    l16ui   a12,a1,10                   # [10]  scratch_buf+10
    movi.n  a8,16                   # [11]
    ee.srcmb.s16.qacc   q2,a8,0         # [12]
    s16i                a12,a1,4                    # [13]  scratch_buf+4
    ee.vld.l.64.ip      q1,a10,0        # [14]  id:309
    l32i                a12,a1,116                  # [15]  gra_spill_temp_159, bias
    ee.vzip.16          q1,q2               # [16]

    beqz.n  a12,.Lt_3_9986          # [17] // skip bias
 // add bias:
    l32i            a8,a1,120                   # [0]  gra_spill_temp_160
    ee.vldbc.32.ip  q0,a8,4         # [2]  id:311
    s32i            a8,a1,120                   # [3]  gra_spill_temp_160
    ee.vadds.s32    q1,q1,q0            # [4]
.Lt_3_9986: # 0xd04

    l32i.n  a12,a9,0                # [0]  id:313
    l32i.n  a4,a11,0                # [1]  id:312
    bgei    a12,1,.Lt_3_10754           # [2]

    neg     a3,a12                      # [0]
    j       .Lt_3_10498                     # [1]

.Lt_3_6402: # 0xd11
    retw.n                          # [0]

    .size   esp_nn_conv_s16_mult4_1x1_esp32s3, . - esp_nn_conv_s16_mult4_1x1_esp32s3


================================================
FILE: src/convolution/esp_nn_conv_s16_mult8_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

    .text
    .literal_position
    .literal    .LC10_28_153, -2147483648
    .literal    .LC11_28_154, -1073741823
    .literal    .LC12_28_155, 2147483647
    .literal    .LC13_28_156, 1073741824

    # Program Unit: esp_nn_conv_s16_mult8_esp32s3
    .type   esp_nn_conv_s16_mult8_esp32s3, @function
    .align   4
    .global esp_nn_conv_s16_mult8_esp32s3
esp_nn_conv_s16_mult8_esp32s3:  # 0x6e2
    # qacc_scratch = 0
    # gra_spill_temp_96 = 48
    # gra_spill_temp_97 = 52
    # gra_spill_temp_98 = 56
    # gra_spill_temp_99 = 60
    # gra_spill_temp_100 = 64
    # gra_spill_temp_101 = 68
    # gra_spill_temp_102 = 72
    # gra_spill_temp_103 = 76
    # gra_spill_temp_104 = 80
    # gra_spill_temp_105 = 84
    # gra_spill_temp_106 = 88
    # gra_spill_temp_107 = 92
    # gra_spill_temp_108 = 96
    # gra_spill_temp_109 = 100
    # gra_spill_temp_110 = 104
    # gra_spill_temp_111 = 108
    # gra_spill_temp_112 = 112
    # gra_spill_temp_113 = 116
    # gra_spill_temp_114 = 120
    # gra_spill_temp_115 = 124
    # gra_spill_temp_116 = 128
    # gra_spill_temp_117 = 132
    # gra_spill_temp_118 = 136
    # gra_spill_temp_119 = 140
    # gra_spill_temp_120 = 144
    # gra_spill_temp_121 = 148
    # gra_spill_temp_122 = 152
    # gra_spill_temp_123 = 156
    # gra_spill_temp_124 = 160
    # gra_spill_temp_125 = 164
    # gra_spill_temp_126 = 168
    # gra_spill_temp_127 = 172
    # gra_spill_temp_128 = 176
    # gra_spill_temp_129 = 180
    # gra_spill_temp_130 = 184
    # gra_spill_temp_131 = 188
    # gra_spill_temp_132 = 192
    # gra_spill_temp_133 = 196
    # gra_spill_temp_134 = 200
    # gra_spill_temp_135 = 204
    # gra_spill_temp_136 = 208
    # gra_spill_temp_137 = 212

 // registers:
 // a2: const int16_t *input_data
 // a3: const uint16_t input_wd
 // a4: const uint16_t input_ht
 // a5: const uint16_t in_channels
 // a6: const uint16_t pad_wd
 // a7: const uint16_t pad_ht

 // on stack:
 // const uint16_t stride_wd
 // const uint16_t stride_ht
 // const int16_t *filter_data
 // const uint16_t filter_wd
 // const uint16_t filter_ht
 // const int32_t *bias
 // int8_t *out_data
 // const uint16_t out_wd
 // const uint16_t out_ht
 // const uint16_t out_channels
 // const int32_t out_offset
 // const int32_t *out_shift
 // const int32_t *out_mult
 // const int32_t activation_min
 // const int32_t activation_max


    entry   a1,256                      #
    s32i    a2,a1,176                   # [0]  gra_spill_temp_128
    s32i    a3,a1,192                   # [1]  gra_spill_temp_132
    s32i.n  a6,a1,60                # [2]  gra_spill_temp_99
    l16ui   a8,a1,288                   # [3]  id:282 out_ht+0x0
    s32i    a8,a1,68                    # [4]  gra_spill_temp_101
    beqz.n  a8,.Lt_2_11778          # [5]

    s32i    a7,a1,76                    # [0]  gra_spill_temp_103
    s32i    a1,a1,156                   # [1]  gra_spill_temp_123
    l16ui   a8,a1,272                   # [2]  id:285 filter_ht+0x0
    neg     a11,a7                      # [3]
    movi.n  a12,0                   # [4]
    neg     a14,a6                      # [5]
    l16ui   a15,a1,268                  # [6]  id:286 filter_wd+0x0
    l16ui   a9,a1,292                   # [7]  id:283 out_channels+0x0
    l32i    a10,a1,304                  # [8]  id:284 out_mult+0x0
    s32i    a10,a1,88                   # [9]  gra_spill_temp_106
    s32i    a9,a1,96                    # [10]  gra_spill_temp_108
    s32i    a15,a1,196                  # [11]  gra_spill_temp_133
    s32i.n  a14,a1,48               # [12]  gra_spill_temp_96
    s32i    a12,a1,72                   # [13]  gra_spill_temp_102
    s32i    a11,a1,80                   # [14]  gra_spill_temp_104
    s32i.n  a8,a1,52                # [15]  gra_spill_temp_97
    sub     a13,a3,a14                  # [16]
    mul16u  a8,a5,a8                # [17]
    s32i.n  a13,a1,56               # [18]  gra_spill_temp_98
    sub     a11,a4,a11                  # [19]
    l32i    a12,a1,276                  # [20]  id:292 bias+0x0
    s32i    a12,a1,152                  # [21]  gra_spill_temp_122
    s32i    a11,a1,84                   # [22]  gra_spill_temp_105
    l32i    a14,a1,308                  # [23]  id:290 activation_min+0x0
    l32i    a13,a1,312                  # [24]  id:291 activation_max+0x0
    s32i    a13,a1,144                  # [25]  gra_spill_temp_120
    mull    a15,a15,a8                  # [26]
    addx4   a9,a9,a10                   # [27]
    s32i    a14,a1,140                  # [28]  gra_spill_temp_119
    l32i    a11,a1,300                  # [29]  id:293 out_shift+0x0
    s32i    a11,a1,92                   # [30]  gra_spill_temp_107
    slli    a14,a5,1                    # [31]
    s32i    a9,a1,124                   # [32]  gra_spill_temp_115
    s32i    a15,a1,128                  # [33]  gra_spill_temp_116
    l32i    a8,a1,280                   # [34]  id:288 out_data+0x0
    movi.n  a10,0                   # [35]
    s32i    a10,a1,160                  # [36]  gra_spill_temp_124
    s32i    a8,a1,132                   # [37]  gra_spill_temp_117
    l32i    a15,a1,296                  # [38]  id:289 out_offset+0x0
    l32i    a9,a1,264                   # [39]  id:287 filter_data+0x0
    s32i    a9,a1,180                   # [40]  gra_spill_temp_129
    s32i    a15,a1,136                  # [41]  gra_spill_temp_118
    l16ui   a8,a1,284                   # [42]  id:296 out_wd+0x0
    l16ui   a10,a1,256                  # [43]  id:294 stride_wd+0x0
    s32i    a10,a1,100                  # [44]  gra_spill_temp_109
    s32i    a8,a1,104                   # [45]  gra_spill_temp_110
    addi.n  a15,a5,-1               # [46]
    l16ui   a9,a1,260                   # [47]  id:295 stride_ht+0x0
    s32i    a9,a1,64                    # [48]  gra_spill_temp_100
    srai    a15,a15,3                   # [49]
    j   .Lt_2_12290                     # [50]

.Lt_2_12546:    # 0x788
    l32i    a8,a1,68                    # [0]  gra_spill_temp_101
    l32i    a12,a1,80                   # [1]  gra_spill_temp_104
    l32i    a11,a1,84                   # [2]  gra_spill_temp_105
    l32i    a10,a1,64                   # [3]  gra_spill_temp_100
    l32i    a13,a1,72                   # [4]  gra_spill_temp_102
    l32i    a9,a1,76                    # [5]  gra_spill_temp_103
    addi.n  a13,a13,1               # [6]
    s32i    a13,a1,72                   # [7]  gra_spill_temp_102
    sub     a9,a9,a10                   # [8]
    sub     a11,a11,a10                 # [9]
    add.n   a12,a12,a10                 # [10]
    s32i    a12,a1,80                   # [11]  gra_spill_temp_104
    s32i    a11,a1,84                   # [12]  gra_spill_temp_105
    s32i    a9,a1,76                    # [13]  gra_spill_temp_103
    sub     a13,a13,a8                  # [14]
    beqz    a13,.Lt_2_11778             # [15]

.Lt_2_12290:    # 0x7b6 // width loop
    l32i    a13,a1,104                  # [0]  gra_spill_temp_110
    beqz.n  a13,.Lt_2_12546         # [2]

    l32i    a8,a1,192                   # [0]  gra_spill_temp_132
    l32i    a9,a1,80                    # [1]  gra_spill_temp_104
    movi.n  a11,0                   # [2]
    l32i    a10,a1,76                   # [3]  gra_spill_temp_103
    l32i.n  a12,a1,60               # [4]  gra_spill_temp_99
    l32i.n  a13,a1,56               # [5]  gra_spill_temp_98
    s32i    a13,a1,116                  # [6]  gra_spill_temp_113
    s32i    a12,a1,112                  # [7]  gra_spill_temp_112
    max     a10,a10,a11                 # [8]
    s32i    a10,a1,148                  # [9]  gra_spill_temp_121
    add.n   a9,a9,a10                   # [10]
    l32i.n  a11,a1,48               # [11]  gra_spill_temp_96
    s32i    a11,a1,184                  # [12]  gra_spill_temp_130
    mull    a8,a8,a9                    # [13]
    l32i    a10,a1,84                   # [14]  gra_spill_temp_105
    s32i    a8,a1,120                   # [15]  gra_spill_temp_114
    l32i.n  a9,a1,52                # [16]  gra_spill_temp_97
    movi.n  a8,0                    # [17]
    s32i    a8,a1,108                   # [18]  gra_spill_temp_111
    min     a9,a9,a10                   # [19]
    s32i    a9,a1,204                   # [20]  gra_spill_temp_135
    j   .Lt_2_13058                     # [21]

.Lt_2_13314:    # 0x7f6
#<loop> Part of loop body line 186, head labeled .Lt_2_13058
    l32i    a13,a1,104                  # [0]  gra_spill_temp_110
    l32i    a11,a1,112                  # [1]  gra_spill_temp_112
    l32i    a10,a1,184                  # [2]  gra_spill_temp_130
    l32i    a9,a1,100                   # [3]  gra_spill_temp_109
    l32i    a12,a1,108                  # [4]  gra_spill_temp_111
    l32i    a8,a1,116                   # [5]  gra_spill_temp_113
    addi.n  a12,a12,1               # [6]
    s32i    a12,a1,108                  # [7]  gra_spill_temp_111
    sub     a8,a8,a9                    # [8]
    add.n   a10,a10,a9                  # [9]
    sub     a11,a11,a9                  # [10]
    s32i    a11,a1,112                  # [11]  gra_spill_temp_112
    s32i    a10,a1,184                  # [12]  gra_spill_temp_130
    s32i    a8,a1,116                   # [13]  gra_spill_temp_113
    beq     a12,a13,.Lt_2_12546         # [14]

.Lt_2_13058:    # 0x821 // channel loop
    l32i    a12,a1,96                   # [0]  gra_spill_temp_108
    beqz.n  a12,.Lt_2_13314         # [2]

    movi.n  a11,0                   # [0]
    l32i    a10,a1,112                  # [1]  gra_spill_temp_112
    l32i    a13,a1,92                   # [2]  gra_spill_temp_107
    l32i    a8,a1,152                   # [3]  gra_spill_temp_122
    movi.n  a9,0                    # [4]
    l32i    a12,a1,88                   # [5]  gra_spill_temp_106
    s32i    a12,a1,168                  # [6]  gra_spill_temp_126
    s32i    a9,a1,188                   # [7]  gra_spill_temp_131
    s32i    a8,a1,164                   # [8]  gra_spill_temp_125
    s32i    a13,a1,172                  # [9]  gra_spill_temp_127
    l32i    a8,a1,116                   # [10]  gra_spill_temp_113
    l32i    a13,a1,196                  # [11]  gra_spill_temp_133
    max     a10,a10,a11                 # [12]
    s32i    a10,a1,208                  # [13]  gra_spill_temp_136
    min     a13,a13,a8                  # [14]
    s32i    a13,a1,200                  # [15]  gra_spill_temp_134
    j   .Lt_2_13826                     # [16]

.Lt_2_14082:    # 0x857

// extract data
    l32i    a4,a1,156                   # [0]  gra_spill_temp_123
    ee.st.qacc_l.l.128.ip   a4,16       # [2]  id:303
    ee.st.qacc_l.h.32.ip    a4,0        # [3]  id:304
    l8ui    a9,a1,15                    # [4]  qacc_scratch+15
    l16ui   a8,a1,10                    # [5]  qacc_scratch+10
    l8ui    a12,a1,16                   # [6]  qacc_scratch+16
    l8ui    a11,a1,6                    # [7]  qacc_scratch+6
    l8ui    a10,a1,5                    # [8]  qacc_scratch+5
    s8i     a10,a1,2                    # [9]  qacc_scratch+2
    s8i     a11,a1,3                    # [10]  qacc_scratch+3
    s8i     a12,a1,7                    # [11]  qacc_scratch+7
    s16i    a8,a1,4                     # [12]  qacc_scratch+4
    s8i     a9,a1,6                     # [13]  qacc_scratch+6

    ee.st.qacc_h.l.128.ip   a4,16       # [14]  id:314
    ee.st.qacc_h.h.32.ip    a4,-32      # [15]  id:315
    l8ui    a13,a1,32                   # [16]  qacc_scratch+32
    l8ui    a9,a1,21                    # [17]  qacc_scratch+21
    l8ui    a12,a1,31                   # [18]  qacc_scratch+31
    l16ui   a11,a1,26                   # [19]  qacc_scratch+26
    l8ui    a10,a1,22                   # [20]  qacc_scratch+22
    l16ui   a8,a1,16                    # [21]  qacc_scratch+16
    s16i    a8,a1,8                     # [22]  qacc_scratch+8
    s8i     a10,a1,11                   # [23]  qacc_scratch+11
    s16i    a11,a1,12                   # [24]  qacc_scratch+12
    s8i     a12,a1,14                   # [25]  qacc_scratch+14
    s8i     a9,a1,10                    # [26]  qacc_scratch+10
    s8i     a13,a1,15                   # [27]  qacc_scratch+15

    l32i    a9,a1,152                   # [28]  gra_spill_temp_122, bias
    movi.n  a13,16                  # [29]
    ee.srcmb.s16.qacc   q1,a13,0        # [30]
    ee.vld.128.ip   q0,a4,0             # [31]  id:327
    s32i            a4,a1,156                   # [32]  gra_spill_temp_123
    ee.vzip.16      q0,q1               # [33]
    ee.vadds.s32    q0,q0,q1            # [34]
    ee.movi.32.a    q0,a12,3            # [35]
    ee.movi.32.a    q0,a11,2            # [36]
    ee.movi.32.a    q0,a10,0            # [37]
    add.n           a11,a11,a12                 # [38]
    ee.movi.32.a    q0,a12,1            # [39]
    add.n           a10,a10,a12                 # [40]
    add.n           a10,a10,a11                 # [41]

    beqz.n  a9,.Lt_2_17154          # [42] // skip bias

    l32i    a13,a1,164                  # [0]  gra_spill_temp_125
    l32i.n  a13,a13,0               # [2]  id:329
    add.n   a10,a10,a13                 # [4]
.Lt_2_17154:    # 0x8d7

 # 259                  conv_out = esp_nn_multiply_by_quantized_mult(conv_out, out_mult[out_ch_idx], out_shift[out_ch_idx]);
    l32i    a11,a1,172                  # [0]  gra_spill_temp_127
    l32i    a4,a1,168                   # [1]  gra_spill_temp_126
    l32i.n  a11,a11,0               # [2]  id:331
    l32i.n  a4,a4,0                 # [3]  id:330

    blti    a11,1,.LBB26_esp_nn_conv_s16_mult8_esp32s3  # [4]
    movi.n  a13,0                   # [0]
    j       .Lt_2_17666                     # [1]
.LBB26_esp_nn_conv_s16_mult8_esp32s3:   # 0xa4e
    neg     a13,a11                     # [0]
.Lt_2_17666:    # 0x8e6

    movi.n  a12,0                   # [0]
    max     a12,a11,a12                 # [1]
    movi.n  a11,0                   # [2]
    ssl     a12                         # [3]
    sll     a10,a10                     # [4]
    bne     a10,a4,.Lt_2_20994          # [5]

    l32r    a9,.LC10_28_153             # [0]
    movi.n  a8,1                    # [1]
    sub     a9,a10,a9                   # [2]
    moveqz  a11,a8,a9               # [3]

.Lt_2_20994:    # 0x901
    extui   a8,a4,31,1                  # [0]
    extui   a12,a10,31,1                # [1]
    xor     a12,a12,a8                  # [2]
    extui   a12,a12,0,8                 # [3]

    beqz.n  a12,.Lt_2_18434         # [4]
    movi.n  a12,-1                  # [0]
    l32r    a9,.LC11_28_154             # [1]
    j       .Lt_2_18178                     # [2]

.Lt_2_18434:    # 0xa54
    movi.n  a12,0                   # [0]
    l32r    a9,.LC13_28_156             # [1]
.Lt_2_18178:    # 0x914

    ssai    31                          # [0]
    l32r    a8,.LC12_28_155             # [1]
    mulsh   a6,a4,a10                   # [2]
    mull    a4,a4,a10                   # [3]
    add.n   a6,a6,a12                   # [4]
    add.n   a7,a4,a9                    # [5]
    saltu   a4,a7,a4                    # [6]
    add.n   a4,a4,a6                    # [7]
    srai    a6,a4,31                    # [8]
    and     a6,a6,a8                    # [9]
    add.n   a7,a6,a7                    # [10]
    srai    a3,a6,31                    # [11]
    add.n   a3,a3,a4                    # [12]
    saltu   a6,a7,a6                    # [13]
    add.n   a6,a6,a3                    # [14]
    src     a6,a6,a7                    # [15]
    extui   a3,a11,0,8                  # [16]
    movi.n  a7,1                    # [17]
    ssr     a13                         # [18]
    movnez  a6,a8,a3                # [19]
    sra     a8,a6                       # [20]

    addi.n  a3,a8,1                 # [21]
    ssl     a13                         # [22]
    sll     a7,a7                       # [23]
    extui   a4,a8,31,1                  # [24]
    addi.n  a7,a7,-1                # [25]
    and     a6,a6,a7                    # [26]
    srai    a7,a7,1                     # [27]
    add.n   a4,a4,a7                    # [28]
    l32i    a7,a1,164                   # [29]  gra_spill_temp_125
    salt    a4,a4,a6                    # [30]
    movnez  a8,a3,a4                # [31]
    l32i    a6,a1,172                   # [32]  gra_spill_temp_127
    l32i    a4,a1,132                   # [33]  gra_spill_temp_117
    l32i    a3,a1,160                   # [34]  gra_spill_temp_124
    addi.n  a7,a7,4                 # [35]
    s32i    a7,a1,164                   # [36]  gra_spill_temp_125
    addi.n  a6,a6,4                 # [37]
    s32i    a6,a1,172                   # [38]  gra_spill_temp_127
    l32i    a7,a1,136                   # [39]  gra_spill_temp_118
    l32i    a6,a1,140                   # [40]  gra_spill_temp_119
    add.n   a4,a3,a4                    # [41]
    add.n   a7,a7,a8                    # [42]
    addi.n  a3,a3,1                 # [43]
    l32i    a8,a1,128                   # [44]  gra_spill_temp_116
    max     a6,a6,a7                    # [45]
    s32i    a3,a1,160                   # [46]  gra_spill_temp_124
    l32i    a7,a1,188                   # [47]  gra_spill_temp_131
    l32i    a3,a1,144                   # [48]  gra_spill_temp_120
    add.n   a7,a7,a8                    # [49]
    min     a3,a3,a6                    # [50]
    s8i     a3,a4,0                     # [51]  id:332
    s32i    a7,a1,188                   # [52]  gra_spill_temp_131
    l32i    a4,a1,168                   # [53]  gra_spill_temp_126
    l32i    a6,a1,124                   # [54]  gra_spill_temp_115
    addi.n  a4,a4,4                 # [55]
    s32i    a4,a1,168                   # [56]  gra_spill_temp_126
    sub     a4,a4,a6                    # [57]
    beqz    a4,.Lt_2_13314              # [58]

.Lt_2_13826:    # 0x9b4
    ee.zero.qacc                    # [0]
    l32i    a9,a1,204                   # [1]  gra_spill_temp_135
    l32i    a8,a1,148                   # [2]  gra_spill_temp_121
    s32i    a8,a1,212                   # [3]  gra_spill_temp_137
    bge     a8,a9,.Lt_2_14082           # [4]

.LBB12_esp_nn_conv_s16_mult8_esp32s3:   # 0x9c3
#<loop> Part of loop body line 187, head labeled .Lt_2_13826
    l32i    a8,a1,196                   # [0]  gra_spill_temp_133
    l32i    a7,a1,212                   # [1]  gra_spill_temp_137
    l32i    a13,a1,200                  # [2]  gra_spill_temp_134
    mull    a7,a7,a8                    # [3]
    l32i    a6,a1,120                   # [4]  gra_spill_temp_114
    add.n   a13,a7,a13                  # [5]
    j   .Lt_2_14594                     # [6]

.Lt_2_14850:    # 0x9d7
#<loop> Part of loop body line 201, head labeled .Lt_2_14594
    l32i    a9,a1,204                   # [0]  gra_spill_temp_135
    l32i    a10,a1,212                  # [1]  gra_spill_temp_137
    l32i    a12,a1,192                  # [2]  gra_spill_temp_132
    l32i    a11,a1,196                  # [3]  gra_spill_temp_133
    add.n   a6,a6,a12                   # [4]
    add.n   a7,a7,a11                   # [5]
    add.n   a13,a13,a11                 # [6]
    addi.n  a10,a10,1               # [7]
    s32i    a10,a1,212                  # [8]  gra_spill_temp_137
    sub     a9,a9,a10                   # [9]
    beqz    a9,.Lt_2_14082              # [10]

.Lt_2_14594:    # 0x9f4
    l32i    a9,a1,200                   # [0]  gra_spill_temp_134
    l32i    a8,a1,208                   # [1]  gra_spill_temp_136
    bge     a8,a9,.Lt_2_14850           # [3]

    l32i    a11,a1,176                  # [0]  gra_spill_temp_128
    l32i    a10,a1,184                  # [1]  gra_spill_temp_130
    add.n   a12,a7,a8                   # [2]
    add.n   a10,a10,a8                  # [3]
    add.n   a10,a6,a10                  # [4]
    mull    a10,a5,a10                  # [5]
    mull    a8,a12,a5                   # [6]
    addx2   a10,a10,a11                 # [7]
    l32i    a11,a1,188                  # [8]  gra_spill_temp_131
    add.n   a11,a11,a8                  # [10]
    l32i    a8,a1,180                   # [11]  gra_spill_temp_129
    mov.n   a2,a10                      # [12]
    addx2   a11,a11,a8                  # [13]
    movi.n  a8,8                    # [14]
    mov.n   a3,a11                      # [15]
    j   .Lt_2_15362                     # [16]

.LBB18_esp_nn_conv_s16_mult8_esp32s3:   # 0xa26
    loopgtz a15,.LBB54_esp_nn_conv_s16_mult8_esp32s3    # [0]

    ee.vmulas.s16.qacc.ld.ip    q0,a2,16,q0,q1  # [0*II+0]  id:300
    ee.vld.128.ip   q1,a3,16            # [0*II+1]  id:301
.LBB54_esp_nn_conv_s16_mult8_esp32s3:   # 0xa30

.Lt_2_15618:    # 0xa30
    ee.vmulas.s16.qacc  q0,q1       # [0]
    movi.n  a8,8                    # [1]
    add.n   a10,a10,a14                 # [2]
    add.n   a11,a11,a14                 # [3]
    mov.n   a3,a11                      # [4]
    mov.n   a2,a10                      # [5]
    beq     a12,a13,.Lt_2_14850         # [6]

.Lt_2_15362:    # 0xa40
    ee.vld.128.ip   q1,a3,16            # [0]  id:299
    ee.vld.128.ip   q0,a2,16            # [1]  id:298
    addi.n  a12,a12,1               # [2]
    bltu    a8,a5,.LBB18_esp_nn_conv_s16_mult8_esp32s3  # [3]

    j   .Lt_2_15618                     # [0]

.Lt_2_11778:    # 0xa5c
    retw.n                          # [0]

    .size   esp_nn_conv_s16_mult8_esp32s3, . - esp_nn_conv_s16_mult8_esp32s3


================================================
FILE: src/convolution/esp_nn_conv_s8_1x1_esp32s3.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * 1x1 convolution for ESP32-S3 using transpose + parallel MAC.
 * Processes 8 spatial positions simultaneously via QACC lanes.
 */

#include <stdint.h>
#include <string.h>
#include <esp_nn_defs.h>
#include <common_functions.h>

int esp_nn_conv_s8_1x1_scratch_size(int out_channels)
{
    /* Transpose buffer: 8 channels × 8 positions × 2 bytes = 128 bytes per chunk.
     * Multiple chunks processed sequentially, so 128 is enough. */
    return 128 + 64; /* transpose + alignment */
}

/*
 * Transpose 8 spatial positions × 8 channels from int8 to int16 with offset.
 * C fallback for when input address is not 8-byte aligned.
 */
static inline void transpose_8x8_s16_c(const int8_t *input, int stride,
                                         int32_t input_offset, int16_t *out_buf)
{
    for (int ch = 0; ch < 8; ch++) {
        for (int pos = 0; pos < 8; pos++) {
            out_buf[ch * 8 + pos] = (int16_t)(input[pos * stride + ch] + input_offset);
        }
    }
}

/*
 * SIMD transpose: 8 positions × 8 channels → channel-major int16 with offset.
 * Uses vzip.8/16/32 chain (same as original .S transpose, verified correct).
 *
 * Input: 8 consecutive spatial positions, each `stride` bytes apart.
 *        Input address MUST be 8-byte aligned.
 * Output: int16 buffer [ch0: pos0..pos7, ch1: pos0..pos7, ...] (16-byte aligned)
 */
static inline void transpose_8x8_s16_simd(const int8_t *input, int stride,
                                            int16_t offset16, int16_t *out_buf)
{
    const int8_t *p = input;
    int16_t *out = out_buf;
    int16_t *off_ptr = &offset16;

    __asm__ volatile(
        /* Load input_offset broadcast to all 8 int16 lanes */
        "ee.vldbc.16 q5, %[off]\n"
        /* Zero register for sign extension comparisons */
        "ee.zero.q q7\n"

        /* Load 8 positions × 8 channels into q0-q3 using paired l/h loads.
         * Each vld.l.64.xp loads 8 bytes (1 position) into low half, advances by stride.
         * Each vld.h.64.xp loads 8 bytes into high half, advances by stride.
         * Result: q0=[pos0|pos2], q1=[pos1|pos3], q2=[pos4|pos6], q3=[pos5|pos7] */
        "ee.vld.l.64.xp q0, %[p], %[s]\n"
        "ee.vld.l.64.xp q1, %[p], %[s]\n"
        "ee.vld.h.64.xp q0, %[p], %[s]\n"
        "ee.vld.h.64.xp q1, %[p], %[s]\n"
        "ee.vld.l.64.xp q2, %[p], %[s]\n"
        "ee.vzip.8 q0, q1\n"
        "ee.vld.l.64.xp q3, %[p], %[s]\n"
        "ee.vld.h.64.xp q2, %[p], %[s]\n"
        "ee.vld.h.64.ip q3, %[p], 0\n"
        "ee.vzip.16 q0, q1\n"
        "ee.vzip.8 q2, q3\n"
        "ee.vzip.16 q2, q3\n"
        "ee.vzip.32 q0, q2\n"

        /* First 4 channels: sign-extend q0→(q0,q6), q2→(q2,q4), add offset, store */
        "ee.vcmp.lt.s8 q4, q2, q7\n"
        "ee.vzip.8 q2, q4\n"
        "ee.vcmp.lt.s8 q6, q0, q7\n"
        "ee.vzip.8 q0, q6\n"
        "ee.vadds.s16 q0, q0, q5\n"
        "ee.vst.128.ip q0, %[out], 16\n"
        "ee.vadds.s16 q6, q6, q5\n"
        "ee.vst.128.ip q6, %[out], 16\n"
        "ee.vadds.s16 q2, q2, q5\n"
        "ee.vst.128.ip q2, %[out], 16\n"
        "ee.vadds.s16 q4, q4, q5\n"
        "ee.vst.128.ip q4, %[out], 16\n"

        /* Last 4 channels: sign-extend q1→(q1,q6), q3→(q3,q4), add offset, store */
        "ee.vzip.32 q1, q3\n"
        "ee.vcmp.lt.s8 q4, q3, q7\n"
        "ee.vzip.8 q3, q4\n"
        "ee.vcmp.lt.s8 q6, q1, q7\n"
        "ee.vzip.8 q1, q6\n"
        "ee.vadds.s16 q1, q1, q5\n"
        "ee.vst.128.ip q1, %[out], 16\n"
        "ee.vadds.s16 q6, q6, q5\n"
        "ee.vst.128.ip q6, %[out], 16\n"
        "ee.vadds.s16 q3, q3, q5\n"
        "ee.vst.128.ip q3, %[out], 16\n"
        "ee.vadds.s16 q4, q4, q5\n"
        "ee.vst.128.ip q4, %[out], 16\n"

        : [p] "+r" (p), [out] "+r" (out), [off] "+r" (off_ptr)
        : [s] "r" (stride)
        : "memory"
    );
}

/*
 * MAC 8 filter channels against 8 positions using QACC.
 * data_buf: [ch0: 8 int16, ch1: 8 int16, ...] = 128 bytes, 16-byte aligned
 * filter: 8 int8 values, sign-extended to int16 internally
 * Accumulates into QACC lanes 0-7 (must be zeroed before first call per oc)
 *
 * NOTE: filter pointer may not be 8-byte aligned, so we copy to an aligned
 * local buffer before using ee.vld.l.64.ip (which ignores unaligned address bits).
 */
static inline void mac_8pos_8ch_simd(const int16_t *data_buf, const int8_t *filter)
{
    /* Copy filter to aligned buffer — ee.vld.l.64.ip requires 8-byte alignment */
    int8_t __attribute__((aligned(16))) f_aligned[16];
    memcpy(f_aligned, filter, 8);

    const int16_t *dp = data_buf;
    const int8_t *fp = f_aligned;
    __asm__ volatile(
        /* Sign-extend filter: load 8 int8 → 8 int16 in q7 */
        "ee.zero.q q5\n"
        "ee.vld.l.64.ip q7, %[f], 0\n"
        /* Pre-load first two data chunks during sign extension */
        "ee.vld.128.ip q0, %[d], 16\n"
        "ee.vld.128.ip q1, %[d], 16\n"
        "ee.vcmp.lt.s8 q6, q7, q5\n"
        "ee.vzip.8 q7, q6\n"

        /* Pipelined: MAC current + load next in one instruction */
        "ee.vsmulas.s16.qacc.ld.incp q2, %[d], q0, q7, 0\n"
        "ee.vsmulas.s16.qacc.ld.incp q3, %[d], q1, q7, 1\n"
        "ee.vsmulas.s16.qacc.ld.incp q0, %[d], q2, q7, 2\n"
        "ee.vsmulas.s16.qacc.ld.incp q1, %[d], q3, q7, 3\n"
        "ee.vsmulas.s16.qacc.ld.incp q2, %[d], q0, q7, 4\n"
        "ee.vsmulas.s16.qacc.ld.incp q3, %[d], q1, q7, 5\n"
        /* Last two: plain MAC, no more data to load */
        "ee.vsmulas.s16.qacc q2, q7, 6\n"
        "ee.vsmulas.s16.qacc q3, q7, 7\n"
        : [d] "+r" (dp), [f] "+r" (fp)
        :
        : "memory"
    );
}

void esp_nn_conv_s8_1x1(const int8_t *input,
                         const uint16_t input_wd,
                         const uint16_t input_ht,
                         const uint16_t in_channels,
                         const int32_t input_offset,
                         const int8_t *filter_data,
                         const int32_t *bias,
                         int8_t *out_data,
                         const uint16_t out_channels,
                         const int32_t out_offset,
                         const int32_t *out_shift,
                         const int32_t *out_mult,
                         const int32_t activation_min,
                         const int32_t activation_max,
                         void *scratch)
{
    const int size = input_wd * input_ht;
    const int ch8 = in_channels / 8;

    /* SIMD transpose requires 8-byte aligned input; check once */
    const int use_simd_transpose = (in_channels % 8 == 0) &&
                                    (((uintptr_t)input & 7) == 0);
    const int16_t offset16 = (int16_t)input_offset;

    /* Use scratch buffer for transpose data — holds ALL channel groups at once.
     * Layout: [cg0: 8 int16 × 8 pos, cg1: 8 int16 × 8 pos, ...] = ch8 × 128 bytes.
     * Aligned to 16 bytes for SIMD loads. */
    int16_t *tbuf = (int16_t *)((uintptr_t)((int8_t *)scratch + 15) & ~15);

    int pos = 0;
    for (; pos + 7 < size; pos += 8) {
        const int8_t *in_base = input + pos * in_channels;

        /* Transpose ALL channel groups ONCE per position batch.
         * This is the key optimization — reuse transposed data across all out_channels. */
        for (int cg = 0; cg < ch8; cg++) {
            int16_t *cg_buf = tbuf + cg * 64; /* 64 int16 per channel group */
            if (use_simd_transpose) {
                transpose_8x8_s16_simd(in_base + cg * 8, in_channels,
                                        offset16, cg_buf);
            } else {
                transpose_8x8_s16_c(in_base + cg * 8, in_channels,
                                     input_offset, cg_buf);
            }
        }
        __asm__ volatile("" ::: "memory");

        for (int oc = 0; oc < out_channels; oc++) {
            const int8_t *filt = filter_data + oc * in_channels;

            /* MAC across all channel groups using pre-transposed data */
            __asm__ volatile("ee.zero.qacc");

            for (int cg = 0; cg < ch8; cg++) {
                mac_8pos_8ch_simd(tbuf + cg * 64, filt + cg * 8);
            }

            /* Extract QACC → 8 int32 values */
            int32_t qacc[8];
            {
                int8_t __attribute__((aligned(16))) qraw[24];
                int8_t *qp = qraw;

                __asm__ volatile(
                    "ee.st.qacc_l.l.128.ip %[p], 16\n"
                    "ee.st.qacc_l.h.32.ip  %[p], -16\n"
                    : [p] "+r" (qp) : : "memory"
                );
                qacc[0] = *(int32_t *)(qraw + 0);
                qacc[1] = *(int32_t *)(qraw + 5);
                qacc[2] = *(int32_t *)(qraw + 10);
                qacc[3] = *(int32_t *)(qraw + 15);

                qp = qraw;
                __asm__ volatile(
                    "ee.st.qacc_h.l.128.ip %[p], 16\n"
                    "ee.st.qacc_h.h.32.ip  %[p], -16\n"
                    : [p] "+r" (qp) : : "memory"
                );
                qacc[4] = *(int32_t *)(qraw + 0);
                qacc[5] = *(int32_t *)(qraw + 5);
                qacc[6] = *(int32_t *)(qraw + 10);
                qacc[7] = *(int32_t *)(qraw + 15);
            }

            /* Remainder channels (scalar) */
            for (int c = ch8 * 8; c < in_channels; c++) {
                int16_t f = (int16_t)filt[c];
                for (int p = 0; p < 8; p++) {
                    qacc[p] += ((int32_t)in_base[p * in_channels + c] + input_offset) * f;
                }
            }

            /* Bias + requant + store for 8 positions */
            for (int p = 0; p < 8; p++) {
                int32_t acc = qacc[p];
                if (bias) acc += bias[oc];
                acc = esp_nn_multiply_by_quantized_mult(acc, out_mult[oc], out_shift[oc]);
                acc += out_offset;
                acc = max(acc, activation_min);
                acc = min(acc, activation_max);
                out_data[(pos + p) * out_channels + oc] = (int8_t)acc;
            }
        }
    }

    /* Leftover positions (< 8 remaining) */
    for (; pos < size; pos++) {
        const int8_t *in_ptr = input + pos * in_channels;
        for (int oc = 0; oc < out_channels; oc++) {
            const int8_t *filt = filter_data + oc * in_channels;
            int32_t acc = 0;
            int c = 0;
            for (; c + 2 < in_channels; c += 3) {
                acc += ((int32_t)in_ptr[c]     + input_offset) * (int32_t)filt[c];
                acc += ((int32_t)in_ptr[c + 1] + input_offset) * (int32_t)filt[c + 1];
                acc += ((int32_t)in_ptr[c + 2] + input_offset) * (int32_t)filt[c + 2];
            }
            for (; c < in_channels; c++) {
                acc += ((int32_t)in_ptr[c] + input_offset) * (int32_t)filt[c];
            }
            if (bias) acc += bias[oc];
            acc = esp_nn_multiply_by_quantized_mult(acc, out_mult[oc], out_shift[oc]);
            acc += out_offset;
            acc = max(acc, activation_min);
            acc = min(acc, activation_max);
            out_data[pos * out_channels + oc] = (int8_t)acc;
        }
    }
}


================================================
FILE: src/convolution/esp_nn_conv_s8_3x3_opt_esp32s3.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * Optimized 3x3 convolution for ESP32-S3.
 *
 * Key optimization vs the general aligned asm:
 * The general asm reloads input for each output channel (128× per pixel).
 * This version pre-loads the 3x3 input window into scratch (9 rows × in_ch bytes),
 * then iterates output channels with the input in L1 cache.
 *
 * For Conv[11] (26×26×128→12×12×128, 3×3 s2):
 * - Input window: 3 × 3 × 128 = 1,152 bytes (fits in L1)
 * - Filter per OC: 3 × 3 × 128 = 1,152 bytes
 * - Total for all 128 OC: 147,456 bytes (cycles through L1)
 * - Input loaded once vs 128× in the general asm
 */

#include <stdint.h>
#include <string.h>
#include <esp_nn_defs.h>
#include <common_functions.h>

/*
 * Check if a conv can use the optimized 3x3 path.
 * Requirements:
 * - filter_wd == 3 && filter_ht == 3
 * - in_channels >= 16 (SIMD worth it)
 * - in_channels % 16 == 0 (aligned for ee.vld.128)
 */
int esp_nn_conv_s8_3x3_can_use(int filter_wd, int filter_ht,
                                int in_channels)
{
    return (filter_wd == 3 && filter_ht == 3 &&
            in_channels >= 16 && (in_channels % 16) == 0);
}

/*
 * Scratch size for the 3x3 optimized path:
 * - im2col buffer: 3 × 3 × in_channels bytes (input window)
 * - corrections: out_channels × 4 bytes
 */
int esp_nn_conv_s8_3x3_scratch_size(int in_channels, int out_channels)
{
    int im2col = 9 * in_channels;          /* 3×3 input window */
    int corrections = out_channels * 4;    /* bias + filter_sum * offset */
    return im2col + corrections + 32;      /* + alignment */
}

/*
 * 3x3 convolution: im2col per pixel, then dot product per output channel.
 * Uses ACCX dot product (ee.vmulas.s8.accx) for the 3×3×in_ch window.
 */
void esp_nn_conv_s8_3x3_opt(const int8_t *input,
                             const uint16_t input_wd,
                             const uint16_t input_ht,
                             const uint16_t in_channels,
                             const int32_t input_offset,
                             const uint16_t stride_wd,
                             const uint16_t stride_ht,
                             const int8_t *filter_data,
                             const int32_t *bias,
                             int8_t *out_data,
                             const uint16_t out_wd,
                             const uint16_t out_ht,
                             const uint16_t out_channels,
                             const int32_t out_offset,
                             const int32_t *out_shift,
                             const int32_t *out_mult,
                             const int32_t activation_min,
                             const int32_t activation_max,
                             void *scratch)
{
    const int window_len = 9 * in_channels; /* 3×3 window */
    const int window_len_aligned = (window_len + 15) & ~15;

    /* Scratch layout: [im2col_buf | corrections] */
    int8_t *im2col_buf = (int8_t *)((uintptr_t)((int8_t *)scratch + 15) & ~15);
    int32_t *corrections = (int32_t *)(im2col_buf + window_len_aligned);

    /* Pre-compute corrections: filter_sum * input_offset + bias */
    const int8_t *f_ptr = filter_data;
    for (int oc = 0; oc < out_channels; oc++) {
        int32_t filter_sum = 0;
        for (int i = 0; i < window_len; i++) {
            filter_sum += f_ptr[i];
        }
        corrections[oc] = filter_sum * input_offset;
        if (bias) corrections[oc] += bias[oc];
        f_ptr += window_len;
    }

    /* Zero-pad the tail of im2col buffer for aligned SIMD reads */
    memset(im2col_buf + window_len, 0, window_len_aligned - window_len);

    const int in_row_stride = input_wd * in_channels;

    for (int out_y = 0; out_y < out_ht; out_y++) {
        for (int out_x = 0; out_x < out_wd; out_x++) {
            /* Phase 1: Build im2col for this output pixel (one-time per pixel) */
            const int in_y = out_y * stride_ht;
            const int in_x = out_x * stride_wd;
            int8_t *dst = im2col_buf;
            for (int fy = 0; fy < 3; fy++) {
                const int8_t *src = input + (in_y + fy) * in_row_stride + in_x * in_channels;
                memcpy(dst, src, 3 * in_channels);
                dst += 3 * in_channels;
            }

            /* Phase 2: Dot product against each output channel's filter */
            const int8_t *filter_ptr = filter_data;
            for (int oc = 0; oc < out_channels; oc++) {
                /* ACCX dot product: im2col_buf · filter_ptr */
                int32_t acc = 0;

                /* Use SIMD dot product via ACCX */
                const int8_t *a = im2col_buf;
                const int8_t *b = filter_ptr;
                int remaining = window_len_aligned;

                __asm__ volatile("ee.zero.accx");

                /* Primed unaligned load for input */
                __asm__ volatile(
                    "ee.ld.128.usar.ip q0, %[a], 16\n"
                    : [a] "+r" (a) : : "memory"
                );

                while (remaining >= 32) {
                    __asm__ volatile(
                        "ee.vld.128.ip q4, %[a], 16\n"
                        "ee.vmulas.s8.accx.ld.ip.qup q3, %[b], 16, q2, q1, q0, q4\n"
                        "ee.vld.128.ip q2, %[a], 16\n"
                        "ee.vmulas.s8.accx.ld.ip.qup q1, %[b], 16, q0, q3, q4, q2\n"
                        "ee.orq q0, q2, q2\n"
                        "ee.orq q2, q4, q4\n"
                        : [a] "+r" (a), [b] "+r" (b)
                        : : "memory"
                    );
                    remaining -= 32;
                }
                if (remaining >= 16) {
                    __asm__ volatile(
                        "ee.vmulas.s8.accx.ld.ip q4, %[a], 16, q2, q1\n"
                        "ee.src.q.ld.ip q1, %[b], 16, q0, q4\n"
                        "ee.orq q2, q0, q0\n"
                        : [a] "+r" (a), [b] "+r" (b)
                        : : "memory"
                    );
                    remaining -= 16;
                }
                __asm__ volatile(
                    "ee.vmulas.s8.accx q2, q1\n"
                    "movi.n %[tmp], 0\n"
                    "ee.srs.accx %[acc], %[tmp], 0\n"
                    : [acc] "=r" (acc), [tmp] "=r" (remaining)
                    : : "memory"
                );

                acc += corrections[oc];
                acc = esp_nn_multiply_by_quantized_mult(acc, out_mult[oc], out_shift[oc]);
                acc += out_offset;
                acc = max(acc, activation_min);
                acc = min(acc, activation_max);
                *out_data++ = (int8_t)acc;

                filter_ptr += window_len;
            }
        }
    }
}


================================================
FILE: src/convolution/esp_nn_conv_s8_filter_aligned_input_padded_esp32s3.S
================================================
//
// SPDX-FileCopyrightText: 2023-2026 Espressif Systems (Shanghai) CO LTD
//
// SPDX-License-Identifier: Apache-2.0
//


//
// Contraints used by this function are:
//     1. pad_wd and pad_ht is 0. For versions needing padding we do this
//        explicitly
//     2. All the filter rows are aligned to 16 bytes boundary. To make sure
//        this is indeed the case, for filter rows (filter_wd * channels) not
//        multiple of 16, we add zeros to fill it till 16 bondary.
//
//     The optimized kernel assumes this and skips filter row with following
//     size: ((filter_wd * input_ch) + 15) & ~15.

	.text

.literal_position
	.literal .LC1, 1073741824

    # Program Unit: esp_nn_conv_s8_filter_aligned_input_padded_esp32s3
	.type	esp_nn_conv_s8_filter_aligned_input_padded_esp32s3, @function
	.align	4
	.global	esp_nn_conv_s8_filter_aligned_input_padded_esp32s3
 // registers:
 // a2: const int16_t *input_data
 // a3: const uint16_t input_wd
 // a4: const uint16_t input_ht
 // a5: const uint16_t in_ch
 // a6: const uint16_t input_offset
 // a7: const uint16_t stride_wd

 // on stack:
 // const uint16_t stride_ht	: 80
 // const int8_t *filter_data	: 84
 // const uint16_t filter_wd	: 88
 // const uint16_t filter_ht	: 92
 // const int32_t *bias			: 96
 // int8_t *out_data			: 100
 // const uint16_t out_wd		: 104
 // const uint16_t out_ht		: 108
 // const uint16_t out_channels	: 112
 // const int32_t out_offset	: 116
 // const int32_t *out_shift	: 120
 // const int32_t *out_mult		: 124
 // const int32_t activation_min: 128
 // const int32_t activation_max: 132
 // void *scratch_buffer: 136

esp_nn_conv_s8_filter_aligned_input_padded_esp32s3:
	entry	sp, 80
	s32i.n  a2, sp, 40  	# input_data
	mov		a11, a6			# input_offset
	l16ui	a2, sp, 88  	# filter_wd
	l32i	a8, sp, 100		# out_data
	l16ui	a6, sp, 80		# stride_ht
	mov.n	a15, a5

	mull	a4, a2, a15		# filter_row_sz
	s32i.n	a8, sp, 24		# out_data_ptr
	movi.n	a9, 0
	s32i.n	a9, sp, 36      # out_y

	addi.n	a4, a4, 15		# to round the size up
	srli	a2, a4, 4		# (filter_row_sz) >> 4
	slli	a12, a2, 4		# ((filter_row_sz) >> 4) << 4

	mull	a4, a6, a3		# stride_ht * input_wd
	mull	a5, a3, a15		# input_wd * in_ch
	l32i.n	a10, sp, 112     # out_ch

	mull 	a9, a7, a15		# stride_wd * in_ch
	mull 	a4, a4, a15		# (stride_ht * input_wd) * in_ch

	slli	a3, a10, 2		# out_ch * 4

	s32i.n	a3, sp, 32		# out_ch * 4
	s32i.n	a5, sp, 12		# input_wd * in_ch
	s32i.n	a9, sp, 52		# stride_wd * in_ch
	s32i	a4, sp, 56		# (stride_ht * input_wd) * in_ch

	l32i.n	a3, sp, 92   	# filter_ht
	l32i	a13, sp, 136	# scratch_buf
	l32i	a5, sp, 84		# filter_data
	mull    a4, a12, a3		# (filter_wd * filter_ht * in_ch)
	srai	a4, a4, 1
	addx4	a10, a10, a13   # scratch_buf + 4 * out_ch
	l32i	a3, sp, 96
	// Skip filter sum accumulation if input_offset is 0 (common in TFLite)
	// In that case, correction = just bias (pre-filled by C wrapper)
	beqz	a11, .L_skip_acc_loop
	// accumulate filter values per channel into scratch buffer
.L_acc_out_channel_loop:
	movi.n	a9, 0	// acc
	loop	a4, .L_acc_filter_size_loop
	l8ui	a14, a5, 0
	l8ui	a7, a5, 1
	addi.n	a5, a5, 2
	sext	a14, a14, 7
	sext	a7, a7, 7
	add		a9, a9, a14
	add		a9, a9, a7
	.L_acc_filter_size_loop:

	// multiply by offset, add bias and store the acc value per channel
	mull 	a9, a9, a11
	beqz.n 	a3, .L_skip_bias
	l32i	a8, a3, 0
	addi	a3, a3, 4	// this will remain 0 if bias not present
	add 	a9, a9, a8
.L_skip_bias:
	s32i	a9, a13, 0
	addi.n 	a13, a13, 4
	blt    	a13, a10, .L_acc_out_channel_loop

	j		.L_acc_done

.L_skip_acc_loop:
	// input_offset == 0: correction = bias only
	// Fill scratch_buf with bias values
	beqz.n	a3, .L_skip_acc_zero_bias
.L_copy_bias_loop:
	l32i	a8, a3, 0
	s32i	a8, a13, 0
	addi	a3, a3, 4
	addi.n	a13, a13, 4
	blt		a13, a10, .L_copy_bias_loop
	j		.L_acc_done

.L_skip_acc_zero_bias:
	// No bias either: zero the scratch buffer
.L_zero_scratch_loop:
	movi.n	a8, 0
	s32i	a8, a13, 0
	addi.n	a13, a13, 4
	blt		a13, a10, .L_zero_scratch_loop

.L_acc_done:
	movi.n	a4, 0			# 0

.L_height_loop:
	l32i.n	a8, sp, 40  	# in_row_ptr
	movi.n	a9, 0
	l32i.n	a10, sp, 104	# out_wd
	s32i.n	a8, sp, 28  	# input_ptr
	s32i.n	a9, sp, 44      # out_x

.L_width_loop:
	movi.n	a9, 0
	l32i	a5, sp, 84		# filter_data
	s32i.n	a9, sp, 20
	l32i	a3, sp, 136		# scratch_buf

.L_out_ch_loop:
	movi.n	a6, 0
	l32i.n	a9, sp, 28  	# input_ptr
	mov.n	a10, a6

.L_filter_ht_loop:
	add.n	a8, a5, a12
	mov.n	a13, a9

	ee.zero.accx
	ee.ld.128.usar.ip 	q0, a13, 16
	ee.vld.128.ip 		q4, a13, 16
	ee.vld.128.ip 		q1, a5, 16

	sub             a15, a8, a5         // row_len - 16
	extui           a14, a15, 4, 1      // if multiple of 16 and not 32
	srai            a15, a15, 5         // multiples of 32
	ee.src.q.qup 	q2, q0, q4
	beqz	a15, .L_vector_32_loop_end

	loop	a15, .L_vector_32_loop_end

	ee.vld.128.ip 					q4, a13, 16
	ee.vmulas.s8.accx.ld.ip.qup 	q3, a5, 16, q2, q1, q0, q4
	ee.vld.128.ip 					q2, a13, 16
	ee.vmulas.s8.accx.ld.ip.qup 	q1, a5, 16, q0, q3, q4, q2
	ee.orq 							q0, q2, q2
	ee.orq 							q2, q4, q4

.L_vector_32_loop_end:
	beqz	a14, .L_vector_loop_end
	ee.vmulas.s8.accx.ld.ip 		q4, a13, 16, q2, q1
	ee.src.q.ld.ip					q1, a5, 16, q0, q4
	ee.orq 							q2, q0, q0

.L_vector_loop_end:
	ee.vmulas.s8.accx 	q2, q1
	addi	a13, a13, -16	// since we incremented by 16 too much
	movi 	a15, 0
	ee.srs.accx  	a14, a15, 0

	mov.n	a5, a8
	add.n 			a6, a6, a14
.L7:
	l32i.n	a8, sp, 12		# input_wd * in_ch
	l32i.n	a2, sp, 92   	# filter_ht
	addi.n	a10, a10, 1		# filter_y_idx
	add.n	a9, a9, a8
	blt		a10, a2, .L_filter_ht_loop
.L9:
	l32i    a7, a3, 0		# load input_offset acc
	addi    a3, a3, 4		# increment offset acc ptr
	l32i.n	a8, sp, 20
	add.n	a6, a6, a7		# add input_offset accumulation

.L_multiply_by_quant_mult:
	l32i	a10, sp, 120
	l32i	a9, sp, 124
	add.n	a2, a10, a8
	l32i.n	a2, a2, 0
	add.n	a7, a9, a8
	l32i.n	a7, a7, 0
	max		a8, a2, a4
	ssl		a8
	sll		a6, a6
	mull	a9, a6, a7
	l32r	a10, .LC1
	sub		a2, a8, a2
	add.n	a8, a9, a10
	mulsh	a6, a6, a7
	movi.n	a7, 1
	bltu	a8, a9, .L13
	movi.n	a7, 0

.L13:
	add.n	a6, a7, a6
	slli	a6, a6, 1
	extui	a8, a8, 31, 1
	or		a6, a6, a8
	beqz.n	a2, .L_skip_div_by_pow_of_2
	addi.n	a7, a2, -1
	movi.n	a9, 1
	extui	a8, a6, 31, 1
	ssl		a7
	sll		a7, a9
	sub		a7, a7, a8
	add.n	a6, a7, a6
	ssr		a2
	sra		a6, a6
.L_skip_div_by_pow_of_2:
	l32i	a10, sp, 116
	l32i	a8, sp, 128
	add.n	a2, a10, a6
	l32i	a9, sp, 132
	l32i.n	a10, sp, 24		# out_data_ptr
	max		a2, a2, a8
	min		a2, a2, a9
	s8i		a2, a10, 0
	l32i.n	a2, sp, 20
	addi.n	a10, a10, 1
	addi.n	a2, a2, 4
	l32i.n	a6, sp, 32
	s32i.n	a2, sp, 20
	s32i.n	a10, sp, 24		# out_data_ptr
	bne		a6, a2, .L_out_ch_loop

.L4:
	l32i.n	a5, sp, 44      # out_x
	l32i.n	a6, sp, 28  	# input_ptr (was stored by height loop)
	l32i.n	a8, sp, 52		# stride_wd * in_ch
	addi.n	a5, a5, 1
	add.n	a6, a6, a8		# input_ptr + stride_wd * in_ch
	l32i.n	a9, sp, 104 	# out_wd
	s32i.n	a5, sp, 44      # out_x
	s32i.n	a6, sp, 28  	# input_ptr
	bne		a9, a5, .L_width_loop

	l32i.n	a10, sp, 36     # out_y
	l32i.n	a2, sp, 40  	# in_row_ptr
	l32i	a5, sp, 56		# (stride_ht * input_wd) * in_ch
	l32i.n	a6, sp, 108		# out_ht
	addi.n	a10, a10, 1
	add.n	a2, a2, a5		# in_row_ptr
	s32i.n	a10, sp, 36     # out_y
	s32i.n	a2, sp, 40  	# in_row_ptr
	blt		a10, a6, .L_height_loop
	// end outer (height) loop
	retw.n

	.size	esp_nn_conv_s8_filter_aligned_input_padded_esp32s3, .-esp_nn_conv_s8_filter_aligned_input_padded_esp32s3


================================================
FILE: src/convolution/esp_nn_conv_s8_mult8_1x1_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

    .text
    .literal_position
    .literal  .nudge_val, 1073741824

    # Program Unit: esp_nn_conv_s8_mult8_1x1_esp32s3
    #
    # Requirements:
    #   - in_channels must be a multiple of 8
    #   - filter_data must be 8-byte aligned (ee.vld.l.64.ip ignores lower 3 address bits)
    #   - input_data must be 8-byte aligned (ee.vld.l/h.64.xp same alignment requirement)
    #   - buffer (scratch) must be 16-byte aligned
    #
    # If filter is not aligned, use esp_nn_conv_s8_1x1() (C+inline asm) as fallback.
    #
    .type   esp_nn_conv_s8_mult8_1x1_esp32s3, @function
    .align   4
    .global esp_nn_conv_s8_mult8_1x1_esp32s3

esp_nn_conv_s8_mult8_1x1_esp32s3:  # 0xdbc
    # scratch_buf = 0   // to store qacc regs need 36 bytes
    # gra_spill_temp_164 = 36, channel itr, (in_channels - 1) >> 3
    # gra_spill_temp_165 = 40, i_out
    # gra_spill_temp_166 = 44, in_channels
    # gra_spill_temp_167 = 48, in_channels/8 - 1
    # gra_spill_temp_168 = 52, in_channels-7
    # gra_spill_temp_169 = 56, input
    # gra_spill_temp_170 = 60, filter_data
    # gra_spill_temp_171 = 64, input_offset
    # gra_spill_temp_172 = 68, input_ptr
    # gra_spill_temp_173 = 72, bias
    # gra_spill_temp_174 = 76, in_channels*8
    # gra_spill_temp_175 = 80, size-7
    # gra_spill_temp_176 = 84, size

 // registers:
 // a2: int8_t *input_data
 // a3: uint16_t input_wd
 // a4: uint16_t input_ht
 // a5: uint16_t in_channels
 // a6: int32_t input_offset
 // a7: int16_t *filter_data

 // on stack:
 // int32_t *bias           // 160
 // int8_t *out_data        // 164
 // uint16_t out_wd         // 168
 // uint16_t out_ht         // 172
 // uint16_t out_channels   // 176
 // int32_t out_offset      // 180
 // int32_t *out_shift      // 184
 // int32_t *out_mult       // 188
 // int32_t activation_min  // 192
 // int32_t activation_max  // 196
 // void *buffer // tmp buf // 200

    entry   a1,160                      #
    s32i    a5,a1,44                    # [0]  gra_spill_temp_166, in_channels
    s32i    a6,a1,64                    # [2]  id:619 input_offset+0x0
    s32i    a7,a1,60                    # [1]  gra_spill_temp_170, filter_data
    mul16u  a8,a3,a4                    # [3]  size = input_wd * input_ht;
    s32i    a2,a1,56                    # [0]  gra_spill_temp_169, input
    l32i    a4,a1,164                   # [1]  id:624 out_data+0x0
    mov.n   a3,a1                       # [52]  scratch_buf

    s32i    a8,a1,84                    # [4]  gra_spill_temp_176, size
    blti    a8,8,.prepare_leftover      # [5] // process remaining lines one by one
    addi    a9,a8,-7                    # [32]
    s32i    a9,a1,80                    # [33]  gra_spill_temp_175, size-7

    s32i    a2,a1,68                    # [2]  gra_spill_temp_172 , input_ptr
    srai    a15,a5,3                    # [7] `in_ch/8` loop_cnt
    movi.n  a11,0                       # [10]
    s32i    a11,a1,40                   # [11]  gra_spill_temp_165
    addi    a15,a15,-1                  # [17]  `in_ch/8` loop_cnt - 1
    s32i    a15,a1,48                   # [18]  gra_spill_temp_167
    slli    a9,a5,3                     # [19]  in_channels*8
    s32i    a9,a1,76                    # [20]  gra_spill_temp_174
    addi    a15,a5,-7                   # [31]
    s32i    a15,a1,52                   # [34]  gra_spill_temp_168

.outer_loop: // for (; i_out < size - 7; i_out += 8) {

    l32i    a10,a1,200                  # [1]  gra_spill_temp_165, buffer
    l32i.n  a11,a1,44                   # [1]  gra_spill_temp_166, input_channels
    l32i.n  a8,a1,68                    # [2]  gra_spill_temp_172, input_ptr
    srai    a9,a11,3                    # [7] `in_ch/8` loop_cnt for transpose loop

    ee.zero.q   q7                      # [0]
    addi        a12,a1,64               # [6]
    ee.vldbc.16 q5,a12                  # [0*II+16]  id:638 input_offset

    // load and transose 8 lines of input 8xchannels,
    // add input offset and store 16 bit data to tmp buffer
    loopgtz a9,.transpose_loop_end  # [10]
    mov.n                   a9,a8
    ee.vld.l.64.xp          q0,a9,a11
    ee.vld.l.64.xp          q1,a9,a11
    ee.vld.h.64.xp          q0,a9,a11
    ee.vld.h.64.xp          q1,a9,a11
    ee.vld.l.64.xp          q2,a9,a11
    ee.vzip.8               q0,q1
    ee.vld.l.64.xp          q3,a9,a11
    ee.vld.h.64.xp          q2,a9,a11
    ee.vld.h.64.ip          q3,a9,0
    ee.vzip.16              q0,q1
    ee.vzip.8               q2,q3
    ee.vzip.16              q2,q3
    ee.vzip.32              q0,q2
    ee.vcmp.lt.s8           q4,q2,q7
    ee.vzip.8               q2,q4
    ee.vcmp.lt.s8           q6,q0,q7
    ee.vzip.8               q0,q6
    ee.vadds.s16            q0,q0,q5
    ee.vadds.s16.st.incp    q0,a10,q6,q6,q5
    ee.vadds.s16.st.incp    q6,a10,q2,q2,q5
    ee.vadds.s16.st.incp    q2,a10,q4,q4,q5
    ee.vst.128.ip           q4,a10,16
    ee.vzip.32              q1,q3
    ee.vcmp.lt.s8           q4,q3,q7
    ee.vzip.8               q3,q4
    ee.vcmp.lt.s8           q6,q1,q7
    ee.vzip.8               q1,q6
    ee.vadds.s16            q1,q1,q5
    ee.vadds.s16.st.incp    q1,a10,q6,q6,q5
    ee.vadds.s16.st.incp    q6,a10,q3,q3,q5
    ee.vadds.s16.st.incp    q3,a10,q4,q4,q5
    ee.vst.128.ip           q4,a10,16
    addi.n                  a8,a8,8
.transpose_loop_end:    # 0xeeb

 # 468          uint32_t bias_ptr = (uint32_t) bias;
 # 469          uint32_t filter_ptr = (uint32_t) (filter_data);
 # 470          const int32_t *out_mult_ptr = out_mult;
 # 471          const int32_t *out_shift_ptr = out_shift;
    l32i    a6,a1,184                   # [0]  out_shift
    l32i    a2,a1,188                   # [1]  out_mult
    l32i    a5,a1,60                    # [2]  gra_spill_temp_170, filter
    l32i    a9,a1,160                   # [3]  gra_spill_temp_170, bias
 # 472          for (int32_t out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {
    l16ui   a8,a1,176                   # [5]  id:620 out_channels+0x0
    s32i    a9,a1,72                    # [5]  gra_spill_temp_173
    blti    a8,1,.outer_ch_loop_end

    movi.n  a7,0

.out_ch_loop:   # 0xf3e
    l32i    a8,a1,200                   # [4]  gra_spill_temp_165, buffer_ptr
    ee.zero.qacc                        # [3]
    ee.zero.q                       q5  #
    l32i    a10,a1,52                   # [1]  gra_spill_temp_168, in_channels-7
    l32i    a9,a1,48                    # [1]  gra_spill_temp_167, in_channels/8 - 1
    # USAR-based filter load for unaligned access
    ee.ld.128.usar.ip               q7,a5,16
    ee.ld.128.usar.ip               q6,a5,0
    addi                            a5,a5,-8     # net advance = 8
    ee.src.q                        q7,q7,q6
    ee.vld.128.ip                   q0,a8,16
    ee.vld.128.ip                   q1,a8,16
    ee.vcmp.lt.s8                   q6,q7,q5
    ee.vzip.8                       q7,q6

    ee.vsmulas.s16.qacc.ld.incp     q2,a8,q0,q7,0
    ee.vsmulas.s16.qacc.ld.incp     q3,a8,q1,q7,1
    ee.vsmulas.s16.qacc.ld.incp     q0,a8,q2,q7,2
    ee.vsmulas.s16.qacc.ld.incp     q1,a8,q3,q7,3
    ee.vsmulas.s16.qacc.ld.incp     q2,a8,q0,q7,4
    ee.vsmulas.s16.qacc.ld.incp     q3,a8,q1,q7,5
    blti    a10,8,.inner_loop_end           # [16]

    loopgtz a9,.inner_loop_end  # [3]

    ee.vsmulas.s16.qacc.ld.incp q0,a8,q2,q7,6   # [0*II+0]  id:657
    ee.vsmulas.s16.qacc.ld.incp q1,a8,q3,q7,7   # [0*II+1]  id:658
    # USAR-based filter load for unaligned access
    ee.ld.128.usar.ip           q7,a5,16
    ee.ld.128.usar.ip           q6,a5,0
    addi                        a5,a5,-8
    ee.src.q                    q7,q7,q6
    ee.vcmp.lt.s8               q6,q7,q5
    ee.vzip.8                   q7,q6
    ee.vsmulas.s16.qacc.ld.incp q2,a8,q0,q7,0   # [0*II+4]  id:660
    ee.vsmulas.s16.qacc.ld.incp q3,a8,q1,q7,1   # [0*II+5]  id:661
    ee.vsmulas.s16.qacc.ld.incp q0,a8,q2,q7,2   # [0*II+6]  id:662
    ee.vsmulas.s16.qacc.ld.incp q1,a8,q3,q7,3   # [0*II+7]  id:663
    ee.vsmulas.s16.qacc.ld.incp q2,a8,q0,q7,4   # [0*II+8]  id:664
    ee.vsmulas.s16.qacc.ld.incp q3,a8,q1,q7,5   # [0*II+9]  id:665
.inner_loop_end:    # 0xfaf

    ee.vsmulas.s16.qacc q2,q7,6     # [2]
    ee.vsmulas.s16.qacc q3,q7,7     # [3]

 # store qacc registers and re-arrange data for low 16 bits

    ee.st.qacc_l.l.128.ip   a3,16       # [5]  id:668
    ee.st.qacc_l.h.32.ip    a3,-16        # [6]  id:669
    l32i.n     a10, a1, 0
    l32i.n     a11, a1, 5
    l32i.n     a12, a1, 10
    l32i.n     a13, a1, 15
    ee.movi.32.q    q0, a10, 0
    ee.movi.32.q    q0, a11, 1
    ee.movi.32.q    q0, a12, 2
    ee.movi.32.q    q0, a13, 3

    ee.st.qacc_h.l.128.ip   a3,16       # [5]  id:668
    ee.st.qacc_h.h.32.ip    a3,-16        # [6]  id:669
    l32i.n     a10, a1, 0
    l32i.n     a11, a1, 5
    l32i.n     a12, a1, 10
    l32i.n     a13, a1, 15
    ee.movi.32.q    q4, a10, 0
    ee.movi.32.q    q4, a11, 1
    ee.movi.32.q    q4, a12, 2
    ee.movi.32.q    q4, a13, 3

    l32i                a9,a1,160       # [17]  gra_spill_temp_170, bias
    l32i                a10,a1,72       # [0]  gra_spill_temp_173, bias_ptr

 # add bias
    beqz.n          a9,.no_bias
    ee.vldbc.32.ip  q6,a10,4
    s32i            a10,a1,72           # [3]  gra_spill_temp_173, bias_ptr
    ee.vadds.s32    q0,q0,q6            # [4]
    ee.vadds.s32    q4,q4,q6            # [5]
.no_bias:   # 0x102e

    l32i.n  a11,a6,0                    # [1]  id:696
    l32i.n  a10,a2,0                    # [3]  id:695
    .global esp_nn_multiply_by_quantized_mult_asm_esp32s3
    call8   esp_nn_multiply_by_quantized_mult_asm_esp32s3   # [4]  esp_nn_multiply_by_quantized_mult_asm_esp32s3

    l32i.n  a10,a2,0                    # [0]  id:697, mult
    l32i.n  a11,a6,0                    # [2]  id:698, shift
    mv.qr   q5,q0
    mv.qr   q0,q4
    call8   esp_nn_multiply_by_quantized_mult_asm_esp32s3   # [5]  esp_nn_multiply_by_quantized_mult_asm_esp32s3

    addi.n  a6,a6,4                     # out_shift_ptr++
    addi.n  a2,a2,4                     # out_mult_ptr++
    addi    a9,a1,180                   # [7]
    addi    a10,a1,192                  # [5]
    addi    a8,a1,196                   # [6]

# load broadcast, activation and out_offset
    ee.vldbc.32     q4,a9               # [14]  id:699 out_offset
    ee.vldbc.32     q2,a10              # [11]  id:700 activation_min
    ee.vldbc.32     q3,a8               # [12]  id:701 activation_max

# add offset
    ee.vadds.s32    q1,q0,q4            # [17]
    ee.vadds.s32    q0,q5,q4            # [22]

 # activation
    ee.vmin.s32     q1,q1,q3            # [19]
    ee.vmax.s32     q1,q1,q2            # [21]
    ee.vmin.s32     q0,q0,q3            # [23]
    ee.vmax.s32     q0,q0,q2            # [24]

    l16ui           a9,a1,176           # [33]  out_channels

# unzip and store
    ee.vunzip.16    q0,q1               # [25]
    ee.vst.128.ip   q0,a3,0             # [26]  id:702, scratch_buf

 # a4 = out_data, out_channels = a1+176

    l8ui    a14,a1,0                    # [27]
    l8ui    a11,a1,2                    # [30]  scratch_buf+2
    add     a10,a4,a9
    s8i     a14,a4,0                    # [28], out_data
    s8i     a11,a10,0                   # [31], out_data + out_channels

    l8ui    a14,a1,4                    # [32]  scratch_buf+4
    l8ui    a11,a1,6                    # [37]  scratch_buf+6
    add     a12,a10,a9
    add     a10,a12,a9
    s8i     a14,a12,0                   # [28]
    s8i     a11,a10,0                   # [31]

    l8ui    a14,a1,8                    # [41]  scratch_buf+8
    l8ui    a11,a1,10                   # [47]  scratch_buf+10
    add     a12,a10,a9
    add     a10,a12,a9
    s8i     a14,a12,0                   # [28]
    s8i     a11,a10,0                   # [31]

    l8ui    a14,a1,12                   # [51]  scratch_buf+12
    l8ui    a11,a1,14                   # [55]  scratch_buf+14
    add     a12,a10,a9
    add     a10,a12,a9
    s8i     a14,a12,0                   # [28]
    s8i     a11,a10,0                   # [31]

    addi.n  a4,a4,1                     # [29] out_data++;
    addi.n  a7,a7,1
    bne     a7,a9,.out_ch_loop

.outer_ch_loop_end:

    subx8   a11,a9,a9                   # (7 * out_channels);
    l32i    a10,a1,76                   # [1]  gra_spill_temp_174, in_channels * 8
    l32i    a15,a1,40                   # [4]  gra_spill_temp_165
    l32i    a9,a1,68                    # [2]  gra_spill_temp_172
    l32i    a8,a1,80                    # [0]  gra_spill_temp_175, size-7
    add.n   a4,a4,a11                   # [5] out_data += (7 * out_channels);
    addi.n  a15,a15,8
    s32i    a15,a1,40                   # [7]  gra_spill_temp_165
    add.n   a9,a9,a10                   # [8]
    s32i    a9,a1,68                    # [9]  gra_spill_temp_172
    blt     a15,a8,.outer_loop          # [10]

 # check if leftover
    l32i    a15,a1,40
    l32i    a13,a1,84                   # [1]  gra_spill_temp_176, size
    l32i    a8,a1,44                    # [0]  gra_spill_temp_166, in_channels
    bge     a15, a13, .return_function  # no leftover

// This block below processes one input channel line at a time.
.process_leftover:
    l32i    a15,a1,40                   # [1]  gra_spill_temp_165, i_out
    l32i    a14,a1,56                   # [2]  gra_spill_temp_169, input
    mull    a15,a15,a8                  # [3] in_channels * i_out
    addi.n  a8,a8,-1                    # [4] in_channels - 1
    add.n   a14,a14,a15                 # [5] input_ptr = in_channels * i_out + input
    srai    a8,a8,3                     # [6] iterations, (in_channels - 1) >> 3
    s32i    a8,a1,36                    # [7]  gra_spill_temp_164, iterations
    s32i    a14,a1,68                   # [8]  gra_spill_temp_172, in_channels * i_out + input
    addi            a12,a1,64
    ee.vldbc.16     q4,a12              # [8]  id:716 input_offset

.leftover_outer_loop:

    l32i    a15,a1,184                  # [0]  out_shift
    l32i    a2,a1,188                   # [1]  out_mult
    l32i    a8,a1,60                    # [3]  gra_spill_temp_170, filter_data
    l32i    a5,a1,160                   # [0]  gra_spill_temp_170, bias
    movi.n  a11,0                       # [2]

.leftover_out_ch_loop:

    ee.zero.qacc                            # [0]
    ee.zero.q       q3                      # [1]
    l32i.n          a9,a1,68                # [4]  gra_spill_temp_172, input_ptr
    l32i            a10,a1,36               # [1]  gra_spill_temp_164, iterations, (in_channels - 1) >> 3
    ee.vld.l.64.ip          q0,a9,8         # [7]  id:717, input
    # USAR-based filter load for unaligned access
    ee.ld.128.usar.ip       q1,a8,16
    ee.ld.128.usar.ip       q7,a8,0
    addi                    a8,a8,-8
    ee.src.q                q1,q1,q7
    ee.vcmp.lt.s8           q6,q0,q3
    ee.vcmp.lt.s8           q7,q1,q3
    ee.vzip.8               q0,q6
    ee.vzip.8               q1,q7
    ee.vadds.s16            q0,q0,q4  # [11]  id:718, add offset

    loopgtz a10,.leftover_inner_loop_end        # [3]

    ee.vmulas.s16.qacc          q0,q1  # mula(q0,q1)
    ee.vld.l.64.ip              q0,a9,8         # load 8 input values
    # USAR-based filter load for unaligned access
    ee.ld.128.usar.ip           q1,a8,16
    ee.ld.128.usar.ip           q7,a8,0
    addi                        a8,a8,-8
    ee.src.q                    q1,q1,q7
    ee.vcmp.lt.s8               q2,q0,q3        # sign
    ee.vcmp.lt.s8               q7,q1,q3
    ee.vzip.8                   q0,q2           # 16 bit input
    ee.vzip.8                   q1,q7           # 16 bit filter
    ee.vadds.s16                q0,q0,q4        # add offset
.leftover_inner_loop_end:   # 0x1262

# re-arrange data from qacc in 32 bit q registers
    ee.vmulas.s16.qacc      q0,q1       # [3]
    ee.st.qacc_l.l.128.ip   a3,16       # [5]  id:722
    ee.st.qacc_l.h.32.ip    a3,0        # [6]  id:723
    l8ui    a10,a1,5                    # [11]  scratch_buf+5
    l8ui    a12,a1,6                    # [10]  scratch_buf+6
    l16ui   a14,a1,10                   # [8]  scratch_buf+10
    l8ui    a9,a1,15                    # [7]  scratch_buf+15
    l8ui    a13,a1,16                   # [9]  scratch_buf+16
    s8i     a10,a1,2                    # [12]  scratch_buf+2
    s8i     a12,a1,3                    # [13]  scratch_buf+3
    s16i    a14,a1,4                    # [15]  scratch_buf+4
    s8i     a9,a1,6                     # [16]  scratch_buf+6
    s8i     a13,a1,7                    # [14]  scratch_buf+7

    ee.st.qacc_h.l.128.ip   a3,16       # [17]  id:724
    ee.st.qacc_h.h.32.ip    a3,-32      # [18]  id:725
    l16ui   a13,a1,16                   # [30]  scratch_buf+16
    l8ui    a14,a1,21                   # [23]  scratch_buf+21
    l8ui    a9,a1,22                    # [22]  scratch_buf+22
    l16ui   a10,a1,26                   # [21]  scratch_buf+26
    s16i    a13,a1,8                    # [31]  scratch_buf+8
    l8ui    a12,a1,31                   # [20]  scratch_buf+31
    l8ui    a13,a1,32                   # [19]  scratch_buf+32
    s8i     a14,a1,10                   # [24]  scratch_buf+10
    s8i     a9,a1,11                    # [25]  scratch_buf+11
    s16i    a10,a1,12                   # [26]  scratch_buf+12
    s8i     a12,a1,14                   # [27]  scratch_buf+14
    s8i     a13,a1,15                   # [28]  scratch_buf+15
    movi.n  a12,16

# get data now
    ee.vld.128.ip       q0,a3,0
    ee.srcmb.s16.qacc   q1,a12,0
    ee.vzip.16          q0,q1

    ee.vadds.s32    q0,q0,q1
    ee.movi.32.a    q0,a10,3
    ee.movi.32.a    q0,a9,2
    ee.movi.32.a    q0,a14,0
    add             a9,a9,a10
    ee.movi.32.a    q0,a10,1
    add             a14,a14,a10
    add             a14,a14,a9

# a14 contains conv_out
    l32i    a9,a1,160                   # [43]  gra_spill_temp_170, bias ptr
    l32i.n  a6,a15,0                    # [44]  id:730, shift
    beqz.n  a9,.leftover_multiply_by_quant_mult             # [45]

# load and add bias
    l32i.n  a9,a5,0
    add.n   a14,a14,a9

.leftover_multiply_by_quant_mult:   # 0x12e7
    l32i.n  a9,a2,0                 # [0]  id:729, mult
    movi.n  a10,0                   # [1]
    max     a10,a6,a10              # [2]  left_shift
    ssl     a10                     # [3]
    sll     a14,a14                 # [4] (value << left_shift)

    sub     a7,a10,a6               # right_shift

    l32r    a13,.nudge_val
    mulsh   a12,a9,a14
    mull    a14,a9,a14
    ssai    31

    addi.n  a2,a2,4                 # [0] mult
    addi.n  a15,a15,4               # [1] shift
    addi.n  a5,a5,4                 # [2] bias
    addi.n  a11,a11,1               # [3]

    add     a13,a14,a13             # low part
    saltu   a14,a13,a14
    add     a9,a12,a14              # high part
    src     a12,a9,a13

    blti    a7,1,.leftover_skip_div_by2

    addi.n  a14,a7,-1
    ssl     a14
    movi.n  a10,1
    sll     a10,a10                     # 1 << (exponent - 1)
    extui   a14,a12,31,1
    ssr     a7
    sub     a10,a10,a14                 # 1 << (exponent - 1) - (val < 0)
    add     a12,a12,a10                 # val += to_add
    sra     a12,a12

.leftover_skip_div_by2:
    l32i    a10,a1,180                  # [26]  id:733 out_offset+0x0
    l32i    a9,a1,192                   # [29]  id:732 activation_min+0x0
    l16ui   a13,a1,176                  # [5]  id:620 out_channels+0x0
    l32i    a14,a1,196                  # [31]  id:731 activation_max+0x0

// add offset, apply activation and store
    add.n   a10,a10,a12
    max     a9,a9,a10
    min     a14,a14,a9
    s8i     a14,a4,0
    addi.n  a4,a4,1

    bne     a11,a13,.leftover_out_ch_loop

    l32i    a15,a1,44                   # [0]  gra_spill_temp_166, in_channels
    l32i    a14,a1,68                   # [1]  gra_spill_temp_172, input_ptr
    l32i    a13,a1,40                   # [2]  gra_spill_temp_165, i_out
    l32i    a12,a1,84                   # [3]  gra_spill_temp_176, size
    addi.n  a13,a13,1                   # [4]
    s32i    a13,a1,40                   # [5]  gra_spill_temp_165, i_out
    add     a14,a14,a15                 # [7]  input_ptr += in_channels
    s32i    a14,a1,68                   # [8]  gra_spill_temp_172, input_ptr
    blt     a13,a12,.leftover_outer_loop

.return_function:
    retw.n              # [9]

.prepare_leftover:
    l32i    a8,a1,44                    # [0]  gra_spill_temp_166, in_channels
    movi.n  a15,0
    s32i    a15,a1,40                   # [7]  gra_spill_temp_165, i_out
    j   .process_leftover

    .size   esp_nn_conv_s8_mult8_1x1_esp32s3, . - esp_nn_conv_s8_mult8_1x1_esp32s3


================================================
FILE: src/convolution/esp_nn_depthwise_conv_ansi.c
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <esp_nn_defs.h>
#include <common_functions.h>

int esp_nn_get_depthwise_conv_scratch_size_ansi(const data_dims_t *input_dims,
                                                const data_dims_t *filter_dims,
                                                const data_dims_t *output_dims,
                                                const dw_conv_params_t *conv_params)
{
    return 0;
}

void esp_nn_set_depthwise_conv_scratch_buf_ansi(const void *buf)
{

}

void esp_nn_depthwise_conv_s8_ansi(const data_dims_t *input_dims,
                                   const int8_t *input_data,
                                   const data_dims_t *filter_dims,
                                   const int8_t *filter_data,
                                   const int32_t *bias,
                                   const data_dims_t *output_dims,
                                   int8_t *out_data,
                                   const dw_conv_params_t *conv_params,
                                   const quant_data_t *quant_data)
{
    const uint16_t input_wd = input_dims->width;
    const uint16_t input_ht = input_dims->height;
    const uint16_t channels = input_dims->channels;
    const int32_t input_offset = conv_params->in_offset;
    const int32_t out_offset = conv_params->out_offset;
    const uint16_t pad_wd = conv_params->padding.width;
    const uint16_t pad_ht = conv_params->padding.height;
    const uint16_t stride_wd = conv_params->stride.width;
    const uint16_t stride_ht = conv_params->stride.height;
    const uint16_t filter_wd = filter_dims->width;
    const uint16_t filter_ht = filter_dims->height;
    const uint16_t out_wd = output_dims->width;
    const uint16_t out_ht = output_dims->height;
    const int32_t *out_shift = quant_data->shift;
    const int32_t *out_mult = quant_data->mult;
    const int32_t activation_min = conv_params->activation.min;
    const int32_t activation_max = conv_params->activation.max;
    const uint16_t ch_mult = conv_params->ch_mult;

    int out_idx = 0;
    for (int out_y = 0; out_y < out_ht; out_y++) { //height loop
        const int16_t base_y = (out_y * stride_ht) - pad_ht;
        for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop
            const int16_t base_x = (out_x * stride_wd) - pad_wd;
            for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop
                for (int ch_mult_idx = 0; ch_mult_idx < ch_mult; ch_mult_idx++) {
                    int32_t result = 0;
                    const int out_ch_idx = ch_mult_idx + ch_idx * ch_mult;

                    /* Select filter so as the point doesn't lie outside block */
                    int filter_y_start = max(0, -base_y);
                    int filter_x_start = max(0, -base_x);
                    int filter_y_end = min(filter_ht, input_ht - base_y);
                    int filter_x_end = min(filter_wd, input_wd - base_x);

                    for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
                        const int32_t idx_y = base_y + filter_y_idx;
                        for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
                            const int32_t idx_x = base_x + filter_x_idx;
                            int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;
                            int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx;
                            int32_t input_val = input_data[input_index] + input_offset;
                            int32_t filter_val = filter_data[filter_index];
                            result += input_val * filter_val;
                        }
                    }
                    if (bias) {
                        result += bias[out_ch_idx];
                    }
                    result = esp_nn_multiply_by_quantized_mult(result, out_mult[out_ch_idx], out_shift[out_ch_idx]);
                    result += out_offset;
                    result = max(result, activation_min);
                    result = min(result, activation_max);

                    out_data[out_idx++] = result;
                }
            }
        }
    }
}


================================================
FILE: src/convolution/esp_nn_depthwise_conv_esp32p4.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <esp_nn_defs.h>
#include <common_functions.h>
#include <stdlib.h>

/* Note: esp_nn_requant_2x_esp32p4.S exists but inline ESP_NN_REQUANT_2X macro
 * from common_functions.h is used instead (avoids function call overhead). */

/* External fallback */
void esp_nn_depthwise_conv_s8_opt(const data_dims_t *input_dims,
                                   const int8_t *input_data,
                                   const data_dims_t *filter_dims,
                                   const int8_t *filter_data,
                                   const int32_t *bias,
                                   const data_dims_t *output_dims,
                                   int8_t *out_data,
                                   const dw_conv_params_t *conv_params,
                                   const quant_data_t *quant_data);

int esp_nn_get_depthwise_conv_scratch_size_esp32p4(const data_dims_t *input_dims,
                                                    const data_dims_t *filter_dims,
                                                    const data_dims_t *output_dims,
                                                    const dw_conv_params_t *conv_params)
{
    return 0;
}

void esp_nn_set_depthwise_conv_scratch_buf_esp32p4(const void *buf)
{
    (void) buf;
}

/* PIE-optimized ch_mult=1, channels>=16 path using QACC per-lane MAC.
 * Pre-computes filter_sum[ch] = sum of filter[ch] across all filter positions.
 * For non-edge output positions: result[ch] = QACC_MAC + filter_sum[ch] * input_offset
 * For edge positions: falls back to scalar with input_offset applied directly. */
__attribute__ ((noinline))
static void depthwise_conv_s8_ch1_pie(const data_dims_t *input_dims,
                                       const int8_t *input_data,
                                       const data_dims_t *filter_dims,
                                       const int8_t *filter_data,
                                       const int32_t *bias,
                                       const data_dims_t *output_dims,
                                       int8_t *out_data,
                                       const dw_conv_params_t *conv_params,
                                       const quant_data_t *quant_data)
{
    const uint16_t input_wd = input_dims->width;
    const uint16_t input_ht = input_dims->height;
    const uint16_t channels = input_dims->channels;
    const int32_t input_offset = conv_params->in_offset;
    const int32_t out_offset = conv_params->out_offset;
    const uint16_t pad_wd = conv_params->padding.width;
    const uint16_t pad_ht = conv_params->padding.height;
    const uint16_t stride_wd = conv_params->stride.width;
    const uint16_t stride_ht = conv_params->stride.height;
    const uint16_t filter_wd = filter_dims->width;
    const uint16_t filter_ht = filter_dims->height;
    const uint16_t out_wd = output_dims->width;
    const uint16_t out_ht = output_dims->height;
    const int32_t activation_min = conv_params->activation.min;
    const int32_t activation_max = conv_params->activation.max;

    /* Enable PIE */
    asm volatile (
        "csrsi  0x7f2, 0b01        \n\t"
        "li     x29, 0b10          \n\t"
        "esp.movx.w.cfg x29        \n\t"
        ::: "x29"
    );

    /* Set up activation min/max vectors for PIE clamp */
    {
        int8_t act_min_val = (int8_t) activation_min;
        int8_t act_max_val = (int8_t) activation_max;
        asm volatile (
            "mv     x30, %0             \n\t"
            "esp.vldbc.8.ip q4, x30, 0  \n\t"
            "mv     x30, %1             \n\t"
            "esp.vldbc.8.ip q5, x30, 0  \n\t"
            :: "r"(&act_min_val), "r"(&act_max_val)
            : "x30"
        );
    }

    /* Pre-compute combined offset: filter_sum * input_offset + bias per channel.
     * This fuses two additions per channel into one pre-computed value.
     * Constant for the entire layer - computed once. */
    int32_t combined_offset_buf[256]; /* support up to 256 channels on stack */
    int32_t *combined_offset = NULL;
    if (channels <= 256) {
        combined_offset = combined_offset_buf;
        for (int ch = 0; ch < channels; ch++) {
            int32_t s = 0;
            if (input_offset != 0) {
                for (int fy = 0; fy < filter_ht; fy++) {
                    for (int fx = 0; fx < filter_wd; fx++) {
                        s += filter_data[(fy * filter_wd + fx) * channels + ch];
                    }
                }
                s *= input_offset;
            }
            combined_offset[ch] = s + (bias ? bias[ch] : 0);
        }
    }

    int out_idx = 0;
    for (int out_y = 0; out_y < out_ht; out_y++) {
        const int16_t base_y = (out_y * stride_ht) - pad_ht;
        for (int out_x = 0; out_x < out_wd; out_x++) {
            const int16_t base_x = (out_x * stride_wd) - pad_wd;

            const int32_t *out_shift = quant_data->shift;
            const int32_t *out_mult = quant_data->mult;

            int filter_y_start = max(0, -base_y);
            int filter_x_start = max(0, -base_x);
            int filter_y_end = min(filter_ht, input_ht - base_y);
            int filter_x_end = min(filter_wd, input_wd - base_x);

            /* Check if this is a non-edge position (full filter window) */
            int is_full_window = (filter_y_start == 0 && filter_x_start == 0 &&
                                  filter_y_end == filter_ht && filter_x_end == filter_wd);

            /* Process 16 channels at a time using QACC.
             * Inline helper macro for QACC MAC across filter window. */
            #define QACC_MAC_WINDOW(ch_off) do { \
                asm volatile ("esp.zero.qacc \n\t"); \
                for (int _fy = filter_y_start; _fy < filter_y_end; _fy++) { \
                    const int32_t _iy = base_y + _fy; \
                    const int8_t *_ip = input_data + (_iy * input_wd + base_x + filter_x_start) * channels + (ch_off); \
                    const int8_t *_fp = filter_data + (_fy * filter_wd + filter_x_start) * channels + (ch_off); \
                    int _fc = filter_x_end - filter_x_start; \
                    asm volatile ( \
                        "mv     x30, %[ip]               \n\t" \
                        "mv     x31, %[fp]               \n\t" \
                        "mv     s7,  %[cnt]              \n\t" \
                        "1:                              \n\t" \
                        "esp.vld.128.ip  q0, x30, 0      \n\t" \
                        "esp.vld.128.ip  q1, x31, 0      \n\t" \
                        "esp.vmulas.s8.qacc q0, q1       \n\t" \
                        "add    x30, x30, %[stride]      \n\t" \
                        "add    x31, x31, %[stride]      \n\t" \
                        "addi   s7, s7, -1               \n\t" \
                        "bnez   s7, 1b                   \n\t" \
                        : \
                        : [ip] "r"(_ip), [fp] "r"(_fp), \
                          [cnt] "r"(_fc), [stride] "r"((int32_t)channels) \
                        : "x30", "x31", "s7" \
                    ); \
                } \
            } while(0)

            #define QACC_EXTRACT(dst) do { \
                asm volatile ( \
                    "mv                      x30, %0     \n\t" \
                    "esp.st.qacc.l.l.128.ip  x30, 16     \n\t" \
                    "esp.st.qacc.l.h.128.ip  x30, 16     \n\t" \
                    "esp.st.qacc.h.l.128.ip  x30, 16     \n\t" \
                    "esp.st.qacc.h.h.128.ip  x30, 0      \n\t" \
                    :: "r"(dst) \
                    : "x30", "memory" \
                ); \
            } while(0)

            int ch_idx = 0;

            /* Process 16-channel blocks, then partial block if remainder >= 8 */
            while (ch_idx < channels) {
                int block_ch = (ch_idx + 16 <= channels) ? 16 :
                               (channels - ch_idx >= 8) ? (channels - ch_idx) : 0;
                if (block_ch == 0) break;  /* remaining < 8, handle scalar below */

                QACC_MAC_WINDOW(ch_idx);

                /* Extract per-lane results (only first block_ch are valid) */
                int32_t result[16] __attribute__((aligned(16)));
                QACC_EXTRACT(result);

                /* Add fused offset (filter_sum * input_offset + bias) + requantize */
                if (combined_offset) {
                    if (is_full_window) {
                        for (int k = 0; k < block_ch; k++) {
                            result[k] += combined_offset[ch_idx + k];
                        }
                    } else {
                        for (int k = 0; k < block_ch; k++) {
                            int32_t fsum = 0;
                            if (input_offset != 0) {
                                for (int fy = filter_y_start; fy < filter_y_end; fy++) {
                                    for (int fx = filter_x_start; fx < filter_x_end; fx++) {
                                        fsum += filter_data[(fy * filter_wd + fx) * channels + ch_idx + k];
                                    }
                                }
                                fsum *= input_offset;
                            }
                            result[k] += fsum + (bias ? bias[ch_idx + k] : 0);
                        }
                    }
                }

                /* Per-channel requantize */
                {
                    const int32_t *mp = out_mult + ch_idx;
                    const int32_t *sp = out_shift + ch_idx;
                    int rq_count = block_ch & ~1;  /* round down to even for 2-wide */

                    for (int k = 0; k < rq_count; k += 2) {
                        int32_t r0 = result[k]; int32_t r1 = result[k+1];

                        int32_t m0 = mp[k], s0 = sp[k];
                        int32_t m1 = mp[k+1], s1 = sp[k+1];

                        /* 2-wide interleaved requant via inline asm macro.
                         * Macro handles left_shift internally - do NOT pre-shift. */
                        int32_t h0, h1;
                        ESP_NN_REQUANT_2X(r0, r1, m0, m1, s0, s1, h0, h1);

                        h0 += out_offset; h1 += out_offset;
                        out_data[out_idx++] = (int8_t)max(activation_min, min(h0, activation_max));
                        out_data[out_idx++] = (int8_t)max(activation_min, min(h1, activation_max));
                    }
                    /* Handle odd remaining channel in block */
                    if (block_ch & 1) {
                        int k = rq_count;
                        int32_t r = result[k];
                        r = esp_nn_requantize(r, mp[k], sp[k]);
                        r += out_offset;
                        out_data[out_idx++] = (int8_t)max(activation_min, min(r, activation_max));
                    }
                }
                ch_idx += block_ch;
            }

            /* Remaining channels < 8: scalar */
            for (; ch_idx < channels; ch_idx++) {
                int32_t result = 0;
                for (int fy = filter_y_start; fy < filter_y_end; fy++) {
                    const int32_t idx_y = base_y + fy;
                    for (int fx = filter_x_start; fx < filter_x_end; fx++) {
                        const int32_t idx_x = base_x + fx;
                        result += (input_data[(idx_y * input_wd + idx_x) * channels + ch_idx] + input_offset)
                                  * filter_data[(fy * filter_wd + fx) * channels + ch_idx];
                    }
                }
                if (bias) result += bias[ch_idx];
                result = esp_nn_requantize(result, out_mult[ch_idx], out_shift[ch_idx]);
                result += out_offset;
                result = max(result, activation_min);
                result = min(result, activation_max);
                out_data[out_idx++] = (int8_t) result;
            }
        }
    }
}

void esp_nn_depthwise_conv_s8_esp32p4(const data_dims_t *input_dims,
                                       const int8_t *input_data,
                                       const data_dims_t *filter_dims,
                                       const int8_t *filter_data,
                                       const int32_t *bias,
                                       const data_dims_t *output_dims,
                                       int8_t *out_data,
                                       const dw_conv_params_t *conv_params,
                                       const quant_data_t *quant_data)
{
    const uint16_t ch_mult = conv_params->ch_mult;
    const uint16_t channels = input_dims->channels;

    if (ch_mult == 1 && channels >= 8) {
        depthwise_conv_s8_ch1_pie(input_dims, input_data, filter_dims, filter_data,
                                   bias, output_dims, out_data, conv_params, quant_data);
        return;
    }

    /* Fall back to generic optimized */
    esp_nn_depthwise_conv_s8_opt(input_dims, input_data, filter_dims, filter_data,
                                  bias, output_dims, out_data, conv_params, quant_data);
}


================================================
FILE: src/convolution/esp_nn_depthwise_conv_opt.c
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <esp_nn_defs.h>
#include <common_functions.h>

int esp_nn_get_depthwise_conv_scratch_size_opt(const data_dims_t *input_dims,
                                               const data_dims_t *filter_dims,
                                               const data_dims_t *output_dims,
                                               const dw_conv_params_t *conv_params)
{
    return 0;
}

void esp_nn_set_depthwise_conv_scratch_buf_opt(const void *buf)
{

}

/* common channel multiplier == 1 case */
__attribute__ ((noinline))
static void esp_nn_depthwise_conv_s8_ch_mult_1(const data_dims_t *input_dims,
                                               const int8_t *input_data,
                                               const data_dims_t *filter_dims,
                                               const int8_t *filter_data,
                                               const int32_t *bias,
                                               const data_dims_t *output_dims,
                                               int8_t *out_data,
                                               const dw_conv_params_t *conv_params,
                                               const quant_data_t *quant_data)
{
    const uint16_t input_wd = input_dims->width;
    const uint16_t input_ht = input_dims->height;
    const uint16_t channels = input_dims->channels;
    const int32_t input_offset = conv_params->in_offset;
    const int32_t out_offset = conv_params->out_offset;
    const uint16_t pad_wd = conv_params->padding.width;
    const uint16_t pad_ht = conv_params->padding.height;
    const uint16_t stride_wd = conv_params->stride.width;
    const uint16_t stride_ht = conv_params->stride.height;
    const uint16_t filter_wd = filter_dims->width;
    const uint16_t filter_ht = filter_dims->height;
    const uint16_t out_wd = output_dims->width;
    const uint16_t out_ht = output_dims->height;
    const int32_t activation_min = conv_params->activation.min;
    const int32_t activation_max = conv_params->activation.max;

    int out_idx = 0;
    for (int out_y = 0; out_y < out_ht; out_y++) { //height loop
        const int16_t base_y = (out_y * stride_ht) - pad_ht;
        for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop
            const int16_t base_x = (out_x * stride_wd) - pad_wd;

            const int32_t *out_shift = quant_data->shift;
            const int32_t *out_mult = quant_data->mult;

            /* Select filter so as the point doesn't lie outside block */
            int filter_y_start = max(0, -base_y);
            int filter_x_start = max(0, -base_x);
            int filter_y_end = min(filter_ht, input_ht - base_y);
            int filter_x_end = min(filter_wd, input_wd - base_x);

            int ch_idx = 0;
            for (; ch_idx < channels - 3; ch_idx += 4) {//channel_loop
                int32_t result0 = 0;
                int32_t result1 = 0;
                int32_t result2 = 0;
                int32_t result3 = 0;

                for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
                    const int32_t idx_y = base_y + filter_y_idx;
                    for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
                        const int32_t idx_x = base_x + filter_x_idx;
                        int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;
                        int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels) + ch_idx;
                        int32_t input_val0 = input_data[input_index + 0] + input_offset;
                        int32_t input_val1 = input_data[input_index + 1] + input_offset;
                        int32_t input_val2 = input_data[input_index + 2] + input_offset;
                        int32_t input_val3 = input_data[input_index + 3] + input_offset;
                        int32_t filter_val0 = filter_data[filter_index + 0];
                        int32_t filter_val1 = filter_data[filter_index + 1];
                        int32_t filter_val2 = filter_data[filter_index + 2];
                        int32_t filter_val3 = filter_data[filter_index + 3];
                        result0 += input_val0 * filter_val0;
                        result1 += input_val1 * filter_val1;
                        result2 += input_val2 * filter_val2;
                        result3 += input_val3 * filter_val3;
                    }
                }
                if (bias) {
                    result0 += bias[ch_idx + 0];
                    result1 += bias[ch_idx + 1];
                    result2 += bias[ch_idx + 2];
                    result3 += bias[ch_idx + 3];
                }
                result0 = esp_nn_requantize(result0, *out_mult++, *out_shift++);
                result1 = esp_nn_requantize(result1, *out_mult++, *out_shift++);
                result2 = esp_nn_requantize(result2, *out_mult++, *out_shift++);
                result3 = esp_nn_requantize(result3, *out_mult++, *out_shift++);

                result0 += out_offset;
                result1 += out_offset;
                result2 += out_offset;
                result3 += out_offset;

                result0 = max(result0, activation_min);
                result1 = max(result1, activation_min);
                result2 = max(result2, activation_min);
                result3 = max(result3, activation_min);

                result0 = min(result0, activation_max);
                result1 = min(result1, activation_max);
                result2 = min(result2, activation_max);
                result3 = min(result3, activation_max);

                out_data[out_idx++] = result0;
                out_data[out_idx++] = result1;
                out_data[out_idx++] = result2;
                out_data[out_idx++] = result3;
            }
            for (; ch_idx < channels; ch_idx++) {//channel_loop
                int32_t result = 0;

                for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
                    const int32_t idx_y = base_y + filter_y_idx;
                    for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
                        const int32_t idx_x = base_x + filter_x_idx;
                        int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;
                        int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels) + ch_idx;
                        int32_t input_val = input_data[input_index] + input_offset;
                        int32_t filter_val = filter_data[filter_index];
                        result += input_val * filter_val;
                    }
                }
                if (bias) {
                    result += bias[ch_idx];
                }
                result = esp_nn_requantize(result, *out_mult++, *out_shift++);
                result += out_offset;
                result = max(result, activation_min);
                result = min(result, activation_max);

                out_data[out_idx++] = result;
            }
        }
    }
}

void esp_nn_depthwise_conv_s8_opt(const data_dims_t *input_dims,
                                  const int8_t *input_data,
                                  const data_dims_t *filter_dims,
                                  const int8_t *filter_data,
                                  const int32_t *bias,
                                  const data_dims_t *output_dims,
                                  int8_t *out_data,
                                  const dw_conv_params_t *conv_params,
                                  const quant_data_t *quant_data)
{
    const uint16_t ch_mult = conv_params->ch_mult;
    if (ch_mult == 1) {
        esp_nn_depthwise_conv_s8_ch_mult_1(input_dims, input_data, filter_dims, filter_data,
                                           bias, output_dims, out_data, conv_params, quant_data);
        return;
    }
    const uint16_t input_wd = input_dims->width;
    const uint16_t input_ht = input_dims->height;
    const uint16_t channels = input_dims->channels;
    const int32_t input_offset = conv_params->in_offset;
    const int32_t out_offset = conv_params->out_offset;
    const uint16_t pad_wd = conv_params->padding.width;
    const uint16_t pad_ht = conv_params->padding.height;
    const uint16_t stride_wd = conv_params->stride.width;
    const uint16_t stride_ht = conv_params->stride.height;
    const uint16_t filter_wd = filter_dims->width;
    const uint16_t filter_ht = filter_dims->height;
    const uint16_t out_wd = output_dims->width;
    const uint16_t out_ht = output_dims->height;
    const int32_t activation_min = conv_params->activation.min;
    const int32_t activation_max = conv_params->activation.max;

    int out_idx = 0;
    for (int out_y = 0; out_y < out_ht; out_y++) { //height loop
        const int16_t base_y = (out_y * stride_ht) - pad_ht;
        for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop
            const int16_t base_x = (out_x * stride_wd) - pad_wd;

            const int32_t *out_shift = quant_data->shift;
            const int32_t *out_mult = quant_data->mult;

            /* Select filter so as the point doesn't lie outside block */
            int filter_y_start = max(0, -base_y);
            int filter_x_start = max(0, -base_x);
            int filter_y_end = min(filter_ht, input_ht - base_y);
            int filter_x_end = min(filter_wd, input_wd - base_x);

            for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop
                int ch_mult_idx = 0;
                for (; ch_mult_idx < ch_mult - 3; ch_mult_idx += 4) {
                    int32_t result0 = 0;
                    int32_t result1 = 0;
                    int32_t result2 = 0;
                    int32_t result3 = 0;
                    const int out_ch_idx =  ch_idx * ch_mult + ch_mult_idx;

                    for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
                        const int32_t idx_y = base_y + filter_y_idx;
                        for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
                            const int32_t idx_x = base_x + filter_x_idx;
                            int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;
                            int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx;
                            int32_t input_val = input_data[input_index] + input_offset;
                            int32_t filter_val0 = filter_data[filter_index + 0];
                            int32_t filter_val1 = filter_data[filter_index + 1];
                            int32_t filter_val2 = filter_data[filter_index + 2];
                            int32_t filter_val3 = filter_data[filter_index + 3];
                            result0 += input_val * filter_val0;
                            result1 += input_val * filter_val1;
                            result2 += input_val * filter_val2;
                            result3 += input_val * filter_val3;
                        }
                    }
                    if (bias) {
                        result0 += bias[out_ch_idx + 0];
                        result1 += bias[out_ch_idx + 1];
                        result2 += bias[out_ch_idx + 2];
                        result3 += bias[out_ch_idx + 3];
                    }
                    result0 = esp_nn_requantize(result0, *out_mult++, *out_shift++);
                    result1 = esp_nn_requantize(result1, *out_mult++, *out_shift++);
                    result2 = esp_nn_requantize(result2, *out_mult++, *out_shift++);
                    result3 = esp_nn_requantize(result3, *out_mult++, *out_shift++);

                    result0 += out_offset;
                    result1 += out_offset;
                    result2 += out_offset;
                    result3 += out_offset;

                    result0 = max(result0, activation_min);
                    result1 = max(result1, activation_min);
                    result2 = max(result2, activation_min);
                    result3 = max(result3, activation_min);
                    result0 = min(result0, activation_max);
                    result1 = min(result1, activation_max);
                    result2 = min(result2, activation_max);
                    result3 = min(result3, activation_max);

                    out_data[out_idx++] = result0;
                    out_data[out_idx++] = result1;
                    out_data[out_idx++] = result2;
                    out_data[out_idx++] = result3;
                }
                for (; ch_mult_idx < ch_mult; ch_mult_idx++) {
                    int32_t result = 0;
                    const int out_ch_idx =  ch_idx * ch_mult + ch_mult_idx;

                    for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
                        const int32_t idx_y = base_y + filter_y_idx;
                        for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
                            const int32_t idx_x = base_x + filter_x_idx;
                            int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;
                            int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx;
                            int32_t input_val = input_data[input_index] + input_offset;
                            int32_t filter_val = filter_data[filter_index];
                            result += input_val * filter_val;
                        }
                    }
                    if (bias) {
                        result += bias[out_ch_idx];
                    }
                    result = esp_nn_requantize(result, *out_mult++, *out_shift++);
                    result += out_offset;
                    result = max(result, activation_min);
                    result = min(result, activation_max);

                    out_data[out_idx++] = result;
                }
            }
        }
    }
}


================================================
FILE: src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

    .text
    .literal_position

    # Program Unit: esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3
    .type   esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3, @function
    .align   4
    .global esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3

esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3:    # 0x776
    # qacc_scratch = 0
    # gra_spill_temp_35 = 48
    # gra_spill_temp_36 = 52
    # gra_spill_temp_37 = 56
    # gra_spill_temp_38 = 60
    # gra_spill_temp_39 = 64
    # gra_spill_temp_40 = 68
    # gra_spill_temp_41 = 72
    # gra_spill_temp_42 = 76
    # gra_spill_temp_43 = 80
    # gra_spill_temp_44 = 84
    # gra_spill_temp_45 = 88
    # gra_spill_temp_46 = 92
    # gra_spill_temp_47 = 96
    # gra_spill_temp_48 = 100
    # gra_spill_temp_49 = 104
    # gra_spill_temp_50 = 108
    # gra_spill_temp_51 = 112
    # gra_spill_temp_52 = 116
    # gra_spill_temp_53 = 120
    # gra_spill_temp_54 = 124
    # gra_spill_temp_55 = 128
    # gra_spill_temp_56 = 132
    # gra_spill_temp_57 = 136
    # gra_spill_temp_58 = 140
    # gra_spill_temp_59 = 144
    # gra_spill_temp_60 = 148
    # gra_spill_temp_61 = 152
    # gra_spill_temp_62 = 156
    # gra_spill_temp_63 = 160
    # gra_spill_temp_64 = 164
    # gra_spill_temp_65 = 168
    # gra_spill_temp_66 = 176
    # gra_spill_temp_67 = 192
    # gra_spill_temp_68 = 208
    # gra_spill_temp_69 = 224
    # gra_spill_temp_70 = 240

 // registers:
 // a2: const int16_t *input_data
 // a3: const uint16_t input_wd
 // a4: const uint16_t input_ht
 // a5: const uint16_t channels
 // a6: const uint16_t pad_wd
 // a7: const uint16_t pad_ht

 // on stack
 // const uint16_t stride_wd
 // const uint16_t stride_ht
 // const int16_t *filter_data
 // const int32_t *bias
 // int8_t *out_data
 // const uint16_t out_wd
 // const uint16_t out_ht
 // const int32_t out_offset
 // const int32_t *out_shift
 // const int32_t *out_mult
 // const int32_t activation_min
 // const int32_t activation_max

    entry   a1,288                      #
    s32i    a2,a1,104                   # [0]  gra_spill_temp_49
    s32i    a3,a1,112                   # [1]  gra_spill_temp_51
    s32i    a5,a1,116                   # [2]  gra_spill_temp_52
    s32i.n  a6,a1,56                # [3]  gra_spill_temp_37
    addi    a14,a1,112                  # [4]
    addmi   a11,a1,256                  # [5]
    addmi   a13,a1,256                  # [6]
    addmi   a15,a1,256                  # [7]
    l32i    a9,a1,304                   # [8]  id:251 out_data+0x0
    l16ui   a8,a1,312                   # [9]  id:252 out_ht+0x0
    s32i    a8,a1,64                    # [10]  gra_spill_temp_39
    s32i    a9,a1,156                   # [11]  gra_spill_temp_62
    addi    a15,a15,60                  # [12]
    addi    a13,a13,72                  # [13]
    addi    a11,a11,76                  # [14]
    ee.vldbc.32 q0,a11              # [15]  id:250 activation_max
    ee.vldbc.32 q1,a13              # [16]  id:249 activation_min
    ee.vldbc.32 q2,a15              # [17]  id:248 out_offset
    st.qr   q2,a14,80                   # [18]  gra_spill_temp_67-112
    st.qr   q1,a14,96                   # [19]  gra_spill_temp_68-112
    st.qr   q0,a14,112                  # [20]  gra_spill_temp_69-112
    beqz.n  a8,.Lt_5_7426           # [21]

.LBB3_esp_nn_depthwise_conv_s16_mult1_3x3:  # 0x7b9
    s32i    a1,a1,160                   # [0]  gra_spill_temp_63
    s32i    a7,a1,72                    # [1]  gra_spill_temp_41
    mul16u  a6,a3,a5                # [2]
    l32i    a14,a1,296                  # [3]  id:254 filter_data+0x0
    l32i    a15,a1,300                  # [4]  id:253 bias+0x0
    l16ui   a9,a1,308                   # [5]  id:259 out_wd+0x0
    l16ui   a13,a1,288                  # [6]  id:255 stride_wd+0x0
    neg     a8,a7                       # [7]
    l16ui   a10,a1,292                  # [8]  id:258 stride_ht+0x0
    l32i    a11,a1,324                  # [9]  id:257 out_mult+0x0
    l32i    a12,a1,320                  # [10]  id:256 out_shift+0x0
    s32i    a12,a1,84                   # [11]  gra_spill_temp_44
    s32i    a11,a1,88                   # [12]  gra_spill_temp_45
    s32i.n  a10,a1,60               # [13]  gra_spill_temp_38
    s32i    a8,a1,124                   # [14]  gra_spill_temp_54
    s32i    a13,a1,80                   # [15]  gra_spill_temp_43
    s32i    a9,a1,92                    # [16]  gra_spill_temp_46
    s32i    a15,a1,140                  # [17]  gra_spill_temp_58
    s32i    a14,a1,108                  # [18]  gra_spill_temp_50
    slli    a6,a6,1                     # [19]
    movi.n  a14,16                  # [20]
    extui   a15,a15,0,4                 # [21]
    addi    a9,a5,-7                    # [22]
    movi.n  a13,0                   # [23]
    sub     a8,a4,a8                    # [24]
    addx2   a7,a5,a5                    # [25]
    slli    a7,a7,1                     # [26]
    slli    a4,a5,1                     # [27]
    s32i    a13,a1,68                   # [28]  gra_spill_temp_40
    s32i    a9,a1,144                   # [29]  gra_spill_temp_59
    s32i    a15,a1,132                  # [30]  gra_spill_temp_56
    l32i.n  a9,a1,56                # [31]  gra_spill_temp_37
    s32i    a8,a1,76                    # [32]  gra_spill_temp_42
    neg     a9,a9                       # [33]
    s32i.n  a9,a1,48                # [34]  gra_spill_temp_35
    sub     a8,a3,a9                    # [35]
    s32i.n  a8,a1,52                # [36]  gra_spill_temp_36

.Lt_5_7938: # 0x822
    l32i    a10,a1,92                   # [0]  gra_spill_temp_46
    beqz.n  a10,.Lt_5_8194          # [2]

.LBB6_esp_nn_depthwise_conv_s16_mult1_3x3:  # 0x827
    l32i.n  a5,a1,52                # [0]  gra_spill_temp_36
    l32i    a11,a1,76                   # [1]  gra_spill_temp_42
    movi.n  a13,0                   # [2]
    l32i    a12,a1,72                   # [3]  gra_spill_temp_41
    movi.n  a15,0                   # [4]
    l32i.n  a8,a1,48                # [5]  gra_spill_temp_35
    l32i.n  a9,a1,56                # [6]  gra_spill_temp_37
    s32i    a9,a1,100                   # [7]  gra_spill_temp_48
    s32i    a8,a1,128                   # [8]  gra_spill_temp_55
    s32i    a15,a1,96                   # [9]  gra_spill_temp_47
    max     a12,a12,a13                 # [10]
    s32i    a12,a1,152                  # [11]  gra_spill_temp_61
    movi.n  a13,3                   # [12]
    min     a11,a11,a13                 # [13]
    s32i    a11,a1,136                  # [14]  gra_spill_temp_57
    sub     a11,a11,a12                 # [15]
    s32i    a11,a1,120                  # [16]  gra_spill_temp_53

.Lt_5_8706: # 0x854
    l32i    a2,a1,84                    # [0]  gra_spill_temp_44
    l32i    a10,a1,144                  # [1]  gra_spill_temp_59
    l32i    a11,a1,140                  # [2]  gra_spill_temp_58
    l32i    a12,a1,88                   # [3]  gra_spill_temp_45
    s32i    a12,a1,168                  # [4]  gra_spill_temp_65
    s32i    a11,a1,148                  # [5]  gra_spill_temp_60
    blti    a10,1,.Lt_5_8962            # [6]

    movi.n  a8,0                    # [0]
    movi.n  a13,0                   # [1]
    l32i    a3,a1,100                   # [2]  gra_spill_temp_48
    s32i    a13,a1,164                  # [3]  gra_spill_temp_64
    max     a3,a3,a8                    # [4]

.Lt_5_9474: # 0x876
    l32i    a10,a1,136                  # [0]  gra_spill_temp_57
    l32i    a9,a1,152                   # [1]  gra_spill_temp_61
    ee.zero.qacc                    # [2]
    bge     a9,a10,.Lt_5_9730           # [3]

.LBB12_esp_nn_depthwise_conv_s16_mult1_3x3: # 0x882
    l32i    a12,a1,128                  # [0]  gra_spill_temp_55
    l32i    a15,a1,112                  # [1]  gra_spill_temp_51
    l32i    a10,a1,116                  # [2]  gra_spill_temp_52
    l32i    a13,a1,124                  # [3]  gra_spill_temp_54
    mull    a11,a9,a10                  # [4]
    add.n   a13,a13,a9                  # [5]
    mull    a13,a13,a15                 # [6]
    addx2   a11,a11,a11                 # [7]
    l32i    a9,a1,164                   # [8]  gra_spill_temp_64
    add.n   a12,a12,a13                 # [9]
    mull    a10,a10,a12                 # [10]
    add.n   a11,a9,a11                  # [11]
    l32i    a12,a1,108                  # [12]  gra_spill_temp_50
    add.n   a9,a9,a10                   # [13]
    l32i    a10,a1,104                  # [14]  gra_spill_temp_49
    addx2   a11,a11,a12                 # [15]
    l32i    a12,a1,120                  # [16]  gra_spill_temp_53
    addx2   a9,a9,a10                   # [17]
    loopgtz a12,.LBB32_esp_nn_depthwise_conv_s16_mult1_3x3  # [18]

    mov.n   a13,a9                      # [0]
    mov.n   a12,a11                     # [1]
    mov.n   a9,a11                      # [2]
    mov.n   a11,a13                     # [3]

    beqz.n  a3,.Lt_5_10498          # [4] if (filter_x_start)

    add.n   a11,a4,a13                  # [0]
    add.n   a9,a4,a12                   # [1]
.Lt_5_10498:    # 0x8c5

    ee.vld.128.xp   q0,a11,a4           # [0]  id:261
    ee.vld.128.xp   q1,a9,a4            # [1]  id:262

    bnez.n  a3,.Lt_5_11010          # [2] if (filter_x_start)

    ee.vmulas.s16.qacc  q0,q1       # [0]
    ee.vld.128.xp   q0,a11,a4           # [1]  id:264
    ee.vld.128.xp   q1,a9,a4            # [2]  id:265
.Lt_5_11010:    # 0x8d6

    ee.vmulas.s16.qacc  q0,q1       # [0]
    ee.vld.128.xp   q0,a11,a4           # [1]  id:267
    ee.vld.128.xp   q1,a9,a4            # [2]  id:268
    add.n   a9,a6,a13                   # [3]

    blti    a5,3,.Lt_5_11522            # [4] if (filter_x_end)
    ee.vmulas.s16.qacc  q0,q1       # [0]
.Lt_5_11522:    # 0x8e7

    add.n   a11,a7,a12                  # [0]

.LBB32_esp_nn_depthwise_conv_s16_mult1_3x3: # 0x8eb

.Lt_5_9730: # 0x8eb
 // extract data
    l32i    a9,a1,160                   # [0]  gra_spill_temp_63
    ee.st.qacc_l.l.128.ip   a9,16       # [2]  id:270
    ee.st.qacc_l.h.32.ip    a9,0        # [3]  id:271
    l8ui    a11,a1,15                   # [4]  qacc_scratch+15
    l16ui   a10,a1,10                   # [5]  qacc_scratch+10
    l8ui    a15,a1,16                   # [6]  qacc_scratch+16
    l8ui    a13,a1,6                    # [7]  qacc_scratch+6
    l8ui    a12,a1,5                    # [8]  qacc_scratch+5
    s8i     a12,a1,2                    # [9]  qacc_scratch+2
    s8i     a13,a1,3                    # [10]  qacc_scratch+3
    s8i     a15,a1,7                    # [11]  qacc_scratch+7
    s16i    a10,a1,4                    # [12]  qacc_scratch+4
    s8i     a11,a1,6                    # [13]  qacc_scratch+6

    ee.st.qacc_h.l.128.ip   a9,16       # [14]  id:281
    ee.st.qacc_h.h.32.ip    a9,-32      # [15]  id:282
    ee.srcmb.s16.qacc   q1,a14,0        # [16]
    l8ui    a15,a1,31                   # [17]  qacc_scratch+31
    l8ui    a8,a1,32                    # [18]  qacc_scratch+32
    l16ui   a13,a1,26                   # [19]  qacc_scratch+26
    l8ui    a12,a1,22                   # [20]  qacc_scratch+22
    l8ui    a11,a1,21                   # [21]  qacc_scratch+21
    l16ui   a10,a1,16                   # [22]  qacc_scratch+16
    s16i    a10,a1,8                    # [23]  qacc_scratch+8
    s8i     a11,a1,10                   # [24]  qacc_scratch+10
    s8i     a12,a1,11                   # [25]  qacc_scratch+11
    s16i    a13,a1,12                   # [26]  qacc_scratch+12
    s8i     a8,a1,15                    # [27]  qacc_scratch+15
    s8i     a15,a1,14                   # [28]  qacc_scratch+14


    l32i    a8,a1,140                   # [29]  gra_spill_temp_58 , bias
    ee.vld.128.ip   q0,a9,0             # [30]  id:294
    s32i    a9,a1,160                   # [31]  gra_spill_temp_63
    ee.vzip.16  q0,q1               # [32]
    beqz.n  a8,.Lt_5_12290          # [33] // skip bias

    addi    a8,a1,112                   # [0]
    l32i    a10,a1,132                  # [1]  gra_spill_temp_56
    l32i    a9,a1,148                   # [2]  gra_spill_temp_60
    wur.sar_byte    a10                 # [3]
    ee.vld.128.ip   q4,a9,16            # [4]  id:297
    ee.vld.128.ip   q7,a9,16            # [5]  id:298
    ee.vld.128.ip   q5,a9,0             # [6]  id:299
    s32i    a9,a1,148                   # [7]  gra_spill_temp_60
    ee.src.q.qup    q6,q4,q7            # [8]
    ee.vadds.s32    q0,q0,q6            # [9]
    ee.src.q.qup    q3,q4,q5            # [10]
    ee.vadds.s32    q1,q1,q3            # [11]
    st.qr   q1,a8,64                    # [12]  gra_spill_temp_66-112

.Lt_5_12290:    # 0x974
    addi    a11,a1,112                  # [0]

 # 287                  q0 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q0, out_mult_ptr, out_shift_ptr);
    l32i    a10,a1,168                  # [1]  gra_spill_temp_65
    st.qr   q1,a11,64                   # [2]  gra_spill_temp_66-112
    mov.n   a11,a2                      # [3]
    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [4]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3

 # 288                  out_mult_ptr += 4;
 # 289                  out_shift_ptr += 4;
 # 290
 # 291                  q1 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q1, out_mult_ptr, out_shift_ptr);
    l32i    a10,a1,168                  # [0]  gra_spill_temp_65
    addmi   a12,a1,256                  # [1]
    addi    a11,a1,112                  # [2]
    st.qr   q0,a12,-16                  # [3]  gra_spill_temp_70-256
    ld.qr   q0,a11,64                   # [4]  gra_spill_temp_66-112
    addi    a10,a10,16                  # [5]
    addi    a11,a2,16                   # [6]
    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [7]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3

.LBB25_esp_nn_depthwise_conv_s16_mult1_3x3: # 0x99a
#<loop> Part of loop body line 216, head labeled .Lt_5_9474
    movi.n  a14,16                  # [0]
 # 292                  out_mult_ptr += 4;
 # 293                  out_shift_ptr += 4;
    addi    a2,a2,32                    # [1]
    l32i    a15,a1,144                  # [2]  gra_spill_temp_59
    l32i    a9,a1,156                   # [3]  gra_spill_temp_62
    l32i    a8,a1,168                   # [4]  gra_spill_temp_65
    addmi   a12,a1,256                  # [5]
    addi    a13,a1,112                  # [6]
    ld.qr   q3,a13,112                  # [7]  gra_spill_temp_69-112
    ld.qr   q1,a13,80                   # [8]  gra_spill_temp_67-112
    ld.qr   q2,a12,-16                  # [9]  gra_spill_temp_70-256
    addi    a8,a8,32                    # [10]
    s32i    a8,a1,168                   # [11]  gra_spill_temp_65
    ee.vadds.s32    q2,q2,q1            # [12]
    ee.vadds.s32    q1,q0,q1            # [13]
    ee.vmin.s32 q0,q2,q3            # [14]
    ee.vmin.s32 q1,q1,q3            # [15]
    ld.qr   	q2,a13,96                   # [16]  gra_spill_temp_68-112
    l32i    	a13,a1,164                  # [17]  gra_spill_temp_64
    ee.vmax.s32 q1,q1,q2            # [18]
    ee.vmax.s32 q0,q0,q2            # [19]
    addi.n  	a13,a13,8               # [20]
    s32i    	a13,a1,164                  # [21]  gra_spill_temp_64
    ee.vunzip.16    q0,q1               # [22]
    ee.vunzip.8 	q0,q1               # [23]
    ee.vst.l.64.ip  q0,a9,8         # [24]  id:302
    s32i    	a9,a1,156                   # [25]  gra_spill_temp_62
    blt     	a13,a15,.Lt_5_9474          # [26]

.Lt_5_8962: # 0x9e9
#<loop> Part of loop body line 203, head labeled .Lt_5_8706
    l32i    a8,a1,92                    # [0]  gra_spill_temp_46
    l32i    a11,a1,100                  # [1]  gra_spill_temp_48
    l32i    a10,a1,128                  # [2]  gra_spill_temp_55
    l32i    a9,a1,80                    # [3]  gra_spill_temp_43
    l32i    a15,a1,96                   # [4]  gra_spill_temp_47
    sub     a5,a5,a9                    # [5]
    addi.n  a15,a15,1               # [6]
    s32i    a15,a1,96                   # [7]  gra_spill_temp_47
    add.n   a10,a10,a9                  # [8]
    sub     a11,a11,a9                  # [9]
    s32i    a11,a1,100                  # [10]  gra_spill_temp_48
    s32i    a10,a1,128                  # [11]  gra_spill_temp_55
    sub     a15,a15,a8                  # [12]
    bnez    a15,.Lt_5_8706              # [13]

.Lt_5_8194: # 0xa11
#<loop> Part of loop body line 201, head labeled .Lt_5_7938
    l32i    a13,a1,64                   # [0]  gra_spill_temp_39
    l32i    a10,a1,72                   # [1]  gra_spill_temp_41
    l32i    a9,a1,124                   # [2]  gra_spill_temp_54
    l32i.n  a8,a1,60                # [3]  gra_spill_temp_38
    l32i    a12,a1,68                   # [4]  gra_spill_temp_40
    l32i    a15,a1,76                   # [5]  gra_spill_temp_42
    addi.n  a12,a12,1               # [6]
    s32i    a12,a1,68                   # [7]  gra_spill_temp_40
    sub     a15,a15,a8                  # [8]
    add.n   a9,a9,a8                    # [9]
    sub     a10,a10,a8                  # [10]
    s32i    a10,a1,72                   # [11]  gra_spill_temp_41
    s32i    a9,a1,124                   # [12]  gra_spill_temp_54
    s32i    a15,a1,76                   # [13]  gra_spill_temp_42
    sub     a12,a12,a13                 # [14]
    bnez    a12,.Lt_5_7938              # [15]

.Lt_5_7426: # 0xa3e
    retw.n                          # [0]

    .size   esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3, . - esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3


================================================
FILE: src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

    .text
    .literal_position

    # Program Unit: esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3
    .type   esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3, @function
    .align   4
    .global esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3

esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3: # 0xa42
    # qacc_scratch = 0
    # gra_spill_temp_71 = 48
    # gra_spill_temp_72 = 52
    # gra_spill_temp_73 = 56
    # gra_spill_temp_74 = 60
    # gra_spill_temp_75 = 64
    # gra_spill_temp_76 = 68
    # gra_spill_temp_77 = 72
    # gra_spill_temp_78 = 76
    # gra_spill_temp_79 = 80
    # gra_spill_temp_80 = 84
    # gra_spill_temp_81 = 88
    # gra_spill_temp_82 = 92
    # gra_spill_temp_83 = 96
    # gra_spill_temp_84 = 100
    # gra_spill_temp_85 = 104
    # gra_spill_temp_86 = 108
    # gra_spill_temp_87 = 112
    # gra_spill_temp_88 = 116
    # gra_spill_temp_89 = 120
    # gra_spill_temp_90 = 124
    # gra_spill_temp_91 = 128
    # gra_spill_temp_92 = 132
    # gra_spill_temp_93 = 136
    # gra_spill_temp_94 = 140
    # gra_spill_temp_95 = 144
    # gra_spill_temp_96 = 160
    # gra_spill_temp_97 = 176
    # gra_spill_temp_98 = 192
    # gra_spill_temp_99 = 208
    # gra_spill_temp_100 = 224
    # gra_spill_temp_101 = 240
    # gra_spill_temp_102 = 244
    # gra_spill_temp_103 = 248

 // registers:
 // a2: const int16_t *input_data
 // a3: const uint16_t input_wd
 // a4: const uint16_t input_ht
 // a5: const uint16_t channels
 // a6: const uint16_t stride_wd
 // a7: const uint16_t stride_ht

 // on stack:
 // const int16_t *filter_data
 // const int32_t *bias
 // int8_t *out_data
 // const uint16_t out_wd
 // const uint16_t out_ht
 // const int32_t out_offset
 // const int32_t *out_shift
 // const int32_t *out_mult
 // const int32_t activation_min
 // const int32_t activation_max

    entry   a1,288                      #
    s32i    a2,a1,120                   # [0]  gra_spill_temp_89
    s32i.n  a3,a1,48                # [1]  gra_spill_temp_71
    s32i    a5,a1,76                    # [2]  gra_spill_temp_78
    s32i    a6,a1,84                    # [3]  gra_spill_temp_80
    s32i.n  a7,a1,60                # [4]  gra_spill_temp_74
    l32i    a12,a1,296                  # [5]  id:241 out_data+0x0
    addi    a14,a1,112                  # [6]
    addmi   a10,a1,256                  # [7]
    addmi   a13,a1,256                  # [8]
    addmi   a15,a1,256                  # [9]

 // height loop
    l16ui   a8,a1,304                   # [10]  id:242 out_ht+0x0
    s32i.n  a8,a1,56                # [11]  gra_spill_temp_73
    addi    a15,a15,52                  # [12]
    addi    a13,a13,64                  # [13]
    addi    a10,a10,68                  # [14]
    ee.vldbc.32 q0,a10              # [15]  id:240 activation_max
    ee.vldbc.32 q1,a13              # [16]  id:239 activation_min
    ee.vldbc.32 q2,a15              # [17]  id:238 out_offset
    st.qr   q2,a14,64                   # [18]  gra_spill_temp_97-112
    st.qr   q1,a14,80                   # [19]  gra_spill_temp_98-112
    st.qr   q0,a14,96                   # [20]  gra_spill_temp_99-112
    beqz.n  a8,.Lt_6_6914           # [21]

.LBB3_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad:   # 0xa83
    s32i    a1,a1,144                   # [0]  gra_spill_temp_95
    mul16u  a7,a3,a5                # [1]
    s32i    a4,a1,72                    # [2]  gra_spill_temp_77
    addi    a9,a5,-7                    # [3]
    l16ui   a11,a1,300                  # [4]  id:247 out_wd+0x0
    l32i    a10,a1,292                  # [5]  id:243 bias+0x0
    l32i    a15,a1,288                  # [6]  id:244 filter_data+0x0
    l32i    a13,a1,316                  # [7]  id:246 out_mult+0x0
    l32i    a14,a1,312                  # [8]  id:245 out_shift+0x0
    s32i    a14,a1,88                   # [9]  gra_spill_temp_81
    s32i    a13,a1,92                   # [10]  gra_spill_temp_82
    s32i    a15,a1,124                  # [11]  gra_spill_temp_90
    s32i    a10,a1,116                  # [12]  gra_spill_temp_88
    s32i    a11,a1,96                   # [13]  gra_spill_temp_83
    s32i    a9,a1,136                   # [14]  gra_spill_temp_93
    addx2   a4,a5,a5                    # [15]
    slli    a4,a4,1                     # [16]
    slli    a7,a7,1                     # [17]
    l32i.n  a9,a1,60                # [18]  gra_spill_temp_74
    movi.n  a11,0                   # [19]
    extui   a10,a10,0,4                 # [20]
    movi.n  a15,0                   # [21]
    slli    a5,a5,1                     # [22]
    s32i    a15,a1,68                   # [23]  gra_spill_temp_76
    s32i    a10,a1,112                  # [24]  gra_spill_temp_87
    s32i    a11,a1,64                   # [25]  gra_spill_temp_75
    mul16u  a8,a3,a9                # [26]
    movi.n  a11,0                   # [27]
    s32i    a11,a1,80                   # [28]  gra_spill_temp_79
    s32i.n  a8,a1,52                # [29]  gra_spill_temp_72

.Lt_6_7426: # 0xad8 // width_loop
    l32i    a8,a1,96                    # [0]  gra_spill_temp_83
    beqz.n  a8,.Lt_6_7682           # [2]

    movi.n  a11,3                   # [0]
    l32i    a10,a1,72                   # [1]  gra_spill_temp_77
    movi.n  a9,0                    # [2]
    movi.n  a13,0                   # [3]
    l32i.n  a14,a1,48               # [4]  gra_spill_temp_71
    s32i    a14,a1,108                  # [5]  gra_spill_temp_86
    s32i    a13,a1,104                  # [6]  gra_spill_temp_85
    s32i    a9,a1,100                   # [7]  gra_spill_temp_84
    min a10,a10,a11                 # [8]
    s32i    a10,a1,128                  # [9]  gra_spill_temp_91

.Lt_6_8194: # 0xaf7
    l32i    a2,a1,88                    # [0]  gra_spill_temp_81
    l32i    a6,a1,92                    # [1]  gra_spill_temp_82
    l32i    a8,a1,116                   # [2]  gra_spill_temp_88

// channel loop
    l32i    a15,a1,136                  # [3]  gra_spill_temp_93
    s32i    a8,a1,140                   # [4]  gra_spill_temp_94
    blti    a15,1,.Lt_6_8450            # [5]

    movi.n  a11,0                   # [0]
    movi.n  a10,0                   # [1]
    l32i    a9,a1,76                    # [2]  gra_spill_temp_78
    l32i    a14,a1,80                   # [3]  gra_spill_temp_79
    movi.n  a8,3                    # [4]
    l32i    a3,a1,108                   # [5]  gra_spill_temp_86
    l32i    a13,a1,104                  # [6]  gra_spill_temp_85
    min a3,a3,a8                    # [7]
    add.n   a13,a13,a14                 # [8]
    mull    a9,a9,a13                   # [9]
    s32i    a9,a1,132                   # [10]  gra_spill_temp_92

.Lt_6_8962: # 0xb26
    ee.zero.qacc                    # [0]
    l32i    a9,a1,132                   # [1]  gra_spill_temp_92
    l32i    a13,a1,120                  # [2]  gra_spill_temp_89
    add.n   a9,a9,a10                   # [3]
    addx2   a9,a9,a13                   # [4]
    l32i    a13,a1,124                  # [5]  gra_spill_temp_90
    l32i    a14,a1,128                  # [6]  gra_spill_temp_91
    add.n   a13,a11,a13                 # [7]
    loopgtz a14,.LBB30_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad   # [8]

.Lt_6_9730: # 0xb3f
#<loop> Loop body line 360, nesting depth: 4, estimated iterations: 100
    mov.n   a14,a13                     # [0]
    mov.n   a15,a9                      # [1]
    ee.vld.128.xp   q0,a15,a5           # [2]  id:249
    ee.vld.128.xp   q1,a14,a5           # [3]  id:250
    add.n   a9,a9,a7                    # [4]
    beqi    a3,2,.LBB15_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad  # [5]

.Lt_6_9986: # 0xb4e
    beqi    a3,3,.LBB17_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad  # [0]

.Lt_6_10498:    # 0xb51
    add.n   a13,a13,a4                  # [0]
    ee.vmulas.s16.qacc  q0,q1       # [1]

.LBB30_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad:  # 0xb58

 // extract data
    l32i    a15,a1,144                  # [0]  gra_spill_temp_95
    ee.st.qacc_l.l.128.ip   a15,16      # [2]  id:258
    ee.st.qacc_l.h.32.ip    a15,0       # [3]  id:259
    l8ui    a14,a1,15                   # [4]  qacc_scratch+15
    l8ui    a13,a1,16                   # [5]  qacc_scratch+16
    l8ui    a8,a1,5                     # [6]  qacc_scratch+5
    l8ui    a9,a1,6                     # [7]  qacc_scratch+6
    s8i     a9,a1,3                     # [8]  qacc_scratch+3
    s8i     a8,a1,2                     # [9]  qacc_scratch+2
    s8i     a13,a1,7                    # [10]  qacc_scratch+7
    s8i     a14,a1,6                    # [11]  qacc_scratch+6
    l16ui   a13,a1,10                   # [12]  qacc_scratch+10
    s16i    a13,a1,4                    # [13]  qacc_scratch+4
    ee.st.qacc_h.l.128.ip   a15,16      # [14]  id:269
    ee.st.qacc_h.h.32.ip    a15,-32     # [15]  id:270
    l8ui    a9,a1,32                    # [16]  qacc_scratch+32
    l8ui    a13,a1,22                   # [17]  qacc_scratch+22
    l8ui    a8,a1,31                    # [18]  qacc_scratch+31
    l16ui   a14,a1,26                   # [19]  qacc_scratch+26
    s16i    a14,a1,12                   # [20]  qacc_scratch+12
    s8i     a8,a1,14                    # [21]  qacc_scratch+14
    s8i     a13,a1,11                   # [22]  qacc_scratch+11
    s8i     a9,a1,15                    # [23]  qacc_scratch+15

    l32i    a13,a1,116                  # [24]  gra_spill_temp_88
    l8ui    a9,a1,21                    # [25]  qacc_scratch+21
    l16ui   a8,a1,16                    # [26]  qacc_scratch+16
    movi.n  a14,16                  # [27]
    ee.srcmb.s16.qacc   q1,a14,0        # [28]
    s16i    a8,a1,8                     # [29]  qacc_scratch+8
    s8i     a9,a1,10                    # [30]  qacc_scratch+10
    ee.vld.128.ip   q0,a15,0            # [31]  id:282
    s32i    a15,a1,144                  # [32]  gra_spill_temp_95
    ee.vzip.16  q0,q1               # [33]

    bnez.n  a13,.LBB20_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad   # [34]

    s32i    a12,a1,240                  # [0]  gra_spill_temp_101
    s32i    a11,a1,244                  # [1]  gra_spill_temp_102
    s32i    a10,a1,248                  # [2]  gra_spill_temp_103
    addi    a14,a1,112                  # [3]
    st.qr   q1,a14,48                   # [4]  gra_spill_temp_96-112
    j   .Lt_6_11266                     # [5]

.LBB15_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad:  # 0xbce
#<loop> Part of loop body line 360, head labeled .Lt_6_9730
    ee.vmulas.s16.qacc.ld.xp    q0,a15,a5,q0,q1     # [0]  id:251
    ee.vld.128.xp   q1,a14,a5           # [1]  id:252
    bnei    a3,3,.Lt_6_10498            # [2]

.LBB17_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad:  # 0xbd8
    ee.vmulas.s16.qacc.ld.xp    q3,a15,a5,q0,q1     # [0]  id:253
    ee.vld.128.xp   q4,a14,a5           # [1]  id:254
    ee.vld.128.xp   q1,a14,a5           # [2]  id:256
    ee.vmulas.s16.qacc.ld.xp    q0,a15,a5,q3,q4     # [3]  id:255
    j   .Lt_6_10498                     # [4]

.LBB20_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad:  # 0xbe9
#<loop> Part of loop body line 358, head labeled .Lt_6_8962
    s32i    a12,a1,240                  # [0]  gra_spill_temp_101
    s32i    a11,a1,244                  # [1]  gra_spill_temp_102
    s32i    a10,a1,248                  # [2]  gra_spill_temp_103
    addi    a15,a1,112                  # [3]
    l32i    a9,a1,112                   # [4]  gra_spill_temp_87
    l32i    a8,a1,140                   # [5]  gra_spill_temp_94
    wur.sar_byte    a9                  # [6]
    ee.vld.128.ip   q6,a8,16            # [7]  id:285
    ee.vld.128.ip   q3,a8,16            # [8]  id:286
    ee.vld.128.ip   q7,a8,0             # [9]  id:287
    s32i    a8,a1,140                   # [10]  gra_spill_temp_94
    ee.src.q.qup    q2,q6,q3            # [11]
    ee.vadds.s32    q0,q0,q2            # [12]
    ee.src.q.qup    q5,q6,q7            # [13]
    ee.vadds.s32    q1,q1,q5            # [14]
    st.qr           q1,a15,48                   # [15]  gra_spill_temp_96-112

.Lt_6_11266:    # 0xc19
 # 423                  q0 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q0, out_mult_ptr, out_shift_ptr);
    mov.n   a10,a6                      # [0]
    mov.n   a11,a2                      # [1]
    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [2]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3

    addi    a11,a1,112                  # [0]
    addi    a10,a6,16                   # [1]
    st.qr   q0,a11,112                  # [2]  gra_spill_temp_100-112
    ld.qr   q0,a11,48                   # [3]  gra_spill_temp_96-112
    addi    a11,a2,16                   # [4]
    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [5]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3

    addi    a6,a6,32                    # [0]
    addi    a2,a2,32                    # [1]

    l32i    a13,a1,136                  # [2]  gra_spill_temp_93
    l32i    a12,a1,240                  # [3]  gra_spill_temp_101
    l32i    a10,a1,248                  # [4]  gra_spill_temp_103
    l32i    a11,a1,244                  # [5]  gra_spill_temp_102
    addi    a9,a1,112                   # [6]
    ld.qr   q6,a9,80                    # [7]  gra_spill_temp_98-112
    ld.qr   q7,a9,96                    # [8]  gra_spill_temp_99-112
    ld.qr   q5,a9,64                    # [9]  gra_spill_temp_97-112
    ld.qr   q4,a9,112                   # [10]  gra_spill_temp_100-112
    addi    a11,a11,16                  # [11]
    addi.n  a10,a10,8               # [12]
    ee.vadds.s32    q4,q4,q5            # [13]
    ee.vadds.s32    q5,q0,q5            # [14]
    ee.vmin.s32     q4,q4,q7            # [15]
    ee.vmax.s32     q4,q4,q6            # [16]
    ee.vmin.s32     q5,q5,q7            # [17]
    ee.vmax.s32     q5,q5,q6            # [18]
    ee.vunzip.16    q4,q5               # [19]
    ee.vunzip.8     q4,q5               # [20]
    ee.vst.l.64.ip  q4,a12,8        # [21]  id:290
    blt         a10,a13,.Lt_6_8962          # [22]

.Lt_6_8450: # 0xc76
#<loop> Part of loop body line 348, head labeled .Lt_6_8194
    l32i    a11,a1,96                   # [0]  gra_spill_temp_83
    l32i    a15,a1,104                  # [1]  gra_spill_temp_85
    l32i    a14,a1,84                   # [2]  gra_spill_temp_80
    l32i    a10,a1,100                  # [3]  gra_spill_temp_84
    l32i    a13,a1,108                  # [4]  gra_spill_temp_86
    addi.n  a10,a10,1               # [5]
    s32i    a10,a1,100                  # [6]  gra_spill_temp_84
    sub     a13,a13,a14                 # [7]
    add.n   a15,a15,a14                 # [8]
    s32i    a15,a1,104                  # [9]  gra_spill_temp_85
    s32i    a13,a1,108                  # [10]  gra_spill_temp_86
    sub     a10,a10,a11                 # [11]
    bnez    a10,.Lt_6_8194              # [12]

.Lt_6_7682: # 0xc9b
    l32i.n  a9,a1,56                # [0]  gra_spill_temp_73
    l32i    a15,a1,64                   # [1]  gra_spill_temp_75
    l32i.n  a14,a1,52               # [2]  gra_spill_temp_72
    l32i    a13,a1,80                   # [3]  gra_spill_temp_79
    l32i.n  a11,a1,60               # [4]  gra_spill_temp_74
    l32i    a8,a1,68                    # [5]  gra_spill_temp_76
    l32i    a10,a1,72                   # [6]  gra_spill_temp_77
    addi.n  a8,a8,1                 # [7]
    s32i    a8,a1,68                    # [8]  gra_spill_temp_76
    sub     a10,a10,a11                 # [9]
    add.n   a13,a13,a14                 # [10]
    add.n   a15,a15,a11                 # [11]
    s32i    a15,a1,64                   # [12]  gra_spill_temp_75
    s32i    a13,a1,80                   # [13]  gra_spill_temp_79
    s32i    a10,a1,72                   # [14]  gra_spill_temp_77
    sub     a8,a8,a9                    # [15]
    bnez    a8,.Lt_6_7426               # [16]

.Lt_6_6914: # 0xcc8
    retw.n                          # [0]

    .size   esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3, . - esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3


================================================
FILE: src/convolution/esp_nn_depthwise_conv_s16_mult1_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

    .text
    .literal_position

    # Program Unit: esp_nn_depthwise_conv_s16_mult1_esp32s3
    .type   esp_nn_depthwise_conv_s16_mult1_esp32s3, @function
    .align   4
    .global esp_nn_depthwise_conv_s16_mult1_esp32s3

esp_nn_depthwise_conv_s16_mult1_esp32s3:    # 0x4c8
    # scratch_buf = 0
    # gra_spill_temp_2 = 48
    # gra_spill_temp_22 = 52
    # gra_spill_temp_4 = 56
    # gra_spill_temp_23 = 60
    # gra_spill_temp_24 = 64
    # gra_spill_temp_7 = 68
    # gra_spill_temp_26 = 72
    # gra_spill_temp_27 = 76
    # gra_spill_temp_28 = 80
    # gra_spill_temp_29 = 84
    # gra_spill_temp_12 = 88
    # gra_spill_temp_13 = 92
    # gra_spill_temp_14 = 96
    # gra_spill_temp_15 = 100
    # gra_spill_temp_21 = 104
    # gra_spill_temp_17 = 108
    # gra_spill_temp_18 = 112
    # gra_spill_temp_20 = 116
    # gra_spill_temp_30 = 0
    # gra_spill_temp_34 = 16

 // in registers:
 // a2: *input_data
 // a3: input_wd
 // a4: input_ht
 // a5: channels
 // a6: pad_wd
 // a7: pad_ht

 // on stack:
 // stride_wd
 // stride_ht
 // *filter_data
 // filter_wd
 // filter_ht
 // *bias
 // *out_data
 // out_wd
 // out_ht
 // out_offset
 // *out_shift
 // *out_mult
 // activation_min
 // activation_max

    entry   a1,160                      #
    l32i    a9,a1,184                   # [7]  id:237 out_data+0x0
    l16ui   a8,a1,192                   # [8]  id:238 out_ht+0x0
    s32i    a2,a1,52                    # [0]  gra_spill_temp_22
    s32i.n  a4,a1,56                # [1]  gra_spill_temp_4
    s32i    a5,a1,60                    # [2]  gra_spill_temp_23
    s32i    a9,a1,112                   # [10]  gra_spill_temp_18
    beqz.n  a8,.Lt_4_7170           # [20]

.LBB3_esp_nn_depthwise_conv_s16_mult1:  # 0x508
    l16ui   a4,a1,172                   # [0]  id:240 filter_wd+0x0
    neg     a13,a7                      # [2]
    neg     a12,a6                      # [3]
    sext    a12,a12,15                  # [16]
    sext    a13,a13,15                  # [17]
    s32i    a13,a1,92                   # [18]  gra_spill_temp_13
    s32i.n  a12,a1,48               # [19]  gra_spill_temp_2
    movi.n  a8,0                    # [20]
    slli    a9,a5,1                     # [21]
    addi    a10,a5,-7                   # [22]
    s32i    a10,a1,100                  # [23]  gra_spill_temp_15
    s32i    a9,a1,64                    # [24]  gra_spill_temp_24
    s32i    a8,a1,68                    # [25]  gra_spill_temp_7
    j   .Lt_4_7682                      # [30]

.Lt_4_7938: # 0x561
    l32i    a15,a1,192                  # [0]  out_ht
    l32i.n  a9,a1,164                   # [1]  stride_ht
    l32i    a14,a1,68                   # [2]  gra_spill_temp_7
    l32i    a8,a1,92                    # [3]  gra_spill_temp_13
    addi.n  a14,a14,1               # [4]
    s32i    a14,a1,68                   # [5]  gra_spill_temp_7
    add.n   a9,a8,a9                    # [6]
    sub     a14,a14,a15                 # [7]
    sext    a8,a9,15                    # [8]
    s32i    a8,a1,92                    # [9]  gra_spill_temp_13
    beqz    a14,.Lt_4_7170              # [10]

.Lt_4_7682: # 0x57f
#<loop> Loop body line 59, nesting depth: 1, estimated iterations: 100
 #  60          const int16_t base_y = (out_y * stride_ht) - pad_ht;
 #  61          for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop
    l32i    a10,a1,188                  # [0]  out_width
    beqz.n  a10,.Lt_4_7938          # [2]

.LBB6_esp_nn_depthwise_conv_s16_mult1:  # 0x584
#<loop> Part of loop body line 59, head labeled .Lt_4_7682
    movi.n  a14,0                   # [0]
    l32i.n  a7,a1,176                   # [1]  filter_ht
    l32i    a13,a1,92                   # [2]  gra_spill_temp_13
    l32i.n  a8,a1,56                # [3]  gra_spill_temp_4
    movi.n  a11,0                   # [4]
    l32i.n  a12,a1,48               # [5]  gra_spill_temp_2
    s32i    a12,a1,84                   # [6]  gra_spill_temp_29
    s32i    a11,a1,88                   # [7]  gra_spill_temp_12
    sub     a8,a8,a13                   # [8]
    min     a7,a7,a8                    # [9]
    neg     a13,a13                     # [10]
    max     a13,a13,a14                 # [11]
    s32i    a13,a1,96                   # [12]  gra_spill_temp_14
    j   .Lt_4_8450                      # [13]

.Lt_4_8706: # 0x5a9
#<loop> Part of loop body line 61, head labeled .Lt_4_8450
    l32i    a10,a1,188                  # [0]  out_width
    l32i    a12,a1,160                  # [1]  stride_wd
    l32i    a9,a1,88                    # [2]  gra_spill_temp_12
    l32i    a11,a1,84                   # [3]  gra_spill_temp_29
    addi.n  a9,a9,1                 # [4]
    s32i    a9,a1,88                    # [5]  gra_spill_temp_12
    add.n   a12,a11,a12                 # [6]
    sext    a11,a12,15                  # [7]
    s32i    a11,a1,84                   # [8]  gra_spill_temp_29
    beq     a9,a10,.Lt_4_7938           # [9]

.Lt_4_8450: # 0x5c5
#<loop> Loop body line 61, nesting depth: 2, estimated iterations: 100
 #  69              uint32_t bias_ptr = (uint32_t) bias;
 #  70              const int32_t *out_mult_ptr = out_mult;
 #  71              const int32_t *out_shift_ptr = out_shift;
 #  72
 #  73              for (int ch_idx = 0; ch_idx < channels - 7; ch_idx += 8) {//channel_loop
    l32i    a13,a1,100                  # [0]  gra_spill_temp_15
    l32i    a14,a1,180                  # [1]  bias
    l32i    a15,a1,204                  # [2]  out_mult
    l32i    a8,a1,200                   # [3]  out_shift
    s32i    a8,a1,104                   # [4]  gra_spill_temp_21
    s32i    a15,a1,116                  # [5]  gra_spill_temp_20
    s32i    a14,a1,108                  # [6]  gra_spill_temp_17
    blti    a13,1,.Lt_4_8706            # [7]

.LBB9_esp_nn_depthwise_conv_s16_mult1:  # 0x5dd
#<loop> Part of loop body line 61, head labeled .Lt_4_8450
    movi.n  a2,0                    # [0]
    l32i    a5,a1,84                    # [1]  gra_spill_temp_29
    movi.n  a8,0                    # [2]
    neg     a6,a5                       # [3]
    max     a6,a6,a8                    # [4]
    sub     a5,a3,a5                    # [5]
    min     a5,a4,a5                    # [6]
    sub     a9,a5,a6                    # [7]
    s32i    a9,a1,72                    # [8]  gra_spill_temp_26
    j   .Lt_4_9218                      # [9]

.Lt_4_9474: # 0x5f9

// extract data
    mov     a11,a1
    ee.st.qacc_l.l.128.ip   a11,16      # [2]  id:252
    ee.st.qacc_l.h.32.ip    a11,0       # [3]  id:253
    l8ui    a12,a1,15                   # [4]  scratch_buf+15
    l16ui   a10,a1,10                   # [5]  scratch_buf+10
    l8ui    a13,a1,5                    # [6]  scratch_buf+5
    l8ui    a14,a1,6                    # [7]  scratch_buf+6
    l8ui    a15,a1,16                   # [8]  scratch_buf+16
    s8i     a13,a1,2                    # [11]  scratch_buf+2
    s8i     a14,a1,3                    # [10]  scratch_buf+3
    s8i     a15,a1,7                    # [9]  scratch_buf+7
    s16i    a10,a1,4                    # [12]  scratch_buf+4
    s8i     a12,a1,6                    # [13]  scratch_buf+6

    movi.n  a10,16                  # [14]
    ee.st.qacc_h.l.128.ip   a11,16      # [15]  id:263
    ee.st.qacc_h.h.32.ip    a11,-32     # [16]  id:264
    ee.srcmb.s16.qacc       q1,a10,0        # [17]
    l8ui    a8,a1,31                    # [18]  scratch_buf+31
    l8ui    a9,a1,32                    # [19]  scratch_buf+32
    l16ui   a12,a1,16                   # [20]  scratch_buf+16
    l8ui    a13,a1,21                   # [21]  scratch_buf+21
    l8ui    a14,a1,22                   # [22]  scratch_buf+22
    l16ui   a15,a1,26                   # [23]  scratch_buf+26
    s8i     a13,a1,10                   # [26]  scratch_buf+10
    s8i     a14,a1,11                   # [25]  scratch_buf+11
    s16i    a15,a1,12                   # [24]  scratch_buf+12
    s16i    a12,a1,8                    # [27]  scratch_buf+8
    s8i     a9,a1,15                    # [28]  scratch_buf+15
    s8i     a8,a1,14                    # [29]  scratch_buf+14

    l32i            a9,a1,180                   # [30]  bias
    ee.vld.128.ip   q0,a11,0            # [31]  id:164
    ee.vzip.16      q0,q1               # [33]
    beqz.n          a9,.Lt_4_11522          # [34] // skip bias

// add bias
    l32i    a9,a1,108                   # [0]  gra_spill_temp_17
    addi    a8,a1,112                   # [1]
    extui   a10,a9,0,4                  # [2]
    wur.sar_byte    a10                 # [3]
    ee.vld.128.ip   q4,a9,16            # [4]  id:279
    ee.vld.128.ip   q7,a9,16            # [5]  id:168
    ee.vld.128.ip   q5,a9,0             # [6]  id:281
    s32i    a9,a1,108                   # [7]  gra_spill_temp_17
    ee.src.q    q4,q4,q7            # [8]
    ee.src.q    q7,q7,q5            # [10]
    ee.vadds.s32    q0,q0,q4            # [9]
    ee.vadds.s32    q1,q1,q7            # [11]
    st.qr   q1,a1,0                 # [12]  gra_spill_temp_30-112

.Lt_4_11522:    # 0x684

// apply quantisation: esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q0, out_mult_ptr, out_shift_ptr);

    l32i    a10,a1,116                  # [1]  gra_spill_temp_20
    l32i    a11,a1,104                  # [3]  gra_spill_temp_21
    st.qr   q1,a1,0                 # [2]  gra_spill_temp_30-112
    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [4]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3

    l32i    a10,a1,116                  # [2]  gra_spill_temp_20
    l32i    a11,a1,104                  # [0]  gra_spill_temp_21
    st.qr   q0,a1,16                # [3]  gra_spill_temp_34-112
    ld.qr   q0,a1,0                 # [4]  gra_spill_temp_30-112
    addi    a10,a10,16                  # [5] // out_mult_ptr += 4
    addi    a11,a11,16                  # [6] // out_shift_ptr += 4
    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [7]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3

// add offset, apply activation and store
    l32i    a13,a1,100                  # [0]  gra_spill_temp_15
    addi.n  a2,a2,8                 # [1]
    l32i    a8,a1,112                   # [2]  gra_spill_temp_18
    l32i    a15,a1,116                  # [3]  gra_spill_temp_20
    l32i    a14,a1,104                  # [4]  gra_spill_temp_21

    addi        a12,a1,212
    ee.vldbc.32 q3,a12              # [14]  id:236 activation_max
    addi        a12,a1,196
    ee.vldbc.32 q1,a12              # [16]  id:234 out_offset
    addi    a12,a1,208

    ld.qr   q2,a1,16                # [8]  gra_spill_temp_34-112

    addi    a14,a14,32                  # [9]
    addi    a15,a15,32                  # [10]
    s32i    a15,a1,116                  # [11]  gra_spill_temp_20
    ee.vadds.s32    q2,q2,q1            # [12]
    s32i            a14,a1,104                  # [13]  gra_spill_temp_21
    ee.vadds.s32    q1,q0,q1            # [14]
    ee.vmin.s32     q0,q2,q3            # [15]
    ee.vldbc.32     q2,a12              # [16]  id:234 out_offset
    ee.vmin.s32     q1,q1,q3            # [17]
    ee.vmax.s32     q1,q1,q2            # [18]
    ee.vmax.s32     q0,q0,q2            # [19]
    ee.vunzip.16    q0,q1               # [20]
    ee.vunzip.8     q0,q1               # [21]
    ee.vst.l.64.ip  q0,a8,8         # [22]  id:172
    s32i    a8,a1,112                   # [23]  gra_spill_temp_18
    bge     a2,a13,.Lt_4_8706           # [24]

.Lt_4_9218: # 0x6f5
    ee.zero.qacc                    # [0]
    l32i    a13,a1,96                   # [1]  gra_spill_temp_14
    s32i    a13,a1,80                   # [2]  gra_spill_temp_28
    bge     a13,a7,.Lt_4_9474           # [3]

.LBB12_esp_nn_depthwise_conv_s16_mult1: # 0x701 // channel_loop
    mull    a15,a13,a4                  # [0]
    l32i    a14,a1,92                   # [1]  gra_spill_temp_13
    add.n   a8,a15,a5                   # [2]
    add.n   a14,a14,a13                 # [3]
    mull    a14,a3,a14                  # [4]
    s32i    a8,a1,76                    # [5]  gra_spill_temp_27
    bge     a6,a5,.Lt_4_10242           # [6]

.LBB15_esp_nn_depthwise_conv_s16_mult1: # 0x714
    l32i    a12,a1,64                   # [0]  gra_spill_temp_24
    l32i    a9,a1,168                   # [1]  filter_data
    l32i    a10,a1,60                   # [2]  gra_spill_temp_23
    l32i    a11,a1,84                   # [3]  gra_spill_temp_29
    add.n   a8,a15,a6                   # [4]
    add.n   a11,a11,a6                  # [5]
    mull    a8,a8,a10                   # [6]
    add.n   a11,a14,a11                 # [7]
    mull    a10,a10,a11                 # [8]
    add.n   a8,a2,a8                    # [9]
    l32i    a11,a1,52                   # [10]  gra_spill_temp_22
    addx2   a8,a8,a9                    # [11]
    add.n   a10,a2,a10                  # [12]
    l32i    a9,a1,72                    # [13]  gra_spill_temp_26
    addx2   a10,a10,a11                 # [14]
    loopgtz a9,.LBB41_esp_nn_depthwise_conv_s16_mult1   # [15]
// innermost loop
    ee.vld.128.xp   q0,a10,a12          # [0*II+3]  id:249
    ee.vld.128.xp   q1,a8,a12           # [0*II+4]  id:250
    ee.vmulas.s16.qacc  q0,q1       # [0*II+6]
.LBB41_esp_nn_depthwise_conv_s16_mult1: # 0x750

.Lt_4_10242:    # 0x750
    add.n   a14,a14,a3                  # [0]
    add.n   a15,a15,a4                  # [1]
    l32i    a9,a1,80                    # [2]  gra_spill_temp_28
    l32i    a10,a1,76                   # [3]  gra_spill_temp_27
    addi.n  a9,a9,1                 # [4]
    add.n   a10,a10,a4                  # [5]
    s32i    a10,a1,76                   # [6]  gra_spill_temp_27
    s32i    a9,a1,80                    # [7]  gra_spill_temp_28
    sub     a9,a7,a9                    # [8]
    beqz    a9,.Lt_4_9474               # [9]

    blt a6,a5,.LBB15_esp_nn_depthwise_conv_s16_mult1    # [0]

    j   .Lt_4_10242                     # [0]

.Lt_4_7170: # 0x770
    retw.n                          # [0]

    .size   esp_nn_depthwise_conv_s16_mult1_esp32s3, . - esp_nn_depthwise_conv_s16_mult1_esp32s3


================================================
FILE: src/convolution/esp_nn_depthwise_conv_s16_mult4_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

    .text
    .literal_position

    # Program Unit: esp_nn_depthwise_conv_s16_mult4_esp32s3
    .type   esp_nn_depthwise_conv_s16_mult4_esp32s3, @function
    .align   4
    .global esp_nn_depthwise_conv_s16_mult4_esp32s3

esp_nn_depthwise_conv_s16_mult4_esp32s3:    # 0x17c8
    # qacc_scratch = 0
    # gra_spill_temp_220 = 32
    # gra_spill_temp_221 = 36
    # gra_spill_temp_222 = 40
    # gra_spill_temp_223 = 44
    # gra_spill_temp_224 = 48
    # gra_spill_temp_225 = 52
    # gra_spill_temp_226 = 56
    # gra_spill_temp_227 = 60
    # gra_spill_temp_228 = 64
    # gra_spill_temp_229 = 68
    # gra_spill_temp_230 = 72
    # gra_spill_temp_231 = 76
    # gra_spill_temp_232 = 80
    # gra_spill_temp_233 = 84
    # gra_spill_temp_234 = 88
    # gra_spill_temp_235 = 92
    # gra_spill_temp_236 = 96
    # gra_spill_temp_237 = 100
    # gra_spill_temp_238 = 104
    # gra_spill_temp_239 = 108
    # gra_spill_temp_240 = 112
    # gra_spill_temp_241 = 116
    # gra_spill_temp_242 = 120
    # gra_spill_temp_243 = 124
    # gra_spill_temp_244 = 128
    # gra_spill_temp_245 = 132
    # gra_spill_temp_246 = 136
    # gra_spill_temp_247 = 140
    # gra_spill_temp_248 = 144
    # gra_spill_temp_249 = 148
    # gra_spill_temp_250 = 152
    # gra_spill_temp_251 = 156
    # gra_spill_temp_252 = 160
    # gra_spill_temp_253 = 164
    # gra_spill_temp_254 = 168
    # gra_spill_temp_255 = 172
    # gra_spill_temp_256 = 176
    # gra_spill_temp_257 = 192
    # gra_spill_temp_258 = 208
    # gra_spill_temp_259 = 224
    # gra_spill_temp_260 = 240

 // registers:
 // a2: const int16_t *input_data
 // a3: const uint16_t input_wd
 // a4: const uint16_t input_ht
 // a5: const uint16_t channels
 // a6: const uint16_t pad_wd
 // a7: const uint16_t pad_ht

 // on stack:
 // const uint16_t stride_wd
 // const uint16_t stride_ht
 // const uint16_t ch_mult
 // const int16_t *filter_data
 // const uint16_t filter_wd
 // const uint16_t filter_ht
 // const int32_t *bias
 // int8_t *out_data
 // const uint16_t out_wd
 // const uint16_t out_ht
 // const int32_t out_offset
 // const int32_t *out_shift
 // const int32_t *out_mult
 // const int32_t activation_min
 // const int32_t activation_max


    entry   a1,288                      #
    s32i    a2,a1,136                   # [0]  gra_spill_temp_246
    s32i.n  a4,a1,40                # [1]  gra_spill_temp_222
    s32i    a5,a1,164                   # [2]  gra_spill_temp_253
    addi    a12,a1,112                  # [3]
    addmi   a10,a1,256                  # [4]
    addmi   a11,a1,256                  # [5]
    addmi   a13,a1,256                  # [6]
    l16ui   a8,a1,324                   # [7]  id:216 out_ht+0x0
    s32i.n  a8,a1,48                # [8]  gra_spill_temp_224
    addi    a13,a13,72                  # [9]
    addi    a11,a11,88                  # [10]
    addi    a10,a10,84                  # [11]
    ee.vldbc.32 q0,a10              # [12]  id:215 activation_min
    ee.vldbc.32 q1,a11              # [13]  id:214 activation_max
    ee.vldbc.32 q2,a13              # [14]  id:213 out_offset
    st.qr       q2,a12,80                   # [15]  gra_spill_temp_257-112
    st.qr       q1,a12,96                   # [16]  gra_spill_temp_258-112
    st.qr       q0,a12,112                  # [17]  gra_spill_temp_259-112
    beqz.n  a8,.Lt_10_8450          # [18]

    s32i    a1,a1,112                   # [0]  gra_spill_temp_240
    neg     a15,a6                      # [1]
    neg     a4,a7                       # [2]
    addmi   a8,a1,256                   # [3]
    movi.n  a9,0                    # [4]
    movi.n  a11,0                   # [5]
    slli    a14,a5,1                    # [6]
    l16ui   a13,a1,296                  # [7]  id:217 ch_mult+0x0
    l16ui   a10,a1,308                  # [8]  id:227 filter_ht+0x0
    s32i.n  a10,a1,36               # [9]  gra_spill_temp_221
    s32i    a13,a1,76                   # [10]  gra_spill_temp_231
    s32i    a14,a1,148                  # [11]  gra_spill_temp_249
    s32i.n  a11,a1,52               # [12]  gra_spill_temp_225
    s32i    a9,a1,116                   # [13]  gra_spill_temp_241
    st.qr   q4,a8,-16                   # [14]  gra_spill_temp_260-256
    sext    a4,a4,15                    # [15]
    sext    a15,a15,15                  # [16]
    s32i.n  a15,a1,32               # [17]  gra_spill_temp_220
    mul16u  a12,a5,a13              # [18]
    s32i    a4,a1,92                    # [19]  gra_spill_temp_235
    l16ui   a8,a1,320                   # [20]  id:229 out_wd+0x0
    l16ui   a9,a1,292                   # [21]  id:228 stride_ht+0x0
    l32i    a11,a1,336                  # [22]  id:226 out_mult+0x0
    s32i    a11,a1,64                   # [23]  gra_spill_temp_228
    s32i.n  a9,a1,44                # [24]  gra_spill_temp_223
    s32i    a8,a1,68                    # [25]  gra_spill_temp_229
    l32i    a4,a1,300                   # [26]  id:218 filter_data+0x0
    s32i    a12,a1,140                  # [27]  gra_spill_temp_247
    l32i    a15,a1,316                  # [28]  id:219 out_data+0x0
    s32i    a15,a1,96                   # [29]  gra_spill_temp_236
    slli    a12,a12,1                   # [30]
    s32i    a4,a1,152                   # [31]  gra_spill_temp_250
    addi    a14,a13,-3                  # [32]
    l16ui   a4,a1,304                   # [33]  id:223 filter_wd+0x0
    s32i    a14,a1,108                  # [34]  gra_spill_temp_239
    s32i    a12,a1,144                  # [35]  gra_spill_temp_248
    slli    a13,a13,2                   # [36]
    s32i    a13,a1,80                   # [37]  gra_spill_temp_232
    l32i    a12,a1,332                  # [38]  id:225 out_shift+0x0
    l32i    a14,a1,312                  # [39]  id:222 bias+0x0
    s32i    a14,a1,104                  # [40]  gra_spill_temp_238
    s32i.n  a12,a1,60               # [41]  gra_spill_temp_227
    l16ui   a13,a1,288                  # [42]  id:224 stride_wd+0x0
    s32i.n  a13,a1,56               # [43]  gra_spill_temp_226
    j   .Lt_10_8962                     # [44]

.Lt_10_9218:    # 0x1880
    l32i.n  a9,a1,48                # [0]  gra_spill_temp_224
    l32i.n  a11,a1,44               # [1]  gra_spill_temp_223
    l32i.n  a8,a1,52                # [2]  gra_spill_temp_225
    l32i    a10,a1,92                   # [3]  gra_spill_temp_235
    addi.n  a8,a8,1                 # [4]
    s32i.n  a8,a1,52                # [5]  gra_spill_temp_225
    add.n   a11,a10,a11                 # [6]
    sub     a8,a8,a9                    # [7]
    sext    a10,a11,15                  # [8]
    s32i    a10,a1,92                   # [9]  gra_spill_temp_235
    beqz    a8,.Lt_10_8450              # [10]

.Lt_10_8962:    # 0x189b
#<loop> Loop body line 1223, nesting depth: 1, estimated iterations: 100
 #1224          const int16_t base_y = (out_y * stride_ht) - pad_ht;
 #1225          for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop
    l32i    a12,a1,68                   # [0]  gra_spill_temp_229
    beqz.n  a12,.Lt_10_9218         # [2]

.LBB6_esp_nn_depthwise_conv_s16_mult4:  # 0x18a0
    l32i.n  a7,a1,36                # [0]  gra_spill_temp_221
    movi.n  a11,0                   # [1]
    l32i.n  a8,a1,40                # [2]  gra_spill_temp_222
    l32i    a9,a1,92                    # [3]  gra_spill_temp_235
    movi.n  a13,0                   # [4]
    l32i.n  a14,a1,32               # [5]  gra_spill_temp_220
    s32i    a14,a1,160                  # [6]  gra_spill_temp_252
    s32i    a13,a1,72                   # [7]  gra_spill_temp_230
    neg     a10,a9                      # [8]
    sub     a8,a8,a9                    # [9]
    max     a10,a10,a11                 # [10]
    s32i    a10,a1,100                  # [11]  gra_spill_temp_237
    min     a7,a7,a8                    # [12]
    j   .Lt_10_9730                     # [13]

.Lt_10_9986:    # 0x18c5
    l32i    a13,a1,68                   # [0]  gra_spill_temp_229
    l32i.n  a15,a1,56               # [1]  gra_spill_temp_226
    l32i    a12,a1,72                   # [2]  gra_spill_temp_230
    l32i    a14,a1,160                  # [3]  gra_spill_temp_252
    addi.n  a12,a12,1               # [4]
    s32i    a12,a1,72                   # [5]  gra_spill_temp_230
    add.n   a15,a14,a15                 # [6]
    sext    a14,a15,15                  # [7]
    s32i    a14,a1,160                  # [8]  gra_spill_temp_252
    beq a12,a13,.Lt_10_9218         # [9]

.Lt_10_9730:    # 0x18e0
    l32i    a8,a1,164                   # [0]  gra_spill_temp_253
    l32i    a9,a1,64                    # [1]  gra_spill_temp_228
    l32i.n  a10,a1,60               # [2]  gra_spill_temp_227
    s32i    a10,a1,132                  # [3]  gra_spill_temp_245
    s32i    a9,a1,128                   # [4]  gra_spill_temp_244
    beqz.n  a8,.Lt_10_9986          # [5]

    movi.n  a8,0                    # [0]
    l32i    a5,a1,160                   # [1]  gra_spill_temp_252
    movi.n  a12,0                   # [2]
    movi.n  a13,0                   # [3]
    movi.n  a14,0                   # [4]
    s32i    a14,a1,84                   # [5]  gra_spill_temp_233
    s32i    a13,a1,88                   # [6]  gra_spill_temp_234
    s32i    a12,a1,176                  # [7]  gra_spill_temp_256
    neg     a6,a5                       # [8]
    max     a6,a6,a8                    # [9]
    sub     a5,a3,a5                    # [10]
    min     a5,a4,a5                    # [11]
    sub     a11,a5,a6                   # [12]
    s32i    a11,a1,156                  # [13]  gra_spill_temp_251
    j   .Lt_10_10498                    # [14]

.Lt_10_10754:   # 0x1919
    l32i    a10,a1,164                  # [0]  gra_spill_temp_253
    l32i    a14,a1,76                   # [1]  gra_spill_temp_231
    l32i    a13,a1,84                   # [2]  gra_spill_temp_233
    l32i    a12,a1,80                   # [3]  gra_spill_temp_232
    l32i    a9,a1,176                   # [4]  gra_spill_temp_256
    l32i    a11,a1,88                   # [5]  gra_spill_temp_234
    addi.n  a9,a9,1                 # [6]
    s32i    a9,a1,176                   # [7]  gra_spill_temp_256
    add.n   a11,a11,a12                 # [8]
    add.n   a13,a13,a14                 # [9]
    s32i    a13,a1,84                   # [10]  gra_spill_temp_233
    s32i    a11,a1,88                   # [11]  gra_spill_temp_234
    beq     a9,a10,.Lt_10_9986          # [12]

.Lt_10_10498:   # 0x193d
    l32i    a15,a1,108                  # [0]  gra_spill_temp_239
    blti    a15,1,.Lt_10_10754          # [2]

    l32i    a2,a1,84                    # [0]  gra_spill_temp_233
    l32i    a10,a1,104                  # [1]  gra_spill_temp_238
    l32i    a9,a1,88                    # [2]  gra_spill_temp_234
    movi.n  a8,0                    # [3]
    s32i    a8,a1,120                   # [4]  gra_spill_temp_242
    add.n   a9,a9,a10                   # [5]
    s32i    a9,a1,124                   # [6]  gra_spill_temp_243
    j   .Lt_10_11266                    # [7]

.Lt_10_11522:   # 0x1959
    addmi   a12,a1,256                  # [0]
    l32i    a14,a1,112                  # [1]  gra_spill_temp_240
    movi.n  a13,16                  # [2]
    ee.st.qacc_l.l.128.ip   a14,16      # [3]  id:234
    ee.st.qacc_l.h.32.ip    a14,-16     # [4]  id:235
    ee.srcmb.s16.qacc   q5,a13,0        # [5]
    l16ui   a15,a1,10                   # [6]  qacc_scratch+10
    l8ui    a8,a1,15                    # [7]  qacc_scratch+15
    l8ui    a9,a1,5                     # [8]  qacc_scratch+5
    l8ui    a11,a1,16                   # [9]  qacc_scratch+16
    l8ui    a10,a1,6                    # [10]  qacc_scratch+6
    s8i     a10,a1,3                    # [11]  qacc_scratch+3
    s8i     a11,a1,7                    # [12]  qacc_scratch+7
    s8i     a9,a1,2                     # [13]  qacc_scratch+2

    l32i    a11,a1,104                  # [14]  gra_spill_temp_238
    s8i     a8,a1,6                     # [15]  qacc_scratch+6
    s16i    a15,a1,4                    # [16]  qacc_scratch+4
    ee.vld.l.64.ip  q0,a14,0        # [17]  id:245
    s32i    a14,a1,112                  # [18]  gra_spill_temp_240
    ee.vzip.16  q0,q5               # [19]
    st.qr   q5,a12,-16                  # [20]  gra_spill_temp_260-256

    beqz.n  a11,.Lt_10_13570        # [21] // skip_bias

 // add bias
    l32i    a13,a1,124                  # [0]  gra_spill_temp_243
    extui   a12,a13,0,4                 # [2]
    ee.vld.128.ip   q7,a13,16           # [3]  id:248
    ee.vld.128.ip   q1,a13,0            # [4]  id:249
    wur.sar_byte    a12                 # [5]
    ee.src.q.qup    q6,q7,q1            # [6]
    ee.vadds.s32    q0,q0,q6            # [7]

.Lt_10_13570:   # 0x19ae
 #1287                      q0 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q0, out_mult_ptr, out_shift_ptr);
    l32i    a10,a1,128                  # [0]  gra_spill_temp_244
    l32i    a11,a1,132                  # [1]  gra_spill_temp_245
    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [2]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3

    addi.n  a2,a2,4                 # [0]
    l32i    a13,a1,96                   # [1]  gra_spill_temp_236
    l32i    a11,a1,128                  # [2]  gra_spill_temp_244
    l32i    a10,a1,132                  # [3]  gra_spill_temp_245
    addi    a8,a1,112                   # [4]
    ld.qr   q1,a8,96                    # [5]  gra_spill_temp_258-112
    ld.qr   q2,a8,80                    # [6]  gra_spill_temp_257-112
    addi    a10,a10,16                  # [7]
    addi    a11,a11,16                  # [8]
    s32i    a11,a1,128                  # [9]  gra_spill_temp_244
    ee.vadds.s32    q0,q0,q2            # [10]
    s32i    a10,a1,132                  # [11]  gra_spill_temp_245
    ee.vmin.s32 	q0,q0,q1            # [12]
    ld.qr   		q1,a8,112                   # [13]  gra_spill_temp_259-112
    l32i    		a8,a1,116                   # [14]  gra_spill_temp_241
    ee.vmax.s32 	q0,q0,q1            # [15]
    ee.movi.32.a    q0,a14,2            # [16]
    ee.movi.32.a    q0,a15,1            # [17]
    ee.movi.32.a    q0,a9,0             # [18]
    add.n   		a13,a8,a13                  # [19]
    ee.movi.32.a    q0,a12,3            # [20]
    addi.n  a8,a8,4                 # [21]
    s8i 	a12,a13,3                   # [22]  id:254
    s32i    a8,a1,116                   # [23]  gra_spill_temp_241
    s8i 	a9,a13,0                    # [24]  id:251
    s8i 	a15,a13,1                   # [25]  id:252
    s8i 	a14,a13,2                   # [26]  id:253
    l32i    a15,a1,108                  # [27]  gra_spill_temp_239
    l32i    a14,a1,120                  # [28]  gra_spill_temp_242
    l32i    a9,a1,124                   # [29]  gra_spill_temp_243
    addi.n  a14,a14,4               # [30]
    addi    a9,a9,16                    # [31]
    s32i    a9,a1,124                   # [32]  gra_spill_temp_243
    s32i    a14,a1,120                  # [33]  gra_spill_temp_242
    bge a14,a15,.Lt_10_10754        # [34]

.Lt_10_11266:   # 0x1a1c
#<loop> Loop body line 1230, nesting depth: 4, estimated iterations: 100
    ee.zero.qacc                    # [0]
    l32i    a9,a1,100                   # [1]  gra_spill_temp_237
    s32i    a9,a1,172                   # [2]  gra_spill_temp_255
    bge     a9,a7,.Lt_10_11522          # [3]

    mull    a15,a9,a4                   # [0]
    l32i    a14,a1,92                   # [1]  gra_spill_temp_235
    add.n   a11,a15,a5                  # [2]
    add.n   a14,a14,a9                  # [3]
    mull    a14,a3,a14                  # [4]
    s32i    a11,a1,168                  # [5]  gra_spill_temp_254
    bge     a6,a5,.Lt_10_12290          # [6]

.LBB18_esp_nn_depthwise_conv_s16_mult4: # 0x1a3b
    l32i    a10,a1,176                  # [0]  gra_spill_temp_256
    l32i    a11,a1,164                  # [1]  gra_spill_temp_253
    l32i    a12,a1,160                  # [2]  gra_spill_temp_252
    add.n   a9,a15,a6                   # [3]
    l32i    a8,a1,140                   # [4]  gra_spill_temp_247
    addmi   a13,a1,256                  # [5]
    ld.qr   q1,a13,-16                  # [6]  gra_spill_temp_260-256
    mull    a8,a8,a9                    # [7]
    add.n   a12,a12,a6                  # [8]
    l32i    a9,a1,152                   # [9]  gra_spill_temp_250
    add.n   a12,a14,a12                 # [10]
    mull    a11,a11,a12                 # [11]
    add.n   a8,a2,a8                    # [12]
    l32i    a12,a1,148                  # [13]  gra_spill_temp_249
    addx2   a8,a8,a9                    # [14]
    add.n   a10,a10,a11                 # [15]
    l32i    a11,a1,136                  # [16]  gra_spill_temp_246
    l32i    a9,a1,156                   # [17]  gra_spill_temp_251
    addx2   a10,a10,a11                 # [18]
    l32i    a11,a1,144                  # [19]  gra_spill_temp_248
    loopgtz a9,.LBB45_esp_nn_depthwise_conv_s16_mult4   # [20]

    mov.n   a9,a8                       # [0*II+0]
    ee.vldbc.16 q0,a10              # [0*II+1]  id:232
    add.n   a10,a10,a12                 # [0*II+2]
    ee.vld.l.64.ip  q1,a9,0         # [0*II+3]  id:231
    add.n   a8,a8,a11                   # [0*II+4]
    ee.vmulas.s16.qacc  q0,q1       # [0*II+5]
.LBB45_esp_nn_depthwise_conv_s16_mult4: # 0x1a84

    addmi   a10,a1,256                  # [0]
    st.qr   q1,a10,-16                  # [1]  gra_spill_temp_260-256

.Lt_10_12290:   # 0x1a8a
    add.n   a14,a14,a3                  # [0]
    add.n   a15,a15,a4                  # [1]
    l32i    a11,a1,172                  # [2]  gra_spill_temp_255
    l32i    a12,a1,168                  # [3]  gra_spill_temp_254
    addi.n  a11,a11,1               # [4]
    add.n   a12,a12,a4                  # [5]
    s32i    a12,a1,168                  # [6]  gra_spill_temp_254
    s32i    a11,a1,172                  # [7]  gra_spill_temp_255
    sub     a11,a7,a11                  # [8]
    beqz    a11,.Lt_10_11522            # [9]

    blt     a6,a5,.LBB18_esp_nn_depthwise_conv_s16_mult4    # [0]

    j   .Lt_10_12290                    # [0]

.Lt_10_8450:    # 0x1aaa
    retw.n                          # [0]

    .size   esp_nn_depthwise_conv_s16_mult4_esp32s3, . - esp_nn_depthwise_conv_s16_mult4_esp32s3


================================================
FILE: src/convolution/esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

    .text
    .literal_position

    # Program Unit: esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3
    .type   esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3, @function
    .align   4
    .global esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3

esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3:    # 0x11b3
    # qacc_scratch = 0
    # gra_spill_temp_142 = 48
    # gra_spill_temp_143 = 52
    # gra_spill_temp_144 = 56
    # gra_spill_temp_145 = 60
    # gra_spill_temp_146 = 64
    # gra_spill_temp_147 = 68
    # gra_spill_temp_148 = 72
    # gra_spill_temp_149 = 76
    # gra_spill_temp_150 = 80
    # gra_spill_temp_151 = 84
    # gra_spill_temp_152 = 88
    # gra_spill_temp_153 = 92
    # gra_spill_temp_154 = 96
    # gra_spill_temp_155 = 100
    # gra_spill_temp_156 = 104
    # gra_spill_temp_157 = 108
    # gra_spill_temp_158 = 112
    # gra_spill_temp_159 = 116
    # gra_spill_temp_160 = 120
    # gra_spill_temp_161 = 124
    # gra_spill_temp_162 = 128
    # gra_spill_temp_163 = 132
    # gra_spill_temp_164 = 136
    # gra_spill_temp_165 = 140
    # gra_spill_temp_166 = 144
    # gra_spill_temp_167 = 148
    # gra_spill_temp_168 = 152
    # gra_spill_temp_169 = 156
    # gra_spill_temp_170 = 160
    # gra_spill_temp_171 = 164
    # gra_spill_temp_172 = 168
    # gra_spill_temp_173 = 172
    # gra_spill_temp_174 = 176
    # gra_spill_temp_175 = 180
    # gra_spill_temp_176 = 184
    # gra_spill_temp_177 = 188
    # gra_spill_temp_178 = 192
    # gra_spill_temp_179 = 208
    # gra_spill_temp_180 = 224
    # gra_spill_temp_181 = 240
    # gra_spill_temp_182 = 256

 // registers:
 // a2: const int16_t *input_data
 // a3: const uint16_t input_wd
 // a4: const uint16_t input_ht
 // a5: const uint16_t channels
 // a6: const uint16_t pad_wd
 // a7: const uint16_t pad_ht

 // const uint16_t stride_wd
 // const uint16_t stride_ht
 // const uint16_t ch_mult
 // const int16_t *filter_data
 // const int32_t *bias
 // int8_t *out_data
 // const uint16_t out_wd
 // const uint16_t out_ht
 // const int32_t out_offset
 // const int32_t *out_shift
 // const int32_t *out_mult
 // const int32_t activation_min
 // const int32_t activation_max

    entry   a1,304                      #
    s32i    a2,a1,116                   # [0]  gra_spill_temp_159
    s32i    a3,a1,120                   # [1]  gra_spill_temp_160
    s32i    a5,a1,144                   # [2]  gra_spill_temp_166
    s32i.n  a6,a1,60                # [3]  gra_spill_temp_145

    addmi   a9,a1,256                   # [4]
    addi    a12,a1,112                  # [5]
    addmi   a10,a1,256                  # [6]
    addmi   a11,a1,256                  # [7]
    addmi   a13,a1,256                  # [8]

 // height loop
    l16ui   a8,a1,332                   # [9]  id:261 out_ht+0x0
    l32i    a14,a1,324                  # [10]  id:257 out_data+0x0
    s32i    a14,a1,176                  # [11]  gra_spill_temp_174
    s32i    a8,a1,68                    # [12]  gra_spill_temp_147
    addi    a13,a13,80                  # [13]
    addi    a11,a11,96                  # [14]
    addi    a10,a10,92                  # [15]
    ee.vldbc.32 q0,a10              # [16]  id:260 activation_min
    ee.vldbc.32 q1,a11              # [17]  id:259 activation_max
    ee.vldbc.32 q2,a13              # [18]  id:258 out_offset
    st.qr   	q2,a12,96                   # [19]  gra_spill_temp_179-112
    st.qr   	q1,a12,112                  # [20]  gra_spill_temp_180-112
    st.qr   	q0,a9,-16                   # [21]  gra_spill_temp_181-256
    beqz.n  a8,.Lt_8_8194           # [22]

.LBB3_esp_nn_depthwise_conv_s16_mult8_3x3:  # 0x11f9
    s32i    a1,a1,180                   # [0]  gra_spill_temp_175
    mul16u  a6,a3,a5                # [1]
    s32i    a7,a1,76                    # [2]  gra_spill_temp_149
    l32i    a9,a1,316                   # [3]  id:264 filter_data+0x0
    l32i    a15,a1,320                  # [4]  id:262 bias+0x0
    l16ui   a10,a1,312                  # [5]  id:263 ch_mult+0x0
    slli    a11,a5,1                    # [6]
    l16ui   a12,a1,308                  # [7]  id:268 stride_ht+0x0
    l32i    a13,a1,344                  # [8]  id:267 out_mult+0x0
    l32i    a14,a1,340                  # [9]  id:266 out_shift+0x0
    s32i    a14,a1,88                   # [10]  gra_spill_temp_152
    s32i    a13,a1,92                   # [11]  gra_spill_temp_153
    s32i    a12,a1,64                   # [12]  gra_spill_temp_146
    s32i    a11,a1,124                  # [13]  gra_spill_temp_161
    s32i    a10,a1,108                  # [14]  gra_spill_temp_157
    s32i    a15,a1,160                  # [15]  gra_spill_temp_170
    s32i    a9,a1,128                   # [16]  gra_spill_temp_162
    neg     a7,a7                       # [17]
    slli    a6,a6,1                     # [18]
    s32i    a7,a1,136                   # [19]  gra_spill_temp_164
    movi.n  a9,0                    # [20]
    extui   a15,a15,0,4                 # [21]
    s32i    a15,a1,152                  # [22]  gra_spill_temp_168
    s32i    a9,a1,72                    # [23]  gra_spill_temp_148
    sub     a7,a4,a7                    # [24]
    l32i.n  a9,a1,60                # [25]  gra_spill_temp_145
    s32i    a7,a1,80                    # [26]  gra_spill_temp_150
    l16ui   a4,a1,328                   # [27]  id:269 out_wd+0x0
    s32i    a4,a1,96                    # [28]  gra_spill_temp_154
    l16ui   a7,a1,304                   # [29]  id:265 stride_wd+0x0
    s32i    a7,a1,84                    # [30]  gra_spill_temp_151
    mul16u  a4,a5,a10               # [31]
    neg     a9,a9                       # [32]
    s32i.n  a9,a1,52                # [33]  gra_spill_temp_143
    sub     a8,a3,a9                    # [34]
    addi    a10,a10,-7                  # [35]
    s32i    a10,a1,164                  # [36]  gra_spill_temp_171
    s32i.n  a8,a1,56                # [37]  gra_spill_temp_144
    addx2   a7,a4,a4                    # [38]
    slli    a7,a7,1                     # [39]
    j       .Lt_8_8706                      # [40]

.Lt_8_8962: # 0x1270
#<loop> Part of loop body line 933, head labeled .Lt_8_8706
    l32i    a10,a1,68                   # [0]  gra_spill_temp_147
    l32i    a14,a1,76                   # [1]  gra_spill_temp_149
    l32i    a13,a1,136                  # [2]  gra_spill_temp_164
    l32i    a12,a1,64                   # [3]  gra_spill_temp_146
    l32i    a9,a1,72                    # [4]  gra_spill_temp_148
    l32i    a11,a1,80                   # [5]  gra_spill_temp_150
    addi.n  a9,a9,1                 # [6]
    s32i    a9,a1,72                    # [7]  gra_spill_temp_148
    sub     a11,a11,a12                 # [8]
    add.n   a13,a13,a12                 # [9]
    sub     a14,a14,a12                 # [10]
    s32i    a14,a1,76                   # [11]  gra_spill_temp_149
    s32i    a13,a1,136                  # [12]  gra_spill_temp_164
    s32i    a11,a1,80                   # [13]  gra_spill_temp_150
    sub     a9,a9,a10                   # [14]
    beqz    a9,.Lt_8_8194               # [15]

.Lt_8_8706: # 0x129e
#<loop> Loop body line 933, nesting depth: 1, estimated iterations: 100
 # 934          const int32_t base_y = (out_y * stride_ht) - pad_ht;
 # 935          for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop
    l32i    a15,a1,96                   # [0]  gra_spill_temp_154
    beqz.n  a15,.Lt_8_8962          # [2]

.LBB6_esp_nn_depthwise_conv_s16_mult8_3x3:  # 0x12a3
#<loop> Part of loop body line 933, head labeled .Lt_8_8706
    l32i.n  a3,a1,56                # [0]  gra_spill_temp_144
    l32i    a8,a1,80                    # [1]  gra_spill_temp_150
    movi.n  a10,0                   # [2]
    l32i    a9,a1,76                    # [3]  gra_spill_temp_149
    movi.n  a11,0                   # [4]
    l32i.n  a12,a1,52               # [5]  gra_spill_temp_143
    l32i.n  a13,a1,60               # [6]  gra_spill_temp_145
    s32i    a13,a1,104                  # [7]  gra_spill_temp_156
    s32i    a12,a1,140                  # [8]  gra_spill_temp_165
    s32i    a11,a1,100                  # [9]  gra_spill_temp_155
    max     a9,a9,a10                   # [10]
    movi.n  a10,3                   # [11]
    s32i    a9,a1,172                   # [12]  gra_spill_temp_173
    min     a8,a8,a10                   # [13]
    s32i    a8,a1,156                   # [14]  gra_spill_temp_169
    sub     a8,a8,a9                    # [15]
    s32i    a8,a1,132                   # [16]  gra_spill_temp_163
    j       .Lt_8_9474                      # [17]

.Lt_8_9730: # 0x12d3
#<loop> Part of loop body line 935, head labeled .Lt_8_9474
    l32i    a15,a1,96                   # [0]  gra_spill_temp_154
    l32i    a10,a1,104                  # [1]  gra_spill_temp_156
    l32i    a9,a1,140                   # [2]  gra_spill_temp_165
    l32i    a8,a1,84                    # [3]  gra_spill_temp_151
    l32i    a14,a1,100                  # [4]  gra_spill_temp_155
    sub     a3,a3,a8                    # [5]
    addi.n  a14,a14,1               # [6]
    s32i    a14,a1,100                  # [7]  gra_spill_temp_155
    add.n   a9,a9,a8                    # [8]
    sub     a10,a10,a8                  # [9]
    s32i    a10,a1,104                  # [10]  gra_spill_temp_156
    s32i    a9,a1,140                   # [11]  gra_spill_temp_165
    beq     a14,a15,.Lt_8_8962          # [12]

.Lt_8_9474: # 0x12f8
 # 936              const int32_t base_x = (out_x * stride_wd) - pad_wd;
 # 937              const int32_t *out_mult_ptr = out_mult;
 # 938              const int32_t *out_shift_ptr = out_shift;
    l32i    a2,a1,88                    # [0]  gra_spill_temp_152
    l32i    a10,a1,92                   # [1]  gra_spill_temp_153
 # 939              uint32_t bias_ptr = (uint32_t) (bias);
    l32i    a12,a1,160                  # [2]  gra_spill_temp_170
 # 940
 # 941              for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop
    l32i    a11,a1,144                  # [3]  gra_spill_temp_166
    s32i    a12,a1,168                  # [4]  gra_spill_temp_172
    beqz.n  a11,.Lt_8_9730          # [5]

.LBB9_esp_nn_depthwise_conv_s16_mult8_3x3:  # 0x1309
#<loop> Part of loop body line 935, head labeled .Lt_8_9474
    movi.n  a8,0                    # [0]
    l32i    a5,a1,104                   # [1]  gra_spill_temp_156
    movi.n  a13,0                   # [2]
    movi.n  a9,0                    # [3]
    s32i    a9,a1,112                   # [4]  gra_spill_temp_158
    s32i    a13,a1,148                  # [5]  gra_spill_temp_167
    max     a5,a5,a8                    # [6]
    j       .Lt_8_10242                     # [7]

.Lt_8_10498:    # 0x131e
#<loop> Part of loop body line 941, head labeled .Lt_8_10242
    l32i    a12,a1,144                  # [0]  gra_spill_temp_166
    l32i    a14,a1,108                  # [1]  gra_spill_temp_157
    l32i    a11,a1,148                  # [2]  gra_spill_temp_167
    l32i    a13,a1,112                  # [3]  gra_spill_temp_158
    addi.n  a11,a11,1               # [4]
    s32i    a11,a1,148                  # [5]  gra_spill_temp_167
    add.n   a13,a13,a14                 # [6]
    s32i    a13,a1,112                  # [7]  gra_spill_temp_158
    beq     a11,a12,.Lt_8_9730          # [8]

.Lt_8_10242:    # 0x1337
 # 942                  for (int ch_mult_idx = 0; ch_mult_idx < ch_mult - 7; ch_mult_idx += 8) {
    l32i    a15,a1,164                  # [0]  gra_spill_temp_171
    blti    a15,1,.Lt_8_10498           # [2]

    movi.n  a8,0                    # [0]
    l32i    a9,a1,112                   # [1]  gra_spill_temp_158
    s32i    a9,a1,188                   # [2]  gra_spill_temp_177
    s32i    a8,a1,184                   # [3]  gra_spill_temp_176
    j   .Lt_8_11010                     # [4]

.LBB23_esp_nn_depthwise_conv_s16_mult8_3x3: # 0x134b
    s32i.n  a10,a1,48               # [0]  gra_spill_temp_142
    addi    a11,a1,112                  # [1]
    l32i    a13,a1,152                  # [2]  gra_spill_temp_168
    l32i    a12,a1,168                  # [3]  gra_spill_temp_172
    wur.sar_byte    a13                 # [4]
    ee.vld.128.ip   q4,a12,16           # [5]  id:307
    ee.vld.128.ip   q7,a12,16           # [6]  id:308
    ee.vld.128.ip   q5,a12,0            # [7]  id:309
    s32i    a12,a1,168                  # [8]  gra_spill_temp_172
    ee.src.q.qup    q6,q4,q7            # [9]
    ee.vadds.s32    q0,q0,q6            # [10]
    ee.src.q.qup    q3,q4,q5            # [11]
    ee.vadds.s32    q1,q1,q3            # [12]
    st.qr   q1,a11,80                   # [13]  gra_spill_temp_178-112

.Lt_8_13314:    # 0x1374
 #1025  q0 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q0, out_mult_ptr, out_shift_ptr);
    l32i.n  a10,a1,48               # [0]  gra_spill_temp_142
    mov.n   a11,a2                      # [1]
    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3

 #1026                      out_mult_ptr += 4;
 #1027                      out_shift_ptr += 4;
 #1028
 #1029   q1 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q1, out_mult_ptr, out_shift_ptr);
    l32i.n  a10,a1,48               # [0]  gra_spill_temp_142
    addmi   a12,a1,256                  # [1]
    addi    a11,a1,112                  # [2]
    st.qr   q0,a12,0                    # [3]  gra_spill_temp_182-256
    ld.qr   q0,a11,80                   # [4]  gra_spill_temp_178-112
    addi    a10,a10,16                  # [5]
    addi    a11,a2,16                   # [6]
    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3

#<loop> Part of loop body line 942, head labeled .Lt_8_11010
 #1030                      out_mult_ptr += 4;
 #1031                      out_shift_ptr += 4;
    addi    a2,a2,32                    # [0]
    l32i    a14,a1,164                  # [1]  gra_spill_temp_171

    l32i    a8,a1,176                   # [2]  gra_spill_temp_174
    l32i    a15,a1,188                  # [3]  gra_spill_temp_177
    l32i    a13,a1,184                  # [4]  gra_spill_temp_176
    l32i.n  a10,a1,48               # [5]  gra_spill_temp_142
    addmi   a11,a1,256                  # [6]
    addi    a12,a1,112                  # [7]
    ld.qr   q3,a12,112                  # [8]  gra_spill_temp_180-112
    ld.qr   q1,a12,96                   # [9]  gra_spill_temp_179-112
    ld.qr   q2,a11,0                    # [10]  gra_spill_temp_182-256
    addi    a10,a10,32                  # [11]
    addi.n  a13,a13,8               # [12]
    addi.n  a15,a15,8               # [13]
    s32i    a15,a1,188                  # [14]  gra_spill_temp_177
    ee.vadds.s32    q2,q2,q1            # [15]
    s32i    a13,a1,184                  # [16]  gra_spill_temp_176
    ee.vadds.s32    q1,q0,q1            # [17]
    ee.vmin.s32     q0,q2,q3            # [18]
    ld.qr           q2,a11,-16                  # [19]  gra_spill_temp_181-256
    ee.vmin.s32     q1,q1,q3            # [20]
    ee.vmax.s32     q1,q1,q2            # [21]
    ee.vmax.s32     q0,q0,q2            # [22]
    ee.vunzip.16    q0,q1               # [23]
    ee.vunzip.8     q0,q1               # [24]
    ee.vst.l.64.ip  q0,a8,8         # [25]  id:312
    s32i    a8,a1,176                   # [26]  gra_spill_temp_174
    bge     a13,a14,.Lt_8_10498         # [27]

.Lt_8_11010:    # 0x13e3
#<loop> Loop body line 942, nesting depth: 4, estimated iterations: 100
    l32i    a14,a1,156                  # [0]  gra_spill_temp_169
    l32i    a13,a1,172                  # [1]  gra_spill_temp_173
    ee.zero.qacc                    # [2]
    bge     a13,a14,.Lt_8_11266         # [3]

.LBB15_esp_nn_depthwise_conv_s16_mult8_3x3: # 0x13ef
#<loop> Part of loop body line 942, head labeled .Lt_8_11010
    l32i    a12,a1,124                  # [0]  gra_spill_temp_161
    l32i    a8,a1,140                   # [1]  gra_spill_temp_165
    l32i    a11,a1,120                  # [2]  gra_spill_temp_160
    l32i    a14,a1,188                  # [3]  gra_spill_temp_177
    l32i    a9,a1,136                   # [4]  gra_spill_temp_164
    mull    a15,a4,a13                  # [5]
    add.n   a9,a9,a13                   # [6]
    addx2   a15,a15,a15                 # [7]
    l32i    a13,a1,148                  # [8]  gra_spill_temp_167
    add.n   a14,a14,a15                 # [9]
    mull    a9,a9,a11                   # [10]
    l32i    a15,a1,144                  # [11]  gra_spill_temp_166
    add.n   a8,a8,a9                    # [12]
    mull    a15,a15,a8                  # [13]
    l32i    a8,a1,128                   # [14]  gra_spill_temp_162
    add.n   a13,a13,a15                 # [15]
    l32i    a15,a1,116                  # [16]  gra_spill_temp_159
    addx2   a14,a14,a8                  # [17]
    addx2   a13,a13,a15                 # [18]
    add.n   a11,a12,a13                 # [19]
    l32i    a15,a1,132                  # [20]  gra_spill_temp_163
    add.n   a12,a12,a11                 # [21]
    loopgtz a15,.LBB34_esp_nn_depthwise_conv_s16_mult8_3x3  # [22]

.Lt_8_11778:    # 0x142e
    mov.n   a15,a14                     # [0]
    mov.n   a9,a14                      # [1]
    bnez.n  a5,.Lt_8_12034          # [2]

    ee.vldbc.16     q3,a13              # [0]  id:271
    mov.n           a9,a14                      # [1]
    ee.vld.128.ip       q4,a9,0             # [2]  id:272
    ee.vmulas.s16.qacc  q3,q4       # [4]

.Lt_8_12034:    # 0x143f
    ee.vldbc.16     q5,a11              # [0]  id:274
    addx2           a9,a4,a9                    # [1]
    ee.vld.128.ip   q6,a9,0             # [2]  id:275
    add.n           a13,a13,a6                  # [3]
    ee.vmulas.s16.qacc  q5,q6       # [4]
    blti    a3,3,.Lt_8_12546            # [5]

    ee.vldbc.16     q7,a12              # [0]  id:277
    addx2           a14,a4,a9                   # [1]
    ee.vld.128.ip   q0,a14,0            # [2]  id:278
    ee.vmulas.s16.qacc  q7,q0       # [4]

.Lt_8_12546:    # 0x145c
#<loop> Part of loop body line 953, head labeled .Lt_8_11778
    add.n   a11,a11,a6                  # [0]
    add.n   a12,a12,a6                  # [1]
    add.n   a14,a7,a15                  # [2]

.LBB34_esp_nn_depthwise_conv_s16_mult8_3x3: # 0x1464
.Lt_8_11266:    # 0x1464

    l32i    a8,a1,180                   # [0]  gra_spill_temp_175
    ee.st.qacc_l.l.128.ip   a8,16       # [2]  id:280
    ee.st.qacc_l.h.32.ip    a8,0        # [3]  id:281
    l16ui   a9,a1,10                    # [4]  qacc_scratch+10
    l8ui    a11,a1,15                   # [5]  qacc_scratch+15
    l8ui    a12,a1,5                    # [6]  qacc_scratch+5
    l8ui    a13,a1,6                    # [7]  qacc_scratch+6
    l8ui    a14,a1,16                   # [8]  qacc_scratch+16
    s8i     a14,a1,7                    # [9]  qacc_scratch+7
    s8i     a13,a1,3                    # [10]  qacc_scratch+3
    s8i     a12,a1,2                    # [11]  qacc_scratch+2
    s8i     a11,a1,6                    # [12]  qacc_scratch+6
    s16i    a9,a1,4                     # [13]  qacc_scratch+4
    ee.st.qacc_h.l.128.ip   a8,16       # [14]  id:291
    ee.st.qacc_h.h.32.ip    a8,-32      # [15]  id:292
    l16ui   a9,a1,16                    # [16]  qacc_scratch+16
    l8ui    a15,a1,32                   # [17]  qacc_scratch+32
    l8ui    a12,a1,22                   # [18]  qacc_scratch+22
    l8ui    a11,a1,21                   # [19]  qacc_scratch+21
    l8ui    a14,a1,31                   # [20]  qacc_scratch+31
    l16ui   a13,a1,26                   # [21]  qacc_scratch+26
    s16i    a13,a1,12                   # [22]  qacc_scratch+12
    s8i 	a14,a1,14                   # [23]  qacc_scratch+14
    s8i 	a11,a1,10                   # [24]  qacc_scratch+10
    s8i 	a12,a1,11                   # [25]  qacc_scratch+11
    s8i 	a15,a1,15                   # [26]  qacc_scratch+15
    s16i    a9,a1,8                     # [27]  qacc_scratch+8
    l32i    a15,a1,160                  # [28]  gra_spill_temp_170
    movi.n  a9,16                   # [29]
    ee.srcmb.s16.qacc   q1,a9,0         # [30]
    ee.vld.128.ip   q0,a8,0             # [31]  id:304
    s32i    a8,a1,180                   # [32]  gra_spill_temp_175
    ee.vzip.16  q0,q1               # [33]
    bnez.n  a15,.LBB23_esp_nn_depthwise_conv_s16_mult8_3x3  # [34]

    s32i.n  a10,a1,48               # [0]  gra_spill_temp_142
    addi    a15,a1,112                  # [1]
    st.qr   q1,a15,80                   # [2]  gra_spill_temp_178-112
    j   .Lt_8_13314                     # [3]

.Lt_8_8194: # 0x14d3
    retw.n                          # [0]

    .size   esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3, . - esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3


================================================
FILE: src/convolution/esp_nn_depthwise_conv_s16_mult8_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

    .text
    .literal_position

    # Program Unit: esp_nn_depthwise_conv_s16_mult8_esp32s3
    .type   esp_nn_depthwise_conv_s16_mult8_esp32s3, @function
    .align   4
    .global esp_nn_depthwise_conv_s16_mult8_esp32s3

esp_nn_depthwise_conv_s16_mult8_esp32s3:    # 0x14d7
    # qacc_scratch = 0
    # gra_spill_temp_183 = 48
    # gra_spill_temp_184 = 52
    # gra_spill_temp_185 = 56
    # gra_spill_temp_186 = 60
    # gra_spill_temp_187 = 64
    # gra_spill_temp_188 = 68
    # gra_spill_temp_189 = 72
    # gra_spill_temp_190 = 76
    # gra_spill_temp_191 = 80
    # gra_spill_temp_192 = 84
    # gra_spill_temp_193 = 88
    # gra_spill_temp_194 = 92
    # gra_spill_temp_195 = 96
    # gra_spill_temp_196 = 100
    # gra_spill_temp_197 = 104
    # gra_spill_temp_198 = 108
    # gra_spill_temp_199 = 112
    # gra_spill_temp_200 = 116
    # gra_spill_temp_201 = 120
    # gra_spill_temp_202 = 124
    # gra_spill_temp_203 = 128
    # gra_spill_temp_204 = 132
    # gra_spill_temp_205 = 136
    # gra_spill_temp_206 = 140
    # gra_spill_temp_207 = 144
    # gra_spill_temp_208 = 148
    # gra_spill_temp_209 = 152
    # gra_spill_temp_210 = 156
    # gra_spill_temp_211 = 160
    # gra_spill_temp_212 = 164
    # gra_spill_temp_213 = 168
    # gra_spill_temp_214 = 172
    # gra_spill_temp_215 = 176
    # gra_spill_temp_216 = 180
    # gra_spill_temp_217 = 184
    # gra_spill_temp_218 = 192
    # gra_spill_temp_219 = 208

 // registers:
 // a2: const int16_t *input_data
 // a3: const uint16_t input_wd
 // a4: const uint16_t input_ht
 // a5: const uint16_t channels
 // a6: const uint16_t pad_wd
 // a7: const uint16_t pad_ht

 // on stack:
 // const uint16_t stride_wd
 // const uint16_t stride_ht
 // const uint16_t ch_mult
 // const int16_t *filter_data
 // const uint16_t filter_wd
 // const uint16_t filter_ht
 // const int32_t *bias
 // int8_t *out_data
 // const uint16_t out_wd
 // const uint16_t out_ht
 // const int32_t out_offset
 // const int32_t *out_shift
 // const int32_t *out_mult
 // const int32_t activation_min
 // const int32_t activation_max

    entry   a1,256                      #
    s32i    a2,a1,144                   # [0]  gra_spill_temp_207
    s32i.n  a4,a1,56                # [1]  gra_spill_temp_185
    s32i    a5,a1,172                   # [2]  gra_spill_temp_214
    l32i    a9,a1,284                   # [3]  id:241 out_data+0x0

    l16ui   a8,a1,292                   # [4]  id:242 out_ht+0x0
    s32i    a8,a1,64                    # [5]  gra_spill_temp_187
    s32i    a9,a1,124                   # [6]  gra_spill_temp_202
    beqz.n  a8,.Lt_9_8450           # [7]

    s32i    a1,a1,128                   # [0]  gra_spill_temp_203
    neg     a13,a7                      # [1]
    movi.n  a4,0                    # [2]
    neg     a12,a6                      # [3]
    l32i    a9,a1,280                   # [4]  id:243 bias+0x0
    slli    a11,a5,1                    # [5]
    l16ui   a10,a1,264                  # [6]  id:244 ch_mult+0x0
    l32i    a14,a1,268                  # [7]  id:245 filter_data+0x0
    s32i    a14,a1,160                  # [8]  gra_spill_temp_211
    s32i    a10,a1,92                   # [9]  gra_spill_temp_194
    s32i    a11,a1,156                  # [10]  gra_spill_temp_210
    s32i    a9,a1,112                   # [11]  gra_spill_temp_199
    sext    a12,a12,15                  # [12]
    s32i    a4,a1,68                    # [13]  gra_spill_temp_188
    sext    a13,a13,15                  # [14]
    l16ui   a4,a1,272                   # [15]  id:246 filter_wd+0x0
    s32i    a13,a1,100                  # [16]  gra_spill_temp_196
    s32i.n  a12,a1,48               # [17]  gra_spill_temp_183
    mul16u  a8,a5,a10               # [18]
    extui   a9,a9,0,4                   # [19]
    l32i    a11,a1,304                  # [20]  id:249 out_mult+0x0
    s32i    a11,a1,80                   # [21]  gra_spill_temp_191
    s32i    a9,a1,104                   # [22]  gra_spill_temp_197
    s32i    a8,a1,148                   # [23]  gra_spill_temp_208
    addi    a10,a10,-7                  # [24]
    l32i    a12,a1,300                  # [25]  id:248 out_shift+0x0
    l16ui   a13,a1,256                  # [26]  id:247 stride_wd+0x0
    s32i    a13,a1,72                   # [27]  gra_spill_temp_189
    s32i    a12,a1,76                   # [28]  gra_spill_temp_190
    s32i    a10,a1,116                  # [29]  gra_spill_temp_200
    slli    a8,a8,1                     # [30]
    l16ui   a9,a1,260                   # [31]  id:251 stride_ht+0x0
    s32i.n  a9,a1,60                # [32]  gra_spill_temp_186
    s32i    a8,a1,152                   # [33]  gra_spill_temp_209
    l16ui   a10,a1,276                  # [34]  id:250 filter_ht+0x0
    s32i.n  a10,a1,52               # [35]  gra_spill_temp_184
    l16ui   a8,a1,288                   # [36]  id:252 out_wd+0x0
    s32i    a8,a1,84                    # [37]  gra_spill_temp_192
    j       .Lt_9_8962                      # [38]

.Lt_9_9218: # 0x1561
#<loop> Part of loop body line 1083, head labeled .Lt_9_8962
    l32i    a15,a1,64                   # [0]  gra_spill_temp_187
    l32i.n  a9,a1,60                # [1]  gra_spill_temp_186
    l32i    a14,a1,68                   # [2]  gra_spill_temp_188
    l32i    a8,a1,100                   # [3]  gra_spill_temp_196
    addi.n  a14,a14,1               # [4]
    s32i    a14,a1,68                   # [5]  gra_spill_temp_188
    add.n   a9,a8,a9                    # [6]
    sub     a14,a14,a15                 # [7]
    sext    a8,a9,15                    # [8]
    s32i    a8,a1,100                   # [9]  gra_spill_temp_196
    beqz    a14,.Lt_9_8450              # [10]

.Lt_9_8962: # 0x157f
    l32i    a10,a1,84                   # [0]  gra_spill_temp_192
    beqz.n  a10,.Lt_9_9218          # [2]

    l32i.n  a7,a1,52                # [0]  gra_spill_temp_184
    movi.n  a11,0                   # [1]
    l32i.n  a8,a1,56                # [2]  gra_spill_temp_185
    l32i    a9,a1,100                   # [3]  gra_spill_temp_196
    l32i.n  a12,a1,48               # [4]  gra_spill_temp_183
    s32i    a12,a1,168                  # [5]  gra_spill_temp_213
    neg     a10,a9                      # [6]
    sub     a8,a8,a9                    # [7]
    max     a10,a10,a11                 # [8]
    s32i    a10,a1,108                  # [9]  gra_spill_temp_198
    min     a7,a7,a8                    # [10]
    movi.n  a11,0                   # [11]
    s32i    a11,a1,88                   # [12]  gra_spill_temp_193
    j       .Lt_9_9730                      # [13]

.Lt_9_9986: # 0x15a9
#<loop> Part of loop body line 1085, head labeled .Lt_9_9730
    l32i    a13,a1,84                   # [0]  gra_spill_temp_192
    l32i    a15,a1,72                   # [1]  gra_spill_temp_189
    l32i    a12,a1,88                   # [2]  gra_spill_temp_193
    l32i    a14,a1,168                  # [3]  gra_spill_temp_213
    addi.n  a12,a12,1               # [4]
    s32i    a12,a1,88                   # [5]  gra_spill_temp_193
    add.n   a15,a14,a15                 # [6]
    sext    a14,a15,15                  # [7]
    s32i    a14,a1,168                  # [8]  gra_spill_temp_213
    beq     a12,a13,.Lt_9_9218          # [9]

.Lt_9_9730: # 0x15c5
#<loop> Loop body line 1085, nesting depth: 2, estimated iterations: 100
 #1086              const int16_t base_x = (out_x * stride_wd) - pad_wd;
 #1087              const int32_t *out_mult_ptr = out_mult;
 #1088              const int32_t *out_shift_ptr = out_shift;
 #1089              uint32_t bias_ptr = (uint32_t) (bias);
 #1090              for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop
    l32i    a8,a1,172                   # [0]  gra_spill_temp_214
    l32i    a9,a1,80                    # [1]  gra_spill_temp_191
    l32i    a10,a1,76                   # [2]  gra_spill_temp_190
    l32i    a11,a1,112                  # [3]  gra_spill_temp_199
    s32i    a11,a1,120                  # [4]  gra_spill_temp_201
    s32i    a10,a1,140                  # [5]  gra_spill_temp_206
    s32i    a9,a1,136                   # [6]  gra_spill_temp_205
    beqz.n  a8,.Lt_9_9986           # [7]

.LBB9_esp_nn_depthwise_conv_s16_mult8:  # 0x15dc
#<loop> Part of loop body line 1085, head labeled .Lt_9_9730
    movi.n  a8,0                    # [0]
    l32i    a5,a1,168                   # [1]  gra_spill_temp_213
    movi.n  a13,0                   # [2]
    movi.n  a14,0                   # [3]
    s32i    a14,a1,96                   # [4]  gra_spill_temp_195
    s32i    a13,a1,184                  # [5]  gra_spill_temp_217
    neg     a6,a5                       # [6]
    max     a6,a6,a8                    # [7]
    sub     a5,a3,a5                    # [8]
    min     a5,a4,a5                    # [9]
    sub     a12,a5,a6                   # [10]
    s32i    a12,a1,164                  # [11]  gra_spill_temp_212
    j       .Lt_9_10498                     # [12]

.Lt_9_10754:    # 0x1600
#<loop> Part of loop body line 1090, head labeled .Lt_9_10498
    l32i    a10,a1,172                  # [0]  gra_spill_temp_214
    l32i    a12,a1,92                   # [1]  gra_spill_temp_194
    l32i    a9,a1,184                   # [2]  gra_spill_temp_217
    l32i    a11,a1,96                   # [3]  gra_spill_temp_195
    addi.n  a9,a9,1                 # [4]
    s32i    a9,a1,184                   # [5]  gra_spill_temp_217
    add.n   a11,a11,a12                 # [6]
    s32i    a11,a1,96                   # [7]  gra_spill_temp_195
    beq     a9,a10,.Lt_9_9986           # [8]

.Lt_9_10498:    # 0x1619
#<loop> Loop body line 1090, nesting depth: 3, estimated iterations: 100
 #1091                  for (int ch_mult_idx = 0; ch_mult_idx < ch_mult - 7; ch_mult_idx += 8) {
    l32i    a13,a1,116                  # [0]  gra_spill_temp_200
    blti    a13,1,.Lt_9_10754           # [2]

.LBB12_esp_nn_depthwise_conv_s16_mult8: # 0x161f
#<loop> Part of loop body line 1090, head labeled .Lt_9_10498
    l32i    a2,a1,96                    # [0]  gra_spill_temp_195
    movi.n  a14,0                   # [1]
    s32i    a14,a1,132                  # [2]  gra_spill_temp_204
    j       .Lt_9_11266                     # [3]

.Lt_9_11522:    # 0x162a
    l32i    a9,a1,128                   # [0]  gra_spill_temp_203
    ee.st.qacc_l.l.128.ip   a9,16       # [2]  id:257
    ee.st.qacc_l.h.32.ip    a9,0        # [3]  id:258
    l8ui    a10,a1,15                   # [4]  qacc_scratch+15
    l16ui   a8,a1,10                    # [5]  qacc_scratch+10
    l8ui    a13,a1,16                   # [6]  qacc_scratch+16
    l8ui    a12,a1,6                    # [7]  qacc_scratch+6
    l8ui    a11,a1,5                    # [8]  qacc_scratch+5
    s8i     a11,a1,2                    # [9]  qacc_scratch+2
    s8i     a12,a1,3                    # [10]  qacc_scratch+3
    s8i     a13,a1,7                    # [11]  qacc_scratch+7
    s16i    a8,a1,4                     # [12]  qacc_scratch+4
    s8i     a10,a1,6                    # [13]  qacc_scratch+6

    movi.n  a8,16                   # [14]
    ee.st.qacc_h.l.128.ip   a9,16       # [15]  id:268
    ee.st.qacc_h.h.32.ip    a9,-32      # [16]  id:269
    ee.srcmb.s16.qacc   q1,a8,0         # [17]
    l16ui   a13,a1,26                   # [18]  qacc_scratch+26
    l8ui    a15,a1,32                   # [19]  qacc_scratch+32
    l8ui    a12,a1,22                   # [20]  qacc_scratch+22
    l8ui    a11,a1,21                   # [21]  qacc_scratch+21
    l16ui   a10,a1,16                   # [22]  qacc_scratch+16
    l8ui    a14,a1,31                   # [23]  qacc_scratch+31
    s8i     a14,a1,14                   # [24]  qacc_scratch+14
    s16i    a10,a1,8                    # [25]  qacc_scratch+8
    s8i     a11,a1,10                   # [26]  qacc_scratch+10
    s8i     a12,a1,11                   # [27]  qacc_scratch+11
    s8i     a15,a1,15                   # [28]  qacc_scratch+15
    s16i    a13,a1,12                   # [29]  qacc_scratch+12
 #1138                      EE_VZIP_16(q0, q1); /* 4x32 */
 #1139
 #1140                      if (bias) {
    l32i            a15,a1,112                  # [30]  gra_spill_temp_199
    ee.vld.128.ip   q0,a9,0             # [31]  id:281
    s32i            a9,a1,128                   # [32]  gra_spill_temp_203
    ee.vzip.16      q0,q1               # [33]
    beqz.n          a15,.Lt_9_13570         # [34]

.LBB23_esp_nn_depthwise_conv_s16_mult8: # 0x168e
#<loop> Part of loop body line 1091, head labeled .Lt_9_11266
    addi            a14,a1,112                  # [0]
    l32i            a8,a1,104                   # [1]  gra_spill_temp_197
    l32i            a15,a1,120                  # [2]  gra_spill_temp_201
    wur.sar_byte    a8                  # [3]
    ee.vld.128.ip   q3,a15,16           # [4]  id:284
    ee.vld.128.ip   q6,a15,16           # [5]  id:285
    ee.vld.128.ip   q4,a15,0            # [6]  id:286
    s32i            a15,a1,120                  # [7]  gra_spill_temp_201
    ee.src.q.qup    q5,q3,q6            # [8]
    ee.vadds.s32    q0,q0,q5            # [9]
    ee.src.q.qup    q2,q3,q4            # [10]
    ee.vadds.s32    q1,q1,q2            # [11]
    st.qr           q1,a14,96                   # [12]  gra_spill_temp_219-112

.Lt_9_13570:    # 0x16b5
 #1158  q0 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q0, out_mult_ptr, out_shift_ptr);
    l32i    a10,a1,136                  # [0]  gra_spill_temp_205
    l32i    a11,a1,140                  # [1]  gra_spill_temp_206
    addi    a9,a1,112                   # [2]
    st.qr   q1,a9,96                    # [3]  gra_spill_temp_219-112
    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3

 #1159                      out_mult_ptr += 4;
 #1160                      out_shift_ptr += 4;
 #1161
 #1162  q1 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q1, out_mult_ptr, out_shift_ptr);
    l32i    a11,a1,140                  # [0]  gra_spill_temp_206
    addi    a12,a1,112                  # [1]
    l32i    a10,a1,136                  # [2]  gra_spill_temp_205
    st.qr   q0,a12,80                   # [3]  gra_spill_temp_218-112
    ld.qr   q0,a12,96                   # [4]  gra_spill_temp_219-112
    addi    a10,a10,16                  # [5]
    addi    a11,a11,16                  # [6]
    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3

    addi.n  a2,a2,8                 # [0]
    l32i    a14,a1,116                  # [1]  gra_spill_temp_200
    l32i    a15,a1,124                  # [2]  gra_spill_temp_202
    l32i    a13,a1,132                  # [3]  gra_spill_temp_204
    l32i    a10,a1,140                  # [4]  gra_spill_temp_206
    l32i    a11,a1,136                  # [5]  gra_spill_temp_205
    addmi   a9,a1,256                   # [6]
    addi    a8,a1,112                   # [7]
    ld.qr   q7,a8,80                    # [8]  gra_spill_temp_218-112
    addi    a9,a9,56                    # [9]
    ee.vldbc.32 q2,a9               # [10]  id:290 activation_max
    addi    a11,a11,32                  # [11]
    addi    a10,a10,32                  # [12]
    addi.n  a13,a13,8               # [13]
    s32i    a13,a1,132                  # [14]  gra_spill_temp_204
    s32i    a10,a1,140                  # [15]  gra_spill_temp_206
    s32i    a11,a1,136                  # [16]  gra_spill_temp_205
    addmi   a10,a1,256                  # [17]
    addmi   a11,a1,256                  # [18]
    addi    a11,a11,52                  # [19]
    addi    a10,a10,40                  # [20]
    ee.vldbc.32     q3,a10              # [21]  id:289 out_offset
    ee.vldbc.32     q1,a11              # [22]  id:291 activation_min
    ee.vadds.s32    q0,q0,q3            # [23]
    ee.vadds.s32    q7,q7,q3            # [24]
    ee.vmin.s32     q7,q7,q2            # [25]
    ee.vmin.s32     q0,q0,q2            # [26]
    ee.vmax.s32     q0,q0,q1            # [27]
    ee.vmax.s32     q7,q7,q1            # [28]
    ee.vunzip.16    q7,q0               # [29]
    ee.vunzip.8     q7,q0               # [30]
    ee.vst.l.64.ip  q7,a15,8        # [31]  id:292
    s32i            a15,a1,124                  # [32]  gra_spill_temp_202
    bge             a13,a14,.Lt_9_10754         # [33]

.Lt_9_11266:    # 0x1740

    ee.zero.qacc                    # [0]
    l32i    a12,a1,108                  # [1]  gra_spill_temp_198
    s32i    a12,a1,180                  # [2]  gra_spill_temp_216
    bge a12,a7,.Lt_9_11522          # [3]

    mull    a15,a12,a4                  # [0]
    l32i    a14,a1,100                  # [1]  gra_spill_temp_196
    add.n   a8,a15,a5                   # [2]
    add.n   a14,a14,a12                 # [3]
    mull    a14,a3,a14                  # [4]
    s32i    a8,a1,176                   # [5]  gra_spill_temp_215
    bge     a6,a5,.Lt_9_12290           # [6]

.LBB18_esp_nn_depthwise_conv_s16_mult8: # 0x175f
#<loop> Part of loop body line 1091, head labeled .Lt_9_11266
    l32i    a10,a1,184                  # [0]  gra_spill_temp_217
    l32i    a11,a1,172                  # [1]  gra_spill_temp_214
    l32i    a12,a1,168                  # [2]  gra_spill_temp_213
    l32i    a8,a1,148                   # [3]  gra_spill_temp_208
    add.n   a9,a15,a6                   # [4]
    mull    a8,a8,a9                    # [5]
    add.n   a12,a12,a6                  # [6]
    l32i    a9,a1,160                   # [7]  gra_spill_temp_211
    add.n   a12,a14,a12                 # [8]
    mull    a11,a11,a12                 # [9]
    add.n   a8,a2,a8                    # [10]
    l32i    a12,a1,156                  # [11]  gra_spill_temp_210
    addx2   a8,a8,a9                    # [12]
    add.n   a10,a10,a11                 # [13]
    l32i    a11,a1,144                  # [14]  gra_spill_temp_207
    l32i    a9,a1,164                   # [15]  gra_spill_temp_212
    addx2   a10,a10,a11                 # [16]
    l32i    a11,a1,152                  # [17]  gra_spill_temp_209
    loopgtz a9,.LBB45_esp_nn_depthwise_conv_s16_mult8   # [18]

    mov.n           a9,a8                       # [0*II+0]
    ee.vldbc.16     q0,a10              # [0*II+1]  id:255
    ee.vld.128.ip   q1,a9,0             # [0*II+2]  id:254
    add.n           a10,a10,a12                 # [0*II+3]
    add.n           a8,a8,a11                   # [0*II+4]
    ee.vmulas.s16.qacc  q0,q1       # [0*II+5]

.LBB45_esp_nn_depthwise_conv_s16_mult8: # 0x17a2

.Lt_9_12290:    # 0x17a2

    add.n   a14,a14,a3                  # [0]
    add.n   a15,a15,a4                  # [1]
    l32i    a10,a1,180                  # [2]  gra_spill_temp_216
    l32i    a11,a1,176                  # [3]  gra_spill_temp_215
    addi.n  a10,a10,1               # [4]
    add.n   a11,a11,a4                  # [5]
    s32i    a11,a1,176                  # [6]  gra_spill_temp_215
    s32i    a10,a1,180                  # [7]  gra_spill_temp_216
    sub     a10,a7,a10                  # [8]
    beqz    a10,.Lt_9_11522             # [9]

.Lt_9_12034:    # 0x17bc
    blt     a6,a5,.LBB18_esp_nn_depthwise_conv_s16_mult8    # [0]

    j       .Lt_9_12290                     # [0]

.Lt_9_8450: # 0x17c2
    retw.n                          # [0]

    .size   esp_nn_depthwise_conv_s16_mult8_esp32s3, . - esp_nn_depthwise_conv_s16_mult8_esp32s3


================================================
FILE: src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c
================================================
/*
 * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdio.h>
#include <esp_nn_defs.h>

#include <common_functions.h>

static int16_t *scratch_buffer = NULL;

extern void esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3(const int16_t *input_data,
                                                        const uint16_t input_wd,
                                                        const uint16_t input_ht,
                                                        const uint16_t channels,
                                                        const uint16_t pad_wd,
                                                        const uint16_t pad_ht,
                                                        const uint16_t stride_wd,
                                                        const uint16_t stride_ht,
                                                        const uint16_t ch_mult,
                                                        const int16_t *filter_data,
                                                        const int32_t *bias,
                                                        int8_t *out_data,
                                                        const uint16_t out_wd,
                                                        const uint16_t out_ht,
                                                        const int32_t out_offset,
                                                        const int32_t *out_shift,
                                                        const int32_t *out_mult,
                                                        const int32_t activation_min,
                                                        const int32_t activation_max);

extern void esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(const int8_t *input_data,
                                                              const uint16_t input_wd,
                                                              const uint16_t input_ht,
                                                              const uint16_t channels,
                                                              const int32_t input_offset,
                                                              const uint16_t stride_wd,
                                                              const uint16_t stride_ht,
                                                              const int8_t *filter_data,
                                                              const int32_t *bias,
                                                              int8_t *out_data,
                                                              const uint16_t out_wd,
                                                              const uint16_t out_ht,
                                                              const int32_t out_offset,
                                                              const int32_t *out_shift,
                                                              const int32_t *out_mult,
                                                              const int32_t activation_min,
                                                              const int32_t activation_max);

extern void esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3(const int16_t *input_data,
                                                               const uint16_t input_wd,
                                                               const uint16_t input_ht,
                                                               const uint16_t channels,
                                                               const uint16_t stride_wd,
                                                               const uint16_t stride_ht,
                                                               const int16_t *filter_data,
                                                               const int32_t *bias,
                                                               int8_t *out_data,
                                                               const uint16_t out_wd,
                                                               const uint16_t out_ht,
                                                               const int32_t out_offset,
                                                               const int32_t *out_shift,
                                                               const int32_t *out_mult,
                                                               const int32_t activation_min,
                                                               const int32_t activation_max);

extern void esp_nn_depthwise_conv_s16_mult8_esp32s3(const int16_t *input_data,
                                                    const uint16_t input_wd,
                                                    const uint16_t input_ht,
                                                    const uint16_t channels,
                                                    const uint16_t pad_wd,
                                                    const uint16_t pad_ht,
                                                    const uint16_t stride_wd,
                                                    const uint16_t stride_ht,
                                                    const uint16_t ch_mult,
                                                    const int16_t *filter_data,
                                                    const uint16_t filter_wd,
                                                    const uint16_t filter_ht,
                                                    const int32_t *bias,
                                                    int8_t *out_data,
                                                    const uint16_t out_wd,
                                                    const uint16_t out_ht,
                                                    const int32_t out_offset,
                                                    const int32_t *out_shift,
                                                    const int32_t *out_mult,
                                                    const int32_t activation_min,
                                                    const int32_t activation_max);

extern void esp_nn_depthwise_conv_s16_mult4_esp32s3(const int16_t *input_data,
                                                    const uint16_t input_wd,
                                                    const uint16_t input_ht,
                                                    const uint16_t channels,
                                                    const uint16_t pad_wd,
                                                    const uint16_t pad_ht,
                                                    const uint16_t stride_wd,
                                                    const uint16_t stride_ht,
                                                    const uint16_t ch_mult,
                                                    const int16_t *filter_data,
                                                    const uint16_t filter_wd,
                                                    const uint16_t filter_ht,
                                                    const int32_t *bias,
                                                    int8_t *out_data,
                                                    const uint16_t out_wd,
                                                    const uint16_t out_ht,
                                                    const int32_t out_offset,
                                                    const int32_t *out_shift,
                                                    const int32_t *out_mult,
                                                    const int32_t activation_min,
                                                    const int32_t activation_max);

extern void esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3(const int16_t *input_data,
                                                        const uint16_t input_wd,
                                                        const uint16_t input_ht,
                                                        const uint16_t channels,
                                                        const uint16_t pad_wd,
                                                        const uint16_t pad_ht,
                                                        const uint16_t stride_wd,
                                                        const uint16_t stride_ht,
                                                        const int16_t *filter_data,
                                                        const int32_t *bias,
                                                        int8_t *out_data,
                                                        const uint16_t out_wd,
                                                        const uint16_t out_ht,
                                                        const int32_t out_offset,
                                                        const int32_t *out_shift,
                                                        const int32_t *out_mult,
                                                        const int32_t activation_min,
                                                        const int32_t activation_max);

extern void esp_nn_depthwise_conv_s16_mult1_esp32s3(const int16_t *input_data,
                                                    const uint16_t input_wd,
                                                    const uint16_t input_ht,
                                                    const uint16_t channels,
                                                    const uint16_t pad_wd,
                                                    const uint16_t pad_ht,
                                                    const uint16_t stride_wd,
                                                    const uint16_t stride_ht,
                                                    const int16_t *filter_data,
                                                    const uint16_t filter_wd,
                                                    const uint16_t filter_ht,
                                                    const int32_t *bias,
                                                    int8_t *out_data,
                                                    const uint16_t out_wd,
                                                    const uint16_t out_ht,
                                                    const int32_t out_offset,
                                                    const int32_t *out_shift,
                                                    const int32_t *out_mult,
                                                    const int32_t activation_min,
                                                    const int32_t activation_max);

extern void esp_nn_s8_to_s16_esp32s3(const int8_t *src, int16_t *dst, const int size);

extern void esp_nn_aligned_s8_to_s16_with_offset_esp32s3(const int8_t *src, int16_t *dst,
                                                         const int size, const int32_t offset);

static void esp_nn_depthwise_conv_s8_unrolled(const int8_t *input_data,
                                              const uint16_t input_wd,
                                              const uint16_t input_ht,
                                              const uint16_t channels,
                                              const int32_t input_offset,
                                              const uint16_t pad_wd,
                                              const uint16_t pad_ht,
                                              const uint16_t stride_wd,
                                              const uint16_t stride_ht,
                                              const uint16_t ch_mult,
                                              const int8_t *filter_data,
                                              const uint16_t filter_wd,
                                              const uint16_t filter_ht,
                                              const int32_t *bias,
                                              int8_t *out_data,
                                              const uint16_t out_wd,
                                              const uint16_t out_ht,
                                              const int32_t out_offset,
                                              const int32_t *out_shift,
                                              const int32_t *out_mult,
                                              const int32_t activation_min,
                                              const int32_t activation_max)
{
    int out_idx = 0;
    for (int out_y = 0; out_y < out_ht; out_y++) { //height loop
        const int16_t base_y = (out_y * stride_ht) - pad_ht;
        for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop
            const int16_t base_x = (out_x * stride_wd) - pad_wd;
            for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop
                int ch_mult_idx = 0;
                for (; ch_mult_idx < ch_mult - 3; ch_mult_idx += 4) {
                    int32_t result0 = 0, result1 = 0, result2 = 0, result3 = 0;
                    const int out_ch_idx = ch_mult_idx + ch_idx * ch_mult;

                    /* Select filter so as the point doesn't lie outside block */
                    int filter_y_start = max(0, -base_y);
                    int filter_x_start = max(0, -base_x);
                    int filter_y_end = min(filter_ht, input_ht - base_y);
                    int filter_x_end = min(filter_wd, input_wd - base_x);

                    for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
                        const int32_t idx_y = base_y + filter_y_idx;
                        for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
                            const int32_t idx_x = base_x + filter_x_idx;
                            int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;
                            int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx;
                            int32_t input_val = input_data[input_index] + input_offset;
                            int32_t filter_val0 = filter_data[filter_index + 0];
                            int32_t filter_val1 = filter_data[filter_index + 1];
                            int32_t filter_val2 = filter_data[filter_index + 2];
                            int32_t filter_val3 = filter_data[filter_index + 3];
                            result0 += input_val * filter_val0;
                            result1 += input_val * filter_val1;
                            result2 += input_val * filter_val2;
                            result3 += input_val * filter_val3;
                        }
                    }
                    if (bias) {
                        result0 += bias[out_ch_idx + 0];
                        result1 += bias[out_ch_idx + 1];
                        result2 += bias[out_ch_idx + 2];
                        result3 += bias[out_ch_idx + 3];
                    }
                    result0 = esp_nn_multiply_by_quantized_mult(result0,
                                out_mult[out_ch_idx + 0], out_shift[out_ch_idx + 0]);
                    result1 = esp_nn_multiply_by_quantized_mult(result1,
                                out_mult[out_ch_idx + 1], out_shift[out_ch_idx + 1]);
                    result2 = esp_nn_multiply_by_quantized_mult(result2,
                                out_mult[out_ch_idx + 2], out_shift[out_ch_idx + 2]);
                    result3 = esp_nn_multiply_by_quantized_mult(result3,
                                out_mult[out_ch_idx + 3], out_shift[out_ch_idx + 3]);

                    result0 += out_offset;
                    result1 += out_offset;
                    result2 += out_offset;
                    result3 += out_offset;

                    result0 = max(result0, activation_min);
                    result1 = max(result1, activation_min);
                    result2 = max(result2, activation_min);
                    result3 = max(result3, activation_min);

                    result0 = min(result0, activation_max);
                    result1 = min(result1, activation_max);
                    result2 = min(result2, activation_max);
                    result3 = min(result3, activation_max);

                    out_data[out_idx++] = result0;
                    out_data[out_idx++] = result1;
                    out_data[out_idx++] = result2;
                    out_data[out_idx++] = result3;
                }

                /* left-over */
                for (; ch_mult_idx < ch_mult; ch_mult_idx++) {
                    int32_t result = 0;
                    const int out_ch_idx = ch_mult_idx + ch_idx * ch_mult;

                    /* Select filter so as the point doesn't lie outside block */
                    int filter_y_start = max(0, -base_y);
                    int filter_x_start = max(0, -base_x);
                    int filter_y_end = min(filter_ht, input_ht - base_y);
                    int filter_x_end = min(filter_wd, input_wd - base_x);

                    for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
                        const int32_t idx_y = base_y + filter_y_idx;
                        for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
                            const int32_t idx_x = base_x + filter_x_idx;
                            int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;
                            int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx;
                            int32_t input_val = input_data[input_index] + input_offset;
                            int32_t filter_val = filter_data[filter_index];
                            result += input_val * filter_val;
                        }
                    }
                    if (bias) {
                        result += bias[out_ch_idx];
                    }
                    result = esp_nn_multiply_by_quantized_mult(result, out_mult[out_ch_idx], out_shift[out_ch_idx]);
                    result += out_offset;
                    result = max(result, activation_min);
                    result = min(result, activation_max);

                    out_data[out_idx++] = result;
                }
            }
        }
    }
}

void esp_nn_depthwise_conv_s8_ch_mult1(const int8_t *input_data,
                                       const uint16_t input_wd,
                                       const uint16_t input_ht,
                                       const uint16_t channels,
                                       const int32_t input_offset,
                                       const uint16_t pad_wd,
                                       const uint16_t pad_ht,
                                       const uint16_t stride_wd,
                                       const uint16_t stride_ht,
                                       const int8_t *filter_data,
                                       const uint16_t filter_wd,
                                       const uint16_t filter_ht,
                                       const int32_t *bias,
                                       int8_t *out_data,
                                       const uint16_t out_wd,
                                       const uint16_t out_ht,
                                       const int32_t out_offset,
                                       const int32_t *out_shift,
                                       const int32_t *out_mult,
                                       const int32_t activation_min,
                                       const int32_t activation_max)
{
    int out_idx = 0;
    for (int out_y = 0; out_y < out_ht; out_y++) { //height loop
        const int16_t base_y = (out_y * stride_ht) - pad_ht;
        for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop
            const int16_t base_x = (out_x * stride_wd) - pad_wd;
            for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop
                int32_t result = 0;
                /* Select filter so as the point doesn't lie outside block */
                int filter_y_start = max(0, -base_y);
                int filter_x_start = max(0, -base_x);
                int filter_y_end = min(filter_ht, input_ht - base_y);
                int filter_x_end = min(filter_wd, input_wd - base_x);

                for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {
                    const int32_t idx_y = base_y + filter_y_idx;
                    for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {
                        const int32_t idx_x = base_x + filter_x_idx;
                        int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;
                        int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * channels + ch_idx;
                        int32_t input_val = input_data[input_index] + input_offset;
                        int32_t filter_val = filter_data[filter_index];
                        result += input_val * filter_val;
                    }
                }
                if (bias) {
                    result += bias[ch_idx];
                }
                result = esp_nn_multiply_by_quantized_mult(result, out_mult[ch_idx], out_shift[ch_idx]);
                result += out_offset;
                result = max(result, activation_min);
                result = min(result, activation_max);

                out_data[out_idx++] = result;
            }
        }
    }
}

int esp_nn_get_depthwise_conv_scratch_size_esp32s3(const data_dims_t *input_dims,
                                                   const data_dims_t *filter_dims,
                                                   const data_dims_t *output_dims,
                                                   const dw_conv_params_t *conv_params)
{
    const uint16_t input_wd = input_dims->width;
    const uint16_t input_ht = input_dims->height;
    const uint16_t channels = input_dims->channels;
    const uint16_t filter_wd = filter_dims->width;
    const uint16_t filter_ht = filter_dims->height;
    const uint16_t ch_mult = conv_params->ch_mult;
    const uint16_t out_wd = output_dims->width;
    const uint16_t out_ht = output_dims->height;
    const uint16_t pad_wd = conv_params->padding.width;
    const uint16_t pad_ht = conv_params->padding.height;
    const uint16_t stride_wd = conv_params->stride.width;
    const uint16_t stride_ht = conv_params->stride.height;

    int filter_size = filter_wd * filter_ht * channels * ch_mult;
    int pad_width = 0, pad_height = 0;

    if ((ch_mult == 1) && (channels % 8 == 0)) {
        if(filter_wd == 3 && filter_ht == 3) {
            if (channels % 16 == 0) {
                if (pad_wd || pad_ht) {
                    pad_width = pad_wd * 2;
                    pad_height = pad_ht * 2;
                } else {
                    pad_width = (out_wd * stride_wd + filter_wd - 1) - input_wd;
                    pad_height = (out_ht * stride_ht + filter_ht - 1) - input_ht;
                }
                if (pad_width || pad_height) {
                    int full_input = (input_wd + pad_width) * (input_ht + pad_height) * channels;
                    if (full_input <= 40 * 1024) {
                        return filter_size + full_input + 16;
                    } else {
                        /* Tiled: only need filter + strip buffer (filter_ht rows) */
                        int strip = (input_wd + pad_width) * filter_ht * channels;
                        return filter_size + strip + 16;
                    }
                } else {
                    return filter_size + 16;
                }
            } else if (channels >= 12) {
                /* ch % 8 == 0, not % 16, ch >= 12: pad channels to 16, s8 path + compaction */
                int new_ch = (channels + 15) & ~15;
                int new_filter_size = 9 * new_ch;
                int total_pad_wd = pad_wd * 2 + max(0, (out_wd * stride_wd + 2) - input_wd);
                int total_pad_ht = pad_ht * 2 + max(0, (out_ht * stride_ht + 2) - input_ht);
                int new_input_size = (input_wd + total_pad_wd) * (input_ht + total_pad_ht) * new_ch;
                int out_buf_size = out_wd * out_ht * new_ch;
                return new_filter_size + new_input_size + out_buf_size + 64;
            } else {
                /* ch=8: s16 path is more efficient (no channel padding overhead) */
                int input_s = input_wd * input_ht * channels;
                return  2 * (filter_size + input_s) + 32;
            }
        } else {
            int input_size = input_wd * input_ht * channels;
            int total_s16 = 2 * (filter_size + input_size);
            if (total_s16 <= 48 * 1024) {
                return total_s16 + 32;
            } else {
                /* Tiled: only need filter_s16 + tile buffer (filter_ht rows of input s16) */
                int tile_rows = filter_ht;
                int tile_s16 = 2 * input_wd * tile_rows * channels;
                return 2 * filter_size + tile_s16 + 32;
            }
        }
    } else if ((ch_mult == 1) && (channels > 3)) {
        // ch_mult=1, channels>3 case: pad channels to multiple of 8 for mult1
        int padded_channels = (channels + 7) & ~7;
        int padded_input_size = input_wd * input_ht * padded_channels;
        int padded_filter_size = filter_wd * filter_ht * padded_channels;

        // Calculate actual memory layout with 16-byte alignments (matching usage)
        size_t filter_bytes = padded_filter_size * sizeof(int16_t);
        size_t input_start = (filter_bytes + 15) & ~15;
        size_t input_bytes = padded_input_size * sizeof(int16_t);
        size_t out_start = (input_start + input_bytes + 15) & ~15;
        size_t out_bytes = out_wd * out_ht * padded_channels * sizeof(int8_t);
        size_t bias_start = (out_start + out_bytes + 15) & ~15;
        size_t bias_bytes = padded_channels * sizeof(int32_t);
        size_t shift_bytes = padded_channels * sizeof(int32_t);
        size_t mult_bytes = padded_channels * sizeof(int32_t);
        size_t total_size = bias_start + bias_bytes + shift_bytes + mult_bytes;

        return total_size + 16; // 16 for margin
    } else if (ch_mult % 4 == 0) {
        int input_size = input_wd * input_ht * channels;
        return  2 * (filter_size + input_size) + 32; // 32 for alignment
    }

    // Default fallback
    return 32;
}

void esp_nn_set_depthwise_conv_scratch_buf_esp32s3(void *buf)
{
    scratch_buffer = (int16_t *) buf;
}

/**
 * ESP32-S3 optimized depthwise convolution implementation.
 *
 * This function dispatches to various optimized implementations based on:
 * - Channel multiplier (ch_mult)
 * - Number of channels
 * - Filter dimensions
 * - Padding requirements
 *
 * For cases that don't have direct optimized implementations, the function
 * uses data padding techniques to leverage existing optimized functions:
 * - ch_mult % 4 != 0: Pad ch_mult to next multiple of 4, use mult4 functions
 * - ch_mult == 1, channels % 8 != 0: Fallback to C implementation for correctness
 *
 * Assumption 1: i/p channels == o/p channels
 * Assumption 2: Pointers are valid
 * Assumption 3: dilation width = 1
 */

#include "esp_nn_generic_opt.h"

void esp_nn_depthwise_conv_s8_esp32s3(const data_dims_t *input_dims,
                                      const int8_t *input_data,
                                      const data_dims_t *filter_dims,
                                      const int8_t *filter_data,
                                      const int32_t *bias,
                                      const data_dims_t *output_dims,
                                      int8_t *out_data,
                                      const dw_conv_params_t *conv_params,
                                      const quant_data_t *quant_data)
{
    const uint16_t input_wd = input_dims->width;
    const uint16_t input_ht = input_dims->height;
    const uint16_t channels = input_dims->channels;
    const int32_t input_offset = conv_params->in_offset;
    const int32_t out_offset = conv_params->out_offset;
    const uint16_t pad_wd = conv_params->padding.width;
    const uint16_t pad_ht = conv_params->padding.height;
    const uint16_t stride_wd = conv_params->stride.width;
    const uint16_t stride_ht = conv_params->stride.height;
    const uint16_t filter_wd = filter_dims->width;
    const uint16_t filter_ht = filter_dims->height;
    const uint16_t out_wd = output_dims->width;
    const uint16_t out_ht = output_dims->height;
    const int32_t *out_shift = quant_data->shift;
    const int32_t *out_mult = quant_data->mult;
    const int32_t activation_min = conv_params->activation.min;
    const int32_t activation_max = conv_params->activation.max;
    const uint16_t ch_mult = conv_params->ch_mult;

    int filter_size = filter_wd * filter_ht * channels * ch_mult;
    int align_len = 16 - (filter_size & 15);
    int input_size = input_wd * input_ht * channels;
    int16_t *filter_data16 = scratch_buffer;
    int16_t *input_data16 = scratch_buffer + filter_size + align_len;
    if (scratch_buffer == NULL) {
        printf("esp_nn_depthwise_conv error! scratch_buffer not set!\n");
        return;
    }

    if ((ch_mult == 1) && (channels % 8 == 0)) {
        if ((filter_wd == 3) && (filter_ht == 3)) {
            if ((channels % 16 == 0) && (pad_wd == 1) && (pad_ht == 1)) {
                /* process in 8 bits with s8 padded assembly */
                int8_t *filter_aligned = (int8_t *) scratch_buffer;
                int8_t *input_padded = (int8_t *) scratch_buffer + filter_size + align_len;
                memcpy(filter_aligned, filter_data, filter_size);

                int padded_input_size = (input_wd + 2*pad_wd) * (input_ht + 2*pad_ht) * channels;
                if (padded_input_size <= 40 * 1024) {
                    /* Small enough — full padding, single assembly call */
                    esp_nn_aligned_s8_pad_with_value(input_data, input_padded, input_wd, input_ht, channels,
                                                     -input_offset, pad_wd, pad_ht);
                    esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(input_padded, input_wd + 2 * pad_wd,
                                                                      input_ht + 2 * pad_ht, channels, input_offset,
                                                                      stride_wd, stride_ht, filter_aligned, bias,
                                                                      out_data, out_wd, out_ht, out_offset, out_shift,
                                                                      out_mult, activation_min, activation_max);
                } else {
                    /* Large input: row-tiled processing to reduce cache pressure.
                     * Pad and process a strip of output rows at a time. */
                    int padded_wd = input_wd + 2 * pad_wd;
                    int8_t pad_val = (int8_t)(-input_offset);

                    for (int out_y = 0; out_y < out_ht; out_y++) {
                        int in_y_start = out_y * stride_ht; /* in padded coords (pad_ht already accounted) */
                        /* Pad filter_ht rows of input into scratch */
                        int8_t *tile = input_padded;
                        for (int fy = 0; fy < filter_ht; fy++) {
                            int src_y = in_y_start + fy - pad_ht; /* original input row */
                            if (src_y < 0 || src_y >= input_ht) {
                                /* Padding row */
                                memset(tile, pad_val, padded_wd * channels);
                            } else {
                                /* Left pad */
                                memset(tile, pad_val, pad_wd * channels);
                                /* Copy input row */
                                memcpy(tile + pad_wd * channels,
                                       input_data + src_y * input_wd * channels,
                                       input_wd * channels);
                                /* Right pad */
                                memset(tile + (pad_wd + input_wd) * channels, pad_val, pad_wd * channels);
                            }
                            tile += padded_wd * channels;
                        }
                        /* Process one output row */
                        esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(
                            input_padded, padded_wd, filter_ht, channels, input_offset,
                            stride_wd, 1, filter_aligned, bias,
                            out_data + out_y * out_wd * channels,
                            out_wd, 1, out_offset, out_shift,
                            out_mult, activation_min, activation_max);
                    }
                }
            } else if ((channels % 16 == 0) && (pad_wd == 0) && (pad_ht == 0)) {
                /* process in 8 bits */
                int8_t *filter_aligned = (int8_t *) scratch_buffer;
                int8_t *input_padded = (int8_t *) scratch_buffer + filter_size + align_len;

                // check if we need to pad additionally
                int pad_right = (out_wd * stride_wd + filter_wd - 1) - input_wd;
                int pad_bottom = (out_ht * stride_ht + filter_ht - 1) - input_ht;
                if (pad_right || pad_bottom) { // pad right and bottom
                    esp_nn_aligned_s8_pad_end_with_value(input_data, input_padded, input_wd, input_ht,
                                                         channels, -input_offset, pad_right, pad_bottom);
                } else {
                    input_padded = (int8_t *) input_data;
                }
                memcpy(filter_aligned, filter_data, filter_size);
                esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(input_padded, input_wd + pad_right,
                                                                  input_ht + pad_bottom, channels, input_offset,
                                                                  stride_wd, stride_ht, filter_aligned, bias,
                                                                  out_data, out_wd, out_ht, out_offset, out_shift,
                                                                  out_mult, activation_min, activation_max);
            } else if (channels >= 12) {
                /* channels % 8 == 0, not % 16, channels >= 12: pad to 16 is worthwhile
                 * (overhead <= 33%). For ch=8, padding to 16 doubles data — use s16 instead */
                int new_ch = (channels + 15) & ~15;
                int8_t pad_val = (int8_t)(-input_offset);

                /* Pad filter: 3x3 x new_ch */
                int new_filter_size = 9 * new_ch;
                int8_t *filter_padded = (int8_t *) scratch_buffer;
                memset(filter_padded, 0, new_filter_size);
                for (int f = 0; f < 9; f++) {
                    memcpy(filter_padded + f * new_ch, filter_data + f * channels, channels);
                }

                /* Pad input: (input_wd + 2*pad) x (input_ht + 2*pad) x new_ch */
                int new_input_wd = input_wd + 2 * pad_wd;
                int new_input_ht = input_ht + 2 * pad_ht;
                int pad_right = max(0, (out_wd * stride_wd + 3 - 1) - (input_wd + 2 * pad_wd));
                int pad_bottom = max(0, (out_ht * stride_ht + 3 - 1) - (input_ht + 2 * pad_ht));
                new_input_wd += pad_right;
                new_input_ht += pad_bottom;

                int8_t *input_padded = filter_padded + new_filter_size + 16;
                int padded_input_total = new_input_wd * new_input_ht * new_ch;
                /* Fill entire padded input with pad_val first */
                memset(input_padded, pad_val, padded_input_total);
                /* Copy actual input data into correct positions */
                for (int y = 0; y < input_ht; y++) {
                    for (int x = 0; x < input_wd; x++) {
                        int dst_y = y + pad_ht;
                        int dst_x = x + pad_wd;
                        memcpy(input_padded + (dst_y * new_input_wd + dst_x) * new_ch,
                               input_data + (y * input_wd + x) * channels, channels);
                    }
                }

                /* Padded output buffer */
                int8_t *out_padded = input_padded + padded_input_total;

                /* Pad quant arrays */
                int32_t shift_pad[new_ch], mult_pad[new_ch], bias_pad[new_ch];
                memcpy(shift_pad, out_shift, channels * sizeof(int32_t));
                memcpy(mult_pad, out_mult, channels * sizeof(int32_t));
                memset(shift_pad + channels, 0, (new_ch - channels) * sizeof(int32_t));
                memset(mult_pad + channels, 0, (new_ch - channels) * sizeof(int32_t));
                if (bias) {
                    memcpy(bias_pad, bias, channels * sizeof(int32_t));
                    memset(bias_pad + channels, 0, (new_ch - channels) * sizeof(int32_t));
                }

                esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(
                    input_padded, new_input_wd, new_input_ht, new_ch, input_offset,
                    stride_wd, stride_ht, filter_padded,
                    bias ? bias_pad : NULL, out_padded,
                    out_wd, out_ht, out_offset, shift_pad, mult_pad,
                    activation_min, activation_max);

                /* Compact output: strip padding channels */
                for (int pos = 0; pos < out_wd * out_ht; pos++) {
                    memcpy(out_data + pos * channels,
                           out_padded + pos * new_ch, channels);
                }
            } else {
                /* ch < 12 (e.g., ch=8), 3x3: use s16 mult1 3x3 path */
                esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size);
                esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input_data, input_data16, input_size, input_offset);
                esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3(input_data16, input_wd, input_ht, channels,
                                                            pad_wd, pad_ht, stride_wd, stride_ht, filter_data16,
                                                            bias, out_data, out_wd, out_ht, out_offset, out_shift,
                                                            out_mult, activation_min, activation_max);
            }
        } else { // all other ch_mult == 1, channels % 8 == 0
            /* Tiled s16 processing: convert filter once, process input in row strips
             * to keep working set within DCache (64KB) */
            esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size);

            /* Check if full conversion fits comfortably in cache */
            int total_s16_size = 2 * (filter_size + input_size);
            if (total_s16_size <= 48 * 1024) {
                /* Small enough — full conversion is fine */
                esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input_data, input_data16, input_size, input_offset);
                esp_nn_depthwise_conv_s16_mult1_esp32s3(input_data16, input_wd, input_ht, channels,
                                                        pad_wd, pad_ht, stride_wd, stride_ht, filter_data16,
                                                        filter_wd, filter_ht, bias, out_data, out_wd, out_ht, out_offset, out_shift,
                                                        out_mult, activation_min, activation_max);
            } else {
                /* Large input: process in row tiles to reduce cache pressure.
                 * Convert only the input rows needed for each output row strip. */
                int16_t *tile_buf = input_data16; /* reuse scratch for tile */

                for (int out_row = 0; out_row < out_ht; out_row++) {
                    int in_row_start = out_row * stride_ht - pad_ht;
                    int in_row_end = in_row_start + filter_ht;

                    /* Fill tile: pad rows that are outside input bounds */
                    int16_t *dst = tile_buf;
                    for (int r = in_row_start; r < in_row_end; r++) {
                        if (r < 0 || r >= input_ht) {
                            /* Padding row: fill with input_offset */
                            for (int i = 0; i < input_wd * channels; i++) {
                                dst[i] = (int16_t)input_offset;
                            }
                        } else {
                            /* Valid row: convert s8 to s16 with offset */
                            const int8_t *src = input_data + r * input_wd * channels;
                            for (int i = 0; i < input_wd * channels; i++) {
                                dst[i] = (int16_t)src[i] + (int16_t)input_offset;
                            }
                        }
                        dst += input_wd * channels;
                    }

                    /* Process one output row */
                    esp_nn_depthwise_conv_s16_mult1_esp32s3(tile_buf, input_wd, filter_ht, channels,
                                                            pad_wd, 0, stride_wd, 1, filter_data16,
                                                            filter_wd, filter_ht, bias,
                                                            out_data + out_row * out_wd * channels,
                                                            out_wd, 1, out_offset, out_shift,
                                                            out_mult, activation_min, activation_max);
                }
            }
        }
    } else if ((ch_mult == 1) && (channels > 3)) {
        // For ch_mult=1, pad channels to multiple of 8 for optimized mult1 function
        int padded_channels = (channels + 7) & ~7; // Round up to multiple of 8
        int padded_input_size = input_wd * input_ht * padded_channels;
        int padded_filter_size = filter_wd * filter_ht * padded_channels;

        // Use scratch buffer for padded data (ensure 16-byte alignment for SIMD)
        int16_t *padded_filter_data16 = (int16_t*)scratch_buffer;
        size_t input_start = (size_t)(padded_filter_data16 + padded_filter_size);
        int16_t *padded_input_data16 = (int16_t*)((input_start + 15) & ~15);
        size_t out_start = (size_t)(padded_input_data16 + padded_input_size);
        int8_t *padded_out_data = (int8_t*)((out_start + 15) & ~15);

        // Create padded parameter arrays
        size_t bias_start = (size_t)(padded_out_data + out_wd * out_ht * padded_channels);
        int32_t *padded_bias = (int32_t*)((bias_start + 15) & ~15);
        int32_t *padded_shift = padded_bias + padded_channels;
        int32_t *padded_mult = padded_shift + padded_channels;

        // Initialize padded parameters - copy valid values, set padded ones to safe defaults
        memset(padded_bias, 0, padded_channels * sizeof(int32_t));
        memset(padded_shift, 0, padded_channels * sizeof(int32_t));
        memset(padded_mult, 0, padded_channels * sizeof(int32_t));

        if (bias) {
            memcpy(padded_bias, bias, channels * sizeof(int32_t));
        }
        if (out_shift) {
            memcpy(padded_shift, out_shift, channels * sizeof(int32_t));
        }
        if (out_mult) {
            memcpy(padded_mult, out_mult, channels * sizeof(int32_t));
        }

        // Convert filter data to padded layout (zero out extra channels)
        memset(padded_filter_data16, 0, padded_filter_size * sizeof(int16_t));
        for (int c = 0; c < channels; c++) {
            for (int fy = 0; fy < filter_ht; fy++) {
                for (int fx = 0; fx < filter_wd; fx++) {
                    int orig_idx = (fy * filter_wd + fx) * channels + c;
                    int padded_idx = (fy * filter_wd + fx) * padded_channels + c;
                    padded_filter_data16[padded_idx] = (int16_t) filter_data[orig_idx];
                }
            }
        }

        // Convert input data to padded layout (zero out extra channels, apply offset)
        memset(padded_input_data16, 0, padded_input_size * sizeof(int16_t));
        for (int h = 0; h < input_ht; h++) {
            for (int w = 0; w < input_wd; w++) {
                for (int c = 0; c < channels; c++) {
                    int orig_idx = (h * input_wd + w) * channels + c;
                    int padded_idx = (h * input_wd + w) * padded_channels + c;
                    padded_input_data16[padded_idx] = (int16_t) input_data[orig_idx] + input_offset;
                }
            }
        }

        // Call mult1 with padded data
        esp_nn_depthwise_conv_s16_mult1_esp32s3(padded_input_data16, input_wd, input_ht, padded_channels,
                                                pad_wd, pad_ht, stride_wd, stride_ht, padded_filter_data16,
                                                filter_wd, filter_ht, padded_bias, padded_out_data, out_wd, out_ht, out_offset, padded_shift,
                                                padded_mult, activation_min, activation_max);

        // Copy back only valid channels
        for (int h = 0; h < out_ht; h++) {
            for (int w = 0; w < out_wd; w++) {
                for (int c = 0; c < channels; c++) {
                    int out_idx = (h * out_wd + w) * channels + c;
                    int padded_idx = (h * out_wd + w) * padded_channels + c;
                    out_data[out_idx] = padded_out_data[padded_idx];
                }
            }
        }
    } else if (ch_mult % 8 == 0) {
        // Channel multiplier is optimized multiple - use direct s16 functions
        esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size);
        esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input_data, input_data16, input_size, input_offset);
        if (filter_wd == 3 && filter_ht == 3) {
            esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3(input_data16, input_wd, input_ht, channels,
                                                        pad_wd, pad_ht, stride_wd, stride_ht, ch_mult,
                                                        filter_data16, bias,
                                                        out_data, out_wd, out_ht, out_offset, out_shift,
                                                        out_mult, activation_min, activation_max);
        } else {
            esp_nn_depthwise_conv_s16_mult8_esp32s3(input_data16, input_wd, input_ht, channels,
                                                    pad_wd, pad_ht, stride_wd, stride_ht, ch_mult,
                                                    filter_data16, filter_wd, filter_ht, bias,
                                                    out_data, out_wd, out_ht, out_offset, out_shift,
                                                    out_mult, activation_min, activation_max);
        }
    } else if (ch_mult % 4 == 0) {
        esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size);
        esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input_data, input_data16, input_size, input_offset);
        esp_nn_depthwise_conv_s16_mult4_esp32s3(input_data16, input_wd, input_ht, channels,
                                                pad_wd, pad_ht, stride_wd, stride_ht, ch_mult,
                                                filter_data16, filter_wd, filter_ht, bias,
                                                out_data, out_wd, out_ht, out_offset, out_shift,
                                                out_mult, activation_min, activation_max);
    } else {
        esp_nn_depthwise_conv_s8_opt(input_dims, input_data, filter_dims, filter_data, bias,
                                     output_dims, out_data, conv_params, quant_data);
    }
}


================================================
FILE: src/convolution/esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

    .text
    .literal_position

// processes multiple of 16 channels
// already padded version. no additional padding needed
// simply keep sliding filter window by stride_size

    # Program Unit: esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3
    .type   esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3, @function
    .align   4
    .global esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3

esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3:  # 0xccc
    # qacc_scratch = 0
    # gra_spill_temp_103 = 40 // stride_wd*channels
    # gra_spill_temp_104 = 44 // bias_align
    # gra_spill_temp_107 = 48 // input_offset
    # gra_spill_temp_105 = 52 // out_mult_ptr
    # gra_spill_temp_106 = 56 // out_shift_ptr
    # gra_spill_temp_108 = 60 // ch_idx
    # gra_spill_temp_109 = 64 // out_ch
    # gra_spill_temp_110 = 68 // bias_ptr
    # gra_spill_temp_111 = 72 // 2 * (input_wd * channels)
    # gra_spill_temp_112 = 76 // input_data
    # gra_spill_temp_118 = 96
    # gra_spill_temp_119 = 100
    # gra_spill_temp_120 = 104
    # gra_spill_temp_121 = 108
    # gra_spill_temp_113 = 112 // input_wd * channels
    # gra_spill_temp_114 = 116 // input_wd
    # gra_spill_temp_130 = 120

    # gra_spill_temp_141 = 0
    # gra_spill_temp_120 = 16
    # gra_spill_temp_137 = 80

// offset+bias factor
    # gra_spill_temp_134 = 128 //256-128
    # gra_spill_temp_135 = 144 //256-112
    # gra_spill_temp_133 = 160 //256-96
    # gra_spill_temp_132 = 176 //256-80


 // registers:
 // a2: input_data
 // a3: input_wd
 // a4: input_ht
 // a5: channels
 // a6: input_offset
 // a7: stride_wd

 // on stack:

 // 320: stride_ht
 // 324: filter_data
 // 328: *bias
 // 332: *out_data
 // 336: out_wd
 // 340: out_ht
 // 344: out_offset
 // 348: *out_shift
 // 352: *out_mult
 // 356: activation_min
 // 360: activation_max

    entry   a1,320                      #
    mul16u  a7,a7,a5
    s32i    a3,a1,116                   # [0]  gra_spill_temp_114, input_wd
    s32i    a6,a1,48                    # [1]  gra_spill_temp_107, input_offset
    s32i    a7,a1,40                    # gra_spill_temp_103, stride_wd*channels

    addi    a8,a5,-15                   # [2]
    s32i    a2,a1,76                    # [3]  gra_spill_temp_112, input_data
    l32i    a9,a1,328                   # [4]  id:664 bias+0x0
    mov.n   a2,a5                       # [5]
    s32i    a8,a1,64                    # [7]  gra_spill_temp_109
    s32i    a9,a1,68                    # [8]  gra_spill_temp_110, bias_ptr
    blti    a8,1,.Lt_7_4610             # [9]

    l32i    a12,a1,348                  # [4]  id:666 out_shift+0x0
    mul16u  a15,a3,a5               # [1]
    movi.n  a9,0                    # [13]
    s32i    a12,a1,56                   # [9]  gra_spill_temp_106 // out_shift_ptr
    s32i    a9,a1,60                    # [14]  gra_spill_temp_108, ch_idx
    s32i    a15,a1,112                  # [12]  gra_spill_temp_113, input_wd*channels
    l32i    a9,a1,352                   # [24]  id:665 out_mult+0x0
    slli    a15,a15,1                   # [15]
    s32i    a15,a1,72                   # [23]  gra_spill_temp_111, 2 * (input_wd * channels)
    s32i    a9,a1,52                    # [25]  gra_spill_temp_105, out_mult_ptr

// outer most out_ch loop
.Lt_7_5122: # 0xd57
    l32i            a13,a1,324                  # [1]  filter_data
    l32i            a6,a1,60                    # [2]  gra_spill_temp_108, ch_idx
    l32i            a9,a1,48                    # [0]  gra_spill_temp_107, input_offset
    ee.zero.q       q2                      # [3]
    add.n           a13,a6,a13                  # [4]
    s32i            a13,a1,108                  # [5]  gra_spill_temp_121

// multiply accumulate filter points
    ee.vld.128.xp   q1,a13,a2           # [6]  id:673
    ee.vld.128.xp   q3,a13,a2           # [7]  id:674
    ee.vcmp.lt.s8   q0,q1,q2            # [8]
    ee.vcmp.lt.s8   q4,q3,q2            # [9]
    ee.vzip.8       q1,q0                   # [10]
    ee.vzip.8       q3,q4                   # [11]
    ee.vadds.s16    q0,q0,q4            # [12]
    ee.vld.128.xp   q4,a13,a2           # [13]  id:675
    ee.vadds.s16    q1,q1,q3            # [14]
    ee.vcmp.lt.s8   q3,q4,q2            # [15]
    ee.vzip.8       q4,q3                   # [16]
    ee.vadds.s16    q1,q1,q4            # [17]
    ee.vld.128.xp   q4,a13,a2           # [18]  id:676
    ee.vadds.s16    q0,q0,q3            # [19]
    ee.vcmp.lt.s8   q3,q4,q2            # [20]
    ee.vzip.8       q4,q3                   # [21]
    ee.vadds.s16    q0,q0,q3            # [22]
    ee.vld.128.xp   q3,a13,a2           # [23]  id:677
    ee.vadds.s16    q1,q1,q4            # [24]
    ee.vcmp.lt.s8   q4,q3,q2            # [25]
    ee.vzip.8       q3,q4                   # [26]
    ee.vadds.s16    q1,q1,q3            # [27]
    ee.vld.128.xp   q3,a13,a2           # [28]  id:678
    ee.vadds.s16    q0,q0,q4            # [29]
    ee.vcmp.lt.s8   q4,q3,q2            # [30]
    ee.vzip.8       q3,q4                   # [31]
    ee.vadds.s16    q0,q0,q4            # [32]
    ee.vld.128.xp   q4,a13,a2           # [33]  id:679
    ee.vadds.s16    q1,q1,q3            # [34]
    ee.vcmp.lt.s8   q3,q4,q2            # [35]
    ee.vzip.8       q4,q3                   # [36]
    ee.vadds.s16    q1,q1,q4            # [37]
    ee.vld.128.xp   q4,a13,a2           # [38]  id:680
    ee.vadds.s16    q0,q0,q3            # [39]
    ee.vcmp.lt.s8   q3,q4,q2            # [40]
    ee.vzip.8       q4,q3                   # [41]
    ee.vadds.s16    q0,q0,q3            # [42]
    ee.vld.128.xp   q3,a13,a2           # [44]  id:681
    ee.vadds.s16    q1,q1,q4            # [43]
    ee.vcmp.lt.s8   q2,q3,q2            # [47]
    ee.vzip.8       q3,q2                   # [48]
    ee.vadds.s16    q0,q0,q2            # [49]
    ee.vadds.s16    q1,q1,q3            # [50]

    ee.movi.32.a    q1,a15,1            # [51]
    ee.movi.32.a    q1,a8,3             # [52]
    ee.movi.32.a    q0,a10,3            # [54]
    ee.movi.32.a    q0,a13,1            # [55]
    srai            a11,a10,16                  # [56]
    srai            a12,a8,16                   # [57]
    mull            a12,a9,a12                  # [58]
    mull            a11,a9,a11                  # [59]
    sext            a8,a8,15                    # [328]
    sext            a10,a10,15                  # [61]
    srai            a14,a13,16                  # [62]
    mull            a14,a9,a14                  # [63]
    mull            a10,a9,a10                  # [64]
    mull            a8,a9,a8                    # [65]
    sext            a13,a13,15                  # [66]
    mull            a13,a9,a13                  # [67]
    ee.movi.32.q    q3,a11,3            # [68]
    ee.movi.32.q    q4,a12,3            # [69]
    ee.movi.32.q    q4,a8,2             # [70]
    ee.movi.32.q    q3,a10,2            # [71]
    ee.movi.32.a    q1,a11,2            # [72]
    srai            a12,a11,16                  # [74]
    srai            a8,a15,16                   # [75]
    mull            a8,a9,a8                    # [76]
    mull            a12,a9,a12                  # [77]
    sext            a15,a15,15                  # [78]
    sext            a11,a11,15                  # [79]
    mull            a11,a9,a11                  # [80]
    mull            a15,a9,a15                  # [81]
    ee.movi.32.q    q4,a12,1            # [82]
    ee.movi.32.q    q1,a8,3             # [83]
    ee.movi.32.q    q1,a15,2            # [84]
    ee.movi.32.q    q4,a11,0            # [85]
    ee.movi.32.a    q0,a15,2            # [86]
    ee.movi.32.q    q0,a14,3            # [88]
    ee.movi.32.q    q0,a13,2            # [91]
    srai            a8,a15,16                   # [89]
    mull            a8,a9,a8                    # [90]
    sext            a15,a15,15                  # [92]
    mull            a15,a9,a15                  # [93]
 # 526  MUL_IN_OFFSET_EXPAND(q_sum2, 0, q_sum2, 0);
    ee.movi.32.a    q0,a11,0            # [94]
    srai            a13,a11,16                  # [95]
    ee.movi.32.q    q3,a8,1             # [96]
    ee.movi.32.q    q3,a15,0            # [100]
    sext            a11,a11,15                  # [97]
    mull            a13,a9,a13                  # [98]
    l32i            a8,a1,332                   # [99]
    ee.movi.32.a    q1,a10,0            # [103]
    ee.movi.32.q    q0,a13,1            # [100]
    srai            a12,a10,16                  # [105]
    sext            a10,a10,15                  # [106]
    mull            a12,a9,a12                  # [107]
    mull            a10,a9,a10                  # [108]
    mull            a9,a9,a11                   # [109]
    ee.movi.32.q    q1,a12,1            # [110]
    ee.movi.32.q    q1,a10,0            # [111]

    l32i            a11,a1,328      // load bias
    add.n           a6,a6,a8                    # [102]
    ee.movi.32.q    q0,a9,0             # [113]
    beqz.n          a11,.Lt_7_5378          # [114]

// add bias
    l32i            a8,a1,68                    # [0]  gra_spill_temp_110, bias_ptr
    extui           a11,a11,0,4                 # [2] // bias_align
    wur.sar_byte    a11                 # [4]
    ee.vld.128.ip   q5,a8,16            # [5]  id:683
    ee.vld.128.ip   q6,a8,16            # [6]  id:684
    ee.vld.128.ip   q7,a8,16            # [7]  id:685
    addmi           a10,a1,256                  # [2]
    ee.src.q.ld.ip  q2,a8,16,q5,q6              # [9]
    ee.vadds.s32    q1,q1,q5            # [12]
    ee.src.q.ld.ip  q5,a8,0,q6,q7               # [13]
    s32i            a8,a1,68                    # [11]  gra_spill_temp_110, bias_ptr
    ee.vadds.s32    q4,q4,q6            # [18]
    ee.src.q        q7,q7,q2                # [9]
    ee.src.q        q2,q2,q5                # [13]
    ee.vadds.s32    q0,q0,q7            # [12]
    ee.vadds.s32    q3,q3,q2            # [12]
.Lt_7_5378: # 0xeef

// store offset+bias factor (q1,q4,q0,q3)
    st.qr           q4,a10,-112                  # [17]  gra_spill_temp_135-256
    st.qr           q3,a10,-128                  # [21]  gra_spill_temp_134-256
    st.qr           q1,a10,-96                  # [7]  gra_spill_temp_133-256
    st.qr           q0,a10,-80                  # [8]  gra_spill_temp_132-256

// prepare height loop
    movi.n  a15,0                   # [1]
    movi.n  a8,0                    # [2]
    movi.n  a9,0                    # [3]
    s32i    a9,a1,100                   # [4]  gra_spill_temp_119
    s32i    a8,a1,104                   # [5]  gra_spill_temp_120
    s32i    a15,a1,96                  # [6]  gra_spill_temp_118

// height loop
.Lt_7_6402: # 0xf0c
    l32i    a4,a1,104                   # [2]  gra_spill_temp_120 // out_y * (input_wd * stride_ht) * channels)
    l32i    a8,a1,100                   # [3]  gra_spill_temp_119 // initialised to 0 before height loop
    l32i    a5,a1,76                    # [1]  gra_spill_temp_112, input_data
    l32i    a3,a1,60                    # [0]  gra_spill_temp_108, ch_idx
    l32i    a7,a1,112                   # [1]  gra_spill_temp_113, input_wd*channels
    l32i    a10,a1,336                  # [0]  out_wd
    add.n   a4,a4,a5                    # [4] // input_data + (out_y * stride_ht) * input_wd * channels
    mov.n   a5,a8                       # [5] // index
    add.n   a3,a3,a4                    # [6] // input_row0
    l32i    a4,a1,72                    # [9]  gra_spill_temp_111, 2 * (input_wd * channels)
    add.n   a7,a7,a3                    # [7] // input_row1 = (input_wd * channels)
    add.n   a8,a8,a10                   # [8]
    s32i    a8,a1,120                   # [10]  gra_spill_temp_130
    add.n   a4,a4,a3                    # [11] // input_row2

// width loop
.Lt_7_7170: # 0xf32
    l32i                    a9,a1,108                   # [3]  gra_spill_temp_121, filter_ptr
    ee.zero.qacc                    # [2]
    mov.n                   a12,a3                      # [4]
    mov.n                   a11,a7                      # [1]
    mov.n                   a10,a4                      # [0]
    ee.vld.128.xp           q0,a12,a2           # [5]  id:693
    ee.vld.128.xp           q6,a12,a2           # [6]  id:695
    ee.vld.128.xp           q1,a9,a2            # [7]  id:694
    ee.vld.128.xp           q7,a9,a2            # [8]  id:696
    ee.vld.128.xp           q5,a9,a2            # [9]  id:698
    ee.vld.128.xp           q3,a9,a2            # [10]  id:700
    ee.vmulas.s8.qacc.ld.xp q4,a12,a2,q0,q1     # [11]  id:697
    ee.vmulas.s8.qacc.ld.xp q2,a11,a2,q6,q7     # [13]  id:699
    ee.vld.128.xp           q1,a9,a2            # [14]  id:702
    ee.vmulas.s8.qacc.ld.xp q0,a11,a2,q4,q5     # [15]  id:701
    ee.vmulas.s8.qacc.ld.xp q6,a11,a2,q2,q3     # [16]  id:703
    ee.vld.128.xp           q7,a9,a2            # [17]  id:704
    ee.vld.128.xp           q3,a9,a2            # [18]  id:706
    ee.vmulas.s8.qacc.ld.xp q0,a10,a2,q0,q1     # [19]  id:705
    ee.vmulas.s8.qacc.ld.xp q1,a10,a2,q6,q7     # [20]  id:707
    ee.vmulas.s8.qacc.ld.xp q4,a10,a2,q0,q3     # [21]  id:709
    ee.vld.128.xp           q6,a9,a2            # [22]  id:708
    ee.vld.128.xp           q5,a9,a2            # [23]  id:710
    ee.vmulas.s8.qacc       q1,q6           # [24]
    ee.vmulas.s8.qacc       q4,q5           # [25]

 // extract data
    mov     a12,a1      //// scratch
    ee.st.qacc_l.l.128.ip   a12,16      # [27]  id:713
    ee.st.qacc_l.h.32.ip    a12,-16     # [28]  id:714

    l32i.n  a9,a1,8                 # [29]  qacc_scratch+8
    l32i.n  a11,a1,4                # [30]  qacc_scratch+4
    l32i.n  a15,a1,0                # [31]  qacc_scratch
    slli    a14,a11,24                  # [32]
    sext    a8,a15,19                   # [33]
    slli    a10,a9,16                   # [34]
    slli    a13,a11,4                   # [35]
    extui   a9,a9,16,16                 # [36]
    srai    a13,a13,12                  # [37]
    extui   a15,a15,20,12               # [39]
    srai    a14,a14,12                  # [40]
    srai    a10,a10,12                  # [41]
    extui   a11,a11,28,4                # [42]
    or      a10,a10,a11                 # [43]
    or      a14,a14,a15                 # [44]

// insert to q0
    ee.movi.32.q    q0,a8,0             # [38]
    ee.movi.32.q    q0,a14,1            # [45]
    ee.movi.32.q    q0,a13,2            # [48]
    ee.movi.32.q    q0,a10,3            # [49]

    l32i.n  a11,a1,16               # [46]  qacc_scratch+16
    l32i.n  a14,a1,12               # [47]  qacc_scratch+12
    slli    a13,a11,20                  # [50]

    ee.st.qacc_h.l.128.ip   a12,16      # [51]  id:720
    ee.st.qacc_h.h.32.ip    a12,-16     # [55]  id:721
    srai    a11,a11,12                  # [52]
    srai    a13,a13,12                  # [53]
    slli    a8,a14,28                   # [54]
    slli    a15,a14,8                   # [56]
    srai    a15,a15,12                  # [57]
    srai    a8,a8,12                    # [59]

    l32i.n          a12,a1,8                # [328]  qacc_scratch+8
    or              a8,a8,a9                    # [61]
    extui           a14,a14,24,8                # [62]
    l32i.n          a9,a1,0                 # [63]  qacc_scratch
    or              a13,a13,a14                 # [64]
//insert to q3
    ee.movi.32.q    q3,a8,0             # [65]
    ee.movi.32.q    q3,a15,1            # [67]
    ee.movi.32.q    q3,a13,2            # [69]
    ee.movi.32.q    q3,a11,3            # [70]

    l32i.n          a14,a1,4                # [66]  qacc_scratch+4
    sext            a10,a9,19                   # [68]
    extui           a9,a9,20,12                 # [72]
    slli            a13,a12,16                  # [73]
    slli            a8,a14,24                   # [74]
    extui           a12,a12,16,16               # [75]
    srai            a13,a13,12                  # [76]
    srai            a8,a8,12                    # [77]
    slli            a15,a14,4                   # [78]
    srai            a15,a15,12                  # [79]
    or              a8,a8,a9                    # [80]
    extui           a14,a14,28,4                # [81]
    l32i.n          a9,a1,12                # [82]  qacc_scratch+12
    or              a13,a13,a14                 # [83]
// insert to q1
    ee.movi.32.q    q1,a10,0            # [71]
    ee.movi.32.q    q1,a8,1             # [84]
    ee.movi.32.q    q1,a15,2            # [85]
    ee.movi.32.q    q1,a13,3            # [88]

// load in_offset+bias factor
    addmi           a14,a1,256                  # [86]
    ld.qr           q7,a14,-128                  # [87]  gra_spill_temp_134-256
    ld.qr           q4,a14,-112                  # [89]  gra_spill_temp_135-256
    l32i.n          a15,a1,16               # [90]  qacc_scratch+16
    ld.qr           q2,a14,-96                  # [91]  gra_spill_temp_133-256
    slli            a11,a9,28                   # [92]
    slli            a10,a9,8                    # [93]
    srai            a10,a10,12                  # [94]
    srai            a11,a11,12                  # [95]
    extui           a9,a9,24,8                  # [96]
    or              a11,a11,a12                 # [97]
    ee.vadds.s32    q0,q0,q2            # [98]
    slli            a8,a15,20                   # [99]
    ee.vadds.s32    q3,q3,q4            # [100]
    st.qr           q3,a1,80                # [101]  gra_spill_temp_137-256
    srai            a15,a15,12                  # [102]
    ld.qr           q2,a14,-80                  # [103]  gra_spill_temp_132-256
    srai            a8,a8,12                    # [105]
    or              a8,a8,a9                    # [108]

// insert to q6
    ee.movi.32.q    q6,a11,0            # [100]
    ee.movi.32.q    q6,a10,1            # [107]
    ee.movi.32.q    q6,a8,2             # [112]
    ee.movi.32.q    q6,a15,3            # [113]

    ee.vadds.s32    q1,q1,q2            # [110]
    ee.vadds.s32    q6,q6,q7            # [114]
    st.qr           q1,a1,16                   # [111]  gra_spill_temp_120
    s32i.n          a7,a1,32                # [0] // tmp
    s32i.n          a6,a1,36                # [106] // tmp
    l32i            a7,a1,52                # [109]  gra_spill_temp_105, out_mult_ptr
    l32i            a6,a1,56                # [106]  gra_spill_temp_106, out_shift_ptr
    addi.n          a10,a7,0
    addi.n          a11,a6,0
    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [116]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3

    mv.qr       q5,q0
    ld.qr       q0,a1,80                # [4]  gra_spill_temp_137-256
    addi.n      a10,a7,16
    addi.n      a11,a6,16
    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [5]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3

    mv.qr       q4,q0
    ld.qr       q0,a1,16                   # [5]  gra_spill_temp_120
    addi.n      a10,a7,32
    addi.n      a11,a6,32
    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [6]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3

    st.qr       q0,a1,0                 # [3]  gra_spill_temp_141
    mv.qr       q0,q6
    addi.n      a10,a7,48
    addi.n      a11,a6,48
    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [6]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3


    l32i.n  a6,a1,36                # [106]  // tmp
    l32i.n  a7,a1,32                # [0]  // tmp
    l32i    a15,a1,40                   # gra_spill_temp_103, stride_wd * channels
    l32i    a11,a1,120                  # [3]  gra_spill_temp_130

    add.n   a3,a3,a15                   # [0]
    add.n   a4,a4,a15                   # [1]
    add.n   a7,a7,a15                   # [2]
    addi.n  a5,a5,1                 # [4]

 // add offset, apply activation and store
    addmi   a13,a1,256                  # [8]
    ld.qr   q3,a1,0                 # [10]  gra_spill_temp_141
    mv.qr   q2,q5
    addi    a8,a13,88                   # [14]
    addi    a9,a13,100                  # [15]
    addi    a15,a13,104                 # [13]
    ee.vldbc.32     q6,a9               # [17]  id:723 activation_min
    ee.vldbc.32     q1,a8               # [18]  id:722 out_offset
    ee.vldbc.32     q7,a15              # [19]  id:724 activation_max
    ee.vadds.s32    q4,q4,q1            # [20]
    ee.vadds.s32    q2,q2,q1            # [21]
    ee.vadds.s32    q5,q0,q1            # [22]
    ee.vadds.s32    q3,q3,q1            # [23]
    ee.vmin.s32     q3,q3,q7            # [24]
    ee.vmin.s32     q5,q5,q7            # [25]
    ee.vmin.s32     q2,q2,q7            # [26]
    ee.vmin.s32     q4,q4,q7            # [27]
    ee.vmax.s32     q4,q4,q6            # [28]
    ee.vmax.s32     q2,q2,q6            # [29]
    ee.vmax.s32     q5,q5,q6            # [30]
    ee.vmax.s32     q3,q3,q6            # [31]
    ee.vunzip.16    q3,q5               # [32]
    ee.vunzip.16    q2,q4               # [33]
    ee.vunzip.8     q2,q3               # [34]
    ee.vst.128.xp   q2,a6,a2            # [35]  id:725
    bne             a5,a11,.Lt_7_7170               # [36]

.Lt_7_6658: # 0x112f
#<loop> Part of loop body line 548, head labeled .Lt_7_6402
    l32i    a15,a1,112                  # [3]  gra_spill_temp_113, input_wd*channels
    l32i    a10,a1,320                  # gra_spill_temp_103
    l32i    a13,a1,340                  # [0]  // out_ht
    l32i    a9,a1,116                   # [1]  gra_spill_temp_114, input_wd
    l32i    a12,a1,96                  # [4]  gra_spill_temp_118
    mull    a15,a10,a15                 # // (input_wd * stride_ht) * channels
    l32i    a14,a1,104                  # [5]  gra_spill_temp_120
    l32i    a8,a1,100                   # [2]  gra_spill_temp_119

    addi.n  a12,a12,1               # [6]
    s32i    a12,a1,96                  # [7]  gra_spill_temp_118
    add.n   a14,a14,a15                 # [8]
    add.n   a8,a8,a9                    # [9]
    s32i    a8,a1,100                   # [10]  gra_spill_temp_119
    s32i    a14,a1,104                  # [11]  gra_spill_temp_120, (input_wd * stride_wd) * channels
    bne     a12,a13,.Lt_7_6402              # [13] // iterate over height loop

#<loop> Part of loop body line 348, head labeled .Lt_7_5122
    l32i    a11,a1,56                   # [6]  gra_spill_temp_106 // out_shift_ptr
    l32i    a15,a1,52                   # [2]  gra_spill_temp_105, out_mult_ptr
    l32i    a10,a1,60                   # [24]  gra_spill_temp_108, ch_idx
    addi    a11,a11,64                  # [8]
    addi    a15,a15,64                  # [13]
    s32i    a11,a1,56                   # [23]  gra_spill_temp_106
    s32i    a15,a1,52                   # [18]  gra_spill_temp_105, out_mult_ptr
    l32i    a11,a1,64                   # [25]  gra_spill_temp_109
    addi    a10,a10,16                  # [26]
    s32i    a10,a1,60                   # [27]  gra_spill_temp_108, ch_idx
    blt     a10,a11,.Lt_7_5122          # [28] // iterate over outer most out_ch loop

.Lt_7_4610: # 0x11ad
    retw.n                          # [0]

    .size   esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3, . - esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3


================================================
FILE: src/fully_connected/esp_nn_fc_s8_mac16_esp32s3.S
================================================
//
// SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
//
// SPDX-License-Identifier: Apache-2.0
//

//
// s8 dot product for FC with 2x loop unrolling and QUP for unaligned filter.
// Pattern adapted from esp-dsp dspi_dotprod_s8_aes3.S.
// Input must be 16-byte aligned. Filter can be unaligned.
//

    .text
    .align  4

    .type   esp_nn_fc_s8_mac16_esp32s3, @function
    .align  4
    .global esp_nn_fc_s8_mac16_esp32s3

// a2: input_data (16-byte aligned)
// a3: filter_data (may be unaligned)
// a4: row_len_div16 (>= 1)
// Returns: int32_t dot product in a2

esp_nn_fc_s8_mac16_esp32s3:
    entry   a1, 32

    ee.zero.accx
    beqz    a4, .Ldone

    // Prime: first unaligned filter load (sets SAR_BYTE)
    ee.ld.128.usar.ip   q0, a3, 16         // filter chunk 0

    // Check if we can do 2x unrolled (need >= 2 iterations)
    srai    a5, a4, 1                       // a5 = row_len_div16 / 2
    beqz    a5, .Lsingle

    // Load first input + filter pair for unrolled loop
    ee.vld.128.ip       q1, a2, 16         // input[0]
    ee.ld.128.usar.ip   q2, a3, 16         // filter chunk 1

    // 2x unrolled main loop: 2 MACs per iteration
    loopgtz a5, .Lloop2_end

    ee.src.q.qup        q4, q0, q2         // align filter[i]
    ee.vld.128.ip       q3, a2, 16         // input[i+1]
    ee.vmulas.s8.accx   q4, q1             // MAC filter[i] * input[i]
    ee.ld.128.usar.ip   q0, a3, 16         // filter chunk[i+2]
    ee.src.q.qup        q5, q2, q0         // align filter[i+1]
    ee.vld.128.ip       q1, a2, 16         // input[i+2] (primed for next)
    ee.vmulas.s8.accx   q5, q3             // MAC filter[i+1] * input[i+1]
    ee.ld.128.usar.ip   q2, a3, 16         // filter chunk[i+3]

.Lloop2_end:

    // Check if there's a remaining single iteration
    bbci    a4, 0, .Llast_qup              // if row_len_div16 is even, skip single

.Lsingle:
    // Single iteration: load input, QUP filter, MAC
    ee.vld.128.ip       q1, a2, 16         // input
    ee.ld.128.usar.ip   q2, a3, 16         // next filter chunk
    ee.src.q.qup        q4, q0, q2         // align filter
    ee.vmulas.s8.accx   q4, q1             // MAC
    j                   .Ldone_mac

.Llast_qup:
    // After 2x loop: need to back up pointers since we loaded one extra pair
    addi    a2, a2, -16
    addi    a3, a3, -16

.Ldone_mac:
.Ldone:
    // 2-cycle gap before ACCX read
    movi.n  a3, 0
    nop
    ee.srs.accx a2, a3, 0

    retw.n

    .size   esp_nn_fc_s8_mac16_esp32s3, . - esp_nn_fc_s8_mac16_esp32s3


================================================
FILE: src/fully_connected/esp_nn_fully_connected_ansi.c
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <stdint.h>

#include <common_functions.h>

void esp_nn_fully_connected_s8_ansi(const int8_t *input_data,
                                    const int32_t input_offset,
                                    const uint16_t row_len,
                                    const int8_t *filter_data,
                                    const int32_t filter_offset,
                                    const int32_t *bias,
                                    int8_t *out_data,
                                    const uint16_t out_channels,
                                    const int32_t out_offset,
                                    const int32_t out_shift,
                                    const int32_t out_mult,
                                    const int32_t activation_min,
                                    const int32_t activation_max)
{
    for (int32_t out_c = 0; out_c < out_channels; ++out_c) {
        int32_t result = 0;
        for (int32_t data_idx = 0; data_idx < row_len; data_idx++) {
            int32_t filter_index = row_len * out_c + data_idx;
            int32_t input_val = input_data[data_idx];
            int32_t filter_val = filter_data[filter_index];
            result += (filter_val + filter_offset) * (input_val + input_offset);
        }
        if (bias) {
            result += bias[out_c];
        }
        result = esp_nn_multiply_by_quantized_mult(result, out_mult, out_shift);
        result += out_offset;
        result = max(result, activation_min);
        result = min(result, activation_max);
        out_data[out_c] = (int8_t) result;
    }
}

void esp_nn_fully_connected_per_ch_s8_ansi(const int8_t *input_data,
                                    const int32_t input_offset,
                                    const uint16_t row_len,
                                    const int8_t *filter_data,
                                    const int32_t filter_offset,
                                    const int32_t *bias,
                                    int8_t *out_data,
                                    const uint16_t out_channels,
                                    const int32_t out_offset,
                                    const int32_t* out_shift,
                                    const int32_t* out_mult,
                                    const int32_t activation_min,
                                    const int32_t activation_max)
{
    for (int32_t out_c = 0; out_c < out_channels; ++out_c) {
        int32_t result = 0;
        for (int32_t data_idx = 0; data_idx < row_len; data_idx++) {
            int32_t filter_index = row_len * out_c + data_idx;
            int32_t input_val = input_data[data_idx];
            int32_t filter_val = filter_data[filter_index];
            result += (filter_val + filter_offset) * (input_val + input_offset);
        }
        if (bias) {
            result += bias[out_c];
        }
        result = esp_nn_multiply_by_quantized_mult(result, out_mult[out_c], out_shift[out_c]);
        result += out_offset;
        result = max(result, activation_min);
        result = min(result, activation_max);
        out_data[out_c] = (int8_t) result;
    }
}


================================================
FILE: src/fully_connected/esp_nn_fully_connected_esp32s3.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * FC multi-path dispatcher for ESP32-S3.
 * - Pre-computes offset corrections per channel in C
 * - Dispatches to s8 MAC assembly (aligned, large row_len) or s16 assembly (fallback)
 */

#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include <common_functions.h>

/* Original s16 assembly (renamed) */
extern void esp_nn_fc_s16_esp32s3(const int8_t *input_data,
                                   const int32_t input_offset,
                                   const uint16_t row_len,
                                   const int8_t *filter_data,
                                   const int32_t filter_offset,
                                   const int32_t *bias,
                                   int8_t *out_data,
                                   const uint16_t out_channels,
                                   const int32_t out_offset,
                                   const int32_t out_shift,
                                   const int32_t out_mult,
                                   const int32_t activation_min,
                                   const int32_t activation_max);

extern void esp_nn_fc_per_ch_s16_esp32s3(const int8_t *input_data,
                                          const int32_t input_offset,
                                          const uint16_t row_len,
                                          const int8_t *filter_data,
                                          const int32_t filter_offset,
                                          const int32_t *bias,
                                          int8_t *out_data,
                                          const uint16_t out_channels,
                                          const int32_t out_offset,
                                          const int32_t *out_shift,
                                          const int32_t *out_mult,
                                          const int32_t activation_min,
                                          const int32_t activation_max);

/* Shared s8 dot product from common — handles unaligned filter via USAR+QUP */
extern int32_t esp_nn_dot_s8_unaligned_esp32s3(const int8_t *a,
                                                const int8_t *b,
                                                int32_t len_div16);

void esp_nn_fully_connected_s8_esp32s3(const int8_t *input_data,
                                       const int32_t input_offset,
                                       const uint16_t row_len,
                                       const int8_t *filter_data,
                                       const int32_t filter_offset,
                                       const int32_t *bias,
                                       int8_t *out_data,
                                       const uint16_t out_channels,
                                       const int32_t out_offset,
                                       const int32_t out_shift,
                                       const int32_t out_mult,
                                       const int32_t activation_min,
                                       const int32_t activation_max)
{
    /* Quick check: s8 fast path only for aligned, row_len%16, no filter_offset */
    if (__builtin_expect(filter_offset != 0 || row_len < 16
        || ((uintptr_t)input_data & 15), 0)) {
        /* Fallback to original s16 assembly — tail call, no extra overhead */
        esp_nn_fc_s16_esp32s3(input_data, input_offset, row_len, filter_data,
                              filter_offset, bias, out_data, out_channels,
                              out_offset, out_shift, out_mult,
                              activation_min, activation_max);
        return;
    }
    {
        int32_t row_len_div16 = row_len >> 4;

        /* Pre-compute per-channel corrections once */
        int32_t corrections[out_channels];
        for (int ch = 0; ch < out_channels; ch++) {
            const int8_t *f_ptr = filter_data + ch * row_len;
            int32_t corr = 0;
            if (input_offset != 0) {
                int32_t filter_sum = 0;
                for (int i = 0; i < row_len; i++) {
                    filter_sum += f_ptr[i];
                }
                corr = filter_sum * input_offset;
            }
            if (bias) {
                corr += bias[ch];
            }
            corrections[ch] = corr;
        }

        int32_t row_len_rem = row_len & 15;
        int32_t simd_bytes = row_len_div16 << 4;

        for (int ch = 0; ch < out_channels; ch++) {
            const int8_t *f_ptr = filter_data + ch * row_len;
            int32_t acc = esp_nn_dot_s8_unaligned_esp32s3(input_data, f_ptr, row_len_div16);

            /* Scalar remainder for non-multiple-of-16 row_len */
            for (int i = 0; i < row_len_rem; i++) {
                acc += (int32_t)input_data[simd_bytes + i] * (int32_t)f_ptr[simd_bytes + i];
            }

            acc += corrections[ch];

            acc = esp_nn_multiply_by_quantized_mult(acc, out_mult, out_shift);
            acc += out_offset;
            acc = max(acc, activation_min);
            acc = min(acc, activation_max);
            out_data[ch] = (int8_t)acc;
        }
    }
}

void esp_nn_fully_connected_per_ch_s8_esp32s3(const int8_t *input_data,
                                       const int32_t input_offset,
                                       const uint16_t row_len,
                                       const int8_t *filter_data,
                                       const int32_t filter_offset,
                                       const int32_t *bias,
                                       int8_t *out_data,
                                       const uint16_t out_channels,
                                       const int32_t out_offset,
                                       const int32_t *out_shift,
                                       const int32_t *out_mult,
                                       const int32_t activation_min,
                                       const int32_t activation_max)
{
    if (__builtin_expect(filter_offset != 0 || row_len < 16
        || ((uintptr_t)input_data & 15), 0)) {
        esp_nn_fc_per_ch_s16_esp32s3(input_data, input_offset, row_len, filter_data,
                                     filter_offset, bias, out_data, out_channels,
                                     out_offset, out_shift, out_mult,
                                     activation_min, activation_max);
        return;
    }
    {
        int32_t row_len_div16 = row_len >> 4;

        /* Pre-compute per-channel corrections once */
        int32_t corrections[out_channels];
        for (int ch = 0; ch < out_channels; ch++) {
            const int8_t *f_ptr = filter_data + ch * row_len;
            int32_t corr = 0;
            if (input_offset != 0) {
                int32_t filter_sum = 0;
                for (int i = 0; i < row_len; i++) {
                    filter_sum += f_ptr[i];
                }
                corr = filter_sum * input_offset;
            }
            if (bias) {
                corr += bias[ch];
            }
            corrections[ch] = corr;
        }

        int32_t row_len_rem = row_len & 15;
        int32_t simd_bytes = row_len_div16 << 4;

        for (int ch = 0; ch < out_channels; ch++) {
            const int8_t *f_ptr = filter_data + ch * row_len;
            int32_t acc = esp_nn_dot_s8_unaligned_esp32s3(input_data, f_ptr, row_len_div16);

            for (int i = 0; i < row_len_rem; i++) {
                acc += (int32_t)input_data[simd_bytes + i] * (int32_t)f_ptr[simd_bytes + i];
            }

            acc += corrections[ch];

            acc = esp_nn_multiply_by_quantized_mult(acc, out_mult[ch], out_shift[ch]);
            acc += out_offset;
            acc = max(acc, activation_min);
            acc = min(acc, activation_max);
            out_data[ch] = (int8_t)acc;
        }
    }
}


================================================
FILE: src/fully_connected/esp_nn_fully_connected_per_ch_s8_esp32s3.S
================================================
//
// SPDX-FileCopyrightText: 2025-2026 Espressif Systems (Shanghai) CO LTD
//
// SPDX-License-Identifier: Apache-2.0
//
    .text
    .align  4
    .literal_position
    .literal    .LC3_26_101, 1073741824 // nudge (1 << 30)

    # Program Unit: esp_nn_fc_per_ch_s16_esp32s3
    .type   esp_nn_fc_per_ch_s16_esp32s3, @function
    .align   4
    .global esp_nn_fc_per_ch_s16_esp32s3

// a2: input_data
// a3: input_offset
// a4: row_len
// a5: filter_data
// a6: filter_offset
// a7: bias
// on stack: out_data
// on stack: out_channels
// on stack: out_offset
// on stack: out_shift
// on stack: out_mult
// on stack: activation_min
// on stack: activation_max

esp_nn_fc_per_ch_s16_esp32s3:  # 0x4
    # qacc_scratch = 0
    // 40, filter_offset
    // 44, input_offset
    # gra_spill_temp_7 = 48
    # gra_spill_temp_2 = 60
    # gra_spill_temp_3 = 64
    # gra_spill_temp_4 = 68
    # gra_spill_temp_5 = 72
    # gra_spill_temp_6 = 76
    # gra_spill_temp_8 = 80
    # gra_spill_temp_9 = 84

    entry   a1,112                      #
    s32i.n  a5,a1,60                # [0]  gra_spill_temp_2, filter_data
    s32i    a7,a1,48                    # [1]  gra_spill_temp_7, bias
    s32i    a6,a1,40                    # [2]  id:252 filter_offset+0x0
    s32i    a3,a1,44                    # [3]  id:251 input_offset+0x0
    mov.n   a13,a2                      # [5]
    mov.n   a12,a4                      # [6]

 // out_channel loop
    l16ui       a2,a1,116                   # [7]  id:255 out_channels+0x0
    addi        a4,a1,40                # [8]
    addi        a8,a1,44                # [9]
    ee.vldbc.16 q5,a8               # [10]  id:253 input_offset
    ee.vldbc.16 q6,a4               # [12]  id:254 filter_offset
    beqz.n      a2,.Lt_0_7938           # [13]

    ee.zero.q   q7                      # [0]
    srai        a11,a12,3                   # [2]
    l32i        a8,a1,112                   # [6]  id:259 out_data+0x0
    addi        a9,a12,-7                   # [7]
    s32i        a9,a1,76                    # [8]  gra_spill_temp_6
    s32i        a8,a1,72                    # [9]  gra_spill_temp_5
    s32i        a11,a1,64                   # [14]  gra_spill_temp_3
    slli        a11,a11,3                   # [16]
    s32i        a11,a1,68                   # [18]  gra_spill_temp_4
    movi.n      a15,0                   # [17]
    mov.n       a14,a7                      # [15]
    mov.n       a11,a5                      # [31]
    l32i        a10,a1,124  # out_shift
    l32i        a2,a1,128  # out_mult
    s32i        a10,a1,80                   # gra_spill_temp_8
    s32i        a2,a1,84                   # gra_spill_temp_9
    movi.n      a10,0                   # [32]
    mov.n       a2,a11                      # [33]

.Lt_0_8450: # 0x12b
    l32i            a9,a1,76                    # [2]  gra_spill_temp_6
    extui           a5,a11,0,3                  # [34]
    ee.zero.accx
    slli            a5,a5,1                     # [3]
    bgei            a9,0,.LBB6_esp_nn_fc_per_ch_s16_esp32s3            # [9]

    mov.n           a5,a10                      # [6]
    movi.n  a2,0                    # [0]
    j       .Lt_0_8706                      # [1]

.LBB6_esp_nn_fc_per_ch_s16_esp32s3:    # 0x147
    wur.sar_byte    a5                  # [5]
    ee.vld.l.64.ip  q4,a2,8         # [4]  id:267
    l32i            a4,a1,64                    # [0]  gra_spill_temp_3
    mov.n           a3,a13                      # [1]
    addx8           a5,a4,a10                   # [2]
    ee.vcmp.lt.s8   q2,q4,q7            # [7]
    ee.vzip.8       q4,q2                   # [8]
    loopgtz a4,.LBB45_esp_nn_fc_per_ch_s16_esp32s3     # [3]

    ee.vld.l.64.ip      q0,a2,8         # [0*II+0]  id:268
    ee.vld.l.64.ip      q1,a3,8         # [0*II+1]  id:270
    ee.vcmp.lt.s8       q2,q0,q7            # [0*II+2]
    ee.vcmp.lt.s8       q3,q1,q7            # [0*II+3]
    ee.vzip.8           q0,q2                   # [0*II+4]
    ee.vzip.8           q1,q3                   # [0*II+5]
    ee.vadds.s16        q1,q1,q5            # [0*II+6]
    ee.src.q.qup        q2,q4,q0            # [0*II+7]
    ee.vadds.s16        q2,q2,q6            # [0*II+8]
    ee.vmulas.s16.accx  q1,q2       # [0*II+9]

.LBB45_esp_nn_fc_per_ch_s16_esp32s3:   # 0x170
    l32i    a2,a1,68                    # [0]  gra_spill_temp_4

.Lt_0_8706: # 0x173
	movi a9, 0
	ee.srs.accx  a6, a9, 0

    bge             a2,a12,.Lt_0_9730           # [38]

// prepare remaining loop
    l32i    a8,a1,44                    # [0]  id:251 input_offset+0x0
    l32i    a7,a1,40                    # [1]  id:252 filter_offset+0x0
    sub     a3,a12,a2                   # [2]
    l32i.n  a4,a1,60                # [3]  gra_spill_temp_2
    add.n   a2,a2,a13                   # [4]
    add.n   a4,a4,a5                    # [5]
    loopgtz a3,.LBB60_esp_nn_fc_per_ch_s16_esp32s3     # [6]

// remaining c loop
    l8ui    a3,a2,0                     # [0*II+0]  id:299
    l8ui    a5,a4,0                     # [0*II+1]  id:300
    sext    a3,a3,7                     # [0*II+2]
    sext    a5,a5,7                     # [0*II+3]
    add.n   a5,a5,a7                    # [0*II+5]
    add.n   a3,a3,a8                    # [0*II+6]
    mull    a3,a3,a5                    # [0*II+7]
    addi.n  a2,a2,1                 # [0*II+8]
    addi.n  a4,a4,1                 # [0*II+4]
    add.n   a6,a6,a3                    # [0*II+9]

.LBB60_esp_nn_fc_per_ch_s16_esp32s3:   # 0x20f

// add bias
.Lt_0_9730: # 0x20f
    l32i    a8,a1,48                    # [0]  gra_spill_temp_7, bias
    beqz.n  a8,.Lt_0_10754          # [2], skip_bias

    l32i.n  a9,a14,0                # [0]  id:301
    add.n   a6,a6,a9                    # [2]

// apply quantization
.Lt_0_10754:    # 0x218
    movi        a4,0
    l32i        a5,a1,80                  # [25]  id:256 gra_spill_temp_8, out_shift+0x0
    l32i        a5,a5,0
    max         a2,a5,a4                 // left_shift
    sub         a5,a2,a5                 // right_shift

    ssl     a2                          # [3]
    sll     a6,a6                       # [5] // x * (1 << left_shift)

    l32i    a4,a1,84                   # [2]  gra_spill_temp_9 //out_mult
    l32r    a3,.LC3_26_101              # [0]

    add.n   a10,a10,a12                 # [0]
    addi.n  a14,a14,4               # [1]

    l32i    a4,a4,0
    add.n   a11,a11,a12                 # [6]

// multiply add nudge and pick high32
    ssai    31
    mulsh   a7,a4,a6                    # [4]
    mull    a4,a4,a6                    # [5]

    mov.n   a2,a11                      # [27]
    add     a4,a4,a3
    saltu   a8,a4,a3
    add.n   a7,a7,a8
    src     a3,a7,a4

// divide_by_power_of2_step
    blti    a5,1,.skip_divide_by2
    movi.n  a8,1                    # [28]
    addi    a4,a5,-1
    ssl     a4          // load left_shift
    sll     a8,a8       // to_add factor ( 1 << (exponent - 1))
    extui   a6,a3,31,1                  # [33]
    sub     a8,a8,a6        // modified to_add factor ( 1 << (exponent - 1) - (val < 0))
    add     a3,a3,a8    // val + to_add
    ssr     a5                          # [29] //load right_shift
    sra     a3,a3                       # [31]
.skip_divide_by2:

    l32i    a8,a1,120                   # [41]  out_offset
    l32i    a7,a1,132                   # [44] // activation_min
    l32i    a4,a1,136                   # [45] // activation_max

    add.n   a8,a8,a3                    # [46] // add out_offset
    l32i    a6,a1,72                    # [47]  gra_spill_temp_5
    l32i.n  a3,a1,116                   # [48]  out_channels
    max     a7,a7,a8                    # [49]
    add.n   a6,a15,a6                   # [50]
    min     a4,a4,a7                    # [51]
    addi.n  a15,a15,1               # [52]

    l32i        a7,a1,84                # gra_spill_temp_9
    l32i        a8,a1,80                # gra_spill_temp_8

    s8i     a4,a6,0                     # store output

    addi.n      a7,a7,4                 # increment mult pointer
    addi.n      a8,a8,4                 # increment mult pointer

    s32i        a7,a1,84                # gra_spill_temp_9
    s32i        a8,a1,80                # gra_spill_temp_8

    bne     a3,a15,.Lt_0_8450               # [55]

.Lt_0_7938: # 0x25c
    retw.n                          # [0]

    .size   esp_nn_fc_per_ch_s16_esp32s3, . - esp_nn_fc_per_ch_s16_esp32s3


================================================
FILE: src/fully_connected/esp_nn_fully_connected_s8_esp32p4.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>
#include <common_functions.h>

/**
 * Fully connected layer for s8 using ESP32-P4 PIE SIMD.
 *
 * Uses esp.vmulas.s8.xacc.ld.ip for fused 16-wide s8 MAC + load.
 * Pre-computes filter_sum * input_offset (like conv) so PIE path
 * works even with non-zero input_offset.
 *
 * Inner loop is software-pipelined:
 *   iteration N: MAC(q0,q1) + load_next_input(q0)
 *                load_next_filter(q1)     <- hides MAC latency
 *                counter_update           <- independent of above
 */

/* Core dot product: PIE-accelerated when row_len >= 16 */
static inline __attribute__((always_inline))
int32_t fc_dot_s8_pie(const int8_t *input, const int8_t *filter, int32_t row_len)
{
    int32_t result = 0;
    int32_t idx = 0;

    if (row_len >= 32) {
        /* Double-pumped: process 32 elements per iteration
         * Uses q0/q1 for first pair, q2/q3 for second pair */
        asm volatile (
            "esp.zero.xacc                          \n\t"
            "mv     x30, %[in]                      \n\t"
            "mv     x31, %[flt]                     \n\t"
            "li     %[idx], 32                      \n\t"
            "addi   s7, %[len], -31                 \n\t"

            /* Prime the pipeline: load first 32 bytes */
            "esp.vld.128.ip  q0, x30, 16            \n\t"
            "esp.vld.128.ip  q2, x30, 16            \n\t"
            "esp.vld.128.ip  q1, x31, 16            \n\t"
            "esp.vld.128.ip  q3, x31, 16            \n\t"
            "j      2f                              \n\t"

            "1:                                     \n\t"
            /* MAC pair 1 + load next input[0:16] */
            "esp.vmulas.s8.xacc.ld.ip q0, x30, 16, q0, q1 \n\t"
            /* Load next filter[0:16] while MAC settles */
            "esp.vld.128.ip  q1, x31, 16            \n\t"
            /* MAC pair 2 + load next input[16:32] */
            "esp.vmulas.s8.xacc.ld.ip q2, x30, 16, q2, q3 \n\t"
            /* Load next filter[16:32] - interleaved with counter */
            "esp.vld.128.ip  q3, x31, 16            \n\t"
            "addi   %[idx], %[idx], 32              \n\t"

            "2:                                     \n\t"
            "blt    %[idx], s7, 1b                  \n\t"

            /* Drain pipeline: final two MACs */
            "esp.vmulas.s8.xacc  q0, q1             \n\t"
            "esp.vmulas.s8.xacc  q2, q3             \n\t"

            /* Handle 16-element remainder if any (idx+16 <= row_len) */
            "addi   s7, %[len], -15                 \n\t"
            "bge    %[idx], s7, 3f                  \n\t"
            "esp.vld.128.ip  q0, x30, 16            \n\t"
            "esp.vld.128.ip  q1, x31, 16            \n\t"
            "esp.vmulas.s8.xacc  q0, q1             \n\t"
            "addi   %[idx], %[idx], 16              \n\t"
            "3:                                     \n\t"

            "esp.movx.r.xacc.l   x30                \n\t"
            "mv     %[res], x30                     \n\t"
            : [idx] "+r"(idx), [res] "=r"(result)
            : [in] "r"(input), [flt] "r"(filter), [len] "r"(row_len)
            : "x30", "x31", "s7"
        );
    } else if (row_len >= 16) {
        /* Single-pumped for 16-31 element rows */
        asm volatile (
            "esp.zero.xacc                          \n\t"
            "mv     x30, %[in]                      \n\t"
            "mv     x31, %[flt]                     \n\t"
            "li     %[idx], 16                      \n\t"
            "addi   s7, %[len], -15                 \n\t"
            "esp.vld.128.ip  q0, x30, 16            \n\t"
            "esp.vld.128.ip  q1, x31, 16            \n\t"
            "j      5f                              \n\t"
            "4:                                     \n\t"
            "esp.vmulas.s8.xacc.ld.ip q0, x30, 16, q0, q1 \n\t"
            "esp.vld.128.ip  q1, x31, 16            \n\t"
            "addi   %[idx], %[idx], 16              \n\t"
            "5:                                     \n\t"
            "blt    %[idx], s7, 4b                  \n\t"
            "esp.vmulas.s8.xacc  q0, q1             \n\t"
            "esp.movx.r.xacc.l   x30                \n\t"
            "mv     %[res], x30                     \n\t"
            : [idx] "+r"(idx), [res] "=r"(result)
            : [in] "r"(input), [flt] "r"(filter), [len] "r"(row_len)
            : "x30", "x31", "s7"
        );
    }

    /* Scalar remainder */
    for (; idx < row_len; idx++) {
        result += (int32_t)input[idx] * (int32_t)filter[idx];
    }

    return result;
}

void esp_nn_fully_connected_s8_esp32p4(const int8_t *input_data,
                                        const int32_t input_offset,
                                        const uint16_t row_len,
                                        const int8_t *filter_data,
                                        const int32_t filter_offset,
                                        const int32_t *bias,
                                        int8_t *out_data,
                                        const uint16_t out_channels,
                                        const int32_t out_offset,
                                        const int32_t out_shift,
                                        const int32_t out_mult,
                                        const int32_t activation_min,
                                        const int32_t activation_max)
{
    /* Enable PIE once for all channels */
    asm volatile (
        "csrsi  0x7f2, 0b01        \n\t"
        "li     x29, 0b10          \n\t"
        "esp.movx.w.cfg x29        \n\t"
        ::: "x29"
    );

    for (int32_t out_c = 0; out_c < out_channels; ++out_c) {
        const int8_t *filter_row = filter_data + (int32_t)row_len * out_c;

        int32_t result;
        if (input_offset == 0 && filter_offset == 0) {
            /* Fast PIE path: pure s8 dot product */
            result = fc_dot_s8_pie(input_data, filter_row, row_len);
        } else {
            /* Scalar path with offsets */
            result = 0;
            for (int32_t i = 0; i < row_len; i++) {
                result += ((int32_t)input_data[i] + input_offset) *
                          ((int32_t)filter_row[i] + filter_offset);
            }
        }

        if (bias) {
            result += bias[out_c];
        }
        result = esp_nn_requantize(result, out_mult, out_shift);
        result += out_offset;
        result = max(result, activation_min);
        result = min(result, activation_max);
        out_data[out_c] = (int8_t) result;
    }
}

void esp_nn_fully_connected_per_ch_s8_esp32p4(const int8_t *input_data,
                                        const int32_t input_offset,
                                        const uint16_t row_len,
                                        const int8_t *filter_data,
                                        const int32_t filter_offset,
                                        const int32_t *bias,
                                        int8_t *out_data,
                                        const uint16_t out_channels,
                                        const int32_t out_offset,
                                        const int32_t *out_shift,
                                        const int32_t *out_mult,
                                        const int32_t activation_min,
                                        const int32_t activation_max)
{
    /* Enable PIE once for all channels */
    asm volatile (
        "csrsi  0x7f2, 0b01        \n\t"
        "li     x29, 0b10          \n\t"
        "esp.movx.w.cfg x29        \n\t"
        ::: "x29"
    );

    for (int32_t out_c = 0; out_c < out_channels; ++out_c) {
        const int8_t *filter_row = filter_data + (int32_t)row_len * out_c;

        int32_t result;
        if (input_offset == 0 && filter_offset == 0) {
            result = fc_dot_s8_pie(input_data, filter_row, row_len);
        } else {
            result = 0;
            for (int32_t i = 0; i < row_len; i++) {
                result += ((int32_t)input_data[i] + input_offset) *
                          ((int32_t)filter_row[i] + filter_offset);
            }
        }

        if (bias) {
            result += bias[out_c];
        }
        result = esp_nn_requantize(result, out_mult[out_c], out_shift[out_c]);
        result += out_offset;
        result = max(result, activation_min);
        result = min(result, activation_max);
        out_data[out_c] = (int8_t) result;
    }
}


================================================
FILE: src/fully_connected/esp_nn_fully_connected_s8_esp32s3.S
================================================
//
// SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
//
// SPDX-License-Identifier: Apache-2.0
//
    .text
    .align  4
    .literal_position
    .literal    .LC3_26_101, 1073741824 // nudge (1 << 30)

    # Program Unit: esp_nn_fc_s16_esp32s3
    .type   esp_nn_fc_s16_esp32s3, @function
    .align   4
    .global esp_nn_fc_s16_esp32s3

// a2: input_data
// a3: input_offset
// a4: row_len
// a5: filter_data
// a6: filter_offset
// a7: bias
// on stack: out_data
// on stack: out_channels
// on stack: out_offset
// on stack: out_shift
// on stack: out_mult
// on stack: activation_min
// on stack: activation_max

esp_nn_fc_s16_esp32s3:  # 0x4
    # qacc_scratch = 0
    // 40, filter_offset
    // 44, input_offset
    # gra_spill_temp_7 = 48
    # gra_spill_temp_0 = 52
    # gra_spill_temp_1 = 56
    # gra_spill_temp_2 = 60
    # gra_spill_temp_3 = 64
    # gra_spill_temp_4 = 68
    # gra_spill_temp_5 = 72
    # gra_spill_temp_6 = 76

    entry   a1,112                      #
    s32i.n  a5,a1,60                # [0]  gra_spill_temp_2, filter_data
    s32i    a7,a1,48                    # [1]  gra_spill_temp_7, bias
    s32i    a6,a1,40                    # [2]  id:252 filter_offset+0x0
    s32i    a3,a1,44                    # [3]  id:251 input_offset+0x0
    mov.n   a13,a2                      # [5]
    mov.n   a12,a4                      # [6]

 // out_channel loop
    l16ui       a2,a1,116                   # [7]  id:255 out_channels+0x0
    addi        a4,a1,40                # [8]
    addi        a8,a1,44                # [9]
    ee.vldbc.16 q5,a8               # [10]  id:253 input_offset
    ee.vldbc.16 q6,a4               # [12]  id:254 filter_offset
    beqz.n      a2,.Lt_0_7938           # [13]

    ee.zero.q   q7                      # [0]
    srai        a11,a12,3                   # [2]
    l32i        a10,a1,128                  # [5]  id:257 out_mult+0x0
    l32i        a8,a1,112                   # [6]  id:259 out_data+0x0
    addi        a9,a12,-7                   # [7]
    s32i        a9,a1,76                    # [8]  gra_spill_temp_6
    s32i        a8,a1,72                    # [9]  gra_spill_temp_5
    s32i        a11,a1,64                   # [14]  gra_spill_temp_3
    slli        a11,a11,3                   # [16]
    s32i        a11,a1,68                   # [18]  gra_spill_temp_4
    l32i        a10,a1,124                  # [25]  id:256 out_shift+0x0
    movi.n      a15,0                   # [17]
    mov.n       a14,a7                      # [15]
    max         a11,a10,a15                 # [29]
    s32i        a11,a1,52                   # [30]  gra_spill_temp_0 // left_shift
    sub         a10,a11,a10                 #  // right_shift
    s32i.n      a10,a1,56                   # [28]  gra_spill_temp_1 // right_shift
    mov.n       a11,a5                      # [31]
    movi.n      a10,0                   # [32]
    mov.n       a2,a11                      # [33]

.Lt_0_8450: # 0x12b

    l32i            a9,a1,76                    # [2]  gra_spill_temp_6
    extui           a5,a11,0,3                  # [34]
    ee.zero.accx
    slli            a5,a5,1                     # [3]
    bgei            a9,0,.LBB6_esp_nn_fc_s16_esp32s3            # [9]

    mov.n           a5,a10                      # [6]
    movi.n  a2,0                    # [0]
    j       .Lt_0_8706                      # [1]

.LBB6_esp_nn_fc_s16_esp32s3:    # 0x147
    wur.sar_byte    a5                  # [5]
    ee.vld.l.64.ip  q4,a2,8         # [4]  id:267
    l32i            a4,a1,64                    # [0]  gra_spill_temp_3
    mov.n           a3,a13                      # [1]
    addx8           a5,a4,a10                   # [2]
    ee.vcmp.lt.s8   q2,q4,q7            # [7]
    ee.vzip.8       q4,q2                   # [8]
    loopgtz a4,.LBB45_esp_nn_fc_s16_esp32s3     # [3]

    ee.vld.l.64.ip      q0,a2,8         # [0*II+0]  id:268
    ee.vld.l.64.ip      q1,a3,8         # [0*II+1]  id:270
    ee.vcmp.lt.s8       q2,q0,q7            # [0*II+2]
    ee.vcmp.lt.s8       q3,q1,q7            # [0*II+3]
    ee.vzip.8           q0,q2                   # [0*II+4]
    ee.vzip.8           q1,q3                   # [0*II+5]
    ee.vadds.s16        q1,q1,q5            # [0*II+6]
    ee.src.q.qup        q2,q4,q0            # [0*II+7]
    ee.vadds.s16        q2,q2,q6            # [0*II+8]
    ee.vmulas.s16.accx  q1,q2       # [0*II+9]

.LBB45_esp_nn_fc_s16_esp32s3:   # 0x170
    l32i    a2,a1,68                    # [0]  gra_spill_temp_4

.Lt_0_8706: # 0x173
	movi a9, 0
	ee.srs.accx  a6, a9, 0

    bge             a2,a12,.Lt_0_9730           # [38]

// prepare remaining loop
    l32i    a8,a1,44                    # [0]  id:251 input_offset+0x0
    l32i    a7,a1,40                    # [1]  id:252 filter_offset+0x0
    sub     a3,a12,a2                   # [2]
    l32i.n  a4,a1,60                # [3]  gra_spill_temp_2
    add.n   a2,a2,a13                   # [4]
    add.n   a4,a4,a5                    # [5]
    loopgtz a3,.LBB60_esp_nn_fc_s16_esp32s3     # [6]

// remaining c loop
    l8ui    a3,a2,0                     # [0*II+0]  id:299
    l8ui    a5,a4,0                     # [0*II+1]  id:300
    sext    a3,a3,7                     # [0*II+2]
    sext    a5,a5,7                     # [0*II+3]
    add.n   a5,a5,a7                    # [0*II+5]
    add.n   a3,a3,a8                    # [0*II+6]
    mull    a3,a3,a5                    # [0*II+7]
    addi.n  a2,a2,1                 # [0*II+8]
    addi.n  a4,a4,1                 # [0*II+4]
    add.n   a6,a6,a3                    # [0*II+9]

.LBB60_esp_nn_fc_s16_esp32s3:   # 0x20f

// add bias
.Lt_0_9730: # 0x20f
    l32i    a8,a1,48                    # [0]  gra_spill_temp_7, bias
    beqz.n  a8,.Lt_0_10754          # [2], skip_bias

    l32i.n  a9,a14,0                # [0]  id:301
    add.n   a6,a6,a9                    # [2]

// apply quantization
.Lt_0_10754:    # 0x218
    l32i    a2,a1,52                    # [1]  gra_spill_temp_0 // left_shift
    l32i    a5,a1,56                    # [2]  gra_spill_temp_1 // right_shift
    ssl     a2                          # [3]
    sll     a6,a6                       # [5] // x * (1 << left_shift)

    l32r    a3,.LC3_26_101              # [0]

    add.n   a10,a10,a12                 # [0]
    addi.n  a14,a14,4               # [1]

    l32i    a4,a1,128                   # [2]  gra_spill_temp_10 //out_mult
    add.n   a11,a11,a12                 # [6]

// multiply add nudge and pick high32
    ssai    31
    mulsh   a7,a4,a6                    # [4]
    mull    a4,a4,a6                    # [5]

    mov.n   a2,a11                      # [27]
    add     a4,a4,a3
    saltu   a8,a4,a3
    add.n   a7,a7,a8
    src     a3,a7,a4

// divide_by_power_of2_step
    blti    a5,1,.skip_divide_by2
    movi.n  a8,1                    # [28]
    addi    a4,a5,-1
    ssl     a4          // load left_shift
    sll     a8,a8       // to_add factor ( 1 << (exponent - 1))
    extui   a6,a3,31,1                  # [33]
    sub     a8,a8,a6        // modified to_add factor ( 1 << (exponent - 1) - (val < 0))
    add     a3,a3,a8    // val + to_add
    ssr     a5                          # [29] //load right_shift
    sra     a3,a3                       # [31]
.skip_divide_by2:

    l32i    a8,a1,120                   # [41]  out_offset
    l32i    a7,a1,132                   # [44] // activation_min
    l32i    a4,a1,136                   # [45] // activation_max

    add.n   a8,a8,a3                    # [46] // add out_offset
    l32i    a6,a1,72                    # [47]  gra_spill_temp_5
    l32i.n  a3,a1,116                   # [48]  out_channels
    max     a7,a7,a8                    # [49]
    add.n   a6,a15,a6                   # [50]
    min     a4,a4,a7                    # [51]
    addi.n  a15,a15,1               # [52]
    s8i     a4,a6,0                     # [53]  id:302
    bne     a3,a15,.Lt_0_8450               # [55]

.Lt_0_7938: # 0x25c
    retw.n                          # [0]

    .size   esp_nn_fc_s16_esp32s3, . - esp_nn_fc_s16_esp32s3


================================================
FILE: src/logistic/esp_nn_logistic_ansi.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>
#include <math.h>

/*
 * LUT-based int8 logistic (sigmoid) for quantized inference.
 *
 * For int8, there are only 256 possible input values. We precompute sigmoid
 * for all of them during Prepare() and store as a 256-byte LUT.
 * Eval() then becomes a trivial table lookup — O(1) per element.
 *
 * Output quantization is fixed: scale = 1/256, zero_point = -128.
 * This matches TFLite's convention for int8 logistic output.
 */

int32_t esp_nn_get_logistic_s8_scratch_size_ansi(void)
{
    return 256; /* LUT: one int8 output per possible int8 input */
}

void esp_nn_logistic_s8_prepare_ansi(int8_t *lut,
                                      int32_t input_zero_point,
                                      float input_scale)
{
    /* Build LUT: for each possible int8 input value (-128..127),
     * compute sigmoid and quantize to output int8.
     *
     * Output quant: scale=1/256, zero_point=-128
     * So output_int8 = clamp(round(sigmoid * 256) - 128, -128, 127)
     * Which simplifies to: output_int8 = clamp(round(sigmoid * 256) - 128, -128, 127)
     */
    for (int i = 0; i < 256; i++) {
        /* Index matches (uint8_t) cast of int8: i=0→int8(0), i=128→int8(-128) */
        int8_t input_val = (int8_t)i;
        float dequant = (input_val - input_zero_point) * input_scale;
        float sigmoid = 1.0f / (1.0f + expf(-dequant));

        /* Quantize to output: scale=1/256, zp=-128 */
        int32_t out_q = (int32_t)roundf(sigmoid * 256.0f) - 128;
        if (out_q < -128) out_q = -128;
        if (out_q > 127) out_q = 127;
        lut[i] = (int8_t)out_q;
    }
}

void esp_nn_logistic_s8_ansi(const int8_t *input, int8_t *output,
                              int32_t size, const int8_t *lut)
{
    for (int i = 0; i < size; i++) {
        output[i] = lut[(uint8_t)input[i]];
    }
}


================================================
FILE: src/pooling/esp_nn_avg_pool_ansi.c
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <stdint.h>

#include <common_functions.h>

void esp_nn_avg_pool_s8_ansi(const int8_t *input,
                             const uint16_t input_wd,
                             const uint16_t input_ht,
                             int8_t *output,
                             const uint16_t output_wd,
                             const uint16_t output_ht,
                             const uint16_t stride_wd,
                             const uint16_t stride_ht,
                             const uint16_t filter_wd,
                             const uint16_t filter_ht,
                             const uint16_t pad_wd,
                             const uint16_t pad_ht,
                             const int32_t activation_min,
                             const int32_t activation_max,
                             const uint16_t channels)
{
    int32_t base_y = -pad_ht;
    for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) {
        int32_t base_x = -pad_wd;
        for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) {
            for (int32_t ch_idx = 0; ch_idx < channels; ch_idx++) {
                int32_t result = 0;
                int32_t filter_cnt = 0;
                /* Make sure filter does not cross the input box */
                int32_t filter_y_start = max(0, -base_y);
                int32_t filter_x_start = max(0, -base_x);

                int32_t filter_y_end = min(filter_ht, input_ht - base_y);
                int32_t filter_x_end = min(filter_wd, input_wd - base_x);

                for (int32_t filter_y = filter_y_start; filter_y < filter_y_end; filter_y++) {
                    for (int32_t filter_x = filter_x_start; filter_x < filter_x_end; filter_x++) {
                        int32_t in_x_idx = base_x + filter_x;
                        int32_t in_y_idx = base_y + filter_y;
                        int32_t input_index = (in_y_idx * input_wd + in_x_idx) * channels + ch_idx;
                        result += input[input_index];
                        filter_cnt++;
                    }
                }

                /* Rounded average */
                result = result > 0 ? (result + filter_cnt / 2) / filter_cnt
                                    : (result - filter_cnt / 2) / filter_cnt;

                /* Activation function */
                result = max(result, activation_min);
                result = min(result, activation_max);

                int32_t output_index = (out_y * output_wd + out_x) * channels + ch_idx;
                output[output_index] = (int8_t) result;
            }
        }
    }
}


================================================
FILE: src/pooling/esp_nn_avg_pool_s8_esp32p4.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>
#include <common_functions.h>

/**
 * Average pooling for s8 using ESP32-P4 PIE SIMD.
 *
 * Uses QACC per-lane accumulation: multiply 16 input channels by a
 * vector of 1s, accumulate per-lane across filter window.
 * Extract 16 × int32 sums via esp.st.qacc.{l,h}.{l,h}.128.ip.
 * Then divide, clamp, and store.
 */
void esp_nn_avg_pool_s8_esp32p4(const int8_t *input,
                                 const uint16_t input_wd,
                                 const uint16_t input_ht,
                                 int8_t *output,
                                 const uint16_t output_wd,
                                 const uint16_t output_ht,
                                 const uint16_t stride_wd,
                                 const uint16_t stride_ht,
                                 const uint16_t filter_wd,
                                 const uint16_t filter_ht,
                                 const uint16_t pad_wd,
                                 const uint16_t pad_ht,
                                 const int32_t activation_min,
                                 const int32_t activation_max,
                                 const uint16_t channels)
{
    /* Enable PIE */
    asm volatile (
        "csrsi  0x7f2, 0b01        \n\t"
        "li     x29, 0b10          \n\t"
        "esp.movx.w.cfg x29        \n\t"
        ::: "x29"
    );

    /* Broadcast 1 into q7 for "multiply by 1" accumulation trick */
    const int8_t one_val = 1;
    asm volatile (
        "mv     x30, %0             \n\t"
        "esp.vldbc.8.ip q7, x30, 0  \n\t"
        :: "r"(&one_val) : "x30"
    );

    const int32_t ch_16 = channels >> 4;

    int32_t base_y = -pad_ht;
    for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) {
        int32_t base_x = -pad_wd;
        for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) {
            int32_t filter_y_start = max(0, -base_y);
            int32_t filter_x_start = max(0, -base_x);
            int32_t filter_y_end = min(filter_ht, input_ht - base_y);
            int32_t filter_x_end = min(filter_wd, input_wd - base_x);
            int32_t filter_cnt = (filter_y_end - filter_y_start) * (filter_x_end - filter_x_start);
            int32_t half_cnt = filter_cnt >> 1;

            int8_t *out_ptr = output + (out_y * output_wd + out_x) * channels;

            /* Process 16 channels at a time using QACC per-lane accumulation */
            int32_t ch_offset = 0;
            for (int32_t ch_blk = 0; ch_blk < ch_16; ch_blk++, ch_offset += 16) {

                /* Clear per-lane accumulators */
                asm volatile ("esp.zero.qacc \n\t");

                /* Accumulate via QACC with stride-based fx loop */
                for (int32_t fy = filter_y_start; fy < filter_y_end; fy++) {
                    int32_t in_y = base_y + fy;
                    const int8_t *row_ptr = input + (in_y * input_wd + base_x + filter_x_start) * channels + ch_offset;
                    int32_t fx_count = filter_x_end - filter_x_start;

                    asm volatile (
                        "mv     x30, %[ptr]              \n\t"
                        "mv     s7,  %[cnt]              \n\t"
                        "1:                              \n\t"
                        "esp.vld.128.ip  q0, x30, 0      \n\t"
                        "esp.vmulas.s8.qacc q0, q7       \n\t"
                        "add    x30, x30, %[stride]      \n\t"
                        "addi   s7, s7, -1               \n\t"
                        "bnez   s7, 1b                   \n\t"
                        :
                        : [ptr] "r"(row_ptr), [cnt] "r"(fx_count),
                          [stride] "r"((int32_t)channels)
                        : "x30", "s7"
                    );
                }

                /* Extract 16 per-lane int32 sums from QACC:
                 * qacc has 4 quadrants, each 128 bits = 4 × int32 */
                int32_t sums[16] __attribute__((aligned(16)));
                asm volatile (
                    "mv                      x30, %0     \n\t"
                    "esp.st.qacc.l.l.128.ip  x30, 16     \n\t"  /* lanes 0-3 */
                    "esp.st.qacc.l.h.128.ip  x30, 16     \n\t"  /* lanes 4-7 */
                    "esp.st.qacc.h.l.128.ip  x30, 16     \n\t"  /* lanes 8-11 */
                    "esp.st.qacc.h.h.128.ip  x30, 0      \n\t"  /* lanes 12-15 */
                    :: "r"(sums)
                    : "x30", "memory"
                );

                /* Rounded division and activation clamp */
                for (int k = 0; k < 16; k++) {
                    int32_t s = sums[k];
                    int32_t result = s > 0 ? (s + half_cnt) / filter_cnt
                                           : (s - half_cnt) / filter_cnt;
                    result = max(result, activation_min);
                    result = min(result, activation_max);
                    out_ptr[ch_offset + k] = (int8_t) result;
                }
            }

            /* Handle remaining channels scalar */
            for (int32_t ch_idx = ch_offset; ch_idx < channels; ch_idx++) {
                int32_t result = 0;
                int32_t count = 0;
                for (int32_t fy = filter_y_start; fy < filter_y_end; fy++) {
                    for (int32_t fx = filter_x_start; fx < filter_x_end; fx++) {
                        int32_t in_y = base_y + fy;
                        int32_t in_x = base_x + fx;
                        result += input[(in_y * input_wd + in_x) * channels + ch_idx];
                        count++;
                    }
                }
                result = result > 0 ? (result + count / 2) / count
                                    : (result - count / 2) / count;
                result = max(result, activation_min);
                result = min(result, activation_max);
                out_ptr[ch_idx] = (int8_t) result;
            }
        }
    }
}


================================================
FILE: src/pooling/esp_nn_avg_pool_s8_esp32s3.S
================================================
//
// SPDX-FileCopyrightText: 2021-2026 Espressif Systems (Shanghai) CO LTD
//
// SPDX-License-Identifier: Apache-2.0
//

    .text
    .align  4
    .literal_position

    # Program Unit: esp_nn_avg_pool_s8_esp32s3_asm
    .type   esp_nn_avg_pool_s8_esp32s3_asm, @function
    .align   4
    .global esp_nn_avg_pool_s8_esp32s3_asm

// no of channels must be multiple of 4.

// a2: input
// a3: input_wd
// a4: input_ht
// a5: output
// a6: output_wd
// a7: output_ht
// on stack: stride_wd
// on stack: stride_ht
// on stack: filter_wd
// on stack: filter_ht
// on stack: pad_wd
// on stack: pad_ht
// on stack: activation_min
// on stack: activation_max
// on stack: channels

esp_nn_avg_pool_s8_esp32s3_asm: # 0x4
    # activation_min = 0
    # activation_max = 4
    # gra_spill_temp_0 = 8
    # gra_spill_temp_1 = 12
    # gra_spill_temp_2 = 16
    # gra_spill_temp_3 = 20
    # gra_spill_temp_4 = 24
    # gra_spill_temp_5 = 28
    # gra_spill_temp_6 = 32
    # gra_spill_temp_7 = 36
    # gra_spill_temp_8 = 40
    # gra_spill_temp_9 = 44
    # gra_spill_temp_10 = 48
    # gra_spill_temp_11 = 52
    # gra_spill_temp_12 = 56
    # gra_spill_temp_13 = 60
    # gra_spill_temp_14 = 64
    # gra_spill_temp_15 = 68
    # gra_spill_temp_16 = 72
    # gra_spill_temp_17 = 76
    # gra_spill_temp_18 = 80
    # gra_spill_temp_19 = 84
    # gra_spill_temp_20 = 88
    # gra_spill_temp_21 = 92
    # gra_spill_temp_22 = 96
    # gra_spill_temp_23 = 100
    # gra_spill_temp_24 = 104
    # gra_spill_temp_25 = 108
    # gra_spill_temp_26 = 112
    # gra_spill_temp_27 = 116
    # gra_spill_temp_28 = 120
    # gra_spill_temp_29 = 124
    # gra_spill_temp_30 = 128
    # gra_spill_temp_31 = 132
    # gra_spill_temp_32 = 136
    # gra_spill_temp_33 = 140
    # gra_spill_temp_34 = 144
    # gra_spill_temp_35 = 148
    # gra_spill_temp_36 = 152
    # gra_spill_temp_37 = 156
    # gra_spill_temp_38 = 160
    # gra_spill_temp_39 = 164
    # gra_spill_temp_40 = 168
    # gra_spill_temp_41 = 172
    # gra_spill_temp_43 = 180

    entry   a1,240                      #
    mov.n   a11,a3                      # [0]
    mov.n   a12,a2                      # [1]
    s32i    a5,a1,136                   # [4]  gra_spill_temp_30
    s32i    a6,a1,128                   # [3]  gra_spill_temp_32

    l16ui   a5,a1,272                   # [5]  id:663 channels+0x0
    s32i    a7,a1,72                    # [6]  gra_spill_temp_16

    l32i        a9,a1,264                   # [1]  id:664 activation_min+0x0
    l32i        a10,a1,268                  # [2]  id:666 activation_max+0x0
    s32i.n      a9,a1,0                 # [4]  activation_min
    s32i.n      a10,a1,4                # [3]  activation_max
    addi.n      a8,a1,4                 # [0]  activation_max
    ee.vldbc.32 q7,a1               # [5]  id:668 activation_min
    ee.vldbc.32 q6,a8               # [6]  id:669 activation_max
    ee.zero.q   q4                      # [0]

    extui   a10,a5,0,3                  # [7]
    beqz.n  a10,.LBB3_esp_nn_avg_pool_s8_esp32s3_asm    # [8], if (channels % 8 == 0)

    extui   a13,a5,0,2                  # [0]
    beqz.n  a13,.LBB52_esp_nn_avg_pool_s8_esp32s3_asm   # [1], if (channels % 4 == 0)

// exit
.Lt_0_44546:    # 0x1e9
    retw.n                          # [0]

.LBB3_esp_nn_avg_pool_s8_esp32s3_asm:   # 0x1eb // if (channels % 8 == 0)

    l16ui   a7,a1,256                   # [1]  id:671 pad_wd+0x0
    l16ui   a10,a1,260                  # [5]  id:670 pad_ht+0x0
    l32i    a15,a1,72                   # [12]  gra_spill_temp_16
    movi.n  a14,0                   # [13]
    movi.n  a8,0                    # [14]
    neg     a10,a10                     # [15]
    s32i    a10,a1,56                   # [16]  gra_spill_temp_12
    s32i    a8,a1,44                    # [17]  gra_spill_temp_9
    s32i.n  a14,a1,20               # [18]  gra_spill_temp_3
    sub     a9,a4,a10                   # [19]
    s32i    a9,a1,40                    # [20]  gra_spill_temp_8
    mul16u  a15,a15,a5              # [21]
    neg     a13,a7                      # [22]
    s32i    a13,a1,104                  # [23]  gra_spill_temp_24
    s32i.n  a15,a1,16               # [24]  gra_spill_temp_2
    sub     a13,a3,a13                  # [25]
    s32i.n  a13,a1,12               # [26]  gra_spill_temp_1
    j       .Lt_0_28162                     # [27]

.Lt_0_28418:    # 0x24e
#<loop> Part of loop body line 44, head labeled .Lt_0_28162
    l32i    a15,a1,260                  # [0]  pad_ht
    l32i    a14,a1,56                   # [1]  gra_spill_temp_12
    l32i.n  a9,a1,16                # [2]  gra_spill_temp_2
    l32i    a13,a1,244                  # [3]  stride_ht
    l32i    a10,a1,40                   # [4]  gra_spill_temp_8
    l32i    a8,a1,44                    # [5]  gra_spill_temp_9
    sub     a10,a10,a13                 # [6]
    add.n   a8,a8,a9                    # [7]
    add.n   a14,a14,a13                 # [8]
    sub     a15,a15,a13                 # [9]
    s32i    a15,a1,260                  # [10]  pad_ht
    s32i    a14,a1,56                   # [11]  gra_spill_temp_12
    s32i    a8,a1,44                    # [12]  gra_spill_temp_9
    s32i    a10,a1,40                   # [13]  gra_spill_temp_8
    l32i.n  a8,a1,20                # [14]  gra_spill_temp_3
    l32i    a9,a1,72                    # [15]  gra_spill_temp_16
    addi.n  a8,a8,1                 # [16]
    s32i.n  a8,a1,20                # [17]  gra_spill_temp_3
    beq a8,a9,.Lt_0_44546           # [18]

.Lt_0_28162:    # 0x281
    l32i    a10,a1,128                  # [0]  gra_spill_temp_32
    beqz.n  a10,.Lt_0_28418         # [2]

.LBB7_esp_nn_avg_pool_s8_esp32s3_asm:   # 0x286
#<loop> Part of loop body line 44, head labeled .Lt_0_28162
    s32i    a7,a1,112                   # [0]  gra_spill_temp_26
    movi.n  a10,0                   # [1]
    l32i    a9,a1,260                   # [2]  pad_ht
    l32i.n  a6,a1,12                # [3]  gra_spill_temp_1
    l32i    a8,a1,44                    # [4]  gra_spill_temp_9
    movi.n  a13,0                   # [5]
    l32i    a15,a1,104                  # [6]  gra_spill_temp_24
    s32i    a15,a1,116                  # [7]  gra_spill_temp_27
    s32i    a13,a1,48                   # [8]  gra_spill_temp_10
    s32i    a8,a1,124                   # [9]  gra_spill_temp_29
    s32i    a6,a1,120                   # [10]  gra_spill_temp_28
    l32i    a8,a1,40                    # [11]  gra_spill_temp_8
    l32i    a6,a1,252                   # [12]  filter_ht
    movi.n  a13,0                   # [13]
    max     a9,a9,a10                   # [14]
    s32i    a9,a1,160                   # [15]  gra_spill_temp_38
    s32i    a13,a1,92                   # [16]  gra_spill_temp_21
    min     a6,a6,a8                    # [17]
    bnez.n  a5,.LBB10_esp_nn_avg_pool_s8_esp32s3_asm    # [18]

.Lt_0_29186:    # 0x2ba
    l32i    a8,a1,116                   # [0]  gra_spill_temp_27
    l32i    a15,a1,120                  # [1]  gra_spill_temp_28
    l32i    a9,a1,48                    # [2]  gra_spill_temp_10
    l32i    a14,a1,240                  # [3]  stride_wd
    l32i    a10,a1,124                  # [4]  gra_spill_temp_29
    l32i    a13,a1,112                  # [5]  gra_spill_temp_26
    add.n   a10,a10,a5                  # [6]
    s32i    a10,a1,124                  # [7]  gra_spill_temp_29
    sub     a13,a13,a14                 # [8]
    add.n   a9,a9,a14                   # [9]
    sub     a15,a15,a14                 # [10]
    add.n   a8,a8,a14                   # [11]
    s32i    a8,a1,116                   # [12]  gra_spill_temp_27
    s32i    a15,a1,120                  # [13]  gra_spill_temp_28
    s32i    a9,a1,48                    # [14]  gra_spill_temp_10
    s32i    a13,a1,112                  # [15]  gra_spill_temp_26
    l32i    a9,a1,92                    # [16]  gra_spill_temp_21
    l32i    a10,a1,128                  # [17]  gra_spill_temp_32
    addi.n  a9,a9,1                 # [18]
    s32i    a9,a1,92                    # [19]  gra_spill_temp_21
    beq     a9,a10,.Lt_0_28418          # [20]

.Lt_0_28930:    # 0x2f5
#<loop> Part of loop body line 46, head labeled .Lt_0_29186
    beqz.n  a5,.Lt_0_29186          # [0]

.LBB10_esp_nn_avg_pool_s8_esp32s3_asm:  # 0x2f7
#<loop> Part of loop body line 44, head labeled .Lt_0_28162
    l32i    a14,a1,120                  # [0]  gra_spill_temp_28
    l32i    a13,a1,248                  # [1]  filter_wd
    l32i    a9,a1,136                   # [2]  gra_spill_temp_30
    l32i    a8,a1,124                   # [3]  gra_spill_temp_29
    movi.n  a15,0                   # [4]
    s32i    a15,a1,24                   # [5]  gra_spill_temp_60
    add.n   a10,a8,a5                   # [6]
    movi.n  a15,0                   # [7]
    add.n   a8,a8,a9                    # [8]
    min     a13,a13,a14                 # [9]
    add.n   a10,a9,a10                  # [10]
    s32i    a10,a1,180                  # [11]  gra_spill_temp_43
    s32i    a13,a1,76                   # [12]  gra_spill_temp_17
    l32i    a14,a1,112                  # [13]  gra_spill_temp_26
    s32i    a8,a1,148                   # [14]  gra_spill_temp_45
    max     a14,a14,a15                 # [15]
    l32i    a15,a1,116                  # [16]  gra_spill_temp_27
    s32i    a14,a1,152                  # [17]  gra_spill_temp_63
    add.n   a8,a15,a14                  # [18]
    s32i    a8,a1,36                    # [19]  gra_spill_temp_7
    add.n   a15,a15,a13                 # [20]
    s32i    a15,a1,204                  # [21]  gra_spill_temp_39
    sub     a13,a13,a14                 # [22]
    s32i    a13,a1,280                  # [23]  gra_spill_temp_58
    j   .Lt_0_29698                     # [24]

.LBB13_esp_nn_avg_pool_s8_esp32s3_asm:  # 0x33b
#<loop> Part of loop body line 16, head labeled .Lt_0_29698
    l32i    a10,a1,56                   # [0]  gra_spill_temp_12
    l32i    a14,a1,204                  # [1]  gra_spill_temp_39
    add.n   a10,a10,a15                 # [2]
    mull    a10,a11,a10                 # [3]
    movi.n  a15,0                   # [4]
    add.n   a14,a10,a14                 # [5]

.Lt_0_30466:    # 0x34a
#<loop> Loop body line 61, nesting depth: 4, estimated iterations: 252
    l32i    a9,a1,76                    # [0]  gra_spill_temp_17
    l32i    a8,a1,152                   # [1]  gra_spill_temp_63
    add.n   a14,a14,a11                 # [2]
    bge     a8,a9,.Lt_0_30722           # [3]

.LBB16_esp_nn_avg_pool_s8_esp32s3_asm:  # 0x355
#<loop> Part of loop body line 61, head labeled .Lt_0_30466
    l32i    a3,a1,36                    # [0]  gra_spill_temp_7
    l32i    a2,a1,24                    # [1]  gra_spill_temp_4
    add.n   a3,a3,a10                   # [2]
    mull    a3,a3,a5                    # [3]
    movi.n  a8,0                    # [4]
    add.n   a2,a2,a3                    # [5]
    l32i    a3,a1,280                   # [6]  gra_spill_temp_58
    add.n   a2,a12,a2                   # [7]
    loopgtz a3,.LBB140_esp_nn_avg_pool_s8_esp32s3_asm   # [8]

    ee.vld.l.64.xp  q0,a2,a5            # [0*II+1]  id:677
    ee.vcmp.lt.s8   q1,q0,q4            # [0*II+3]
    ee.vzip.8       q0,q1                   # [0*II+4]
    ee.vcmp.lt.s16  q1,q0,q4        # [0*II+5]
    ee.vzip.16      q0,q1               # [0*II+6]
    ee.vadds.s32    q2,q2,q1            # [0*II+7]
    ee.vadds.s32    q3,q3,q0            # [0*II+8]


.LBB140_esp_nn_avg_pool_s8_esp32s3_asm: # 0x385
#<loop> Part of loop body line 61, head labeled .Lt_0_30466
    l32i    a2,a1,48                    # [0]  gra_spill_temp_10
    sub     a9,a7,a2                    # [2]
    sub     a2,a2,a7                    # [3]
    max     a9,a9,a8                    # [4]
    l32i    a8,a1,248                   # [5]  filter_wd
    sub     a2,a11,a2                   # [6]
    min     a8,a8,a2                    # [7]
    sub     a8,a8,a9                    # [8]
    add.n   a15,a15,a8                  # [9]

.Lt_0_30722:    # 0x39f
#<loop> Part of loop body line 61, head labeled .Lt_0_30466
    add.n   a10,a10,a11                 # [0]
    addi.n  a13,a13,1               # [1]
    bne     a6,a13,.Lt_0_30466          # [2]

.Lt_0_29954:    # 0x3a6
    srai            a2,a15,1                    # [3]

// move data to general purpose registers and average
    ee.movi.32.a    q3,a9,0             # [0]
    ee.movi.32.a    q3,a4,1             # [0]

    blti            a9,1,.Lt_0_32258            # [4]
    add.n           a9,a9,a2                    # [0]
    j               .Lt_0_32002                     # [2]
.Lt_0_32258:    # 0x45e
    sub             a9,a9,a2                    # [0]
.Lt_0_32002:    # 0x3b9

    blti            a4,1,.Lt_0_32770            # [1]
    add.n           a4,a2,a4                    # [0]
    j               .Lt_0_32514                     # [2]
.Lt_0_32770:
    sub             a4,a4,a2                    # [0]
.Lt_0_32514:    # 0x3c4

    quos            a9,a9,a15                   # [1]
    quos            a4,a4,a15                   # [1]
    ee.movi.32.q    q3,a9,0             # [0]
    ee.movi.32.q    q3,a4,1             # [1]

    ee.movi.32.a    q3,a9,2             # [2]
    ee.movi.32.a    q3,a14,3            # [0]

    blti            a9,1,.Lt_0_33282            # [3]
    add.n           a9,a9,a2                    # [0]
    j               .Lt_0_33026                     # [2]
.Lt_0_33282:    # 0x470
    sub             a9,a9,a2                    # [0]
.Lt_0_33026:    # 0x3d5

    blti            a14,1,.Lt_0_33794           # [1]
    add.n           a14,a2,a14                  # [0]
    j               .Lt_0_33538                     # [2]
.Lt_0_33794:    # 0x479
    sub             a14,a14,a2                      # [0]
.Lt_0_33538:    # 0x3e0

    quos            a9,a9,a15                   # [1]
    quos            a14,a14,a15                 # [1]
    ee.movi.32.q    q3,a9,2             # [0]
    ee.movi.32.q    q3,a14,3            # [1]


    ee.movi.32.a    q2,a9,0             # [0]
    ee.movi.32.a    q2,a4,1             # [0]

    blti            a9,1,.Lt_0_34306            # [3]
    add.n           a9,a9,a2                    # [0]
    j               .Lt_0_34050                     # [2]
.Lt_0_34306:    # 0x482
    sub             a9,a9,a2                    # [0]
.Lt_0_34050:    # 0x3f1

    blti            a4,1,.Lt_0_34818            # [1]
    add.n           a4,a2,a4                    # [0]
    j               .Lt_0_34562                     # [2]
.Lt_0_34818:    # 0x48b
    sub             a4,a4,a2                    # [0]
.Lt_0_34562:    # 0x3fc

    quos            a9,a9,a15                   # [1]
    quos            a4,a4,a15                   # [1]
    ee.movi.32.q    q2,a9,0             # [0]
    ee.movi.32.q    q2,a4,1             # [1]

    ee.movi.32.a    q2,a9,2             # [2]
    ee.movi.32.a    q2,a14,3            # [0]

    blti            a9,1,.Lt_0_35330            # [3]
    add.n           a9,a9,a2                    # [0]
    j               .Lt_0_35074                     # [2]
.Lt_0_35330:    # 0x494
    sub             a9,a9,a2                    # [0]
.Lt_0_35074:    # 0x40d

    blti            a14,1,.Lt_0_35842           # [1]
    add.n           a14,a2,a14                  # [0]
    j               .Lt_0_35586                     # [2]
.Lt_0_35842:    # 0x49d
    sub             a14,a14,a2                      # [0]
.Lt_0_35586:    # 0x418

    quos            a9,a9,a15                   # [1]
    quos            a14,a14,a15                 # [1]
    ee.movi.32.q    q2,a9,2             # [0]
    ee.movi.32.q    q2,a14,3            # [1]


    l32i            a9,a1,180                   # [0]  gra_spill_temp_43
    l32i            a14,a1,24                   # [1]  gra_spill_temp_4
    l32i            a13,a1,148                  # [2]  gra_spill_temp_45
    ee.vmin.s32     q1,q3,q6            # [4]
    ee.vmax.s32     q1,q1,q7            # [5]
    ee.vmin.s32     q5,q2,q6            # [8]
    addi.n          a14,a14,8               # [9]
    s32i            a14,a1,24                   # [10]  gra_spill_temp_4
    ee.vmax.s32     q5,q5,q7            # [11]
    addi.n          a8,a13,8                    # [12]
    s32i            a8,a1,148                   # [13]  gra_spill_temp_45
    ee.vunzip.16    q1,q5               # [14]
    ee.vunzip.8     q1,q5               # [15]
    ee.vst.l.64.ip  q1,a13,0        # [16]  id:678
    bge             a8,a9,.Lt_0_29186           # [17]

.Lt_0_29698:    # 0x44b
#<loop> Loop body line 16, nesting depth: 3, estimated iterations: 252
    mv.qr   q3,q4                       # [0]
    l32i    a15,a1,160                  # [1]  gra_spill_temp_38
    mv.qr   q2,q4                       # [2]
    mov.n   a13,a15                     # [3]
    blt a15,a6,.LBB13_esp_nn_avg_pool_s8_esp32s3_asm    # [4]

.Lt_0_51458:    # 0x459
#<loop> Part of loop body line 16, head labeled .Lt_0_29698
    movi.n  a15,0                   # [0]
    j   .Lt_0_29954                     # [1]


.LBB52_esp_nn_avg_pool_s8_esp32s3_asm:  # 0x4a6 // if (channels % 4 == 0)

    l16ui   a7,a1,256                   # [1]  id:671 pad_wd+0x0
    l16ui   a13,a1,260                  # [5]  id:670 pad_ht+0x0
    s32i    a13,a1,64                   # [8]  gra_spill_temp_4
    l32i    a8,a1,72                    # [12]  gra_spill_temp_16
    movi.n  a15,0                   # [13]
    movi.n  a9,0                    # [14]
    neg     a13,a13                     # [15]
    s32i    a13,a1,192                  # [16]  gra_spill_temp_36
    s32i    a9,a1,32                    # [17]  gra_spill_temp_6
    s32i.n  a15,a1,8                # [18]  gra_spill_temp_0
    sub     a10,a4,a13                  # [19]
    s32i    a10,a1,28                   # [20]  gra_spill_temp_5
    mul16u  a8,a8,a5                # [21]
    neg     a14,a7                      # [22]
    s32i    a14,a1,104                  # [23]  gra_spill_temp_24
    s32i.n  a8,a1,16                # [24]  gra_spill_temp_2
    sub     a14,a3,a14                  # [25]
    s32i.n  a14,a1,12               # [26]  gra_spill_temp_1
    j   .Lt_0_37890                     # [27]

.Lt_0_38146:    # 0x50b
#<loop> Part of loop body line 161, head labeled .Lt_0_37890
    l32i    a15,a1,64                   # [0]  gra_spill_temp_4
    l32i    a14,a1,192                  # [1]  gra_spill_temp_36
    l32i.n  a9,a1,16                # [2]  gra_spill_temp_2
    l32i    a13,a1,244                  # [3]  stride_ht
    l32i    a10,a1,28                   # [4]  gra_spill_temp_5
    l32i    a8,a1,32                    # [5]  gra_spill_temp_6
    sub     a10,a10,a13                 # [6]
    add.n   a8,a8,a9                    # [7]
    add.n   a14,a14,a13                 # [8]
    sub     a15,a15,a13                 # [9]
    s32i    a15,a1,64                   # [10]  gra_spill_temp_4
    s32i    a14,a1,192                  # [11]  gra_spill_temp_36
    s32i    a8,a1,32                    # [12]  gra_spill_temp_6
    s32i    a10,a1,28                   # [13]  gra_spill_temp_5
    l32i.n  a8,a1,8                 # [14]  gra_spill_temp_0
    l32i    a9,a1,72                    # [15]  gra_spill_temp_16
    addi.n  a8,a8,1                 # [16]
    s32i.n  a8,a1,8                 # [17]  gra_spill_temp_0
    sub     a8,a8,a9                    # [18]
    beqz    a8,.Lt_0_44546              # [19]

.Lt_0_37890:    # 0x541
#<loop> Loop body line 161, nesting depth: 1, estimated iterations: 252
    l32i    a10,a1,128                  # [0]  gra_spill_temp_32
    beqz.n  a10,.Lt_0_38146         # [2]

#<loop> Part of loop body line 161, head labeled .Lt_0_37890
    s32i    a7,a1,96                    # [0]  gra_spill_temp_22
    movi.n  a10,0                   # [1]
    l32i    a9,a1,64                    # [2]  gra_spill_temp_4
    l32i.n  a6,a1,12                # [3]  gra_spill_temp_1
    l32i    a8,a1,32                    # [4]  gra_spill_temp_6
    movi.n  a13,0                   # [5]
    l32i    a15,a1,104                  # [6]  gra_spill_temp_24
    s32i    a15,a1,100                  # [7]  gra_spill_temp_23
    s32i    a13,a1,148                  # [8]  gra_spill_temp_35
    s32i    a8,a1,108                   # [9]  gra_spill_temp_25
    s32i    a6,a1,144                   # [10]  gra_spill_temp_24
    l32i    a8,a1,28                    # [11]  gra_spill_temp_5
    l32i    a6,a1,252                   # [12]  filter_ht
    max     a9,a9,a10                   # [14]
    s32i    a9,a1,168                   # [15]  gra_spill_temp_40
    s32i    a13,a1,88                   # [16]  gra_spill_temp_20
    min     a6,a6,a8                    # [17]
    bnez.n  a5,.LBB59_esp_nn_avg_pool_s8_esp32s3_asm    # [18]

.Lt_0_38914:    # 0x57a
#<loop> Loop body line 163
    l32i    a8,a1,100                   # [0]  gra_spill_temp_23
    l32i    a15,a1,144                  # [1]  gra_spill_temp_24
    l32i    a9,a1,148                   # [2]  gra_spill_temp_35
    l32i    a14,a1,240                  # [3]  stride_wd
    l32i    a10,a1,108                  # [4]  gra_spill_temp_25
    l32i    a13,a1,96                   # [5]  gra_spill_temp_22
    add.n   a10,a10,a5                  # [6]
    s32i    a10,a1,108                  # [7]  gra_spill_temp_25
    sub     a13,a13,a14                 # [8]
    add.n   a9,a9,a14                   # [9]
    sub     a15,a15,a14                 # [10]
    add.n   a8,a8,a14                   # [11]
    s32i    a8,a1,100                   # [12]  gra_spill_temp_23
    s32i    a15,a1,144                  # [13]  gra_spill_temp_24
    s32i    a9,a1,148                   # [14]  gra_spill_temp_35
    s32i    a13,a1,96                   # [15]  gra_spill_temp_22
    l32i    a9,a1,88                    # [16]  gra_spill_temp_20
    l32i    a10,a1,128                  # [17]  gra_spill_temp_32
    addi.n  a9,a9,1                 # [18]
    s32i    a9,a1,88                    # [19]  gra_spill_temp_20
    beq     a9,a10,.Lt_0_38146          # [20]

    beqz.n  a5,.Lt_0_38914          # [0]

.LBB59_esp_nn_avg_pool_s8_esp32s3_asm:  # 0x5b7
#<loop> Part of loop body line 161, head labeled .Lt_0_37890
    l32i    a14,a1,144                  # [0]  gra_spill_temp_24
    l32i    a13,a1,248                  # [1]  filter_wd
    l32i    a9,a1,136                   # [2]  gra_spill_temp_30
    l32i    a8,a1,108                   # [3]  gra_spill_temp_25
    movi.n  a15,0                   # [4]
    s32i    a15,a1,216                  # [5]  gra_spill_temp_52
    add.n   a10,a8,a5                   # [6]
    add.n   a8,a8,a9                    # [8]
    min     a13,a13,a14                 # [9]
    add.n   a10,a9,a10                  # [10]
    s32i    a10,a1,172                  # [11]  gra_spill_temp_41
    s32i    a13,a1,132                  # [12]  gra_spill_temp_31
    l32i    a14,a1,96                   # [13]  gra_spill_temp_22
    s32i    a8,a1,164                   # [14]  gra_spill_temp_39
    max     a14,a14,a15                 # [15]
    l32i    a15,a1,100                  # [16]  gra_spill_temp_23
    s32i    a14,a1,208                  # [17]  gra_spill_temp_50
    add.n   a8,a15,a14                  # [18]
    s32i    a8,a1,60                    # [19]  gra_spill_temp_13
    add.n   a15,a15,a13                 # [20]
    s32i    a15,a1,196                  # [21]  gra_spill_temp_37
    sub     a13,a13,a14                 # [22]
    s32i    a13,a1,52                   # [23]  gra_spill_temp_11
    j       .Lt_0_39426                     # [24]

.LBB62_esp_nn_avg_pool_s8_esp32s3_asm:  # 0x5fb
#<loop> Part of loop body line 173, head labeled .Lt_0_39426
    l32i    a10,a1,192                  # [0]  gra_spill_temp_36
    l32i    a14,a1,196                  # [1]  gra_spill_temp_37
    add.n   a10,a10,a15                 # [2]
    mull    a10,a11,a10                 # [3]
    movi.n  a15,0                   # [4]
    add.n   a14,a10,a14                 # [5]

.Lt_0_40194:    # 0x60a
#<loop> Loop body line 178, nesting depth: 4, estimated iterations: 252
    l32i    a9,a1,132                   # [0]  gra_spill_temp_31
    l32i    a8,a1,208                   # [1]  gra_spill_temp_50
    add.n   a14,a14,a11                 # [2]
    bge a8,a9,.Lt_0_40450           # [3]

.LBB65_esp_nn_avg_pool_s8_esp32s3_asm:  # 0x615
#<loop> Part of loop body line 178, head labeled .Lt_0_40194
    l32i    a3,a1,60                    # [0]  gra_spill_temp_13
    l32i    a2,a1,216                   # [1]  gra_spill_temp_52
    add.n   a3,a3,a10                   # [2]
    mull    a3,a3,a5                    # [3]
    l32i    a4,a1,52                    # [4]  gra_spill_temp_11
    add.n   a2,a2,a3                    # [5]
    add.n   a2,a12,a2                   # [6]
    loopgtz a4,.LBB155_esp_nn_avg_pool_s8_esp32s3_asm   # [7]

    ee.vldbc.32.xp  q0,a2,a5                # [0*II+0]  id:684
    ee.vcmp.lt.s8   q1,q0,q4            # [0*II+2]
    ee.vzip.8       q0,q1                   # [0*II+3]
    ee.vcmp.lt.s16  q1,q0,q4        # [0*II+4]
    ee.vzip.16      q0,q1               # [0*II+5]
    ee.vadds.s32    q2,q2,q0            # [0*II+6]

.LBB155_esp_nn_avg_pool_s8_esp32s3_asm: # 0x63e
#<loop> Part of loop body line 178, head labeled .Lt_0_40194
    l32i    a2,a1,148                   # [0]  gra_spill_temp_35
    movi.n  a8,0                    # [1]
    sub     a9,a7,a2                    # [2]
    sub     a2,a2,a7                    # [3]
    max     a9,a9,a8                    # [4]
    l32i    a8,a1,248                   # [5]  filter_wd
    sub     a2,a11,a2                   # [6]
    min     a8,a8,a2                    # [7]
    sub     a8,a8,a9                    # [8]
    add.n   a15,a15,a8                  # [9]

.Lt_0_40450:    # 0x65a
#<loop> Part of loop body line 178, head labeled .Lt_0_40194
    add.n   a10,a10,a11                 # [0]
    addi.n  a13,a13,1               # [1]
    bne     a6,a13,.Lt_0_40194          # [2]

.Lt_0_39682:    # 0x661
#<loop> Part of loop body line 173, head labeled .Lt_0_39426
    srai            a2,a15,1                    # [5]

// move to gp registers and average

    ee.movi.32.a    q2,a9,0             # [0]
    ee.movi.32.a    q2,a4,1             # [0]

    blti            a9,1,.Lt_0_41986            # [3]
    add.n           a9,a9,a2                    # [0]
    j               .Lt_0_41730                     # [2]
.Lt_0_41986:    # 0x482
    sub             a9,a9,a2                    # [0]
.Lt_0_41730:    # 0x3f1

    blti            a4,1,.Lt_0_42498            # [1]
    add.n           a4,a2,a4                    # [0]
    j               .Lt_0_42242                     # [2]
.Lt_0_42498:    # 0x48b
    sub             a4,a4,a2                    # [0]
.Lt_0_42242:    # 0x3fc


    quos            a9,a9,a15                   # [1]
    quos            a4,a4,a15                   # [1]
    ee.movi.32.q    q2,a9,0             # [0]
    ee.movi.32.q    q2,a4,1             # [1]

    ee.movi.32.a    q2,a9,2             # [2]
    ee.movi.32.a    q2,a14,3            # [0]

    blti            a9,1,.Lt_0_43010            # [3]
    add.n           a9,a9,a2                    # [0]
    j               .Lt_0_42754                     # [2]
.Lt_0_43010:    # 0x494
    sub             a9,a9,a2                    # [0]
.Lt_0_42754:    # 0x40d


    blti            a14,1,.Lt_0_43522           # [1]
    add.n           a14,a2,a14                  # [0]
    j               .Lt_0_43266                     # [2]
.Lt_0_43522:    # 0x49d
    sub             a14,a14,a2                      # [0]
.Lt_0_43266:    # 0x418

    quos            a9,a9,a15                   # [1]
    quos            a14,a14,a15                 # [1]
    ee.movi.32.q    q2,a9,2             # [0]
    ee.movi.32.q    q2,a14,3            # [1]


    l32i            a9,a1,172                   # [0]  gra_spill_temp_41
    l32i            a8,a1,164                   # [1]  gra_spill_temp_39
    l32i            a14,a1,216                  # [2]  gra_spill_temp_52
    addi.n          a14,a14,4               # [5]
    ee.vmin.s32     q2,q2,q6            # [6]
    s32i            a14,a1,216                  # [7]  gra_spill_temp_52
    ee.vmax.s32     q2,q2,q7            # [8]
    ee.vunzip.16    q2,q1               # [9]
    ee.vunzip.8     q2,q1               # [10]
    ee.vst.l.64.ip  q2,a1,0         # [11]  id:691
    l32i.n          a13,a1,0                # [12]  id:692
    s32i.n          a13,a8,0                # [13]  id:693
    addi.n          a8,a8,4                 # [14]
    s32i            a8,a1,164                   # [15]  gra_spill_temp_39
    bge             a8,a9,.Lt_0_38914           # [16]

.Lt_0_39426:    # 0x6cb
    l32i    a15,a1,168                  # [0]  gra_spill_temp_40
    mv.qr   q2,q4                       # [1]
    mov.n   a13,a15                     # [2]
    blt     a15,a6,.LBB62_esp_nn_avg_pool_s8_esp32s3_asm    # [3]

.Lt_0_52738:    # 0x6d6
    movi.n  a15,0                   # [0]
    j       .Lt_0_39682                     # [1]

    .size   esp_nn_avg_pool_s8_esp32s3_asm, . - esp_nn_avg_pool_s8_esp32s3_asm


================================================
FILE: src/pooling/esp_nn_avg_pool_s8_esp32s3.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * ESP32-S3 optimized avg pool wrapper.
 * Routes to existing assembly for channels%4==0,
 * provides int16-accumulation C path for other cases.
 */

#include <stdint.h>
#include <string.h>
#include <common_functions.h>

/* Existing S3 assembly (handles depth%4==0) */
extern void esp_nn_avg_pool_s8_esp32s3_asm(const int8_t *input,
                             const uint16_t input_wd,
                             const uint16_t input_ht,
                             int8_t *output,
                             const uint16_t output_wd,
                             const uint16_t output_ht,
                             const uint16_t stride_wd,
                             const uint16_t stride_ht,
                             const uint16_t filter_wd,
                             const uint16_t filter_ht,
                             const uint16_t pad_wd,
                             const uint16_t pad_ht,
                             const int32_t activation_min,
                             const int32_t activation_max,
                             const uint16_t channels);

void esp_nn_avg_pool_s8_esp32s3(const int8_t *input,
                             const uint16_t input_wd,
                             const uint16_t input_ht,
                             int8_t *output,
                             const uint16_t output_wd,
                             const uint16_t output_ht,
                             const uint16_t stride_wd,
                             const uint16_t stride_ht,
                             const uint16_t filter_wd,
                             const uint16_t filter_ht,
                             const uint16_t pad_wd,
                             const uint16_t pad_ht,
                             const int32_t activation_min,
                             const int32_t activation_max,
                             const uint16_t channels)
{
    /* Use existing assembly for channels % 4 == 0 */
    if (channels % 4 == 0) {
        esp_nn_avg_pool_s8_esp32s3_asm(input, input_wd, input_ht, output,
                                        output_wd, output_ht, stride_wd, stride_ht,
                                        filter_wd, filter_ht, pad_wd, pad_ht,
                                        activation_min, activation_max, channels);
        return;
    }

    /* C path with int16 accumulation for non-aligned channels */
    int16_t acc_buf[channels];

    int32_t base_y = -pad_ht;
    for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) {
        int32_t base_x = -pad_wd;
        for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) {
            int32_t fy_start = max(0, -base_y);
            int32_t fx_start = max(0, -base_x);
            int32_t fy_end = min(filter_ht, input_ht - base_y);
            int32_t fx_end = min(filter_wd, input_wd - base_x);
            int32_t filter_cnt = (fy_end - fy_start) * (fx_end - fx_start);

            memset(acc_buf, 0, channels * sizeof(int16_t));

            for (int32_t fy = fy_start; fy < fy_end; fy++) {
                for (int32_t fx = fx_start; fx < fx_end; fx++) {
                    int32_t in_idx = ((base_y + fy) * input_wd + (base_x + fx)) * channels;
                    for (int c = 0; c < channels; c++) {
                        acc_buf[c] += (int16_t)input[in_idx + c];
                    }
                }
            }

            int32_t half_cnt = filter_cnt / 2;
            int32_t out_idx = (out_y * output_wd + out_x) * channels;
            for (int c = 0; c < channels; c++) {
                int32_t result = acc_buf[c];
                result = result > 0 ? (result + half_cnt) / filter_cnt
                                    : (result - half_cnt) / filter_cnt;
                result = max(result, activation_min);
                result = min(result, activation_max);
                output[out_idx + c] = (int8_t)result;
            }
        }
    }
}


================================================
FILE: src/pooling/esp_nn_max_pool_ansi.c
================================================
// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <stdint.h>

#include <common_functions.h>

void esp_nn_max_pool_s8_ansi(const int8_t *input,
                             const uint16_t input_wd,
                             const uint16_t input_ht,
                             int8_t *output,
                             const uint16_t output_wd,
                             const uint16_t output_ht,
                             const uint16_t stride_wd,
                             const uint16_t stride_ht,
                             const uint16_t filter_wd,
                             const uint16_t filter_ht,
                             const uint16_t pad_wd,
                             const uint16_t pad_ht,
                             const int32_t activation_min,
                             const int32_t activation_max,
                             const uint16_t channels)
{
    int32_t base_y = -pad_ht;
    for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) {
        int32_t base_x = -pad_wd;
        for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) {
            /* Make sure filter does not cross the input box */
            int32_t filter_y_start = max(0, -base_y);
            int32_t filter_x_start = max(0, -base_x);
            int32_t filter_y_end = min(filter_ht, input_ht - base_y);
            int32_t filter_x_end = min(filter_wd, input_wd - base_x);

            for (int32_t ch_idx = 0; ch_idx < channels; ch_idx++) {
                int8_t result = INT8_MIN;

                for (int32_t filter_y = filter_y_start; filter_y < filter_y_end; filter_y++) {
                    for (int32_t filter_x = filter_x_start; filter_x < filter_x_end; filter_x++) {
                        int32_t in_x_idx = base_x + filter_x;
                        int32_t in_y_idx = base_y + filter_y;
                        int32_t input_index = (in_y_idx * input_wd + in_x_idx) * channels + ch_idx;
                        result = max(input[input_index], result);
                    }
                }

                /* Activation function */
                result = max(result, activation_min);
                result = min(result, activation_max);

                int32_t output_index = (out_y * output_wd + out_x) * channels + ch_idx;
                output[output_index] = result;
            }
        }
    }
}


================================================
FILE: src/pooling/esp_nn_max_pool_s8_esp32p4.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>
#include <limits.h>
#include <common_functions.h>

/**
 * Max pooling for s8 using ESP32-P4 PIE SIMD.
 * Vectorizes the channel dimension: processes 16 channels per iteration
 * using esp.vmax.s8 to find running maximum across the filter window.
 */
void esp_nn_max_pool_s8_esp32p4(const int8_t *input,
                                 const uint16_t input_wd,
                                 const uint16_t input_ht,
                                 int8_t *output,
                                 const uint16_t output_wd,
                                 const uint16_t output_ht,
                                 const uint16_t stride_wd,
                                 const uint16_t stride_ht,
                                 const uint16_t filter_wd,
                                 const uint16_t filter_ht,
                                 const uint16_t pad_wd,
                                 const uint16_t pad_ht,
                                 const int32_t activation_min,
                                 const int32_t activation_max,
                                 const uint16_t channels)
{
    /* Enable PIE */
    asm volatile (
        "csrsi  0x7f2, 0b01        \n\t"
        "li     x29, 0b10          \n\t"
        "esp.movx.w.cfg x29        \n\t"
        ::: "x29"
    );

    /* Broadcast activation_min and activation_max into vectors */
    int8_t act_min_val = (int8_t) activation_min;
    int8_t act_max_val = (int8_t) activation_max;
    int8_t int8_min_val = INT8_MIN;

    asm volatile (
        "mv              x30, %0     \n\t"
        "esp.vldbc.8.ip  q4, x30, 0 \n\t"  /* q4 = broadcast(activation_min) */
        "mv              x30, %1     \n\t"
        "esp.vldbc.8.ip  q5, x30, 0 \n\t"  /* q5 = broadcast(activation_max) */
        "mv              x30, %2     \n\t"
        "esp.vldbc.8.ip  q6, x30, 0 \n\t"  /* q6 = broadcast(INT8_MIN) for init */
        :: "r"(&act_min_val), "r"(&act_max_val), "r"(&int8_min_val)
        : "x30"
    );

    const int32_t ch_16 = channels >> 4;  /* number of full 16-ch blocks */

    int32_t base_y = -pad_ht;
    for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) {
        int32_t base_x = -pad_wd;
        for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) {
            int32_t filter_y_start = max(0, -base_y);
            int32_t filter_x_start = max(0, -base_x);
            int32_t filter_y_end = min(filter_ht, input_ht - base_y);
            int32_t filter_x_end = min(filter_wd, input_wd - base_x);

            int8_t *out_ptr = output + (out_y * output_wd + out_x) * channels;

            /* Process channels in blocks of 16 */
            int32_t ch_offset = 0;
            for (int32_t ch_blk = 0; ch_blk < ch_16; ch_blk++, ch_offset += 16) {
                /* Initialize running max to INT8_MIN (copy q6 -> q0) */
                asm volatile ("esp.vmax.s8 q0, q6, q6 \n\t");

                /* Accumulate max across filter window.
                 * For fx loop: input channels are at stride=channels apart. */
                for (int32_t fy = filter_y_start; fy < filter_y_end; fy++) {
                    int32_t in_y = base_y + fy;
                    const int8_t *row_ptr = input + (in_y * input_wd + base_x + filter_x_start) * channels + ch_offset;
                    int32_t fx_count = filter_x_end - filter_x_start;

                    asm volatile (
                        "mv     x30, %[ptr]              \n\t"
                        "mv     s7,  %[cnt]              \n\t"
                        "1:                              \n\t"
                        "esp.vld.128.ip  q1, x30, 0      \n\t"
                        "esp.vmax.s8     q0, q0, q1      \n\t"
                        "add    x30, x30, %[stride]      \n\t"
                        "addi   s7, s7, -1               \n\t"
                        "bnez   s7, 1b                   \n\t"
                        :
                        : [ptr] "r"(row_ptr), [cnt] "r"(fx_count),
                          [stride] "r"((int32_t)channels)
                        : "x30", "s7"
                    );
                }

                /* Apply activation: max(act_min, min(act_max, result)) and store */
                {
                    int8_t *store_ptr = out_ptr + ch_offset;
                    asm volatile (
                        "esp.vmax.s8     q0, q0, q4       \n\t"  /* max(result, act_min) */
                        "esp.vmin.s8     q0, q0, q5       \n\t"  /* min(result, act_max) */
                        "mv              x30, %0          \n\t"
                        "esp.vst.128.ip  q0, x30, 0       \n\t"  /* store 16 channels */
                        :
                        : "r"(store_ptr)
                        : "x30", "memory"
                    );
                }
            }

            /* Handle remaining channels scalar */
            for (int32_t ch_idx = ch_offset; ch_idx < channels; ch_idx++) {
                int8_t result = INT8_MIN;
                for (int32_t fy = filter_y_start; fy < filter_y_end; fy++) {
                    for (int32_t fx = filter_x_start; fx < filter_x_end; fx++) {
                        int32_t in_y = base_y + fy;
                        int32_t in_x = base_x + fx;
                        int32_t input_index = (in_y * input_wd + in_x) * channels + ch_idx;
                        result = max(input[input_index], result);
                    }
                }
                result = max(result, (int8_t) activation_min);
                result = min(result, (int8_t) activation_max);
                out_ptr[ch_idx] = result;
            }
        }
    }
}


================================================
FILE: src/pooling/esp_nn_max_pool_s8_esp32s3.S
================================================
// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

    .text
    .align  4
    .literal_position

    # Program Unit: esp_nn_max_pool_s8_esp32s3
    .type   esp_nn_max_pool_s8_esp32s3, @function
    .align   4
    .global esp_nn_max_pool_s8_esp32s3

// no of channels must be multiple of 4

esp_nn_max_pool_s8_esp32s3: # 0x4
    # int8_min = 0
    # gra_spill_temp_0 = 4
    # gra_spill_temp_1 = 8
    # gra_spill_temp_2 = 12
    # gra_spill_temp_3 = 16
    # gra_spill_temp_4 = 20
    # gra_spill_temp_5 = 24
    # gra_spill_temp_6 = 28
    # gra_spill_temp_7 = 32
    # gra_spill_temp_8 = 36
    # gra_spill_temp_9 = 40
    # gra_spill_temp_10 = 44
    # gra_spill_temp_11 = 48
    # gra_spill_temp_12 = 52
    # gra_spill_temp_13 = 56
    # gra_spill_temp_14 = 60
    # gra_spill_temp_15 = 64
    # gra_spill_temp_16 = 68
    # gra_spill_temp_17 = 72
    # gra_spill_temp_18 = 76
    # gra_spill_temp_19 = 80
    # gra_spill_temp_20 = 84
    # gra_spill_temp_21 = 88
    # gra_spill_temp_22 = 92
    # gra_spill_temp_23 = 96

// a2: input
// a3: input_wd
// a4: input_ht
// a5: output
// a6: output_wd
// a7: output_ht
// on stack: stride_wd = 120
// on stack: stride_ht = 124
// on stack: filter_wd = 128
// on stack: filter_ht = 132
// on stack: pad_wd = 136
// on stack: pad_ht = 140
// on stack: activation_min
// on stack: activation_max
// on stack: channels


    entry   a1,120                      #
    mov.n   a12,a2                      # [0]
    s32i    a6,a1,4                 # [2]  gra_spill_temp_0
    s32i    a7,a1,68                    # [3]  gra_spill_temp_16
    mov.n   a11,a3                      # [4]
    s32i    a5,a1,96                    # [5]  gra_spill_temp_23

    l16ui   a5,a1,152                   # [6]  id:465 channels+0x0
    movi    a3,-128                     # [7]
    s32i.n  a3,a1,0                 # [1]  int8_min

    addi.n      a9,a1,148                   # [0]  activation_max
    addi.n      a15,a1,144                  # [1]  activation_min
    ee.vldbc.8  q3,a1               # [7]  id:473 int8_min+0x0
    ee.vldbc.8  q5,a15                  # [8]  id:470 activation_min+0x0
    ee.vldbc.8  q4,a9               # [9]  id:471 activation_max+0x0

    extui   a8,a5,0,3                   # [8]
    beqz.n  a8,.LBB3_esp_nn_max_pool_s8_esp32s3     # [9] // if (channels % 8 == 0)

    extui   a14,a5,0,2                  # [0]
    beqz.n  a14,.LBB25_esp_nn_max_pool_s8_esp32s3   # [1] // if (channels % 4 == 0)

    retw.n                          # [0]   // exit

.LBB3_esp_nn_max_pool_s8_esp32s3:   # 0x1c5 // if (channels % 8 == 0)

    l16ui   a15,a1,136                  # [1]  id:475 pad_wd+0x0
    l16ui   a14,a1,140                  # [4]  id:474 pad_ht+0x0
    movi.n  a8,0                    # [13]
    movi.n  a10,0                   # [15]
    s32i    a14,a1,44                   # [7]  gra_spill_temp_10
    neg     a15,a15                     # [12]
    mul16u  a9,a6,a5                # [14]
    neg     a14,a14                     # [16]
    s32i    a14,a1,92                   # [17]  gra_spill_temp_22
    s32i    a10,a1,52                   # [18]  gra_spill_temp_12
    s32i    a9,a1,60                    # [19]  gra_spill_temp_14
    s32i.n  a8,a1,36                # [16]  gra_spill_temp_8
    s32i    a15,a1,56                   # [21]  gra_spill_temp_13
    sub     a13,a4,a14                  # [22]
    s32i    a13,a1,48                   # [23]  gra_spill_temp_11
    sub     a15,a11,a15                 # [24]
    s32i.n  a15,a1,40               # [25]  gra_spill_temp_9

.Lt_0_21506:    # 0x229
    l32i    a8,a1,4                 # [0]  gra_spill_temp_0
    beqz.n  a8,.Lt_0_21762          # [2]

    movi.n  a10,0                   # [0]
    l32i    a9,a1,44                    # [1]  gra_spill_temp_10
    l32i.n  a15,a1,40               # [2]  gra_spill_temp_9
    l32i    a8,a1,52                    # [3]  gra_spill_temp_12
    l32i.n  a13,a1,136                  # [4]  ,pad_wd
    l32i    a14,a1,56                   # [5]  gra_spill_temp_13
    s32i    a14,a1,80                   # [6]  gra_spill_temp_19
    s32i    a13,a1,76                   # [7]  gra_spill_temp_18
    s32i    a8,a1,88                    # [8]  gra_spill_temp_21
    s32i    a15,a1,84                   # [9]  gra_spill_temp_20
    l32i    a8,a1,48                    # [10]  gra_spill_temp_11
    max     a9,a9,a10                   # [11]
    l32i    a15,a1,132                  # [12]  filter_ht
    s32i    a9,a1,8                 # [13]  gra_spill_temp_1
    movi.n  a9,0                    # [14]
    min     a15,a15,a8                  # [15]
    s32i    a9,a1,64                    # [16]  gra_spill_temp_15

.Lt_0_22274:    # 0x25d
    beqz.n  a5,.Lt_0_22530          # [0]

.LBB10_esp_nn_max_pool_s8_esp32s3:  # 0x25f
#<loop> Part of loop body line 46, head labeled .Lt_0_22274
    l32i    a6,a1,76                    # [0]  gra_spill_temp_18
    l32i    a13,a1,96                   # [1]  gra_spill_temp_23
    l32i    a8,a1,84                    # [2]  gra_spill_temp_20
    l32i    a7,a1,128                   # [3]  filter_wd
    l32i    a10,a1,88                   # [4]  gra_spill_temp_21
    movi.n  a9,0                    # [5]
    s32i    a9,a1,20                    # [6]  gra_spill_temp_4
    add.n   a14,a10,a5                  # [7]
    min     a7,a7,a8                    # [8]
    add.n   a10,a10,a13                 # [9]
    add.n   a14,a13,a14                 # [10]
    s32i    a14,a1,12                   # [11]  gra_spill_temp_2
    s32i    a10,a1,16                   # [12]  gra_spill_temp_3
    movi.n  a8,0                    # [13]
    l32i    a10,a1,80                   # [14]  gra_spill_temp_19
    max     a6,a6,a8                    # [15]
    sub     a9,a7,a6                    # [16]
    s32i    a9,a1,28                    # [17]  gra_spill_temp_6
    add.n   a13,a10,a6                  # [18]
    s32i    a13,a1,24                   # [19]  gra_spill_temp_5
    add.n   a10,a10,a7                  # [16]
    s32i    a10,a1,72                   # [21]  gra_spill_temp_17

.Lt_0_23042:    # 0x29a
    l32i    a8,a1,8                 # [0]  gra_spill_temp_1
    mv.qr   q1,q3                       # [1]
    mov.n   a13,a8                      # [2]
    bge     a8,a15,.Lt_0_23298          # [3]

.LBB13_esp_nn_max_pool_s8_esp32s3:  # 0x2a5
#<loop> Part of loop body line 40, head labeled .Lt_0_23042
    l32i    a10,a1,92                   # [0]  gra_spill_temp_22
    l32i    a14,a1,72                   # [1]  gra_spill_temp_17
    add.n   a10,a10,a8                  # [2]
    mull    a10,a11,a10                 # [3]
    add.n   a14,a10,a14                 # [5]

.Lt_0_23810:    # 0x2b2
    add.n   a14,a14,a11                 # [0]
    addi.n  a13,a13,1               # [1]
    bge     a6,a7,.Lt_0_24066           # [2]

.LBB16_esp_nn_max_pool_s8_esp32s3:  # 0x2b9
    l32i    a3,a1,24                    # [0]  gra_spill_temp_5
    l32i    a2,a1,20                    # [1]  gra_spill_temp_4
    add.n   a3,a3,a10                   # [2]
    mull    a3,a3,a5                    # [3]
    add.n   a2,a2,a3                    # [5]
    l32i    a3,a1,28                    # [6]  gra_spill_temp_6
    add.n   a2,a12,a2                   # [7]
    loopgtz a3,.LBB93_esp_nn_max_pool_s8_esp32s3    # [8]

    ee.vld.l.64.ip  q0,a2,0         # [0*II+1]  id:481
    add.n           a2,a2,a5                    # [0*II+2]
    ee.vmax.s8      q1,q1,q0            # [0*II+3]
.LBB93_esp_nn_max_pool_s8_esp32s3:  # 0x2d8

.Lt_0_24066:    # 0x2d8
    add.n   a10,a10,a11                 # [0]
    bne     a15,a13,.Lt_0_23810         # [1]

.Lt_0_23298:    # 0x2dd
    l32i    a9,a1,12                    # [0]  gra_spill_temp_2
    l32i    a13,a1,20                   # [1]  gra_spill_temp_4
    l32i    a8,a1,16                    # [2]  gra_spill_temp_3
    ee.vmin.s8  q2,q1,q4            # [3]
    ee.vmax.s8  q2,q2,q5            # [4]
    mov.n   a10,a8                      # [5]
    addi.n  a13,a13,8               # [6]
    s32i    a13,a1,20                   # [7]  gra_spill_temp_4
    ee.vst.l.64.ip  q2,a10,0        # [8]  id:482
    addi.n  a8,a8,8                 # [9]
    s32i    a8,a1,16                    # [10]  gra_spill_temp_3
    blt     a8,a9,.Lt_0_23042           # [11]

.Lt_0_22530:    # 0x2fe
    l32i    a13,a1,84                   # [0]  gra_spill_temp_20
    l32i    a14,a1,80                   # [1]  gra_spill_temp_19
    l32i    a10,a1,120                  # [2]  stride_wd
    l32i    a8,a1,88                    # [3]  gra_spill_temp_21
    l32i    a9,a1,76                    # [4]  gra_spill_temp_18
    add.n   a8,a8,a5                    # [5]
    s32i    a8,a1,88                    # [6]  gra_spill_temp_21
    sub     a9,a9,a10                   # [7]
    add.n   a14,a14,a10                 # [8]
    sub     a13,a13,a10                 # [9]
    s32i    a13,a1,84                   # [10]  gra_spill_temp_20
    s32i    a14,a1,80                   # [11]  gra_spill_temp_19
    s32i    a9,a1,76                    # [12]  gra_spill_temp_18
    l32i    a14,a1,64                   # [13]  gra_spill_temp_15
    l32i    a8,a1,4                 # [14]  gra_spill_temp_0
    addi.n  a14,a14,1               # [15]
    s32i    a14,a1,64                   # [16]  gra_spill_temp_15
    sub     a14,a14,a8                  # [17]
    bnez    a14,.Lt_0_22274             # [18]

.Lt_0_21762:    # 0x334
#<loop> Part of loop body line 20, head labeled .Lt_0_21506
    l32i    a8,a1,44                    # [0]  gra_spill_temp_10
    l32i    a15,a1,92                   # [1]  gra_spill_temp_22
    l32i    a10,a1,60                   # [2]  gra_spill_temp_14
    l32i    a14,a1,124                  # [3]  stride_ht
    l32i    a13,a1,48                   # [4]  gra_spill_temp_11
    l32i    a9,a1,52                    # [5]  gra_spill_temp_12
    sub     a13,a13,a14                 # [6]
    add.n   a9,a9,a10                   # [7]
    add.n   a15,a15,a14                 # [8]
    sub     a8,a8,a14                   # [9]
    s32i    a8,a1,44                    # [10]  gra_spill_temp_10
    s32i    a15,a1,92                   # [11]  gra_spill_temp_22
    s32i    a9,a1,52                    # [12]  gra_spill_temp_12
    s32i    a13,a1,48                   # [13]  gra_spill_temp_11
    l32i.n  a9,a1,36                # [14]  gra_spill_temp_8
    l32i    a10,a1,68                   # [15]  gra_spill_temp_16
    addi.n  a9,a9,1                 # [16]
    s32i.n  a9,a1,36                # [17]  gra_spill_temp_8
    sub     a9,a9,a10                   # [18]
    bnez    a9,.Lt_0_21506              # [19]

    retw.n                          # [0] // exit

.LBB25_esp_nn_max_pool_s8_esp32s3:  # 0x36d // if (channels % 4 == 0)

    l16ui   a10,a1,136                  # [1]  id:475 pad_wd+0x0
    l16ui   a9,a1,140                   # [4]  id:474 pad_ht+0x0
    movi.n  a13,0                   # [13]
    movi.n  a15,0                   # [15]
    neg     a10,a10                     # [12]
    s32i    a9,a1,44                    # [7]  gra_spill_temp_10
    mul16u  a14,a6,a5               # [14]
    neg     a9,a9                       # [16]
    s32i    a9,a1,92                    # [17]  gra_spill_temp_22
    s32i    a15,a1,52                   # [18]  gra_spill_temp_12
    s32i    a14,a1,60                   # [19]  gra_spill_temp_14
    s32i.n  a13,a1,36               # [16]  gra_spill_temp_8
    s32i    a10,a1,56                   # [21]  gra_spill_temp_13
    sub     a8,a4,a9                    # [22]
    s32i    a8,a1,48                    # [23]  gra_spill_temp_11
    sub     a10,a11,a10                 # [24]
    s32i.n  a10,a1,40               # [25]  gra_spill_temp_9

.Lt_0_27138:    # 0x3d5
    l32i    a13,a1,4                # [0]  gra_spill_temp_0
    beqz.n  a13,.Lt_0_27394         # [2]

.LBB29_esp_nn_max_pool_s8_esp32s3:  # 0x3da
#<loop> Part of loop body line 107, head labeled .Lt_0_27138
    movi.n  a10,0                   # [0]
    l32i    a9,a1,44                    # [1]  gra_spill_temp_10
    l32i.n  a15,a1,40               # [2]  gra_spill_temp_9
    l32i    a8,a1,52                    # [3]  gra_spill_temp_12
    l32i    a14,a1,56                   # [4]  gra_spill_temp_13
    l32i.n  a13,a1,136                  # [5]  pad_wd
    s32i    a13,a1,76                   # [6]  gra_spill_temp_18
    s32i    a14,a1,80                   # [7]  gra_spill_temp_19
    s32i    a8,a1,88                    # [8]  gra_spill_temp_21
    s32i    a15,a1,84                   # [9]  gra_spill_temp_20
    l32i    a8,a1,48                    # [10]  gra_spill_temp_11
    l32i    a15,a1,132                  # [11]  filter_ht
    movi.n  a14,0                   # [12]
    max     a9,a9,a10                   # [13]
    s32i    a9,a1,8                 # [14]  gra_spill_temp_1
    s32i    a14,a1,64                   # [15]  gra_spill_temp_15
    min     a15,a15,a8                  # [16]

.Lt_0_27906:    # 0x409
#<loop> Loop body line 109, nesting depth: 2, estimated iterations: 56
    beqz.n  a5,.Lt_0_28162          # [0]

.LBB32_esp_nn_max_pool_s8_esp32s3:  # 0x40b
#<loop> Part of loop body line 109, head labeled .Lt_0_27906
    l32i    a6,a1,76                    # [0]  gra_spill_temp_18
    l32i    a13,a1,96                   # [1]  gra_spill_temp_23
    l32i    a8,a1,84                    # [2]  gra_spill_temp_20
    l32i    a7,a1,128                   # [3]  filter_wd
    l32i    a10,a1,88                   # [4]  gra_spill_temp_21
    movi.n  a9,0                    # [5]
    s32i    a9,a1,32                    # [6]  gra_spill_temp_7
    add.n   a14,a10,a5                  # [7]
    min     a7,a7,a8                    # [8]
    add.n   a10,a10,a13                 # [9]
    add.n   a14,a13,a14                 # [10]
    s32i    a14,a1,12                   # [11]  gra_spill_temp_2
    s32i    a10,a1,16                   # [12]  gra_spill_temp_3
    movi.n  a8,0                    # [13]
    l32i    a10,a1,80                   # [14]  gra_spill_temp_19
    max     a6,a6,a8                    # [15]
    sub     a9,a7,a6                    # [16]
    s32i    a9,a1,28                    # [17]  gra_spill_temp_6
    add.n   a13,a10,a6                  # [18]
    s32i    a13,a1,24                   # [19]  gra_spill_temp_5
    add.n   a10,a10,a7                  # [16]
    s32i    a10,a1,72                   # [21]  gra_spill_temp_17

.Lt_0_28674:    # 0x446
#<loop> Loop body line 8, nesting depth: 3, estimated iterations: 56
    l32i    a8,a1,8                 # [0]  gra_spill_temp_1
    mv.qr   q1,q3                       # [1]
    mov.n   a13,a8                      # [2]
    bge     a8,a15,.Lt_0_28930          # [3]

.LBB35_esp_nn_max_pool_s8_esp32s3:  # 0x451
#<loop> Part of loop body line 8, head labeled .Lt_0_28674
    l32i    a10,a1,92                   # [0]  gra_spill_temp_22
    l32i    a14,a1,72                   # [1]  gra_spill_temp_17
    add.n   a10,a10,a8                  # [2]
    mull    a10,a11,a10                 # [3]
    add.n   a14,a10,a14                 # [5]

.Lt_0_29442:    # 0x45e
    add.n   a14,a14,a11                 # [0]
    addi.n  a13,a13,1               # [1]
    bge     a6,a7,.Lt_0_29698           # [2]

.LBB38_esp_nn_max_pool_s8_esp32s3:  # 0x465
    l32i    a3,a1,24                    # [0]  gra_spill_temp_5
    l32i    a2,a1,32                    # [1]  gra_spill_temp_7
    add.n   a3,a3,a10                   # [2]
    mull    a3,a3,a5                    # [3]
    l32i    a4,a1,28                    # [4]  gra_spill_temp_6
    add.n   a2,a2,a3                    # [5]
    add.n   a2,a12,a2                   # [6]
    loopgtz a4,.LBB108_esp_nn_max_pool_s8_esp32s3   # [7]

    ee.vldbc.32 q0,a2               # [0*II+0]  id:489
    add.n       a2,a2,a5                    # [0*II+1]
    ee.vmax.s8  q1,q1,q0            # [0*II+2]
.LBB108_esp_nn_max_pool_s8_esp32s3: # 0x482

.Lt_0_29698:    # 0x482
    add.n   a10,a10,a11                 # [0]
    bne     a15,a13,.Lt_0_29442         # [1]

.Lt_0_28930:    # 0x487
#<loop> Part of loop body line 8, head labeled .Lt_0_28674
    l32i            a9,a1,12                    # [0]  gra_spill_temp_2
    l32i            a8,a1,16                    # [1]  gra_spill_temp_3
    l32i            a10,a1,32                   # [3]  gra_spill_temp_7

    ee.vmin.s8      q5,q1,q4            # [4]
    ee.vmax.s8      q5,q5,q5            # [5]
    addi.n          a10,a10,4               # [6]
    ee.movi.32.a    q5,a13,0
    s32i            a10,a1,32                   # [9]  gra_spill_temp_7
    s32i.n          a13,a8,0                # [10]  id:492
    addi.n          a8,a8,4                 # [11]
    s32i            a8,a1,16                    # [12]  gra_spill_temp_3
    blt             a8,a9,.Lt_0_28674           # [13]

.Lt_0_28162:    # 0x4ad
#<loop> Part of loop body line 109, head labeled .Lt_0_27906
    l32i    a13,a1,84                   # [0]  gra_spill_temp_20
    l32i    a14,a1,80                   # [1]  gra_spill_temp_19
    l32i    a10,a1,120                  # [2]  stride_wd
    l32i    a8,a1,88                    # [3]  gra_spill_temp_21
    l32i    a9,a1,76                    # [4]  gra_spill_temp_18
    add.n   a8,a8,a5                    # [5]
    s32i    a8,a1,88                    # [6]  gra_spill_temp_21
    sub     a9,a9,a10                   # [7]
    add.n   a14,a14,a10                 # [8]
    sub     a13,a13,a10                 # [9]
    s32i    a13,a1,84                   # [10]  gra_spill_temp_20
    s32i    a14,a1,80                   # [11]  gra_spill_temp_19
    s32i    a9,a1,76                    # [12]  gra_spill_temp_18
    l32i    a14,a1,64                   # [13]  gra_spill_temp_15
    l32i    a8,a1,4                 # [14]  gra_spill_temp_0
    addi.n  a14,a14,1               # [15]
    s32i    a14,a1,64                   # [16]  gra_spill_temp_15
    sub     a14,a14,a8                  # [17]
    bnez    a14,.Lt_0_27906             # [18]

.Lt_0_27394:    # 0x4e3
#<loop> Part of loop body line 107, head labeled .Lt_0_27138
    l32i    a8,a1,44                    # [0]  gra_spill_temp_10
    l32i    a15,a1,92                   # [1]  gra_spill_temp_22
    l32i    a10,a1,60                   # [2]  gra_spill_temp_14
    l32i    a14,a1,124                  # [3]  stride_ht
    l32i    a13,a1,48                   # [4]  gra_spill_temp_11
    l32i    a9,a1,52                    # [5]  gra_spill_temp_12
    sub     a13,a13,a14                 # [6]
    add.n   a9,a9,a10                   # [7]
    add.n   a15,a15,a14                 # [8]
    sub     a8,a8,a14                   # [9]
    s32i    a8,a1,44                    # [10]  gra_spill_temp_10
    s32i    a15,a1,92                   # [11]  gra_spill_temp_22
    s32i    a9,a1,52                    # [12]  gra_spill_temp_12
    s32i    a13,a1,48                   # [13]  gra_spill_temp_11
    l32i.n  a9,a1,36                # [14]  gra_spill_temp_8
    l32i    a10,a1,68                   # [15]  gra_spill_temp_16
    addi.n  a9,a9,1                 # [16]
    s32i.n  a9,a1,36                # [17]  gra_spill_temp_8
    sub     a9,a9,a10                   # [18]
    bnez    a9,.Lt_0_27138              # [19]

    retw.n                          # [0] // exit

    .size   esp_nn_max_pool_s8_esp32s3, . - esp_nn_max_pool_s8_esp32s3


================================================
FILE: src/softmax/esp_nn_softmax_ansi.c
================================================
// Copyright 2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "softmax_common.h"

int32_t esp_nn_get_softmax_scratch_size_ansi(const int32_t width, const int32_t height)
{
    (void) width;
    (void) height;
    return 0;
}

void esp_nn_set_softmax_scratch_buf_ansi(void *buffer)
{
    (void) buffer;
    return;
}

void esp_nn_softmax_s8_ansi(const int8_t *input_data,
                            const int32_t height,
                            const int32_t width,
                            const int32_t mult,
                            const int32_t shift,
                            const int32_t diff_min,
                            int8_t *output_data)
{
    // The representation chosen for the input to the exp() function is Q5.26.
    // We need to leave extra space since values that we skip might be as large as
    // -32 before multiplying by input mult, and therefore as large as
    // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
    // accumulation, but exp(-16) definitely is.
#define ACCUM_BITS  12
#define DIFF_BITS   5

    const int32_t mask = (1 << shift);
    int32_t col = 0;
    const int8_t *in_ptr = input_data;
    int8_t *out_ptr = output_data;

    for (int row_idx = 0; row_idx < height; row_idx++) {
        int8_t max_in_row = in_ptr[0];
        for (col = 1; col < width; col++) {
            max_in_row = max(max_in_row, in_ptr[col]);
        }

        int32_t input_diff = 0;
        int32_t sum_of_exps = 0;

        for (col = 0; col < width; col++) {
            input_diff = in_ptr[col] - max_in_row;
            if (input_diff >= diff_min) {
                const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult);
                const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled);
                sum_of_exps += DIV_POW2(exp_raw, ACCUM_BITS);
            }
        }

        const int32_t headroom_plus1 = esp_nn_clz32((uint32_t) sum_of_exps);
        const int32_t shifted_scale = ONE_OVER_ONE_X((sum_of_exps << headroom_plus1) - (1 << 31));
        const int32_t bits_over_unit = ACCUM_BITS - headroom_plus1 + 31 - sizeof(int8_t) * 8;

        for (col = 0; col < width; col++) {
            input_diff = in_ptr[col] - max_in_row;
            if (input_diff >= diff_min) {
                const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult);
                const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled);
                const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw);
                const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128;
                out_ptr[col] = (int8_t) esp_nn_saturate8(result);
            } else {
                out_ptr[col] = -128;
            }
        }
        in_ptr  += width;
        out_ptr += width;
    }
}


================================================
FILE: src/softmax/esp_nn_softmax_opt.c
================================================
// Copyright 2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "softmax_common.h"
#include <stdio.h>

static int32_t *scratch_buf = NULL;

/**
 * @brief   Get scratch buffer size needed by softmax function
 *
 * @param   width
 * @param   height
 * @return  size in bytes
 *
 * @note    buffer must be 4 byte aligned
 */
int32_t esp_nn_get_softmax_scratch_size_opt(const int32_t width, const int32_t height)
{
    (void) height;
    return width * 4;
}

/**
 * @brief   Set scratch buffer to be used by softmax function
 *
 * @param   buffer  this can be NULL if one needs to unset it
 *                  must be aligned to 4 bytes
 */
void esp_nn_set_softmax_scratch_buf_opt(void *buffer)
{
    scratch_buf = (int32_t *) buffer;
}

void esp_nn_softmax_s8_opt(const int8_t *input_data,
                           const int32_t height,
                           const int32_t width,
                           const int32_t mult,
                           const int32_t shift,
                           const int32_t diff_min,
                           int8_t *output_data)
{
    if (scratch_buf == NULL) {
        printf("%s error! scratch buffer not set\n", __FUNCTION__);
        return;
    }
    // The representation chosen for the input to the exp() function is Q5.26.
    // We need to leave extra space since values that we skip might be as large as
    // -32 before multiplying by input mult, and therefore as large as
    // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
    // accumulation, but exp(-16) definitely is.
#define ACCUM_BITS  12
#define DIFF_BITS   5

    const int32_t mask = (1 << shift);
    int32_t col = 0;
    const int8_t *in_ptr = input_data;
    int8_t *out_ptr = output_data;

    for (int row_idx = 0; row_idx < height; row_idx++) {
        int8_t max_in_row = in_ptr[0];
        for (col = 1; col < width; col++) {
            max_in_row = max(max_in_row, in_ptr[col]);
        }

        int32_t input_diff = 0;
        int32_t sum_of_exps = 0;

        for (col = 0; col < width; col++) {
            input_diff = in_ptr[col] - max_in_row;
            if (input_diff >= diff_min) {
                const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult);
                const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled);
                scratch_buf[col] = exp_raw; // store to avoid duplicate calculation later
                sum_of_exps += DIV_POW2(exp_raw, ACCUM_BITS);
            }
        }

        const int32_t headroom_plus1 = esp_nn_clz32((uint32_t) sum_of_exps);
        const int32_t shifted_scale = ONE_OVER_ONE_X((sum_of_exps << headroom_plus1) - (1 << 31));
        const int32_t bits_over_unit = ACCUM_BITS - headroom_plus1 + 31 - sizeof(int8_t) * 8;

        for (col = 0; col < width; col++) {
            input_diff = in_ptr[col] - max_in_row;
            if (input_diff >= diff_min) {
                int32_t exp_raw = scratch_buf[col];
                const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw);
                const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128;
                out_ptr[col] = (int8_t) esp_nn_saturate8(result);
            } else {
                out_ptr[col] = -128;
            }
        }
        in_ptr  += width;
        out_ptr += width;
    }
}


================================================
FILE: src/softmax/esp_nn_softmax_s8_esp32p4.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include "softmax_common.h"
#include <stdio.h>
#include <limits.h>

static int32_t *p4_scratch_buf = NULL;

int32_t esp_nn_get_softmax_scratch_size_esp32p4(const int32_t width, const int32_t height)
{
    (void) height;
    return width * 4;
}

void esp_nn_set_softmax_scratch_buf_esp32p4(void *buffer)
{
    /* Enable PIE */
    asm volatile (
        "csrsi  0x7f2, 0b01        \n\t"
        "li     x29, 0b10          \n\t"
        "esp.movx.w.cfg x29        \n\t"
        ::: "x29"
    );
    p4_scratch_buf = (int32_t *) buffer;
}

/**
 * Softmax for s8 optimized for ESP32-P4.
 * Phase 1 (find-max) uses PIE esp.vmax.s8 for 16 elements at a time.
 * Phases 2-3 (exp + normalize) use cached exp values in scratch buffer.
 */
void esp_nn_softmax_s8_esp32p4(const int8_t *input_data,
                                const int32_t height,
                                const int32_t width,
                                const int32_t mult,
                                const int32_t shift,
                                const int32_t diff_min,
                                int8_t *output_data)
{
    if (p4_scratch_buf == NULL) {
        printf("%s error! scratch buffer not set\n", __FUNCTION__);
        return;
    }

#define ACCUM_BITS  12
#define DIFF_BITS   5

    const int32_t mask = (1 << shift);
    int32_t col = 0;
    const int8_t *in_ptr = input_data;
    int8_t *out_ptr = output_data;

    for (int row_idx = 0; row_idx < height; row_idx++) {
        /* Phase 1: Find max in row using PIE vectorization.
         * Use auto-incrementing loads to avoid redundant mv per iteration. */
        int8_t max_in_row;
        if (width >= 16) {
            int32_t vec_count = (width >> 4);  /* number of 16-element groups */
            int32_t vec_processed = vec_count << 4;

            int32_t max_scalar;
            asm volatile (
                "mv     x30, %[ptr]              \n\t"
                "esp.vld.128.ip q0, x30, 16      \n\t"  /* load first 16, advance */
                "addi   %[cnt], %[cnt], -1       \n\t"  /* one group already loaded */
                "beqz   %[cnt], 2f               \n\t"
                "1:                              \n\t"
                "esp.vld.128.ip q1, x30, 16      \n\t"  /* load next 16, advance */
                "esp.vmax.s8    q0, q0, q1       \n\t"  /* running max */
                "addi   %[cnt], %[cnt], -1       \n\t"
                "bnez   %[cnt], 1b               \n\t"
                "2:                              \n\t"
                "esp.max.s8.a   q0, %[max]       \n\t"  /* horizontal reduce */
                : [cnt] "+r"(vec_count), [max] "=r"(max_scalar)
                : [ptr] "r"(in_ptr)
                : "x30"
            );
            max_in_row = (int8_t) max_scalar;

            /* Check remaining elements (< 16) */
            for (int32_t i = vec_processed; i < width; i++) {
                if (in_ptr[i] > max_in_row) max_in_row = in_ptr[i];
            }
        } else {
            max_in_row = in_ptr[0];
            for (col = 1; col < width; col++) {
                max_in_row = max(max_in_row, in_ptr[col]);
            }
        }

        /* Phase 2: Compute exp values and sum */
        int32_t input_diff = 0;
        int32_t sum_of_exps = 0;

        for (col = 0; col < width; col++) {
            input_diff = in_ptr[col] - max_in_row;
            if (input_diff >= diff_min) {
                const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult);
                const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled);
                p4_scratch_buf[col] = exp_raw;
                sum_of_exps += DIV_POW2(exp_raw, ACCUM_BITS);
            }
        }

        /* Phase 3: Normalize */
        const int32_t headroom_plus1 = esp_nn_clz32((uint32_t) sum_of_exps);
        const int32_t shifted_scale = ONE_OVER_ONE_X((sum_of_exps << headroom_plus1) - (1 << 31));
        const int32_t bits_over_unit = ACCUM_BITS - headroom_plus1 + 31 - sizeof(int8_t) * 8;

        for (col = 0; col < width; col++) {
            input_diff = in_ptr[col] - max_in_row;
            if (input_diff >= diff_min) {
                int32_t exp_raw = p4_scratch_buf[col];
                const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw);
                const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128;
                out_ptr[col] = (int8_t) esp_nn_saturate8(result);
            } else {
                out_ptr[col] = -128;
            }
        }
        in_ptr  += width;
        out_ptr += width;
    }
}


================================================
FILE: src/softmax/esp_nn_softmax_s8_esp32s3.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

/*
 * ESP32-S3 optimized softmax with SIMD find-max for width >= 16.
 */

#include <stdint.h>
#include "softmax_common.h"

static int32_t *scratch_buf_s3 = NULL;

int32_t esp_nn_get_softmax_scratch_size_esp32s3(const int32_t width, const int32_t height)
{
    (void) height;
    return width * 4;
}

void esp_nn_set_softmax_scratch_buf_esp32s3(void *buffer)
{
    scratch_buf_s3 = (int32_t *) buffer;
}

/* Find max of int8 array — SIMD for len >= 32, scalar for smaller */
static inline int8_t find_max_s8(const int8_t *data, int32_t len)
{
    int8_t m = -128;
    int32_t idx = 0;

#if defined(__XTENSA__)
    if (len >= 32) {
        /* Use ee.vmax.s8 for 16 elements/cycle — only for len >= 32
         * to avoid potential alignment issues with small buffers */
        int8_t tmp_buf[16] __attribute__((aligned(16)));
        const int8_t *ptr = data;
        int8_t *buf_ptr = tmp_buf;
        int32_t simd_len = len & ~15; /* round down to multiple of 16 */

        asm volatile (
            "ee.vld.128.ip  q0, %[ptr], 16          \n\t" /* q0 = running max */
            "movi.n %[idx], 16                       \n\t"
            "j      2f                               \n\t"
            "1:                                      \n\t"
            "ee.vld.128.ip  q1, %[ptr], 16           \n\t"
            "ee.vmax.s8     q0, q0, q1               \n\t"
            "addi   %[idx], %[idx], 16               \n\t"
            "2:                                      \n\t"
            "blt    %[idx], %[slen], 1b              \n\t"
            /* Store vector max to tmp_buf for horizontal reduction */
            "ee.vst.128.ip  q0, %[buf], 16           \n\t"
            : [idx] "+r"(idx), [ptr] "+r"(ptr), [buf] "+r"(buf_ptr)
            : [slen] "r"(simd_len)
            : "memory"
        );

        /* Horizontal reduction of 16 max values */
        for (int i = 0; i < 16; i++) {
            if (tmp_buf[i] > m) m = tmp_buf[i];
        }
        idx = simd_len;
    }
#endif

    /* Scalar for remainder or small arrays */
    for (; idx < len; idx++) {
        if (data[idx] > m) m = data[idx];
    }
    return m;
}

void esp_nn_softmax_s8_esp32s3(const int8_t *input_data,
                                const int32_t height,
                                const int32_t width,
                                const int32_t mult,
                                const int32_t shift,
                                const int32_t diff_min,
                                int8_t *output_data)
{
    if (scratch_buf_s3 == NULL) {
        /* Fall through to opt version if scratch not set */
        return;
    }

#define ACCUM_BITS  12

    const int32_t mask = (1 << shift);
    const int8_t *in_ptr = input_data;
    int8_t *out_ptr = output_data;

    for (int row_idx = 0; row_idx < height; row_idx++) {
        /* Phase 1: Find max */
        int8_t max_in_row = find_max_s8(in_ptr, width);

        /* Phase 2: Compute exp and accumulate sum */
        int32_t sum_of_exps = 0;
        for (int col = 0; col < width; col++) {
            int32_t input_diff = in_ptr[col] - max_in_row;
            if (input_diff >= diff_min) {
                const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult);
                const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled);
                scratch_buf_s3[col] = exp_raw;
                sum_of_exps += DIV_POW2(exp_raw, ACCUM_BITS);
            }
        }

        /* Phase 3: Compute normalization scale */
        const int32_t headroom_plus1 = esp_nn_clz32((uint32_t) sum_of_exps);
        const int32_t shifted_scale = ONE_OVER_ONE_X((sum_of_exps << headroom_plus1) - (1 << 31));
        const int32_t bits_over_unit = ACCUM_BITS - headroom_plus1 + 31 - 8;

        /* Phase 4: Normalize and output — unrolled 4x for reduced loop overhead */
        int col = 0;
        for (; col + 3 < width; col += 4) {
            for (int k = 0; k < 4; k++) {
                int32_t input_diff = in_ptr[col + k] - max_in_row;
                if (input_diff >= diff_min) {
                    int32_t exp_raw = scratch_buf_s3[col + k];
                    const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw);
                    const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128;
                    out_ptr[col + k] = (int8_t) esp_nn_saturate8(result);
                } else {
                    out_ptr[col + k] = -128;
                }
            }
        }
        /* Remainder */
        for (; col < width; col++) {
            int32_t input_diff = in_ptr[col] - max_in_row;
            if (input_diff >= diff_min) {
                int32_t exp_raw = scratch_buf_s3[col];
                const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw);
                const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128;
                out_ptr[col] = (int8_t) esp_nn_saturate8(result);
            } else {
                out_ptr[col] = -128;
            }
        }

        in_ptr  += width;
        out_ptr += width;
    }
#undef ACCUM_BITS
}


================================================
FILE: src/softmax/softmax_common.h
================================================
// Copyright 2022 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <stdint.h>
#include <common_functions.h>

#define MASK_IF_ZERO(x)                 (x) == 0 ? ~0 : 0
#define MASK_IF_NON_ZERO(x)             (x) != 0 ? ~0 : 0
#define SELECT_USING_MASK(mask, a, b)   ((mask) & (a)) ^ (~(mask) & (b))
#define SAT_HIGH_MUL(x, y)              esp_nn_sat_round_doubling_high_mul((x), (y))
#define DIV_POW2(x,y)                   esp_nn_div_by_power_of_two((x), (y))

__NN_FORCE_INLINE__ int32_t mul_power_of_2(int val, int exp)
{
    const int32_t thresh = ((1 << (31 - exp)) - 1);
    int32_t result = val << exp;
    result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val > thresh), INT32_MAX, result);
    result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val < -thresh), INT32_MIN, result);
    return result;
}

/**
 * @brief   Calculate `1 / (1 + x)` for x in [0, 1]
 *
 * @param   val     input value to calculate `1/(1+x)` for
 * @return  `int32_t` result
 * @note    Newton-Raphson division
 *
 *          https://en.wikipedia.org/wiki/Division_algorithm#Newton.E2.80.93Raphson_division
 *          Refer to that page for the logic behind the 48/17 and 32/17 constants.
 *          Pseudocode: https://en.wikipedia.org/wiki/Division_algorithm#Pseudocode
 */
__NN_FORCE_INLINE__ int32_t esp_nn_one_over_one_plus_x_for_x_in_0_1(int32_t val)
{
    const int64_t sum = (int64_t) val + INT32_MAX;
    const int32_t half_denominator = (int32_t) ((sum + (sum >= 0 ? 1 : -1)) / 2L);
    int32_t constant_48_over_17 = 1515870810;
    int32_t constant_neg_32_over_17 = -1010580540;
    int32_t x = constant_48_over_17 + SAT_HIGH_MUL(half_denominator, constant_neg_32_over_17);
    const int32_t fixed_2_one = (1 << 29);

    x += mul_power_of_2(SAT_HIGH_MUL(x, fixed_2_one - SAT_HIGH_MUL(half_denominator, x)), 2);
    x += mul_power_of_2(SAT_HIGH_MUL(x, fixed_2_one - SAT_HIGH_MUL(half_denominator, x)), 2);
    x += mul_power_of_2(SAT_HIGH_MUL(x, fixed_2_one - SAT_HIGH_MUL(half_denominator, x)), 2);

    return mul_power_of_2(x, 1);
}

#define ONE_OVER_ONE_X(x)   esp_nn_one_over_one_plus_x_for_x_in_0_1((x))

/**
 * @brief   Return exp(x) for x < 0.
 *
 */
__NN_FORCE_INLINE__ int32_t esp_nn_exp_on_negative_values(int32_t val)
{
    int32_t shift = 24;

    const int32_t one_quarter = (1 << shift);
    int32_t mask = one_quarter - 1;
    const int32_t val_mod_minus_quarter = (val & mask) - one_quarter;
    const int32_t remainder             = val_mod_minus_quarter - val;

    // calculate exponent for x in [-1/4, 0) in `result`
    const int32_t x                     = (val_mod_minus_quarter << 5) + (1 << 28);
    const int32_t x2                    = SAT_HIGH_MUL(x, x);
    const int32_t x3                    = SAT_HIGH_MUL(x2, x);
    const int32_t x4                    = SAT_HIGH_MUL(x2, x2);
    const int32_t one_over_3            = 715827883;
    const int32_t one_over_8            = 1895147668;

    const int32_t x4_over_4 = DIV_POW2(x4, 2);
    const int32_t x4_over_4_plus_x3_over_6_plus_x2_over_2 = DIV_POW2(SAT_HIGH_MUL(x4_over_4 + x3, one_over_3) + x2, 1);
    int32_t result = one_over_8 + SAT_HIGH_MUL(one_over_8, x + x4_over_4_plus_x3_over_6_plus_x2_over_2);

#define SELECT_IF_NON_ZERO(x) {                                   \
    mask   = MASK_IF_NON_ZERO(remainder & (1 << shift++));        \
    result = SELECT_USING_MASK(mask, SAT_HIGH_MUL(result, x), result); \
}

    SELECT_IF_NON_ZERO(1672461947)
    SELECT_IF_NON_ZERO(1302514674)
    SELECT_IF_NON_ZERO(790015084)
    SELECT_IF_NON_ZERO(290630308)
    SELECT_IF_NON_ZERO(39332535)
    SELECT_IF_NON_ZERO(720401)
    SELECT_IF_NON_ZERO(242)

#undef SELECT_IF_NON_ZERO

    mask = MASK_IF_ZERO(val);
    return SELECT_USING_MASK(mask, INT32_MAX, result);
}

================================================
FILE: test_app/CMakeLists.txt
================================================
# The following lines of boilerplate have to be in your project's
# CMakeLists in this exact order for cmake to work correctly
cmake_minimum_required(VERSION 3.5)

set(EXTRA_COMPONENT_DIRS "../" "../tests/")
set(IDF_EXCLUDE_COMPONENTS test test_app)

include($ENV{IDF_PATH}/tools/cmake/project.cmake)
project(test_app)


================================================
FILE: test_app/Makefile
================================================
#
# This is a project Makefile. It is assumed the directory this Makefile resides in is a
# project subdirectory.
#

PROJECT_NAME := test_app

# This line has to be included into the make file
# to include components that are located somewhere
# but not in "component" directory

EXTRA_COMPONENT_DIRS := $(realpath ../)
EXCLUDE_COMPONENTS := test

include $(IDF_PATH)/make/project.mk


================================================
FILE: test_app/main/CMakeLists.txt
================================================

set(COMPONENT_SRCS "main.c")
set(COMPONENT_ADD_INCLUDEDIRS "")

set(COMPONENT_PRIV_REQUIRES tests esp_timer)

register_component()


================================================
FILE: test_app/main/component.mk
================================================
#
# Main component makefile.
#
# This Makefile can be left empty. By default, it will take the sources in the 
# src/ directory, compile them and link them into lib(subdirectory_name).a
# in the build directory. This behaviour is entirely configurable,
# please read the ESP-IDF documents if you need to do this.
# 


================================================
FILE: test_app/main/main.c
================================================
/*
 * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <freertos/FreeRTOS.h>
#include <freertos/task.h>
#include <esp_log.h>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>

#include <test_functions.h>
#include <esp_timer.h>


#if __has_include("esp_idf_version.h")
#include <esp_idf_version.h>
#if ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(5, 0, 0)
#define esp_cpu_get_ccount esp_cpu_get_cycle_count
#endif
#endif

static const char *TAG = "test_app";
static uint32_t start_c, start_opt, total_c, total_opt;

void profile_c_start()
{
    /* initiate profiling */
    start_c = esp_cpu_get_ccount();
}

uint32_t profile_c_end()
{
    /* record profile number */
    total_c = esp_cpu_get_ccount() - start_c;
    return total_c;
}

void profile_opt_start()
{
    /* initiate profiling */
    start_opt = esp_cpu_get_ccount();
}

uint32_t profile_opt_end()
{
    /* record profile number */
    total_opt = esp_cpu_get_ccount() - start_opt;
    return total_opt;
}

static void print_profile(const char *kernel)
{
    float speedup = (total_c > 0 && total_opt > 0) ? (float)total_c / (float)total_opt : 0.0f;
    printf("PROFILE: %s, ansi=%"PRIu32", opt=%"PRIu32", speedup=%.2fx\n",
           kernel, total_c, total_opt, speedup);
}

void app_main()
{
    /* s8 tests */
    ESP_LOGI(TAG, "Running s8 tests...");
    esp_nn_add_elementwise_s8_test();
    print_profile("add_s8");
    esp_nn_mul_elementwise_s8_test();
    print_profile("mul_s8");
    esp_nn_mul_broadcast_channel_s8_test();
    print_profile("mul_broadcast_ch_s8");
    esp_nn_depthwise_conv_s8_test();
    print_profile("depthwise_conv_s8");
    esp_nn_conv_s8_test();
    print_profile("conv_s8");
    esp_nn_relu6_s8_test();
    print_profile("relu6_s8");
    esp_nn_avg_pool_s8_test();
    print_profile("avg_pool_s8");
    esp_nn_max_pool_s8_test();
    print_profile("max_pool_s8");
    esp_nn_fully_connected_s8_test();
    print_profile("fc_s8");
    esp_nn_fully_connected_per_ch_s8_test();
    print_profile("fc_per_ch_s8");
    esp_nn_softmax_s8_test();
    print_profile("softmax_s8");
    esp_nn_hard_swish_s8_test();
    print_profile("hard_swish_s8");
    esp_nn_mean_nhwc_s8_test();
    print_profile("mean_nhwc_s8");
    ESP_LOGI(TAG, "s8 tests done!\n");

    /* u8 tests */
    //ESP_LOGI(TAG, "Running u8 tests...");
    //esp_nn_add_elementwise_u8_test();
    //esp_nn_depthwise_conv_u8_test();
    //esp_nn_conv_u8_test();
    //esp_nn_avg_pool_u8_test();
    //esp_nn_max_pool_u8_test();
    //esp_nn_fully_connected_u8_test();
    //ESP_LOGI(TAG, "u8 tests done!\n");
}


================================================
FILE: test_app/sdkconfig.defaults
================================================

#
# esp-nn
#
CONFIG_NN_OPTIMIZED=y


================================================
FILE: test_app/sdkconfig.defaults.esp32p4
================================================
# Enables high speed SPIRAM and other options
CONFIG_IDF_EXPERIMENTAL_FEATURES=y

#
# ESP System Settings
#
CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ=360
CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_360=y

#
# ESP PSRAM
#
CONFIG_SPIRAM=y
CONFIG_SPIRAM_BOOT_INIT=y
CONFIG_SPIRAM_MODE_HEX=y
CONFIG_SPIRAM_SPEED_200M=y
CONFIG_SPIRAM_SPEED=200
CONFIG_SPIRAM_ALLOW_STACK_EXTERNAL_MEMORY=y
CONFIG_SPIRAM_ALLOW_BSS_SEG_EXTERNAL_MEMORY=n
CONFIG_SPIRAM_USE_CAPS_ALLOC=y
CONFIG_SPIRAM_TRY_ALLOCATE_WIFI_LWIP=y

#
# GDB Stub
#
CONFIG_ESP_GDBSTUB_ENABLED=y
CONFIG_ESP_SYSTEM_PANIC_GDBSTUB=y

#
# Heap memory debugging
#
# CONFIG_HEAP_POISONING_DISABLED is not set
CONFIG_HEAP_POISONING_LIGHT=y


================================================
FILE: test_app/sdkconfig.defaults.esp32s3
================================================
# Default configurations for ESP32-S3

CONFIG_ESP32S3_DEFAULT_CPU_FREQ_240=y
# CONFIG_ESP32S3_SPIRAM_SUPPORT is not set

CONFIG_ESP32S3_DATA_CACHE_64KB=y
CONFIG_ESP32S3_DATA_CACHE_8WAYS=y
CONFIG_ESP32S3_DATA_CACHE_LINE_64B=y


================================================
FILE: tests/CMakeLists.txt
================================================

set(COMPONENT_ADD_INCLUDEDIRS ./include/)
set(COMPONENT_SRCS "src/basic_math_test.c"
                   "src/convolution_test.c"
                   "src/fully_connected_test.c"
                   "src/pooling_test.c"
                   "src/relu_test.c"
                   "src/softmax_test.c"
                   "src/hard_swish_test.c"
                   "src/mean_test.c")

set(COMPONENT_REQUIRES )
set(COMPONENT_PRIV_REQUIRES esp-nn)

register_component()

target_compile_options(${COMPONENT_LIB} PRIVATE -Wno-unused-function)


================================================
FILE: tests/README.md
================================================
# Tests for esp_nn library

- Include these in your test framework and run the framework.
- For IDF test please refer `test_app`


================================================
FILE: tests/component.mk
================================================
#FIXME

COMPONENT_ADD_INCLUDEDIRS := include/

COMPONENT_SRCDIRS :=  src/


================================================
FILE: tests/include/test_functions.h
================================================
/*
 * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */


/* int8_t ops tests */
void esp_nn_add_elementwise_s8_test();
void esp_nn_mul_elementwise_s8_test();
void esp_nn_mul_broadcast_channel_s8_test();

void esp_nn_depthwise_conv_s8_test();
void esp_nn_conv_s8_test();

void esp_nn_avg_pool_s8_test();
void esp_nn_max_pool_s8_test();

void esp_nn_fully_connected_s8_test();
void esp_nn_fully_connected_per_ch_s8_test();

void esp_nn_relu6_s8_test();

void esp_nn_softmax_s8_test();

void esp_nn_hard_swish_s8_test();
void esp_nn_mean_nhwc_s8_test();

/* uint8_t ops tests */
void esp_nn_add_elementwise_u8_test();

void esp_nn_depthwise_conv_u8_test();
void esp_nn_conv_u8_test();

void esp_nn_avg_pool_u8_test();
void esp_nn_max_pool_u8_test();

void esp_nn_fully_connected_u8_test();

/* instructions test functions */
void compare_instructions_test();
void arith_instructions_test();
void min_max_instructions_test();
void bitwise_instructions_test();
void load_store_instructions_test();


================================================
FILE: tests/include/test_utils.h
================================================
/*
 * SPDX-FileCopyrightText: 2020-2023 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>
#include <stdbool.h>
#include <common_functions.h>
#include <stdio.h>

/* mult value range */
#define MULT_MAX    INT32_MAX
#define MULT_MIN    0

/* shift value range */
#define SHIFT_MIN   -31
#define SHIFT_MAX   30

/**
 * @brief callback function to run before C function
 */
void profile_c_start();

/**
 * @brief callback function to run after C function
 *
 * @return uint32_t cycles consumed running C function
 */
uint32_t profile_c_end();

/**
 * @brief callback function to run before optimized function
 */
void profile_opt_start();

/**
 * @brief callback function to run after optimized function
 *
 * @return uint32_t cycles consumed running optimized function
 */
uint32_t profile_opt_end();

#define ANSI_COLOR_RED     "\x1b[31m"
#define ANSI_COLOR_GREEN   "\x1b[32m"
#define ANSI_COLOR_YELLOW  "\x1b[33m"
#define ANSI_COLOR_BLUE    "\x1b[34m"
#define ANSI_COLOR_MAGENTA "\x1b[35m"
#define ANSI_COLOR_CYAN    "\x1b[36m"
#define ANSI_COLOR_RESET   "\x1b[0m"

#define CHECK_EQUAL(ARRAY1, ARRAY2, size) ({    \
    bool res = true;                            \
    for (int _i = 0; _i < size; _i++) {         \
        if (ARRAY1[_i] != ARRAY2[_i]) {         \
            res = false;                        \
            break;                              \
        }                                       \
    }                                           \
    res;                                        \
})

#define PRINT_ARRAY_INT(ARRAY, width, height) ({        \
    int *_array = (int *) ARRAY;                        \
    for (int _j = 0; _j < height; _j++) {               \
        for (int _i = 0; _i < width; _i++) {            \
            printf("%d\t", _array[width * _j + _i]);    \
        }                                               \
        printf("\n");                                   \
    }                                                   \
    printf("\n");                                       \
})

#define PRINT_ARRAY_HEX(ARRAY, width, height) ({        \
    uint8_t *_array = (uint8_t *) ARRAY;                \
    for (int _j = 0; _j < height; _j++) {               \
        for (int _i = 0; _i < width; _i++) {            \
            printf("%02x\t", _array[width * _j + _i]);  \
        }                                               \
        printf("\n");                                   \
    }                                                   \
    printf("\n");                                       \
})

#define PRINT_ARRAY_INT8(ARRAY, width, height) ({        \
    int8_t *_array = (int8_t *) ARRAY;                \
    for (int _j = 0; _j < height; _j++) {               \
        for (int _i = 0; _i < width; _i++) {            \
            printf("%4d ", _array[width * _j + _i]);  \
        }                                               \
        printf("\n");                                   \
    }                                                   \
    printf("\n");                                       \
})

#if CONFIG_IDF_CMAKE
#if ((CONFIG_SPIRAM || CONFIG_SPIRAM_SUPPORT || CONFIG_ESP32S3_SPIRAM_SUPPORT) && \
        (CONFIG_SPIRAM_USE_CAPS_ALLOC || CONFIG_SPIRAM_USE_MALLOC))
#define IDF_HEAP_CAPS 1
#endif
#endif

#if IDF_HEAP_CAPS
#include "esp_heap_caps.h"
/* Try SPIRAM first, fall back to internal RAM */
static inline void *esp_nn_test_alloc(size_t size) {
    void *ptr = heap_caps_malloc(size, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);
    if (!ptr) {
        ptr = heap_caps_malloc(size, MALLOC_CAP_8BIT);
    }
    return ptr;
}
#define ESP_NN_TEST_ALLOC(SIZE) esp_nn_test_alloc(SIZE)
#else
#include <malloc.h>
#define ESP_NN_TEST_ALLOC(SIZE) malloc(SIZE)
#endif


================================================
FILE: tests/src/basic_math_test.c
================================================
/*
 * SPDX-FileCopyrightText: 2020-2023 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <inttypes.h>

#include <common_functions.h>
#include <esp_nn.h>
#include "test_utils.h"

const int8_t test_add_in1[] = {
       13,   26,  -26,   26,  -13,   13,  -13,   13,  -13,   13,  -13,   13,  -26,  -51,  -26,  -51,
      -26,  -39,  -26,  -39,  -39,  -39,  -26,  -51,  -13,  -13,  -13,  -13,  -26,  -13,  -13,  -13,
      -13,  -13,    0,  -26,    0,  -13,    0,  -26,  -13,  -26,  -26,  -26,  -26,  -26,  -26,  -26,
      -13,  -13,    0,  -26,  -13,  -26,  -26,  -26,    0,    0,  -26,  -13,   13,    0,   26,    0,
       13,    0,   13,    0,    0,    0,   13,    0,   13,   26,  -26,   13,  -26,   13,  -13,   13,
      -13,   13,  -13,   13,  -26,  -26,  -13,  -26,  -26,  -26,  -26,  -26,  -39,  -26,  -26,  -26,
      -13,    0,  -13,    0,  -26,    0,  -13,    0,  -13,    0,  -13,  -13,    0,    0,    0,  -13,
      -13,  -13,  -26,  -13,  -26,  -13,  -13,  -13,  -13,    0,    0,  -13,  -13,  -13,  -13,  -13,
        0,    0,  -13,    0,   13,   13,   13,    0,    0,    0,   13,   13,    0,    0,   13,   13,
        0,   26,    0,   13,    0,   13,    0,   13,    0,   13,    0,   13,    0,   13,    0,    0,
        0,    0,    0,   13,    0,    0,    0,    0,   13,   13,    0,   13,    0,   13,    0,   13,
        0,   13,    0,   13,   13,   13,    0,   13,    0,   13,    0,   13,    0,   13,   13,   13,
       13,   13,    0,   13,    0,   13,    0,   13,   13,   13,   13,   13,   13,   13,   13,   13,
        0,   13,   13,   13,   13,   13,   13,   13,
};

const int8_t test_add_in2[] = {
     -128, -128, -103, -128,  -77, -128,  -52, -128,  -26, -128,   -1, -128, -128, -103, -103, -103,
      -77, -103,  -52, -103,  -26, -103,   -1, -103, -128,  -77, -103,  -77,  -77,  -77,  -52,  -77,
      -26,  -77,   -1,  -77, -128,  -52, -103,  -52,  -77,  -52,  -52,  -52,  -26,  -52,   -1,  -52,
     -128,  -26, -103,  -26,  -77,  -26,  -52,  -26,  -26,  -26,   -1,  -26, -128,   -1, -103,   -1,
      -77,   -1,  -52,   -1,  -26,   -1,   -1,   -1, -128, -128, -103, -128,  -77, -128,  -52, -128,
      -26, -128,   -1, -128, -128, -103, -103, -103,  -77, -103,  -52, -103,  -26, -103,   -1, -103,
     -128,  -77, -103,  -77,  -77,  -77,  -52,  -77,  -26,  -77,   -1,  -77, -128,  -52, -103,  -52,
      -77,  -52,  -52,  -52,  -26,  -52,   -1,  -52, -128,  -26, -103,  -26,  -77,  -26,  -52,  -26,
      -26,  -26,   -1,  -26, -128,   -1, -103,   -1,  -77,   -1,  -52,   -1,  -26,   -1,   -1,   -1,
     -128, -128, -103, -128,  -77, -128,  -52, -128,  -26, -128,   -1, -128, -128, -103, -103, -103,
      -77, -103,  -52, -103,  -26, -103,   -1, -103, -128,  -77, -103,  -77,  -77,  -77,  -52,  -77,
      -26,  -77,   -1,  -77, -128,  -52, -103,  -52,  -77,  -52,  -52,  -52,  -26,  -52,   -1,  -52,
     -128,  -26, -103,  -26,  -77,  -26,  -52,  -26,  -26,  -26,   -1,  -26, -128,   -1, -103,   -1,
      -77,   -1,  -52,   -1,  -26,   -1,   -1,   -1,
};

void esp_nn_add_elementwise_s8_test()
{
    /* prepare data */
    int size = 1600 + 8 + 7; /* odd len to test leftover */
    int8_t *input1;
    int8_t *input2;
    int8_t *out_data_c;
    int8_t *out_data_opt;
    int8_t *input1_orig = NULL;
    int8_t *input2_orig = NULL;
    int8_t *out_c_orig = NULL;
    int8_t *out_opt_orig = NULL;
    int32_t input1_offset = 34;
    int32_t input2_offset = 35;
    int32_t output_offset = 36;
    int32_t input1_shift = -8; // right_shift amt always <= 0
    int32_t input2_shift = -8; // right_shift amt always <= 0
    int32_t output_shift = -9; // right_shift amt always <= 0
    int32_t left_shift = 15; // always +ve
    int32_t input1_mult = INT32_MAX;
    int32_t input2_mult = INT32_MAX;
    int32_t output_mult = INT32_MAX;
    int32_t activation_min = -128;
    int32_t activation_max = 127;

    for (int itr = 0; itr < 10; itr++) {
        switch (itr) {
        case 0: // all zeros
            input1_offset = 0;
            input2_offset = 0;
            output_offset = 0;
            input1_mult = 0;
            input2_mult = 0;
            output_mult = 0;
            input1_shift = 0;
            input2_shift = 0;
            output_shift = 0;
            left_shift = 0;
        break;
        case 1: // hit min
            input1_offset = -127;
            input2_offset = -127;
            output_offset = -128;
            input1_mult = MULT_MIN;
            input2_mult = MULT_MIN;
            output_mult = MULT_MIN;
            input1_shift = 0;
            input2_shift = 0;
            output_shift = 0;
            left_shift = 0;
        break;
        case 2: // hit max
            input1_offset = 128;
            input2_offset = 128;
            output_offset = -127;
            input1_mult = MULT_MAX;
            input2_mult = MULT_MAX;
            output_mult = MULT_MAX;
            input1_shift = SHIFT_MIN;
            input2_shift = SHIFT_MIN;
            output_shift = SHIFT_MIN;
            left_shift = 30 - 8; // since input is 8 bits
        break;
        case 3: // hit extreme max
            input1_offset = 128;
            input2_offset = 128;
            output_offset = -127;
            input1_mult = MULT_MAX;
            input2_mult = MULT_MAX;
            output_mult = MULT_MAX;
            input1_shift = 0;
            input2_shift = 0;
            output_shift = 0;
            left_shift = 30 - 8; // -8 since input is 8 bit
        break;
        case 4: // from yolo model
            input1_offset = 64;
            input2_offset = 128;
            output_offset = -128;
            input1_mult = 1705397815;
            input2_mult = 1073741824;
            output_mult = 1756091225;
            input1_shift = -3;
            input2_shift = 0;
            output_shift = -19;
            left_shift = 20;
            size = 216;
        break;
        default:  // practical random input
            input1_offset = rand() % 256 - 127; // range [-127, 128]
            input2_offset = rand() % 256 - 127; // range [-127, 128]
            output_offset = rand() % 256 - 128; // range [-128, 127]
            input1_mult = MULT_MAX / 2 + rand() % INT16_MAX;
            input2_mult = MULT_MAX / 2 + rand() % INT16_MAX;
            output_mult = MULT_MAX / 2 + rand() % INT16_MAX;
            input1_shift = -8 + rand() % 4;
            input2_shift = -8 + rand() % 4;
            output_shift = -8 + rand() % 4;
            left_shift = rand() % 15;
        }

        input1_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);
        input2_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);
        out_c_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);
        out_opt_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);

        if (input1_orig == NULL || input2_orig == NULL ||
                out_c_orig == NULL || out_opt_orig == NULL) {
            printf(ANSI_COLOR_RED"%s error allocating buffers\n"ANSI_COLOR_RESET, __FUNCTION__);
            goto elementwise_add_test_cleanup;
        }

        input1 = (int8_t *) (((uint32_t) input1_orig + 15) & ~15);
        input2 = (int8_t *) (((uint32_t) input2_orig + 15) & ~15);
        if (itr == 4) {
            input2 = input2_orig; // unaligned input
        }
        out_data_c = (int8_t *) (((uint32_t)out_c_orig + 15) & ~15);
        out_data_opt = (int8_t *) (((uint32_t)out_opt_orig + 15) & ~15);


        if (itr == 4) {
            memcpy(input1, test_add_in1, size);
            memcpy(input2, test_add_in2, size);
        } else {
            for (int i = 0; i < size; ++i) {
                input1[i] = rand() % 256 - 128;
                input2[i] = rand() % 256 - 128;
            }
        }

        if (itr == 0) {
            /* enable profiler */
            profile_c_start();
        }
        /* C function */
        esp_nn_add_elementwise_s8_ansi(input1, input2, input1_offset, input2_offset,
                                       input1_mult, input2_mult, input1_shift, input2_shift,
                                       left_shift, out_data_c, output_offset, output_mult,
                                       output_shift, activation_min, activation_max, size);

        if (itr == 0) {
            profile_c_end();
            profile_opt_start();
        }

        /* Optimized function */
        esp_nn_add_elementwise_s8(input1, input2, input1_offset, input2_offset,
                                  input1_mult, input2_mult, input1_shift, input2_shift,
                                  left_shift, out_data_opt, output_offset, output_mult,
                                  output_shift, activation_min, activation_max, size);
        if (itr == 0) {
            /* disable profiler */
            profile_opt_end();
        }

        bool ret = CHECK_EQUAL(out_data_c, out_data_opt, size);
        if (ret == false) {
            printf(ANSI_COLOR_RED"%s[%d] failed\n"ANSI_COLOR_RESET, __FUNCTION__, itr);
            printf("Output: \n");
            PRINT_ARRAY_INT8(out_data_opt, size, 1);
            printf("Expected: \n");
            PRINT_ARRAY_INT8(out_data_c, size, 1);
            printf("Input1:\n");
            PRINT_ARRAY_INT8(input1, size, 1);
            printf("Input2:\n");
            PRINT_ARRAY_INT8(input2, size, 1);
            printf("in1_shift %"PRIi32", in2_shift %"PRIi32", left_shift %"PRIi32", out_shift %"PRIi32"\n",
                   input1_shift, input2_shift, left_shift, output_shift);
            printf("in1_mult %"PRIi32", in2_mult %"PRIi32", out_mult %"PRIi32"\n",
                   input1_mult, input2_mult, output_mult);
            goto elementwise_add_test_cleanup;
        }
        printf(ANSI_COLOR_GREEN"%s[%d] passed\n"ANSI_COLOR_RESET, __FUNCTION__, itr);

elementwise_add_test_cleanup:
        if (input1_orig) {
            free(input1_orig);
        }
        if (input2_orig) {
            free(input2_orig);
        }
        if (out_c_orig) {
            free(out_c_orig);
        }
        if (out_opt_orig) {
            free(out_opt_orig);
        }
    }
}

void esp_nn_mul_elementwise_s8_test()
{
    /* prepare data */
    int size = 1600 + 8 + 7; /* odd len to test leftover */
    int8_t *input1;
    int8_t *input2;
    int8_t *out_data_c;
    int8_t *out_data_opt;
    int32_t input1_offset = 34;
    int32_t input2_offset = 35;
    int32_t output_offset = 36;
    int32_t output_shift = -7;
    int32_t output_mult = MULT_MAX; // max out_mult
    int32_t activation_min = -128;
    int32_t activation_max = 127;
    int8_t *input1_orig = NULL;
    int8_t *input2_orig = NULL;
    int8_t *out_c_orig = NULL;
    int8_t *out_opt_orig = NULL;

    for (int itr = 0; itr < 10; itr++) {
        switch (itr) {
        case 0: // all zeros
            input1_offset = 0;
            input2_offset = 0;
            output_offset = 0;
            output_mult = 0;
            output_shift = 0;
        break;
        case 1: // hit min
            input1_offset = -127;
            input2_offset = -127;
            output_offset = -128;
            output_mult = MULT_MIN;
            output_shift = 0;
        break;
        case 2: // hit max
            input1_offset = 128;
            input2_offset = 128;
            output_offset = -127;
            output_mult = MULT_MAX;
            output_shift = SHIFT_MIN;
        break;
        case 3: // hit extreme max
            input1_offset = 128;
            input2_offset = 128;
            output_offset = -127;
            output_mult = MULT_MAX;
            output_shift = 0;
        break;
        default:  // practical random input
            input1_offset = rand() % 256 - 127; // range [-127, 128]
            input2_offset = rand() % 256 - 127; // range [-127, 128]
            output_offset = rand() % 256 - 128; // range [-128, 127]
            output_mult = MULT_MAX / 2 + rand() % INT16_MAX;
            output_shift = -8 + rand() % 4;
            size = 4 + rand() % 64;
        }

        input1_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);
        input2_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);
        out_c_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);
        out_opt_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);

        if (input1_orig == NULL || input2_orig == NULL ||
                out_c_orig == NULL || out_opt_orig == NULL) {
            printf(ANSI_COLOR_RED"%s error allocating buffers\n"ANSI_COLOR_RESET, __FUNCTION__);
            goto elementwise_mult_test_cleanup;
        }

        input1 = (int8_t *) (((uint32_t) input1_orig + 15) & ~15);
        input2 = (int8_t *) (((uint32_t) input2_orig + 15) & ~15);
        if (itr == 4 || itr == 5) {
            input2 = input2_orig; // unaligned input
        }

        out_data_c = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15);
        out_data_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15);

        for (int i = 0; i < size; ++i) {
            input1[i] = rand() % 256 - 128;
            input2[i] = rand() % 256 - 128;
        }

        if (itr == 0) {
            /* enable profiler */
            profile_c_start();
        }
        /* C function */
        esp_nn_mul_elementwise_s8_ansi(input1, input2, input1_offset, input2_offset,
                                       out_data_c, output_offset, output_mult, output_shift,
                                       activation_min, activation_max, size);

        if (itr == 0) {
            profile_c_end();
            profile_opt_start();
        }
        /* Optimized function */
        esp_nn_mul_elementwise_s8(input1, input2, input1_offset, input2_offset,
                                  out_data_opt, output_offset, output_mult, output_shift,
                                  activation_min, activation_max, size);

        if (itr == 0) {
            /* disable profiler */
            profile_opt_end();
        }

        bool ret = CHECK_EQUAL(out_data_c, out_data_opt, size);
        if (ret == false) {
            printf(ANSI_COLOR_RED"%s[%d] failed\n"ANSI_COLOR_RESET, __FUNCTION__, itr);
            printf("Output: \n");
            PRINT_ARRAY_HEX(out_data_opt, size, 1);
            printf("Expected: \n");
            PRINT_ARRAY_HEX(out_data_c, size, 1);
            printf("Input1:\n");
            PRINT_ARRAY_HEX(input1, size, 1);
            printf("Input2:\n");
            PRINT_ARRAY_HEX(input2, size, 1);
            goto elementwise_mult_test_cleanup;
        }
        printf(ANSI_COLOR_GREEN"%s[%d] passed\n"ANSI_COLOR_RESET, __FUNCTION__, itr);

elementwise_mult_test_cleanup:
        if (input1_orig) {
            free(input1_orig);
        }
        if (input2_orig) {
            free(input2_orig);
        }
        if (out_c_orig) {
            free(out_c_orig);
        }
        if (out_opt_orig) {
            free(out_opt_orig);
        }
    }
}

void esp_nn_mul_broadcast_channel_s8_test()
{
    int total_spatial = 49; /* 7x7 feature map */
    int channels = 64;
    int8_t *input1;
    int8_t *input2_per_ch;
    int8_t *out_data_c;
    int8_t *out_data_opt;
    int8_t *input1_orig = NULL;
    int8_t *input2_orig = NULL;
    int8_t *out_c_orig = NULL;
    int8_t *out_opt_orig = NULL;
    int32_t input1_offset = 34;
    int32_t input2_offset = 35;
    int32_t output_offset = 36;
    int32_t output_shift = -7;
    int32_t output_mult = MULT_MAX;
    int32_t activation_min = -128;
    int32_t activation_max = 127;

    for (int itr = 0; itr < 10; itr++) {
        switch (itr) {
        case 0: // all zeros
            input1_offset = 0;
            input2_offset = 0;
            output_offset = 0;
            output_mult = 0;
            output_shift = 0;
            total_spatial = 49;
            channels = 64;
        break;
        case 1: // hit min
            input1_offset = -127;
            input2_offset = -127;
            output_offset = -128;
            output_mult = MULT_MIN;
            output_shift = 0;
        break;
        case 2: // hit max
            input1_offset = 128;
            input2_offset = 128;
            output_offset = -127;
            output_mult = MULT_MAX;
            output_shift = SHIFT_MIN;
        break;
        case 3: // small channels (leftover only, no SIMD)
            input1_offset = 64;
            input2_offset = 32;
            output_offset = -10;
            output_mult = MULT_MAX / 2;
            output_shift = -5;
            total_spatial = 16;
            channels = 5;
        break;
        case 4: // unaligned channels (SIMD + leftover)
            total_spatial = 14;
            channels = 19;
        break;
        case 5: // typical SE-block: 7x7 spatial, 96 channels
            input1_offset = 128;
            input2_offset = 128;
            output_offset = -128;
            output_mult = 1705397815;
            output_shift = -3;
            total_spatial = 49;
            channels = 96;
        break;
        default: // random
            input1_offset = rand() % 256 - 127;
            input2_offset = rand() % 256 - 127;
            output_offset = rand() % 256 - 128;
            output_mult = MULT_MAX / 2 + rand() % INT16_MAX;
            output_shift = -8 + rand() % 4;
            total_spatial = 4 + rand() % 64;
            channels = 8 + rand() % 128;
        }

        int size = total_spatial * channels;
        input1_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);
        input2_orig = (int8_t *) ESP_NN_TEST_ALLOC(channels + 16);
        out_c_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);
        out_opt_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);

        if (input1_orig == NULL || input2_orig == NULL ||
                out_c_orig == NULL || out_opt_orig == NULL) {
            printf(ANSI_COLOR_RED"%s error allocating buffers\n"ANSI_COLOR_RESET, __FUNCTION__);
            goto broadcast_mul_test_cleanup;
        }

        input1 = (int8_t *) (((uint32_t) input1_orig + 15) & ~15);
        input2_per_ch = (int8_t *) (((uint32_t) input2_orig + 15) & ~15);
        out_data_c = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15);
        out_data_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15);

        if (itr == 4) {
            input1 = input1_orig; // unaligned input
        }

        for (int i = 0; i < size; ++i) {
            input1[i] = rand() % 256 - 128;
        }
        for (int i = 0; i < channels; ++i) {
            input2_per_ch[i] = rand() % 256 - 128;
        }

        if (itr == 0) {
            profile_c_start();
        }
        /* C reference */
        esp_nn_mul_broadcast_channel_s8_ansi(input1, input2_per_ch,
                                             input1_offset, input2_offset,
                                             out_data_c, output_offset,
                                             output_mult, output_shift,
                                             activation_min, activation_max,
                                             total_spatial, channels);
        if (itr == 0) {
            profile_c_end();
            profile_opt_start();
        }
        /* Optimized function */
        esp_nn_mul_broadcast_channel_s8(input1, input2_per_ch,
                                        input1_offset, input2_offset,
                                        out_data_opt, output_offset,
                                        output_mult, output_shift,
                                        activation_min, activation_max,
                                        total_spatial, channels);
        if (itr == 0) {
            profile_opt_end();
        }

        bool ret = CHECK_EQUAL(out_data_c, out_data_opt, size);
        if (ret == false) {
            printf(ANSI_COLOR_RED"%s[%d] failed\n"ANSI_COLOR_RESET, __FUNCTION__, itr);
            printf("spatial=%d channels=%d size=%d\n", total_spatial, channels, size);
            for (int idx = 0; idx < size; idx++) {
                if (out_data_c[idx] != out_data_opt[idx]) {
                    printf("first mismatch at idx=%d (row=%d ch=%d): got %02x exp %02x\n",
                           idx, idx / channels, idx % channels,
                           (uint8_t)out_data_opt[idx], (uint8_t)out_data_c[idx]);
                    // print 8 more mismatches
                    int cnt = 0;
                    for (int j = idx + 1; j < size && cnt < 8; j++) {
                        if (out_data_c[j] != out_data_opt[j]) {
                            printf("  mismatch at idx=%d (row=%d ch=%d): got %02x exp %02x\n",
                                   j, j / channels, j % channels,
                                   (uint8_t)out_data_opt[j], (uint8_t)out_data_c[j]);
                            cnt++;
                        }
                    }
                    break;
                }
            }
            goto broadcast_mul_test_cleanup;
        }
        printf(ANSI_COLOR_GREEN"%s[%d] passed\n"ANSI_COLOR_RESET, __FUNCTION__, itr);

broadcast_mul_test_cleanup:
        if (input1_orig) {
            free(input1_orig);
        }
        if (input2_orig) {
            free(input2_orig);
        }
        if (out_c_orig) {
            free(out_c_orig);
        }
        if (out_opt_orig) {
            free(out_opt_orig);
        }
    }
}


================================================
FILE: tests/src/convolution_test.c
================================================
/*
 * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <inttypes.h>

#include <esp_nn.h>
#include "test_utils.h"

void esp_nn_depthwise_conv_s8_test()
{
    uint32_t total_c = 0, total_opt = 0;
    int8_t *input = NULL, *filter_data = NULL;
    int8_t *out_data_c = NULL, *out_data_opt = NULL;
    int32_t *bias = NULL;
    int32_t input_offset = 5; /* some number in [-128, 127] */
    int32_t out_offset = 7;
    int32_t activation_min = -125;
    int32_t activation_max = 120;
    void *scratch_buf = NULL;

    /* independent variables */
    int input_wd, input_ht, channels;
    uint16_t filter_ht, filter_wd, ch_mult, out_wd, out_ht;
    uint16_t pad_wd, pad_ht, stride_wd, stride_ht;

    printf("\n######## Running %s ##########\n", __FUNCTION__);
    // run for 17 iterations
    for (int itr = 0; itr < 17; itr++) {
        /* prepare data */
        switch (itr) {
        case 0: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (0,0)
            input_wd = 18;
            input_ht = 18;
            filter_ht = 3;
            filter_wd = 3;
            ch_mult = 1;
            channels = 16;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 1;
            stride_ht = 1;
            break;
        case 1: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (1,1)
            input_wd = 10;
            input_ht = 10;
            filter_ht = 3;
            filter_wd = 3;
            ch_mult = 1;
            channels = 16;
            pad_wd = 1;
            pad_ht = 1;
            stride_wd = 1;
            stride_ht = 1;
            break;
        case 2: // (ch_mult 1, (channels % 8) = 0), filter (3,3), pad (1,1)
            input_wd = 10;
            input_ht = 10;
            filter_ht = 3;
            filter_wd = 3;
            ch_mult = 1;
            channels = 24;
            pad_wd = 1;
            pad_ht = 1;
            stride_wd = 1;
            stride_ht = 1;
            break;
        case 3: // other filter sizes (ch_mult 1, (channels % 8) = 0)
            input_wd = 10;
            input_ht = 10;
            filter_ht = 3;
            filter_wd = 3;
            ch_mult = 1;
            channels = 24;
            pad_wd = 1;
            pad_ht = 1;
            stride_wd = 1;
            stride_ht = 1;
            break;
        case 4: // other filter sizes (ch_mult 8 = 0)
            input_wd = 6;
            input_ht = 6;
            filter_ht = 3;
            filter_wd = 3;
            ch_mult = 8;
            channels = 4;
            pad_wd = 1;
            pad_ht = 1;
            stride_wd = 1;
            stride_ht = 1;
            break;
        case 5: // other filter sizes (ch_mult 8 = 0)
            input_wd = 12;
            input_ht = 12;
            filter_ht = 5;
            filter_wd = 5;
            ch_mult = 8;
            channels = 4;
            pad_wd = 1;
            pad_ht = 1;
            stride_wd = 1;
            stride_ht = 1;
            break;
        case 6: // other filter sizes (ch_mult 4 = 0)
            input_wd = 6;
            input_ht = 6;
            filter_ht = 3;
            filter_wd = 3;
            ch_mult = 4;
            channels = 4;
            pad_wd = 1;
            pad_ht = 1;
            stride_wd = 1;
            stride_ht = 1;
            break;
        case 7: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (0,0)  stride (2,2)
            input_wd = 6;
            input_ht = 6;
            filter_ht = 3;
            filter_wd = 3;
            ch_mult = 1;
            channels = 16;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 2;
            stride_ht = 2;
            break;
        case 8: // same as case 7, with large parameters (reduced for non-PSRAM boards)
            input_wd = 28;
            input_ht = 28;
            filter_ht = 3;
            filter_wd = 3;
            ch_mult = 1;
            channels = 64;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 2;
            stride_ht = 2;
            break;
        case 9: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (0,0)  stride (2,2)
            input_wd = 6;
            input_ht = 6;
            filter_ht = 3;
            filter_wd = 3;
            ch_mult = 1;
            channels = 16;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 2;
            stride_ht = 2;
            break;
        case 15: // ch=8, 3x3, pad=1 (person_detection model layer, ch<12 path)
            input_wd = 48;
            input_ht = 48;
            filter_ht = 3;
            filter_wd = 3;
            ch_mult = 1;
            channels = 8;
            pad_wd = 1;
            pad_ht = 1;
            stride_wd = 1;
            stride_ht = 1;
            break;
        case 16: // ch=8, 3x3, pad=0, stride=2 (another ch<12 variant)
            input_wd = 12;
            input_ht = 12;
            filter_ht = 3;
            filter_wd = 3;
            ch_mult = 1;
            channels = 8;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 2;
            stride_ht = 2;
            break;
        default:
            input_wd = 6;
            input_ht = 6;
            filter_ht = 3;
            filter_wd = 3;
            ch_mult = 1;
            channels = 16;
            stride_wd = rand() % 2 + 1;
            stride_ht = stride_wd;
            pad_wd = stride_wd == 1 ? 0 : rand() % 2;
            pad_ht = pad_wd;
            break;
        }

        /* prepare data */
        if (pad_wd) {
            out_wd = (input_wd + stride_wd - 1) / stride_wd;
        } else {
            out_wd = (input_wd + stride_wd - filter_wd) / stride_wd;
        }
        if (pad_ht) {
            out_ht = (input_ht + stride_ht - 1) / stride_ht;
        } else {
            out_ht = (input_ht + stride_ht - filter_ht) / stride_ht;
        }

        // if (itr == 9) {
            // expect the function to handle this gracefully
            // out_wd += 1;
            // out_ht += 1;
        // }
        int in_size = input_wd * input_ht * channels;
        int out_size = out_wd * out_ht * channels * ch_mult;
        int filter_size = filter_wd * filter_ht * channels * ch_mult + 4;
        int bias_size = channels * ch_mult + 1;
        int32_t out_shift[channels * ch_mult];
        int32_t out_mult[channels * ch_mult];

        int8_t *input_orig = ESP_NN_TEST_ALLOC(in_size + 16);
        int8_t *out_c_orig = ESP_NN_TEST_ALLOC(out_size + 16);
        int8_t *out_opt_orig = ESP_NN_TEST_ALLOC(out_size + 16);
        filter_data = ESP_NN_TEST_ALLOC(filter_size);
        bias = ESP_NN_TEST_ALLOC(bias_size * 4);

        if (bias == NULL || input_orig == NULL || filter_data == NULL ||
                out_c_orig == NULL || out_opt_orig == NULL) {
            printf(ANSI_COLOR_RED"[%d] allocations failed\n"ANSI_COLOR_RESET, itr);
            goto dc_s8_cleanup;
        }

        input = (int8_t *) (((uint32_t) input_orig + 15) & ~15);
        out_data_c = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15);
        out_data_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15);

        /* Generate input data */
        for (int i = 0; i < in_size; ++i) {
            input[i] = rand() % 128;
        }

        /* Generate filter data */
        for (int i = 0; i < filter_size; ++i) {
            filter_data[i] = rand() % 256 - 128;
        }

        /* Generate bias data */
        for (int i = 0; i < channels * ch_mult; ++i) {
            bias[i + 1] = rand() % INT16_MAX; //0th index left for unalignment
            out_shift[i] = -8 + rand() % 3;
            out_mult[i] = 0x7eb0e200 + rand() % 50;
        }

        data_dims_t input_dims = {.width = input_wd, .height = input_ht, .channels = channels, 1};
        data_dims_t output_dims = {.width = out_wd, .height = out_ht, .channels = channels * ch_mult, 1};
        data_dims_t filter_dims = {.width = filter_wd, .height = filter_ht, 0, 0};
        dw_conv_params_t conv_params = {.in_offset = input_offset, .out_offset = out_offset, .ch_mult = ch_mult,
                                        .stride = {stride_wd, stride_ht}, .padding = {pad_wd, pad_ht},
                                        .dilation = {0, 0}, .activation = {activation_min, activation_max}};
        quant_data_t quant_data = {.shift = out_shift, .mult = out_mult};

        int scratch_buf_size = esp_nn_get_depthwise_conv_scratch_size(&input_dims, &filter_dims,
                                                                      &output_dims, &conv_params);
        if (scratch_buf_size > 0) {
            scratch_buf = ESP_NN_TEST_ALLOC(scratch_buf_size + 16);
            if (scratch_buf == NULL) {
                printf(ANSI_COLOR_RED"[%d] scratch_buf alloc failed size %d\n"ANSI_COLOR_RESET,
                       itr, scratch_buf_size);
                goto dc_s8_cleanup;
            }
            int align_sz = 16 - (((int32_t) scratch_buf) & 0xf);
            esp_nn_set_depthwise_conv_scratch_buf(scratch_buf + align_sz);
        }

        /* enable profiler */
        profile_c_start();

        /* C function */
        esp_nn_depthwise_conv_s8_ansi(&input_dims, input, &filter_dims, filter_data + 4,
                                      bias + 1, &output_dims, out_data_c, &conv_params, &quant_data);

        total_c = profile_c_end();
        profile_opt_start();

        /* Optimized function */
        esp_nn_depthwise_conv_s8(&input_dims, input, &filter_dims, filter_data + 4,
                                 bias + 1, &output_dims, out_data_opt, &conv_params, &quant_data);

        /* disable profiler */
        total_opt = profile_opt_end();

        bool ret = CHECK_EQUAL(out_data_c, out_data_opt, out_size);
        if (ret == false) {
        printf(ANSI_COLOR_RED"[%3d] failed [pad: (%d, %d), stride: (%d, %d)"
               " out: (%3d,%3d), filter: (%d, %d,%3d), ch_mult %d]\n"ANSI_COLOR_RESET,
               itr, pad_wd, pad_ht, stride_wd, stride_ht, out_wd, out_ht,
               filter_wd, filter_ht, channels, ch_mult);
#if 0
            printf("Output: \n");
            PRINT_ARRAY_HEX(out_data_opt, out_size / out_ht, out_ht);
            printf("Expected: \n");
            PRINT_ARRAY_HEX(out_data_c, out_size / out_ht, out_ht);
            printf("Input:\n");
            PRINT_ARRAY_HEX(input, in_size / input_ht, input_ht);
            printf("Filter data:\n");
            PRINT_ARRAY_HEX(filter_data + 4, (filter_size - 4) / filter_ht, filter_ht);
            printf("bias data:\n");
            PRINT_ARRAY_INT(bias + 1, ch_mult * channels, 1);
#endif
            goto dc_s8_cleanup;
        }
        printf(ANSI_COLOR_GREEN"[%3d] passed [pad: (%d, %d), stride: (%d, %d)"
               " out: (%3d,%3d), filter: (%d, %d,%3d), ch_mult %d]"ANSI_COLOR_RESET,
               itr, pad_wd, pad_ht, stride_wd, stride_ht, out_wd,
               out_ht, filter_wd, filter_ht, channels, ch_mult);
        printf("\tcycles: c %8"PRIu32", opt %8"PRIu32"\n", total_c, total_opt);

    dc_s8_cleanup:
        if (input_orig) {
            free(input_orig);
        }
        if (filter_data) {
            free(filter_data);
        }
        if (out_c_orig) {
            free(out_c_orig);
        }
        if (out_opt_orig) {
            free(out_opt_orig);
        }
        if (bias) {
            free(bias);
        }
        if (scratch_buf) {
            free(scratch_buf);
        }
    }
}

void esp_nn_conv_s8_test()
{
    uint32_t total_c = 0, total_opt = 0;
    int32_t input_offset = 5; /* some number in [-128, 127] */
    int32_t activation_min = -125;
    int32_t activation_max = 122;
    int32_t out_offset = 3;

    void *scratch_buf = NULL;
    int8_t *input_orig = NULL;
    int8_t *out_c_orig = NULL;
    int8_t *out_opt_orig = NULL;
    int8_t *filter_data = NULL;
    int32_t *bias = NULL;
    int32_t *out_shift = NULL;
    int32_t *out_mult = NULL;

    /* independent variable */
    int in_wd, in_ht, in_channels, out_channels;
    uint16_t filter_ht, filter_wd, out_wd, out_ht;
    uint16_t pad_wd, pad_ht, stride_wd, stride_ht;

    printf("\n######## Running %s ##########\n", __FUNCTION__);
    for (int itr = 0; itr < 18; itr++) {
        /* Reset quant params to defaults each iteration */
        input_offset = 5;
        out_offset = 3;
        activation_min = -125;
        activation_max = 122;

        switch (itr) {
        case 0: // ch % 8 == 0 && filter (1,1), padding (0,0)
            in_wd = 10;
            in_ht = 10;
            in_channels = 64;
            out_channels = 64;
            filter_ht = 1;
            filter_wd = 1;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 1;
            stride_ht = 1;
            break;
        case 1: // ch % 4 == 0 && (in_wd * in_ht) % 16 == 0
            in_wd = 4;
            in_ht = 4;
            in_channels = 20;
            out_channels = 8;
            filter_ht = 1;
            filter_wd = 1;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 1;
            stride_ht = 1;
            break;
        case 2: // ch, filter (3x3x3)
            in_wd = 10;
            in_ht = 10;
            in_channels = 3;
            out_channels = 64;
            filter_ht = 3;
            filter_wd = 3;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 1;
            stride_ht = 1;
            break;
        case 3: // remaining pad (0, 0)
            in_wd = 10;
            in_ht = 10;
            in_channels = 3;
            out_channels = 64;
            filter_ht = 1;
            filter_wd = 1;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 1;
            stride_ht = 1;
            break;
        case 4: // unopt case
            in_wd = 10;
            in_ht = 10;
            in_channels = 12;
            out_channels = 64;
            filter_ht = 3;
            filter_wd = 3;
            pad_wd = 1;
            pad_ht = 1;
            stride_wd = 1;
            stride_ht = 1;
            break;
        case 5: // ch % 8 == 0 & stride (2,2)
            in_wd = 16;
            in_ht = 16;
            in_channels = 16;
            out_channels = 16;
            filter_ht = 1;
            filter_wd = 1;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 2;
            stride_ht = 2;
            break;
        case 6: // ch % 8 == 0 && filter (1,1), padding (0,0)
            in_wd = 2;
            in_ht = 2;
            in_channels = 8;
            out_channels = 8;
            filter_ht = 1;
            filter_wd = 1;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 1;
            stride_ht = 1;
            break;
        case 7: // ch == 3, pad (0, 0)
            in_wd = 112;
            in_ht = 112;
            in_channels = 3;
            out_channels = 16;
            filter_ht = 6;
            filter_wd = 6;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 2;
            stride_ht = 2;
            break;
        case 8: // ch == 5, remaining pad (0, 0)
            in_wd = 8;
            in_ht = 8;
            in_channels = 5;
            out_channels = 16;
            filter_ht = 6;
            filter_wd = 6;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 2;
            stride_ht = 2;
            break;
        case 9: //
            in_wd = 3;
            in_ht = 3;
            in_channels = 32;
            out_channels = 1;
            filter_ht = 3;
            filter_wd = 3;
            pad_wd = 1;
            pad_ht = 1;
            stride_wd = 1;
            stride_ht = 1;
            break;
        case 10: // needs right and bottom padding
            in_wd = 4;
            in_ht = 8;
            in_channels = 1;
            out_channels = 3;
            filter_ht = 3;
            filter_wd = 3;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 2;
            stride_ht = 2;
            break;
        case 11: // needs right and bottom padding
            in_wd = 4;
            in_ht = 8;
            in_channels = 3;
            out_channels = 4;
            filter_ht = 3;
            filter_wd = 3;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 2;
            stride_ht = 2;
            break;
        case 15: // 1x1 conv, large spatial, YOLO-like quant params
            in_wd = 48;
            in_ht = 48;
            in_channels = 32;
            out_channels = 32;
            filter_ht = 1;
            filter_wd = 1;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 1;
            stride_ht = 1;
            // Override quant params to match YOLO Op[8]
            input_offset = 127;
            out_offset = 39;
            break;
        case 16: // 1x1, YOLO exact data: 48x48x32->32 with real filter/bias/quant
            in_wd = 48;
            in_ht = 48;
            in_channels = 32;
            out_channels = 32;
            filter_ht = 1;
            filter_wd = 1;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 1;
            stride_ht = 1;
            input_offset = 127;
            out_offset = 39;
            activation_min = -128;
            activation_max = 127;
            break;
        case 17: // 1x1 conv with DELIBERATELY UNALIGNED filter + small out_shift
            // Tests both alignment (filter+5) AND transpose correctness (shift=-6 won't mask 8x error)
            in_wd = 24;
            in_ht = 24;
            in_channels = 32;
            out_channels = 32;
            filter_ht = 1;
            filter_wd = 1;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 1;
            stride_ht = 1;
            input_offset = 110; /* typical YOLO value that exposed the bug */
            out_offset = 39;
            activation_min = -128;
            activation_max = 127;
            break;
        default: // ch % 8 == 0
            in_wd = 8;
            in_ht = 8;
            in_channels = 16;
            out_channels = 16;
            filter_ht = 1;
            filter_wd = 1;
            pad_wd = 0;
            pad_ht = 0;
            stride_wd = 1;
            stride_ht = 1;
            break;
        }

        int8_t *filter_data_orig_save = NULL; /* for case 17 unaligned filter restore */

        /* prepare data */
        if (pad_wd) {
            out_wd = (in_wd + stride_wd - 1) / stride_wd;
        } else {
            out_wd = (in_wd + stride_wd - filter_wd) / stride_wd;
        }
        if (pad_ht) {
            out_ht = (in_ht + stride_ht - 1) / stride_ht;
        } else {
            out_ht = (in_ht + stride_ht - filter_ht) / stride_ht;
        }

        int in_size = in_wd * in_ht * in_channels;
        int filter_size = filter_wd * filter_ht * in_channels * out_channels + 2;
        int out_size = out_wd * out_ht * out_channels;

        input_orig = ESP_NN_TEST_ALLOC(in_size + 16);
        out_c_orig = ESP_NN_TEST_ALLOC(out_size + 16);
        out_opt_orig = ESP_NN_TEST_ALLOC(out_size + 16);
        filter_data = ESP_NN_TEST_ALLOC(filter_size + 16);
        bias = ESP_NN_TEST_ALLOC(128 + sizeof (int32_t) * out_channels);
        out_shift = ESP_NN_TEST_ALLOC(128 + sizeof (int32_t) * out_channels);
        out_mult = ESP_NN_TEST_ALLOC(128 + sizeof (int32_t) * out_channels);

        if (input_orig == NULL || filter_data == NULL ||
                out_c_orig == NULL || out_opt_orig == NULL ||
                bias == NULL || out_shift == NULL || out_mult == NULL) {
            printf(ANSI_COLOR_RED"[%3d] alloc failed (in=%d filter=%d out=%d)\n"ANSI_COLOR_RESET,
                   itr, in_size, filter_size, out_size);
            goto conv_s8_cleanup;
        }

        int8_t *input = (int8_t *) (((uint32_t) input_orig + 15) & ~15);
        int8_t *out_data_c = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15);
        int8_t *out_data_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15);

        /* Generate input data between -128 -> +127 */
        for (int i = 0; i < in_size; ++i) {
            input[i] = rand() % 255 - 128;
        }

        /* Generate filter data between -128 -> +127 */
        for (int i = 0; i < filter_size; ++i) {
            filter_data[i] = rand() % 256 - 128;
        }

        /* Case 17: deliberately misalign filter by 5 bytes to test alignment handling.
         * This reproduces the bug where ee.vld.l.64.ip ignores lower address bits. */
        filter_data_orig_save = filter_data;
        if (itr == 17) {
            filter_data = filter_data + 5; /* misalign by 5 bytes (like YOLO's 0x3c05fe55) */
        }

        /* Generate bias data */
        for (int i = 0; i < out_channels; ++i) {
            bias[i] = (int32_t)rand() % UINT16_MAX + UINT8_MAX;
        }

        /* Shift and multiplier */
        for (int i = 0; i < out_channels; ++i) {
            out_shift[i] = -10 + rand() % 2;
            out_mult[i] = 0x7f67f4f8 + rand() % 50;
        }

        /* Case 17: use small out_shift to expose transpose cross-position contamination.
         * out_shift=-6 (÷64) won't mask an 8x error like -10 (÷1024) would. */
        if (itr == 17) {
            for (int i = 0; i < out_channels; ++i) {
                out_shift[i] = -6;
            }
        }

        /* Case 16: override ALL data with exact YOLO Op[8] values */
        if (itr == 16) {
            static const int8_t yolo_filter[] = {
                6,127,57,21,23,8,5,109,2,15,-1,-99,14,7,-67,-59,-12,40,-90,16,-1,-3,25,7,17,-16,14,24,-53,-2,-110,-10,
                -6,-5,5,5,55,3,2,-6,-4,-17,0,17,-10,7,-3,-13,56,-3,-13,-83,-1,-4,-49,6,-127,1,5,1,8,-10,7,-2,
                3,-1,-2,0,-29,-1,-5,-14,-2,-22,-1,-1,-9,1,-12,-18,-127,-1,-14,71,-1,0,-3,-2,-5,3,0,-4,0,-21,-1,-1,
                -13,-9,-20,-77,-2,-77,-20,59,127,-7,120,-51,-9,-47,50,45,11,8,17,8,112,-20,2,-50,-12,-34,-88,-14,-59,8,-29,2,
                -4,11,-32,-32,3,-4,5,-113,-11,2,-18,-13,-2,-7,127,8,8,7,2,6,-16,-3,1,-15,1,-5,-20,-2,13,1,24,3,
                -8,-7,1,-1,54,1,1,-1,7,-6,5,3,-4,-5,-2,11,-68,-3,-10,27,5,4,-61,4,-127,3,0,2,1,1,2,3,
                0,-2,0,2,11,2,-3,3,1,-61,1,-5,127,2,-2,-5,8,0,27,9,2,-2,4,1,2,-1,2,-2,0,101,0,0,
                1,2,-51,-1,6,-6,2,10,-7,-2,2,-19,2,-3,-115,127,12,12,6,1,0,-6,2,-22,5,-4,-18,3,1,-2,51,-2,
                -20,-21,-60,123,-4,127,-17,25,-80,-7,-95,-45,-7,11,49,51,14,0,-4,8,-73,-52,-5,-47,-11,-33,119,-21,-31,4,21,1,
                -1,4,-1,1,-2,1,-1,1,1,71,1,-3,-127,1,2,4,-1,3,46,1,2,-2,-7,-1,2,-2,1,0,-1,85,2,0,
                -2,-127,81,81,-5,71,18,22,80,14,83,58,14,-2,-14,36,6,29,-106,7,71,-46,-27,88,-19,-66,79,-13,77,7,66,-18,
                46,17,-9,-24,3,17,-22,-4,-9,-1,-36,15,-1,-49,11,-3,5,-29,2,2,0,64,-1,-19,4,12,-4,32,9,-5,-9,-127,
                -30,6,105,-67,-16,-61,-45,110,-56,-15,-50,-54,-18,-37,14,36,19,1,21,22,-66,-13,2,127,-4,-52,-60,-22,92,-6,45,-7,
                -14,12,18,13,5,10,12,29,6,-10,2,-29,-3,-28,-3,-15,-2,39,127,14,3,-43,13,6,1,-103,9,11,4,-10,-5,-27,
                -88,-35,-15,7,4,-6,64,-2,-48,-3,-18,-8,-3,-71,-8,0,7,63,12,3,-7,74,0,16,1,-67,-10,-78,-9,-7,5,127,
                -3,7,2,4,15,2,0,9,2,127,2,-16,74,2,3,-5,9,1,29,13,2,-5,-16,1,8,-4,3,2,-2,-122,-2,-1,
                8,-15,-2,3,11,-1,0,57,1,-7,2,19,-4,-2,-127,54,0,17,48,0,0,2,-2,-4,5,5,4,-5,-8,-7,-20,5,
                11,39,-91,-65,11,-67,3,4,-56,-4,-66,-3,-3,5,4,8,-3,6,28,-8,-51,11,15,-106,10,23,-73,9,-127,-3,-78,8,
                8,-3,1,0,-127,0,-7,-5,-2,-17,-2,0,5,-3,-7,-5,-3,11,-10,-3,-2,-2,3,1,70,-3,2,-7,2,-17,0,-1,
                5,-127,13,6,2,3,12,25,0,-3,-2,-45,0,-6,4,-6,9,-11,-19,6,0,4,-14,9,2,25,3,0,3,5,-5,3,
                -9,36,18,-4,-4,2,-19,-101,0,10,-9,-127,-4,-5,-37,82,-1,-20,6,-13,-2,-1,4,4,1,-11,3,-10,-27,5,-45,-3,
                -6,-13,6,-3,4,-1,-5,5,2,-4,-2,-5,-1,-16,-5,-1,0,-1,0,-1,-15,-127,-3,-2,0,23,-3,0,-1,2,0,16,
                3,45,17,24,8,27,-5,42,25,-9,21,-47,-14,18,27,23,-4,-15,127,19,24,14,19,21,-2,39,23,9,14,-7,15,15,
                -127,-30,2,-10,3,-5,71,11,-16,0,-15,-18,3,-3,14,6,-1,73,-3,-1,-12,27,-2,-2,0,8,-7,-108,9,3,-6,8,
                63,-47,0,-37,11,-20,-48,6,-19,-1,-18,13,-3,76,-18,15,3,-48,16,2,-4,-34,-6,3,7,-127,-7,58,1,-3,-23,108,
                102,-1,0,3,11,1,-127,-7,-4,0,-2,-8,-13,-6,-6,-22,5,115,18,7,-1,-6,-4,3,-5,10,-1,-88,0,4,-1,7,
                127,-5,12,-6,10,-13,-89,16,-20,1,-24,-12,-6,4,-4,-15,-3,-110,3,-6,-17,89,-10,9,13,-80,-18,105,-3,-4,3,-85,
                2,7,2,-1,-25,-2,-5,-5,0,-25,-1,-6,-5,0,-14,-24,74,0,-13,-127,-1,0,-1,-1,-4,0,-1,-4,-2,-19,-8,1,
                -84,-1,-6,-2,-19,-4,105,-3,-2,8,-2,-32,2,-3,2,-21,1,-127,-10,5,-3,8,0,2,-5,-8,-4,85,1,10,4,2,
                19,-15,0,1,95,2,-15,-2,0,-56,-3,-4,-24,-5,-2,3,-16,-6,-37,-6,-3,1,127,-1,-119,4,2,-13,0,-41,3,3,
                -10,-29,-13,-4,5,-9,-7,71,-6,7,-3,113,-2,0,51,-127,-10,-11,26,-3,-4,-1,0,-23,1,5,-7,-9,-20,8,-2,7,
                -66,-1,-1,-10,3,-31,43,9,-18,-9,-2,-22,-2,75,22,-1,5,39,-14,4,-5,-62,-2,3,-1,69,-19,-61,-17,-2,-8,-127
            };
            static const int8_t yolo_input[] = {
                -127,-65,-96,-127,-124,-100,-122,-127,-93,-122,-127,-127,-114,-91,-126,-105,
                -127,-127,-128,-118,-102,-127,-127,-93,-127,-126,-127,-103,-127,-124,-127,-127,
                -126,-63,-128,-128,-127,-127,-122,-118,-127,-126,-128,-114,-112,-122,-120,-122,
                -114,-127,-127,-114,-126,-118,-127,-127,-127,-124,-128,-100,-128,-124,-127,-107,
                -126,-63,-128,-128,-128,-126,-120,-118,-124,-126,-128,-112,-112,-122,-120,-122,
                -114,-127,-127,-114,-128,-120,-127,-124,-127,-124,-127,-98,-128,-124,-127,-105,
                -127,-62,-127,-127,-127,-128,-118,-114,-128,-126,-126,-112,-112,-124,-122,-124,
                -114,-127,-127,-114,-127,-120,-127,-128,-127,-122,-128,-100,-128,-124,-128,-105,
                -126,-63,-128,-127,-127,-128,-120,-116,-128,-124,-128,-112,-114,-122,-120,-124,
                -114,-127,-127,-112,-126,-118,-127,-127,-127,-124,-127,-98,-128,-124,-128,-105,
                -127,-63,-128,-128,-127,-128,-120,-114,-127,-124,-120,-112,-114,-122,-122,-124,
                -114,-127,-127,-114,-127,-120,-127,-127,-127,-124,-127,-98,-124,-124,-128,-107,
                -128,-67,-127,-126,-127,-127,-118,-112,-127,-124,-122,-111,-114,-128,-118,-127,
                -114,-127,-127,-114,-128,-118,-127,-127,-127,-122,-127,-102,-127,-124,-128,-102,
                -126,-69,-128,-128,-127,-127,-120,-112,-127,-124,-118,-111,-114,-124,-124,-126,
                -112,-127,-127,-116,-128,-120,-127,-127,-127,-124,-126,-105,-128,-124,-122,-107
            };
            static const int32_t yolo_bias[] = {
                2420,1649,1302,1816,-446,1562,685,32,2503,-74,3143,463,1507,1883,-932,525,
                1205,162,540,1680,1846,388,338,274,-433,502,817,1021,812,1371,-30,1525
            };
            static const int32_t yolo_shifts[] = {
                -8,-7,-6,-8,-6,-7,-8,-7,-8,-7,-9,-6,-8,-8,-7,-8,-8,-8,-7,-7,-7,-6,-8,-7,-7,-8,-8,-7,-8,-8,-8,-7
            };
            static const int32_t yolo_mults[] = {
                0x52a119c9,0x53a7fce0,0x4430a104,0x5afd73fd,0x4a9394b6,0x5e2b6940,0x7c02c5c9,0x509cb64d,
                0x5941a055,0x5d50f6be,0x60b9e0ad,0x41e9ef39,0x67d9347b,0x6b36dcc7,0x5406c784,0x70ae9dd9,
                0x6a183a7f,0x78f48e0e,0x53e7df22,0x63cc6072,0x448b1623,0x4cd5d08c,0x6175e8be,0x5cd03362,
                0x4de1312d,0x6c5bd16d,0x6e89094f,0x64a1947e,0x78e1060e,0x63d8179b,0x791c8d51,0x532420c2
            };
            memcpy(input, yolo_input, sizeof(yolo_input));
            memcpy(filter_data, yolo_filter, sizeof(yolo_filter));
            memcpy(bias, yolo_bias, sizeof(yolo_bias));
            memcpy(out_shift, yolo_shifts, sizeof(yolo_shifts));
            memcpy(out_mult, yolo_mults, sizeof(yolo_mults));
        }

        data_dims_t input_dims = {.width = in_wd, .height = in_ht, .channels = in_channels, 1};
        data_dims_t output_dims = {.width = out_wd, .height = out_ht, .channels = out_channels, 1};
        data_dims_t filter_dims = {.width = filter_wd, .height = filter_ht, .channels = in_channels, 1};
        conv_params_t conv_params = {.in_offset = input_offset, .out_offset = out_offset,
                                    .stride = {stride_wd, stride_ht}, .padding = {pad_wd, pad_ht},
                                    .dilation = {0, 0}, .activation = {activation_min, activation_max}};
        quant_data_t quant_data = {.shift = out_shift, .mult = out_mult};

        int scratch_buf_size = esp_nn_get_conv_scratch_size(&input_dims, &filter_dims,
                                                            &output_dims, &conv_params);
        if (scratch_buf_size > 0) {
            scratch_buf = ESP_NN_TEST_ALLOC(scratch_buf_size + 16);
            if (scratch_buf == NULL) {
                printf(ANSI_COLOR_RED"[%3d] scratch_buf alloc failed size %d\n"ANSI_COLOR_RESET,
                       itr, scratch_buf_size);
                goto conv_s8_cleanup;
            }
            int align_sz = 16 - (((int32_t) scratch_buf) & 0xf);
            esp_nn_set_conv_scratch_buf(scratch_buf + align_sz);
        }

        /* enable profiler */
        profile_c_start();

        /* C function */
        esp_nn_conv_s8_ansi(&input_dims, input, &filter_dims, filter_data,
                            bias, &output_dims, out_data_c, &conv_params, &quant_data);

        total_c = profile_c_end();
        profile_opt_start();

        /* Optimized function */
        esp_nn_conv_s8(&input_dims, input, &filter_dims, filter_data,
                       bias, &output_dims, out_data_opt, &conv_params, &quant_data);

        /* disable profiler */
        total_opt = profile_opt_end();

        bool ret = CHECK_EQUAL(out_data_c, out_data_opt, out_size);
        if (ret == false) {
            printf(ANSI_COLOR_RED"[%3d] failed [pad: (%d, %d), stride: (%d, %d)"
                   " out: (%3d,%3d,%3d), filter: (%d, %d,%3d)]\n"ANSI_COLOR_RESET,
                   itr, pad_wd, pad_ht, stride_wd, stride_ht, out_wd, out_ht,
                   out_channels, filter_wd, filter_ht, in_channels);
            goto conv_s8_cleanup;
        }
        printf(ANSI_COLOR_GREEN"[%3d] passed [pad: (%d, %d), stride: (%d, %d)"
               " out: (%3d,%3d,%3d), filter: (%d, %d,%3d)]"ANSI_COLOR_RESET,
               itr, pad_wd, pad_ht, stride_wd, stride_ht, out_wd, out_ht,
               out_channels, filter_wd, filter_ht, in_channels);
        printf("\tcycles: c %8"PRIu32", opt %8"PRIu32"\n", total_c, total_opt);

    conv_s8_cleanup:
        /* Restore original filter pointer (may have been offset for alignment test) */
        filter_data = filter_data_orig_save;
        if (input_orig) {
            free(input_orig);
            input_orig = NULL;
        }
        if (filter_data) {
            free(filter_data);
            filter_data = NULL;
        }
        if (out_c_orig) {
            free(out_c_orig);
            out_c_orig = NULL;
        }
        if (out_opt_orig) {
            free(out_opt_orig);
            out_opt_orig = NULL;
        }
        if (bias) {
            free(bias);
            bias = NULL;
        }
        if (out_shift) {
            free(out_shift);
            out_shift = NULL;
        }
        if (out_mult) {
            free(out_mult);
            out_mult = NULL;
        }
        if (scratch_buf) {
            free(scratch_buf);
            scratch_buf = NULL;
        }
    }
}


================================================
FILE: tests/src/fully_connected_test.c
================================================
/*
 * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>

#include <esp_nn.h>
#include "test_utils.h"


void esp_nn_fully_connected_s8_test()
{
    uint32_t total_c = 0, total_opt = 0;
    /* prepare data */
    uint16_t row_len = 256 + 8 + 7; /* odd len to test unaligned+left-over */
    const int32_t max_out_ch = 16;
    const int32_t max_row_len = 271;
    uint16_t out_channels = 3;

    /* Use heap-allocated aligned buffers (matches TFLite real-world usage) */
    int8_t *input_orig = malloc(max_row_len + 16);
    int8_t *filter_orig = malloc(max_row_len * max_out_ch + 16);
    int8_t *out_c_orig = malloc(max_out_ch + 16);
    int8_t *out_opt_orig = malloc(max_out_ch + 16);
    if (!input_orig || !filter_orig || !out_c_orig || !out_opt_orig) {
        printf(ANSI_COLOR_RED"%s allocations failed\n"ANSI_COLOR_RESET, __FUNCTION__);
        goto fc_s8_cleanup;
    }
    int8_t *input = (int8_t *)(((uint32_t)input_orig + 15) & ~15);
    int8_t *filter_data = (int8_t *)(((uint32_t)filter_orig + 15) & ~15);
    int8_t *output_c = (int8_t *)(((uint32_t)out_c_orig + 15) & ~15);
    int8_t *output_opt = (int8_t *)(((uint32_t)out_opt_orig + 15) & ~15);
    int32_t activation_min = -128;
    int32_t activation_max = 127;
    int32_t input_offset = 0;
    int32_t filter_offset = 0;
    int32_t out_shift = -10;
    int32_t out_offset = 5;
    int32_t out_mult = 0x59e492c4;
    printf("\n######## Running %s ##########\n", __FUNCTION__);
    for (int itr = 0; itr < 15; itr++) {
        out_mult = INT32_MAX / row_len + rand() % INT16_MAX;
        switch (itr) {
        case 0:
            out_shift = -10;
            break;
        case 1:
            out_shift = SHIFT_MIN;
            break;
        case 2:
            out_shift = SHIFT_MAX;
            break;
        case 3:
            out_shift = 0;
            break;
        case 4:
            row_len = 1;
            out_channels = 16;
            out_shift = -10 + rand() % 5;
            break;
        case 5:
            row_len = 16;
            out_channels = 8;
            out_shift = -10 + rand() % 5;
            break;
        case 6:
            row_len = 8;
            out_channels = 8;
            out_shift = -10 + rand() % 5;
            break;
        case 7:
            row_len = 8;
            out_channels = 15;
            out_shift = -10 + rand() % 5;
            break;
        case 8:
            row_len = 8;
            out_channels = 1;
            out_shift = -10 + rand() % 5;
            break;
        default:
            row_len = rand() % 7 + 1;
            out_channels = 8;
            out_shift = -10 + rand() % 5;
            break;
        }
        if (itr == 0) {
            out_shift = SHIFT_MAX;
        }
        /* Generate input and filter data */
        for (int i = 0; i < row_len; ++i) {
            input[i] = rand() % 256 - 128;
        }
        for (int i = 0; i < row_len * out_channels; ++i) {
            filter_data[i] = rand() % 256 - 128;
        }

        /* enable profiler */
        profile_c_start();

        /* C function */
        esp_nn_fully_connected_s8_ansi(input, input_offset, row_len, filter_data, filter_offset,
                                    NULL, output_c, out_channels, out_offset, out_shift, out_mult,
                                    activation_min, activation_max);

        total_c = profile_c_end();
        profile_opt_start();

        /* Optimized function */
        esp_nn_fully_connected_s8(input, input_offset, row_len, filter_data, filter_offset,
                                NULL, output_opt, out_channels, out_offset, out_shift, out_mult,
                                activation_min, activation_max);

        /* disable profiler */
        total_opt = profile_opt_end();

        bool ret = CHECK_EQUAL(output_c, output_opt, out_channels);
        if (ret == false) {
            printf(ANSI_COLOR_RED"[%3d] failed\n"ANSI_COLOR_RESET, itr);
#if 0
            printf("Output: \n");
            PRINT_ARRAY_HEX(output_opt, out_channels, 1);
            printf("Expected: \n");
            PRINT_ARRAY_HEX(output_c, out_channels, 1);
            printf("Input:\n");
            PRINT_ARRAY_HEX(input, row_len, 1);
            printf("Filter data:\n");
            PRINT_ARRAY_HEX(filter_data, row_len, out_channels);
            printf("Out shift: %d\n", out_shift);
            printf("Out mult: %x\n", out_mult);
#endif
            goto fc_s8_cleanup;
        }
        printf(ANSI_COLOR_GREEN"[%3d] passed [row_len %"PRIu16", out_ch %"PRIu16"]"ANSI_COLOR_RESET,
               itr, row_len, out_channels);
        printf("\tcycles: c %8"PRIu32", opt %8"PRIu32"\n", total_c, total_opt);
    }

fc_s8_cleanup:
    if (input_orig) {
        free(input_orig);
    }
    if (filter_orig) {
        free(filter_orig);
    }
    if (out_c_orig) {
        free(out_c_orig);
    }
    if (out_opt_orig) {
        free(out_opt_orig);
    }
}

void esp_nn_fully_connected_per_ch_s8_test()
{
    uint32_t total_c = 0, total_opt = 0;
    /* prepare data */
    uint16_t row_len = 256 + 8 + 7; /* odd len to test unaligned+left-over */
    const int32_t max_out_ch = 16;
    const int32_t max_row_len = 271;
    uint16_t out_channels = 3;

    /* Use heap-allocated aligned buffers (matches TFLite real-world usage) */
    int8_t *input_orig = malloc(max_row_len + 16);
    int8_t *filter_orig = malloc(max_row_len * max_out_ch + 16);
    int8_t *out_c_orig = malloc(max_out_ch + 16);
    int8_t *out_opt_orig = malloc(max_out_ch + 16);
    if (!input_orig || !filter_orig || !out_c_orig || !out_opt_orig) {
        printf(ANSI_COLOR_RED"%s allocations failed\n"ANSI_COLOR_RESET, __FUNCTION__);
        goto fc_per_ch_s8_buffers_cleanup;
    }
    int8_t *input = (int8_t *)(((uint32_t)input_orig + 15) & ~15);
    int8_t *filter_data = (int8_t *)(((uint32_t)filter_orig + 15) & ~15);
    int8_t *output_c = (int8_t *)(((uint32_t)out_c_orig + 15) & ~15);
    int8_t *output_opt = (int8_t *)(((uint32_t)out_opt_orig + 15) & ~15);
    int32_t activation_min = -128;
    int32_t activation_max = 127;
    int32_t input_offset = 0;
    int32_t filter_offset = 0;
    int32_t out_offset = 7;

    int32_t* out_mult = NULL;
    int32_t* out_shift = NULL;

    printf("\n######## Running %s ##########\n", __FUNCTION__);
    for (int itr = 0;  itr < 15; itr++) {
        int32_t out_shift_val = 0;
        switch (itr) {
        case 0:
            out_shift_val = -10;
            break;
        case 1:
            out_shift_val = SHIFT_MIN;
            break;
        case 2:
            out_shift_val = SHIFT_MAX;
            break;
        case 3:
            out_shift_val = 0;
            break;
        case 4:
            row_len = 1;
            out_channels = 16;
            break;
        case 5:
            row_len = 16;
            out_channels = 8;
            break;
        case 6:
            row_len = 8;
            out_channels = 8;
            break;
        case 7:
            row_len = 8;
            out_channels = 15;
            break;
        case 8:
            row_len = 8;
            out_channels = 1;
            break;
        default:
            row_len = rand() % 7 + 1;
            out_channels = 8;
            break;
        }

        out_mult = ESP_NN_TEST_ALLOC(out_channels * sizeof(int32_t));
        out_shift = ESP_NN_TEST_ALLOC(out_channels * sizeof(int32_t));

        if (out_shift == NULL || out_mult == NULL) {
            printf(ANSI_COLOR_RED"out_shift/out_mult allocations failed\n"ANSI_COLOR_RESET);
            goto fully_connected_per_ch_cleanup;
        }

        for (int i = 0; i < out_channels; i++) {
            out_mult[i] = INT32_MAX / row_len + rand() % INT16_MAX;
            if (i < 4) {
                out_shift[i] = out_shift_val;
            } else {
                out_shift[i] = -10 + rand() % 5;
            }
        }

        /* Generate input and filter data */
        for (int i = 0; i < row_len; ++i) {
            input[i] = rand() % 256 - 128;
        }
        for (int i = 0; i < row_len * out_channels; ++i) {
            filter_data[i] = rand() % 256 - 128;
        }
        
        /* enable profiler */
        profile_c_start();

        /* C function */
        esp_nn_fully_connected_per_ch_s8_ansi(input, input_offset, row_len, filter_data, filter_offset,
                                    NULL, output_c, out_channels, out_offset, out_shift, out_mult,
                                    activation_min, activation_max);

        total_c = profile_c_end();
        profile_opt_start();

        /* Optimized function */
        esp_nn_fully_connected_per_ch_s8(input, input_offset, row_len, filter_data, filter_offset,
                                NULL, output_opt, out_channels, out_offset, out_shift, out_mult,
                                activation_min, activation_max);

        /* disable profiler */
        total_opt = profile_opt_end();

        bool ret = CHECK_EQUAL(output_c, output_opt, out_channels);
        if (ret == false) {
            printf(ANSI_COLOR_RED"[%3d] failed\n"ANSI_COLOR_RESET, itr);
            goto fully_connected_per_ch_cleanup;
        }
        printf(ANSI_COLOR_GREEN"[%3d] passed [row_len %"PRIu16", out_ch %"PRIu16"]"ANSI_COLOR_RESET,
               itr, row_len, out_channels);
        printf("\tcycles: c %8"PRIu32", opt %8"PRIu32"\n", total_c, total_opt);
    
    fully_connected_per_ch_cleanup:
        if (out_shift) {
            free(out_shift);
        }
        if (out_mult) {
            free(out_mult);
        }
    }

fc_per_ch_s8_buffers_cleanup:
    if (input_orig) {
        free(input_orig);
    }
    if (filter_orig) {
        free(filter_orig);
    }
    if (out_c_orig) {
        free(out_c_orig);
    }
    if (out_opt_orig) {
        free(out_opt_orig);
    }
}


================================================
FILE: tests/src/hard_swish_test.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>

#include <esp_nn.h>
#include "test_utils.h"

void esp_nn_hard_swish_s8_test()
{
    /* Test with representative MobileNetV3 parameters */
    const int test_sizes[] = {1, 8, 16, 32, 100, 1024, 12544};
    const int num_tests = sizeof(test_sizes) / sizeof(test_sizes[0]);

    /* Typical quantization params from MobileNetV3 layers */
    const int16_t input_zp = -128;
    const int16_t output_mult_fxp = 19661;  /* typical value */
    const int16_t reluish_mult_fxp = 22938; /* typical value */
    const int16_t output_zp = -128;

    /* Test all three branches: exp > 0, exp < 0, exp == 0 */
    int32_t reluish_exps[] = {2, -1, 0};
    int32_t output_exps[] = {-1, -2, -1};

    printf("\n######## Running %s ##########\n", __FUNCTION__);

    /* Set up scratch buffer for LUT-based optimization */
    int32_t scratch_size = esp_nn_get_hard_swish_scratch_size();
    void *scratch_buf = NULL;
    if (scratch_size > 0) {
        scratch_buf = malloc(scratch_size);
        if (scratch_buf) {
            esp_nn_set_hard_swish_scratch_buf(scratch_buf);
        }
    }

    for (int t = 0; t < num_tests; t++) {
        int size = test_sizes[t];
        int8_t *input_orig = malloc(size + 16);
        int8_t *out_c_orig = malloc(size + 16);
        int8_t *out_opt_orig = malloc(size + 16);

        if (!input_orig || !out_c_orig || !out_opt_orig) {
            printf(ANSI_COLOR_RED"hard_swish [%d] alloc failed\n"ANSI_COLOR_RESET, t);
            goto cleanup;
        }

        int8_t *input = (int8_t *)(((uint32_t)input_orig + 15) & ~15);
        int8_t *out_c = (int8_t *)(((uint32_t)out_c_orig + 15) & ~15);
        int8_t *out_opt = (int8_t *)(((uint32_t)out_opt_orig + 15) & ~15);

        for (int i = 0; i < size; i++) {
            input[i] = rand() % 256 - 128;
        }

        for (int exp_idx = 0; exp_idx < 3; exp_idx++) {
            /* ANSI C reference */
            profile_c_start();
            esp_nn_hard_swish_s8_ansi(input, out_c, size,
                                       input_zp, output_mult_fxp, reluish_mult_fxp,
                                       reluish_exps[exp_idx], output_exps[exp_idx], output_zp);
            profile_c_end();

            /* Optimized */
            profile_opt_start();
            esp_nn_hard_swish_s8(input, out_opt, size,
                                  input_zp, output_mult_fxp, reluish_mult_fxp,
                                  reluish_exps[exp_idx], output_exps[exp_idx], output_zp);
            profile_opt_end();

            bool ret = CHECK_EQUAL(out_c, out_opt, size);
            if (!ret) {
                printf(ANSI_COLOR_RED"hard_swish [size=%d, exp=%d] failed\n"ANSI_COLOR_RESET,
                       size, (int)reluish_exps[exp_idx]);
                goto cleanup;
            }
        }
        printf(ANSI_COLOR_GREEN"hard_swish [%2d] passed [size %d]\n"ANSI_COLOR_RESET, t, size);

    cleanup:
        if (input_orig) free(input_orig);
        if (out_c_orig) free(out_c_orig);
        if (out_opt_orig) free(out_opt_orig);
    }
    if (scratch_buf) free(scratch_buf);
}


================================================
FILE: tests/src/mean_test.c
================================================
/*
 * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <inttypes.h>

#include <esp_nn.h>
#include "test_utils.h"

void esp_nn_mean_nhwc_s8_test()
{
    /* Test dimensions matching MobileNetV3 SE blocks */
    struct {
        int height, width, channels;
    } test_cases[] = {
        {7, 7, 16},      /* small SE block */
        {7, 7, 72},      /* medium SE block */
        {14, 14, 40},    /* larger spatial */
        {14, 14, 120},   /* larger channels */
        {28, 28, 24},    /* early layer SE */
        {1, 1, 576},     /* degenerate 1x1 */
        {3, 3, 96},      /* small spatial */
    };
    const int num_tests = sizeof(test_cases) / sizeof(test_cases[0]);

    const int32_t input_zp = -128;
    const int32_t output_zp = -128;
    const int32_t multiplier = 1073741824; /* typical */
    const int32_t shift = -1;

    printf("\n######## Running %s ##########\n", __FUNCTION__);

    for (int t = 0; t < num_tests; t++) {
        int h = test_cases[t].height;
        int w = test_cases[t].width;
        int c = test_cases[t].channels;
        int input_size = h * w * c;

        int8_t *input_orig = malloc(input_size + 16);
        int8_t *out_c_orig = malloc(c + 16);
        int8_t *out_opt_orig = malloc(c + 16);

        if (!input_orig || !out_c_orig || !out_opt_orig) {
            printf(ANSI_COLOR_RED"mean [%d] alloc failed\n"ANSI_COLOR_RESET, t);
            goto cleanup;
        }

        int8_t *input = (int8_t *)(((uint32_t)input_orig + 15) & ~15);
        int8_t *out_c = (int8_t *)(((uint32_t)out_c_orig + 15) & ~15);
        int8_t *out_opt = (int8_t *)(((uint32_t)out_opt_orig + 15) & ~15);

        for (int i = 0; i < input_size; i++) {
            input[i] = rand() % 256 - 128;
        }

        /* ANSI C reference */
        profile_c_start();
        esp_nn_mean_nhwc_s8_ansi(input, out_c, h, w, c,
                                  input_zp, output_zp, multiplier, shift);
        profile_c_end();

        /* Optimized */
        profile_opt_start();
        esp_nn_mean_nhwc_s8(input, out_opt, h, w, c,
                             input_zp, output_zp, multiplier, shift);
        profile_opt_end();

        bool ret = CHECK_EQUAL(out_c, out_opt, c);
        if (!ret) {
            printf(ANSI_COLOR_RED"mean [%d] failed [%dx%dx%d]\n"ANSI_COLOR_RESET,
                   t, h, w, c);
            goto cleanup;
        }
        printf(ANSI_COLOR_GREEN"mean [%2d] passed [%dx%dx%d]\n"ANSI_COLOR_RESET,
               t, h, w, c);

    cleanup:
        if (input_orig) free(input_orig);
        if (out_c_orig) free(out_c_orig);
        if (out_opt_orig) free(out_opt_orig);
    }
}


================================================
FILE: tests/src/pooling_test.c
================================================
/*
 * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <inttypes.h>

#include <esp_nn.h>
#include "test_utils.h"

static void run_avg_pool_test(uint16_t input_wd, uint16_t input_ht, uint16_t channels,
                              uint16_t filter_wd, uint16_t filter_ht,
                              uint16_t stride_wd, uint16_t stride_ht,
                              uint16_t pad_wd, uint16_t pad_ht,
                              int iter)
{
    const int32_t activation_min = -128;
    const int32_t activation_max = 127;
    const uint16_t out_wd = (input_wd + 2 * pad_wd - filter_wd) / stride_wd + 1;
    const uint16_t out_ht = (input_ht + 2 * pad_ht - filter_ht) / stride_ht + 1;
    const int size = input_wd * input_ht * channels;
    const int out_size = out_wd * out_ht * channels;

    int8_t *input = NULL, *output_c = NULL, *output_opt = NULL;
    int8_t *input_orig = malloc(size + 16);
    int8_t *out_c_orig = malloc(out_size + 16);
    int8_t *out_opt_orig = malloc(out_size + 16);
    if (input_orig == NULL || out_c_orig == NULL || out_opt_orig == NULL) {
        printf(ANSI_COLOR_RED"avg_pool [%d] allocations failed\n"ANSI_COLOR_RESET, iter);
        goto avg_pool_cleanup;
    }

    input = (int8_t *) (((uint32_t) input_orig + 15) & ~15);
    output_c = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15);
    output_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15);

    for (int i = 0; i < size; ++i) {
        input[i] = rand() % 256 - 128;
    }

    profile_c_start();
    esp_nn_avg_pool_s8_ansi(input, input_wd, input_ht, output_c, out_wd, out_ht,
                            stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht,
                            activation_min, activation_max, channels);
    profile_c_end();

    profile_opt_start();
    esp_nn_avg_pool_s8(input, input_wd, input_ht, output_opt, out_wd, out_ht,
                       stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht,
                       activation_min, activation_max, channels);
    profile_opt_end();

    bool ret = CHECK_EQUAL(output_c, output_opt, out_size);
    if (ret == false) {
        printf(ANSI_COLOR_RED"avg_pool [%d] failed [in %dx%dx%d, f %dx%d, s %dx%d, p %dx%d]\n"ANSI_COLOR_RESET,
               iter, input_wd, input_ht, channels, filter_wd, filter_ht,
               stride_wd, stride_ht, pad_wd, pad_ht);
        goto avg_pool_cleanup;
    }
    printf(ANSI_COLOR_GREEN"avg_pool [%2d] passed [in %dx%dx%d, f %dx%d, s %dx%d, p %dx%d]\n"ANSI_COLOR_RESET,
           iter, input_wd, input_ht, channels, filter_wd, filter_ht,
           stride_wd, stride_ht, pad_wd, pad_ht);

avg_pool_cleanup:
    if (input_orig) free(input_orig);
    if (out_c_orig) free(out_c_orig);
    if (out_opt_orig) free(out_opt_orig);
}

void esp_nn_avg_pool_s8_test()
{
    int iter = 0;
    /* Original test case */
    run_avg_pool_test(16, 16, 16, 3, 3, 1, 1, 1, 1, iter++);
    /* Varying channel counts */
    run_avg_pool_test(16, 16, 4, 3, 3, 1, 1, 1, 1, iter++);
    run_avg_pool_test(16, 16, 8, 3, 3, 1, 1, 1, 1, iter++);
    run_avg_pool_test(16, 16, 32, 3, 3, 1, 1, 1, 1, iter++);
    run_avg_pool_test(16, 16, 64, 3, 3, 1, 1, 1, 1, iter++);
    /* Note: non-multiple-of-4 channels not supported by S3 optimized path */
    /* Different filter sizes */
    run_avg_pool_test(16, 16, 16, 1, 1, 1, 1, 0, 0, iter++);
    run_avg_pool_test(16, 16, 16, 2, 2, 1, 1, 0, 0, iter++);
    run_avg_pool_test(16, 16, 16, 5, 5, 1, 1, 2, 2, iter++);
    /* Stride > 1 */
    run_avg_pool_test(16, 16, 16, 3, 3, 2, 2, 1, 1, iter++);
    run_avg_pool_test(24, 24, 32, 3, 3, 2, 2, 1, 1, iter++);
    /* Person detection final pooling: 6x6x128, filter 6x6 */
    run_avg_pool_test(6, 6, 128, 6, 6, 1, 1, 0, 0, iter++);
    /* No padding */
    run_avg_pool_test(16, 16, 16, 3, 3, 1, 1, 0, 0, iter++);
}

static void run_max_pool_test(uint16_t input_wd, uint16_t input_ht, uint16_t channels,
                              uint16_t filter_wd, uint16_t filter_ht,
                              uint16_t stride_wd, uint16_t stride_ht,
                              uint16_t pad_wd, uint16_t pad_ht,
                              int iter)
{
    const int32_t activation_min = -128;
    const int32_t activation_max = 127;
    const uint16_t out_wd = (input_wd + 2 * pad_wd - filter_wd) / stride_wd + 1;
    const uint16_t out_ht = (input_ht + 2 * pad_ht - filter_ht) / stride_ht + 1;
    const int size = input_wd * input_ht * channels;
    const int out_size = out_wd * out_ht * channels;

    int8_t *input = NULL, *output_c = NULL, *output_opt = NULL;
    int8_t *input_orig = malloc(size + 16);
    int8_t *out_c_orig = malloc(out_size + 16);
    int8_t *out_opt_orig = malloc(out_size + 16);
    if (input_orig == NULL || out_c_orig == NULL || out_opt_orig == NULL) {
        printf(ANSI_COLOR_RED"max_pool [%d] allocations failed\n"ANSI_COLOR_RESET, iter);
        goto max_pool_cleanup;
    }

    input = (int8_t *) (((uint32_t) input_orig + 15) & ~15);
    output_c = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15);
    output_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15);

    for (int i = 0; i < size; ++i) {
        input[i] = rand() % 256 - 128;
    }

    profile_c_start();
    esp_nn_max_pool_s8_ansi(input, input_wd, input_ht, output_c, out_wd, out_ht,
                            stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht,
                            activation_min, activation_max, channels);
    profile_c_end();

    profile_opt_start();
    esp_nn_max_pool_s8(input, input_wd, input_ht, output_opt, out_wd, out_ht,
                       stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht,
                       activation_min, activation_max, channels);
    profile_opt_end();

    bool ret = CHECK_EQUAL(output_c, output_opt, out_size);
    if (ret == false) {
        printf(ANSI_COLOR_RED"max_pool [%d] failed [in %dx%dx%d, f %dx%d, s %dx%d, p %dx%d]\n"ANSI_COLOR_RESET,
               iter, input_wd, input_ht, channels, filter_wd, filter_ht,
               stride_wd, stride_ht, pad_wd, pad_ht);
        goto max_pool_cleanup;
    }
    printf(ANSI_COLOR_GREEN"max_pool [%2d] passed [in %dx%dx%d, f %dx%d, s %dx%d, p %dx%d]\n"ANSI_COLOR_RESET,
           iter, input_wd, input_ht, channels, filter_wd, filter_ht,
           stride_wd, stride_ht, pad_wd, pad_ht);

max_pool_cleanup:
    if (input_orig) free(input_orig);
    if (out_c_orig) free(out_c_orig);
    if (out_opt_orig) free(out_opt_orig);
}

void esp_nn_max_pool_s8_test()
{
    int iter = 0;
    /* Original test case */
    run_max_pool_test(16, 16, 16, 3, 3, 1, 1, 1, 1, iter++);
    /* Varying channel counts */
    run_max_pool_test(16, 16, 4, 3, 3, 1, 1, 1, 1, iter++);
    run_max_pool_test(16, 16, 8, 3, 3, 1, 1, 1, 1, iter++);
    run_max_pool_test(16, 16, 32, 3, 3, 1, 1, 1, 1, iter++);
    run_max_pool_test(16, 16, 64, 3, 3, 1, 1, 1, 1, iter++);
    /* Note: non-multiple-of-4 channels not supported by S3 optimized path */
    /* Different filter sizes */
    run_max_pool_test(16, 16, 16, 1, 1, 1, 1, 0, 0, iter++);
    run_max_pool_test(16, 16, 16, 2, 2, 1, 1, 0, 0, iter++);
    run_max_pool_test(16, 16, 16, 5, 5, 1, 1, 2, 2, iter++);
    /* Stride > 1 */
    run_max_pool_test(16, 16, 16, 3, 3, 2, 2, 1, 1, iter++);
    run_max_pool_test(24, 24, 32, 3, 3, 2, 2, 1, 1, iter++);
    /* Person detection final pooling-like: 6x6x128 */
    run_max_pool_test(6, 6, 128, 6, 6, 1, 1, 0, 0, iter++);
    /* No padding */
    run_max_pool_test(16, 16, 16, 3, 3, 1, 1, 0, 0, iter++);
}


================================================
FILE: tests/src/relu_test.c
================================================
/*
 * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>

#include <esp_nn.h>
#include "test_utils.h"

static void run_relu6_test(int size, int iter)
{
    int8_t *input = NULL, *inout_ansi = NULL, *inout_opt = NULL;

    int8_t *input_orig = malloc(size + 16);
    int8_t *inout_c_orig = malloc(size + 16);
    int8_t *inout_opt_orig = malloc(size + 16);

    if (input_orig == NULL || inout_c_orig == NULL || inout_opt_orig == NULL) {
        printf(ANSI_COLOR_RED"relu6 [%d] allocations failed\n"ANSI_COLOR_RESET, iter);
        goto relu6_cleanup;
    }
    input = (int8_t *) (((uint32_t) input_orig + 15) & ~15);
    inout_ansi = (int8_t *) (((uint32_t) inout_c_orig + 15) & ~15);
    inout_opt = (int8_t *) (((uint32_t) inout_opt_orig + 15) & ~15);

    for (int i = 0; i < size; ++i) {
        input[i] = rand() % 255 - 128;
        inout_ansi[i] = input[i];
        inout_opt[i] = input[i];
    }

    profile_c_start();
    esp_nn_relu6_s8_ansi(inout_ansi, size);
    profile_c_end();

    profile_opt_start();
    esp_nn_relu6_s8(inout_opt, size);
    profile_opt_end();

    bool ret = CHECK_EQUAL(inout_ansi, inout_opt, size);
    if (ret == false) {
        printf(ANSI_COLOR_RED"relu6 [%d] failed [size %d]\n"ANSI_COLOR_RESET, iter, size);
        goto relu6_cleanup;
    }
    printf(ANSI_COLOR_GREEN"relu6 [%2d] passed [size %d]\n"ANSI_COLOR_RESET, iter, size);

relu6_cleanup:
    if (input_orig) free(input_orig);
    if (inout_c_orig) free(inout_c_orig);
    if (inout_opt_orig) free(inout_opt_orig);
}

void esp_nn_relu6_s8_test()
{
    int iter = 0;
    /* Original test case: odd size with leftover */
    run_relu6_test(1600 + 8 + 7, iter++);
    /* Very small sizes (< 8 elements, below SIMD width) */
    run_relu6_test(1, iter++);
    run_relu6_test(3, iter++);
    run_relu6_test(7, iter++);
    /* Between 8 and 16 (partial SIMD) */
    run_relu6_test(8, iter++);
    run_relu6_test(12, iter++);
    run_relu6_test(15, iter++);
    /* Exact multiple of 16 (full SIMD, no leftover) */
    run_relu6_test(16, iter++);
    run_relu6_test(32, iter++);
    run_relu6_test(256, iter++);
    /* Non-aligned sizes */
    run_relu6_test(17, iter++);
    run_relu6_test(33, iter++);
    run_relu6_test(100, iter++);
}


================================================
FILE: tests/src/softmax_test.c
================================================
/*
 * SPDX-FileCopyrightText: 2022-2026 Espressif Systems (Shanghai) CO LTD
 *
 * SPDX-License-Identifier: Apache-2.0
 */

#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <inttypes.h>

#include <esp_nn.h>
#include "test_utils.h"

static void run_softmax_test(int32_t height, int32_t width, int32_t mult,
                             int32_t shift, int32_t diff_min, int iter)
{
    void *scratch_buf = NULL, *scratch_buf_orig = NULL;
    const int size = width * height;
    int8_t *input = NULL, *out_ansi = NULL, *out_opt = NULL;

    int8_t *input_orig = malloc(size + 16);
    int8_t *out_c_orig = malloc(size + 16);
    int8_t *out_opt_orig = malloc(size + 16);
    if (input_orig == NULL || out_c_orig == NULL || out_opt_orig == NULL) {
        printf(ANSI_COLOR_RED"softmax [%d] allocations failed\n"ANSI_COLOR_RESET, iter);
        goto softmax_cleanup;
    }

    input = (int8_t *) (((uint32_t) input_orig + 15) & ~15);
    out_ansi = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15);
    out_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15);

    for (int i = 0; i < size; ++i) {
        input[i] = rand() % 255 - 128;
    }

    profile_c_start();
    esp_nn_softmax_s8_ansi(input, height, width, mult, shift, diff_min, out_ansi);
    profile_c_end();

    int32_t scratch_buf_size = esp_nn_get_softmax_scratch_size(width, height);
    if (scratch_buf_size) {
        scratch_buf_orig = malloc(scratch_buf_size * 4 + 16);
        if (scratch_buf_orig == NULL) {
            printf(ANSI_COLOR_RED"softmax [%d] scratch alloc failed size %"PRIi32"\n"ANSI_COLOR_RESET,
                   iter, scratch_buf_size);
            goto softmax_cleanup;
        }
        scratch_buf = (void *)(((uint32_t) scratch_buf_orig + 15) & ~15);
        esp_nn_set_softmax_scratch_buf(scratch_buf);
    }

    profile_opt_start();
    esp_nn_softmax_s8(input, height, width, mult, shift, diff_min, out_opt);
    profile_opt_end();

    bool ret = CHECK_EQUAL(out_ansi, out_opt, size);
    if (ret == false) {
        printf(ANSI_COLOR_RED"softmax [%d] failed [h %"PRIi32", w %"PRIi32", mult %"PRIi32", shift %"PRIi32", diff_min %"PRIi32"]\n"ANSI_COLOR_RESET,
               iter, height, width, mult, shift, diff_min);
        printf("Output: \n");
        PRINT_ARRAY_HEX(out_opt, width, height);
        printf("Expected: \n");
        PRINT_ARRAY_HEX(out_ansi, width, height);
        goto softmax_cleanup;
    }
    printf(ANSI_COLOR_GREEN"softmax [%2d] passed [h %"PRIi32", w %"PRIi32", mult %"PRIi32", shift %"PRIi32"]\n"ANSI_COLOR_RESET,
           iter, height, width, mult, shift);

softmax_cleanup:
    if (input_orig) free(input_orig);
    if (out_c_orig) free(out_c_orig);
    if (out_opt_orig) free(out_opt_orig);
    if (scratch_buf_orig) free(scratch_buf_orig);
}

void esp_nn_softmax_s8_test()
{
    int iter = 0;
    /* Original test case */
    run_softmax_test(8, 32, INT32_MAX / 2, 7, -128, iter++);
    /* Small output classes (person_detection: 2, micro_speech: 4) */
    run_softmax_test(1, 2, INT32_MAX / 2, 7, -128, iter++);
    run_softmax_test(1, 4, INT32_MAX / 2, 7, -128, iter++);
    /* Single element (degenerate) */
    run_softmax_test(1, 1, INT32_MAX / 2, 7, -128, iter++);
    /* Medium width */
    run_softmax_test(1, 10, INT32_MAX / 2, 7, -128, iter++);
    run_softmax_test(4, 10, INT32_MAX / 2, 7, -128, iter++);
    /* Large width (ImageNet-class) */
    run_softmax_test(1, 1000, INT32_MAX / 2, 7, -128, iter++);
    /* Large height */
    run_softmax_test(64, 32, INT32_MAX / 2, 7, -128, iter++);
    /* Varying diff_min */
    run_softmax_test(8, 32, INT32_MAX / 2, 7, -64, iter++);
    run_softmax_test(8, 32, INT32_MAX / 2, 7, -32, iter++);
    run_softmax_test(8, 32, INT32_MAX / 2, 7, 0, iter++);
    /* Varying multiplier and shift */
    run_softmax_test(8, 32, INT32_MAX / 4, 5, -128, iter++);
    run_softmax_test(8, 32, INT32_MAX, 10, -128, iter++);
    /* Odd width (non-aligned) */
    run_softmax_test(8, 17, INT32_MAX / 2, 7, -128, iter++);
    run_softmax_test(8, 3, INT32_MAX / 2, 7, -128, iter++);
}