[
  {
    "path": ".github/workflows/upload_component.yml",
    "content": "name: Push esp-nn to IDF Component Registry\n\non:\n  push:\n    branches:\n      - master\n\njobs:\n  upload_components:\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions/checkout@v4\n\n      - name: Upload esp-nn to IDF Component Registry\n        uses: espressif/upload-components-ci-action@v1\n        with:\n          namespace: \"espressif\"\n          name: \"esp-nn\"\n          api_token: ${{ secrets.IDF_COMPONENT_API_TOKEN }}\n"
  },
  {
    "path": ".gitignore",
    "content": ".config\n*.o\n*.i\n*.s\n*.orig\n*.pyc\n\n# gtags\nGTAGS\nGRTAGS\nGPATH\n\n# emacs\n.dir-locals.el\n\n# emacs temp file suffixes\n*~\n.#*\n\\#*#\n\n# eclipse setting\n.settings\n\n# MacOS directory files\n.DS_Store\n\n# Example project files\nexamples/**/sdkconfig\nexamples/**/sdkconfig.old\nexamples/**/build\n\n# Test app files\ntest_app/build\ntest_app/sdkconfig\ntest_app/sdkconfig.old\n\n# Doc build artifacts\ndocs/_build/\ndocs/doxygen-warning-log.txt\ndocs/sphinx-warning-log.txt\ndocs/sphinx-warning-log-sanitized.txt\ndocs/xml/\ndocs/xml_in/\ndocs/man/\ndocs/doxygen_sqlite3.db\n\nTEST_LOGS\n\n\n# gcov coverage reports\n*.gcda\n*.gcno\ncoverage.info\ncoverage_report/\n\n# VS Code Settings\n.vscode/\n"
  },
  {
    "path": ".gitlab-ci.yml",
    "content": "stages:\n  - build\n\n# Avoid running duplicate pipeline\nworkflow:\n  rules:\n    - if: '$CI_PIPELINE_SOURCE == \"merge_request_event\"'\n    - if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH'\n\nvariables:\n  GIT_STRATEGY: fetch\n  GIT_SUBMODULE_STRATEGY: recursive\nbefore_script:\n    - mkdir -p ~/.ssh\n    - chmod 700 ~/.ssh\n    - echo -n $GITLAB_KEY_TMP > ~/.ssh/id_rsa_base64\n    - base64 --decode --ignore-garbage ~/.ssh/id_rsa_base64 > ~/.ssh/id_rsa\n    - chmod 600 ~/.ssh/id_rsa\n    - echo -e \"Host gitlab.espressif.cn\\n\\tStrictHostKeyChecking no\\n\" >> ~/.ssh/config\n    - |\n      if [ -n \"$IDF_COMPONENT_MGR_VER\" ]; then\n        pip install idf-component-manager==$IDF_COMPONENT_MGR_VER\n      fi\n\n.test_build: &test_build\n    # Build examples\n    - for TARGET in $EXAMPLE_TARGETS; do\n    - idf.py set-target $TARGET build\n    - done\n\n.build_template:\n  stage: build\n  image: espressif/idf:latest\n  tags:\n    - build\n  variables:\n    PEDANTIC_FLAGS: \"-Werror -Wno-error=cpp -Werror=unused-variable -Werror=unused-but-set-variable -Werror=unused-function\"\n    EXTRA_CFLAGS: \"${PEDANTIC_FLAGS}\"\n    EXTRA_CXXFLAGS: \"${PEDANTIC_FLAGS}\"\n  rules:\n    - if: '$CI_PIPELINE_SOURCE == \"schedule\"'\n      when: never\n    - when: always\n  script:\n    - cd ${CI_PROJECT_DIR}/test_app\n    # build examples\n    - *test_build\n    - cd ${CI_PROJECT_DIR}\n\nbuild_idf_v5.5:\n  extends: .build_template\n  image: espressif/idf:release-v5.5\n  variables:\n    EXAMPLE_TARGETS: esp32 esp32s3 esp32c3 esp32p4\n\nbuild_idf_v5.2:\n  extends: .build_template\n  image: espressif/idf:release-v5.2\n  variables:\n    EXAMPLE_TARGETS: esp32 esp32s3 esp32c3\n\nbuild_idf_v5.0:\n  extends: .build_template\n  image: espressif/idf:release-v5.0\n  variables:\n    EXAMPLE_TARGETS: esp32 esp32s3 esp32c3\n\nbuild_idf_v4.4:\n  extends: .build_template\n  image: espressif/idf:release-v4.4\n  variables:\n    EXAMPLE_TARGETS: esp32 esp32s3 esp32c3\n    IDF_COMPONENT_MGR_VER: \"1.2.0\"\n\nbuild_idf_v4.3:\n  extends: .build_template\n  image: espressif/idf:release-v4.3\n  variables:\n    EXAMPLE_TARGETS: esp32\n\nbuild_idf_v4.2:\n  extends: .build_template\n  image: espressif/idf:release-v4.2\n  variables:\n    EXAMPLE_TARGETS: esp32\n"
  },
  {
    "path": "CMakeLists.txt",
    "content": "cmake_minimum_required(VERSION 3.5)\n\nset(c_srcs\n    \"src/activation_functions/esp_nn_relu_ansi.c\"\n    \"src/activation_functions/esp_nn_hard_swish_ansi.c\"\n    \"src/common/esp_nn_mean_ansi.c\"\n    \"src/basic_math/esp_nn_add_ansi.c\"\n    \"src/basic_math/esp_nn_mul_ansi.c\"\n    \"src/convolution/esp_nn_conv_ansi.c\"\n    \"src/convolution/esp_nn_conv_opt.c\"\n    \"src/convolution/esp_nn_depthwise_conv_ansi.c\"\n    \"src/convolution/esp_nn_depthwise_conv_opt.c\"\n    \"src/fully_connected/esp_nn_fully_connected_ansi.c\"\n    \"src/softmax/esp_nn_softmax_ansi.c\"\n    \"src/softmax/esp_nn_softmax_opt.c\"\n    \"src/logistic/esp_nn_logistic_ansi.c\"\n    \"src/pooling/esp_nn_avg_pool_ansi.c\"\n    \"src/pooling/esp_nn_max_pool_ansi.c\")\n\nif(CONFIG_IDF_TARGET_ESP32S3)\n    set(s3_srcs\n        \"src/common/esp_nn_common_functions_esp32s3.S\"\n        \"src/common/esp_nn_dot_s8_esp32s3.S\"\n        \"src/common/esp_nn_multiply_by_quantized_mult_esp32s3.S\"\n        \"src/common/esp_nn_multiply_by_quantized_mult_ver1_esp32s3.S\"\n        \"src/activation_functions/esp_nn_relu_s8_esp32s3.S\"\n        \"src/activation_functions/esp_nn_hard_swish_s8_esp32s3.c\"\n        \"src/common/esp_nn_mean_s8_esp32s3.c\"\n        \"src/basic_math/esp_nn_add_s8_esp32s3.S\"\n        \"src/basic_math/esp_nn_mul_s8_esp32s3.S\"\n        \"src/basic_math/esp_nn_mul_broadcast_s8_esp32s3.S\"\n        \"src/convolution/esp_nn_conv_esp32s3.c\"\n        \"src/convolution/esp_nn_conv_s8_1x1_esp32s3.c\"\n        \"src/convolution/esp_nn_conv_s8_3x3_opt_esp32s3.c\"\n        \"src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c\"\n        \"src/convolution/esp_nn_conv_s16_mult8_esp32s3.S\"\n        \"src/convolution/esp_nn_conv_s8_mult8_1x1_esp32s3.S\"\n        \"src/convolution/esp_nn_conv_s16_mult4_1x1_esp32s3.S\"\n        \"src/convolution/esp_nn_conv_s8_filter_aligned_input_padded_esp32s3.S\"\n        \"src/convolution/esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3.S\"\n        \"src/convolution/esp_nn_depthwise_conv_s16_mult1_esp32s3.S\"\n        \"src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3.S\"\n        \"src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3.S\"\n        \"src/convolution/esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3.S\"\n        \"src/convolution/esp_nn_depthwise_conv_s16_mult4_esp32s3.S\"\n        \"src/convolution/esp_nn_depthwise_conv_s16_mult8_esp32s3.S\"\n        \"src/fully_connected/esp_nn_fully_connected_esp32s3.c\"\n        \"src/fully_connected/esp_nn_fc_s8_mac16_esp32s3.S\"\n        \"src/fully_connected/esp_nn_fully_connected_s8_esp32s3.S\"\n        \"src/fully_connected/esp_nn_fully_connected_per_ch_s8_esp32s3.S\"\n        \"src/pooling/esp_nn_max_pool_s8_esp32s3.S\"\n        \"src/pooling/esp_nn_avg_pool_s8_esp32s3.c\"\n        \"src/pooling/esp_nn_avg_pool_s8_esp32s3.S\"\n        \"src/softmax/esp_nn_softmax_s8_esp32s3.c\")\nendif()\n\nif(CONFIG_IDF_TARGET_ESP32P4)\n    set(p4_srcs\n        \"src/common/esp_nn_mean_s8_esp32p4.c\"\n        \"src/common/esp_nn_multiply_by_quantized_mult_esp32p4.S\"\n        \"src/activation_functions/esp_nn_hard_swish_s8_esp32p4.c\"\n        \"src/activation_functions/esp_nn_relu_s8_esp32p4.c\"\n        \"src/basic_math/esp_nn_add_s8_esp32p4.c\"\n        \"src/basic_math/esp_nn_mul_s8_esp32p4.c\"\n        \"src/convolution/esp_nn_conv_esp32p4.c\"\n        \"src/convolution/esp_nn_depthwise_conv_esp32p4.c\"\n        \"src/fully_connected/esp_nn_fully_connected_s8_esp32p4.c\"\n        \"src/pooling/esp_nn_avg_pool_s8_esp32p4.c\"\n        \"src/pooling/esp_nn_max_pool_s8_esp32p4.c\"\n        \"src/softmax/esp_nn_softmax_s8_esp32p4.c\")\nendif()\n\nidf_component_register(SRCS \"${c_srcs}\"\n                            \"${s3_srcs}\"\n                            \"${p4_srcs}\"\n                       INCLUDE_DIRS \"include\" \"src/common\")\n\nif(CONFIG_IDF_TARGET_ESP32S3)\n    target_compile_options(${COMPONENT_LIB} PRIVATE -mlongcalls -fno-unroll-loops -O2 -Wno-unused-function)\nelse()\n    target_compile_options(${COMPONENT_LIB} PRIVATE  -O2 -Wno-unused-function)\nendif()\n\nif(CONFIG_NN_SKIP_NUDGE)\n    target_compile_definitions(${COMPONENT_LIB} PRIVATE SKIP_NUDGE)\nendif()\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "# Contributing\n\nContributions to ESP-NN project in the form of pull requests, bug reports, and feature requests are welcome!\n\nThis document covers various topics related to contributions to the ESP-NN projects. Please read it if you plan to submit a PR!\n\n## CLA\n\nWe require accepting the contributor's license agreement for all pull requests. When opening a pull request the first time you will be prompted to sign the CLA by the [CLA Assistant](https://cla-assistant.io/) service.\n\n## Large-scale Changes\n\nIf you'd like to propose a change to the existing APIs or a large-scale refactoring of the implementation, we recommend opening an issue first to discuss this.\n\n## Updating the Benchmarks Table\n\nThe benchmarks table in [README.md](README.md) contains benchmarks for ESP32-S3. The benchmarks are collected by running the app in [test_app](test_app/) directory. Please update this table if you have changed the implementations of some of the functions or added the new ones.\n\n## Releasing a new version\n\nMaintainers should follow the steps below to release a new version of ESP-NN component. Assuming the new version is `vX.Y.Z`:\n\n1. Ensure you are on the latest `master` branch:\n   ```bash\n   git checkout master\n   git pull --ff-only origin master\n   ```\n1. Create the new tag:\n   ```bash\n   git tag -s -a -m \"vX.Y.Z\" vX.Y.Z\n   ```\n1. Push the tag and the branch to the internal repository:\n   ```bash\n   git push origin vX.Y.Z\n   ```\n1. CI will automatically push the tag to Github and will upload the new version to the IDF Component Registry.\n1. Go to https://github.com/espressif/esp-nn/releases and create a release from the tag vX.Y.Z.\n1. Write the release notes and publish the release.\n"
  },
  {
    "path": "Kconfig.projbuild",
    "content": "menu \"ESP-NN\"\n\nchoice NN_OPTIMIZATIONS\n   bool \"Optimization for nn functions\"\n   default NN_OPTIMIZED\n   help\n      Use ANSI-C versions for verification and debug purpose.\n      Optimisations are automatically picked up for a chipset.\n      For ESP32-S3, assembly optimisations are selected.\n      For other platforms(viz., ESP32, ESP32-C3), generic optimisations are used.\n\nconfig NN_ANSI_C\n   bool \"ANSI C\"\n   help\n      ANSI C versions for verification and debug purposes.\nconfig NN_OPTIMIZED\n   bool \"Optimized versions\"\n   help\n      Optimisations are automatically picked up for a chipset.\n      For ESP32-S3, assembly optimisations are selected.\n      For other platforms(viz., ESP32, ESP32-C3), generic optimisations are used.\nendchoice\n\nconfig NN_OPTIMIZATIONS\n   int\n   default 0 if NN_ANSI_C\n   default 1 if NN_OPTIMIZED\n\nconfig NN_SKIP_NUDGE\n   bool \"Use fast (non-bit-exact) requantization\"\n   depends on NN_OPTIMIZED\n   default n\n   help\n      When enabled, kernels use a faster requantize path that may differ\n      from the TFLite reference by +/-1 LSB at half-shift boundaries.\n      On ESP32-S3, this also skips the nudge addition in the assembly\n      requantize for ~20% speedup.\n      Leave disabled for bit-exact behavior (recommended for tests and\n      for matching reference outputs).\n\nendmenu\n"
  },
  {
    "path": "LICENSE",
    "content": "\n                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "README.md",
    "content": "# ESP-NN\n\nThe library contains optimised NN (Neural Network) functions for various Espressif chips.\n\n* Supported platforms:\n   * TensorFlow Lite Micro (TFLite Micro). Repo can be found [here](https://github.com/espressif/tflite-micro-esp-examples)\n\n* Supported ESP chips include:\n   * ESP32-S3 (Assembly versions optimised to benefit from vector instructions of ESP32-S3)\n   * ESP32-P4 (Optimised using PIE/QACC SIMD instructions)\n   * ESP32 (Generic optimisations)\n   * ESP32-C3 (Generic optimisations)\n\n## Performance\n\n### Kernelwise performance for s8 versions:\n\n  * Kernelwise performance on ESP32-P4 chip\n    * Numbers are ticks taken for kernel to execute\n    * Chip config: 360MHz, SPI-RAM: HEX 200MHz, L2-Cache: 128KB\n\n    | Function        | ANSI C  | Optimized | Opt Ratio | Data info   | Memory    |\n    | ----------------| --------|---------|---------|-------------|-----------|\n    | elementwise_add | 190786  | 88451   | 2.16    | size = 1615 | External  |\n    | elementwise_mul | 76585   | 47601   | 1.60    | size = 1615 | External  |\n    | convolution     | 4005512 | 572459  | 7.00    | input(10,10), filter(64x1x1x64), pad(0,0), stride(1,1) | External |\n    | convolution     | 249700  | 71104   | 3.51    | input(8,8), filter(16x1x1x16), pad(0,0), stride(1,1) | External |\n    | convolution     | 816975  | 533318  | 1.53    | input(10,10), filter(64x3x3x3), pad(0,0), stride(1,1) | External |\n    | depthwise conv  | 962834  | 482389  | 2.00    | input (16, 16), pad(0,0), stride(1,1) filter: 1x3x3x16 | External |\n    | depthwise conv  | 1365066 | 703989  | 1.94    | input (12, 12), pad(1,1), stride(1,1)  filter: 8x5x5x4 | External |\n    | max pool        | 482184  | 24178   | 19.94   | input(16,16), filter (1x3x3x16) | Internal |\n    | avg pool        | 303210  | 84401   | 3.59    | input(16,16), filter (1x3x3x16) | Internal |\n    | fully connected | 7650    | 915     | 8.36    | len: 271, ch = 3 | Internal |\n    | prelu (relu6)   | 1195    | 154     | 7.76    | size, 1615  | Internal  |\n    | softmax         | 14260   | 8587    | 1.66    | width: 256  | Internal  |\n    | hard_swish      | 703970  | 516582  | 1.36    | size: 12544 | External  |\n    | mean            | 10113   | 4686    | 2.16    | 7x7x16     | Internal  |\n\n\n  * Kernelwise performance on ESP32-S3 chip\n    * Numbers are ticks taken for kernel to execute\n    * Chip config: 240MHz, SPI: QPI 80MHz, Data cache: 64KB\n\n    | Function        | ANSI C   | Optimized | Opt Ratio | Data info   | Memory    |\n    | ----------------| ---------|-----------|-----------|-------------|-----------|\n    | elementwise_add | 281337   | 74440     | 3.78      | size = 1615 | External  |\n    | elementwise_mul | 122703   | 35002     | 3.51      | size = 1615 | External  |\n    | convolution     | 4712500  | 331008    | 14.24     | input(10,10), filter(64x1x1x64), pad(0,0), stride(1,1) | External |\n    | convolution     | 312754   | 39022     | 8.01      | input(8,8), filter(16x1x1x16), pad(0,0), stride(1,1) | External |\n    | convolution     | 2193289  | 394842    | 5.55      | input(8,8), filter(64x3x3x3), pad(0,0), stride(1,1) | External |\n    | depthwise conv  | 1159831  | 184176    | 6.30      | input(18,18), pad(0,0), stride(1,1), filter: 1x3x3x16 | External |\n    | depthwise conv  | 1671363  | 372435    | 4.49      | input(12,12), pad(1,1), stride(1,1), filter: 8x5x5x4 | External |\n    | max pool        | 376294   | 48069     | 7.83      | input(16,16), filter(1x3x3x16) | Internal |\n    | avg pool        | 427293   | 118052    | 3.62      | input(16,16), filter(1x3x3x16) | Internal |\n    | fully connected | 8443     | 1078      | 7.83      | len: 271, ch = 3 | Internal |\n    | softmax         | 15209    | 11107     | 1.37      | h: 8, w: 32 | Internal  |\n    | prelu (relu6)   | 1125     | 98        | 11.48     | size: 1615  | Internal  |\n\n\n### Model-level performance:\n\n  * **Person Detection** (Visual Wake Words, INT8 quantized — from [esp-tflite-micro](https://github.com/espressif/esp-tflite-micro))\n    * Numbers are time (ms) for `invoke()` call, using internal memory\n\n    | Chip     | CPU Freq | without ESP-NN | with ESP-NN |\n    | -------- | -------- | -------------- | ----------- |\n    | ESP32-P4 | 360MHz   | 1395ms         | 73ms        |\n    | ESP32-S3 | 240MHz   | 2300ms         | 54ms        |\n    | ESP32    | 240MHz   | 4084ms         | 380ms       |\n    | ESP32-C3 | 160MHz   | 3355ms         | 426ms       |\n\n  * **MobileNetV3 Small** (INT8 quantized, 224x224x3, 1000 classes)\n\n    | Chip     | CPU Freq | without ESP-NN | with ESP-NN |\n    | -------- | -------- | -------------- | ----------- |\n    | ESP32-S3 | 240MHz   | 26000ms        | 1434ms      |\n    | ESP32-P4 | 360MHz   | 11600ms        | 1050ms      |\n\n> **Note**:\n  - The above is time taken for execution of the `invoke()` call\n  - SPIRAM used for TensorArena.\n  - Person detection on ESP32-S3 with internal RAM: 47ms\n  - ESP32-P4 optimisation is work in progress\n  - `Without ESP-NN` case is when `esp-nn` is completely disabled by removing below flag from [CMakeLists.txt](CMakeLists.txt):\n    ```cmake\n      # enable ESP-NN optimizations by Espressif\n      target_compile_options(${COMPONENT_LIB} PRIVATE -DESP_NN)\n    ```\n\n\n## Configuration\n\n  * To configure, please use `idf.py menuconfig` and under `ESP-NN` select `NN_OPTIMIZATIONS`\n  * There are two options presented:\n     * Optimized versions\n     * ANSI C\n\n  * Default selection is for `Optimized versions`. For ESP32-S3 and ESP32-P4, assembly versions are automatically selected, whereas for other chips (viz., ESP32, ESP32-C3), generic optimisations are selected.\n  * For debugging purposes, you may want to select `ANSI C` reference versions.\n\n\n## Contributing\n\nIf you encounter an issue with ESP-NN, or wish to submit a feature request, please use the Issues section on the Github.\n\nFor general questions related to this library, please use the esp32.com forum.\n\nPlease check [CONTRIBUTING.md](CONTRIBUTING.md) for further information if you'd like to contribute to ESP-NN.\n\n## Copyrights and License\n\nAll original source code in this repository is Copyright (C) 2020-2021 Espressif Systems. This source code is licensed under the Apache License 2.0 as described in the file LICENSE.\n"
  },
  {
    "path": "idf_component.yml",
    "content": "version: \"1.2.3\"\ndescription: Optimized NN (Neural Network) functions for Espressif chips\nurl: https://github.com/espressif/esp-nn\nrepository: https://github.com/espressif/esp-nn.git\nissues: https://github.com/espressif/esp-nn/issues\ndependencies:\n  idf:\n    version: \">=4.2\"\nfiles:\n  exclude:\n    - test_app\n    - tests\n"
  },
  {
    "path": "include/esp_nn.h",
    "content": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#pragma once\n\n#if defined(CONFIG_NN_OPTIMIZED)\n// select apt optimisations\n#ifdef CONFIG_IDF_TARGET_ESP32P4\n#define ARCH_ESP32_P4 1\n#endif\n#ifdef CONFIG_IDF_TARGET_ESP32S3\n#define ARCH_ESP32_S3 1\n#endif\n#ifdef CONFIG_IDF_TARGET_ESP32\n#define ARCH_ESP32 1\n#endif\n#endif\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n/* reference kernels included by default */\n#include \"esp_nn_ansi_headers.h\"\n\n#if defined(CONFIG_NN_OPTIMIZED)\n#if defined(ARCH_ESP32_P4)\n#include \"esp_nn_esp32p4.h\"\n#elif defined(ARCH_ESP32_S3)\n#include \"esp_nn_esp32s3.h\"\n#else // for other platforms use generic optimisations\n#include \"esp_nn_generic_opt.h\"\n#endif // #if defined(ARCH_ESP32_S3)\n#else\n#include \"esp_nn_ansi_c.h\"\n#endif\n\n#ifdef __cplusplus\n}\n#endif\n"
  },
  {
    "path": "include/esp_nn_ansi_c.h",
    "content": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/**\n * @file        Header definitions to include for ANSI C versions.\n *              These are just typedefs to pick up ANSI versions.\n */\n\n#pragma once\n\n#include \"esp_nn_defs.h\"\n#include \"esp_nn_ansi_headers.h\"\n\n#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_ansi\n#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_ansi\n#define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_ansi\n\n#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_ansi\n\n#define esp_nn_conv_s8 esp_nn_conv_s8_ansi\n\n#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_ansi\n#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_ansi\n\n#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_ansi\n#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_ansi\n\n#define esp_nn_relu6_s8 esp_nn_relu6_s8_ansi\n#define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_ansi\n#define esp_nn_get_hard_swish_scratch_size() 0\n#define esp_nn_set_hard_swish_scratch_buf(buf)\n#define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_ansi\n\n#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_ansi\n#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_ansi\n\n#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_ansi\n#define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_ansi\n\n#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_ansi\n#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_ansi\n#define esp_nn_softmax_s8 esp_nn_softmax_s8_ansi\n\n#define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi\n#define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi\n#define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi\n"
  },
  {
    "path": "include/esp_nn_ansi_headers.h",
    "content": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#pragma once\n\n/**\n * @file        Header definitions to include for esp_nn reference functions\n */\n\n#include \"esp_nn_defs.h\"\n/************************** Basic math functions ****************************/\n\n/**\n * @brief       elementwise addition\n *\n * @note        inputs type: int8_t, output: int8_t\n *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]\n *\n *              shift values are expected to be <= 0\n */\nvoid esp_nn_add_elementwise_s8_ansi(const int8_t *input1_data,\n                                    const int8_t *input2_data,\n                                    const int32_t input1_offset,\n                                    const int32_t input2_offset,\n                                    const int32_t input1_mult,\n                                    const int32_t input2_mult,\n                                    const int32_t input1_shift,\n                                    const int32_t input2_shift,\n                                    const int32_t left_shift,\n                                    int8_t *output,\n                                    const int32_t out_offset,\n                                    const int32_t out_mult,\n                                    const int32_t out_shift,\n                                    const int32_t activation_min,\n                                    const int32_t activation_max,\n                                    const int32_t size);\n/**\n * @brief       elementwise multiplication\n *\n * @note        inputs type: int8_t, output: int8_t\n *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]\n *\n *              output shift is expected to be <= 0\n */\nvoid esp_nn_mul_elementwise_s8_ansi(const int8_t *input1_data,\n                                    const int8_t *input2_data,\n                                    const int32_t input1_offset,\n                                    const int32_t input2_offset,\n                                    int8_t *output,\n                                    const int32_t out_offset,\n                                    const int32_t out_mult,\n                                    const int32_t out_shift,\n                                    const int32_t activation_min,\n                                    const int32_t activation_max,\n                                    const int32_t size);\n\n/**\n * @brief       broadcast MUL for [H,W,C] * [1,1,C] pattern (SE-block)\n *\n * @note        input2_per_ch has `channels` elements, broadcast to all spatial positions.\n *              Uses fast requantization (constant nudge).\n */\nvoid esp_nn_mul_broadcast_channel_s8_ansi(const int8_t *input1,\n                                      const int8_t *input2_per_ch,\n                                      const int32_t input1_offset,\n                                      const int32_t input2_offset,\n                                      int8_t *output,\n                                      const int32_t output_offset,\n                                      const int32_t output_mult,\n                                      const int32_t output_shift,\n                                      const int32_t activation_min,\n                                      const int32_t activation_max,\n                                      const int32_t total_spatial,\n                                      const int32_t channels);\n\n\n/************************** Convolution functions *****************************/\n\n/**\n * @brief       depthwise convolution per channel\n *\n * @note        inputs type: int8_t, output: int8_t\n *              Version used in tflite is per channel.\n *              This version follows the same footsprints.\n *              Meaning, it has per out_channel shift and multiplier for\n *              requantization\n *\n *              optimization notes: Though input_offset is int32 type,\n *              offset values are contained in 8 bits [-128, 127]\n */\nvoid esp_nn_depthwise_conv_s8_ansi(const data_dims_t *input_dims,\n                                   const int8_t *input_data,\n                                   const data_dims_t *filter_dims,\n                                   const int8_t *filter_data,\n                                   const int32_t *bias,\n                                   const data_dims_t *output_dims,\n                                   int8_t *out_data,\n                                   const dw_conv_params_t *conv_params,\n                                   const quant_data_t *quant_data);\n\n/**\n * @brief       2d-convolution channelwise\n *\n * @note        operation: result += (input + offset) * filter\n *\n *              inputs type: int8_t, output: int8_t\n *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]\n */\nvoid esp_nn_conv_s8_ansi(const data_dims_t *input_dims,\n                         const int8_t *input_data,\n                         const data_dims_t *filter_dims,\n                         const int8_t *filter_data,\n                         const int32_t *bias,\n                         const data_dims_t *output_dims,\n                         int8_t *out_data,\n                         const conv_params_t *conv_params,\n                         const quant_data_t *quant_data);\n\nint esp_nn_get_conv_scratch_size_ansi(const data_dims_t *input_dims,\n                                      const data_dims_t *filter_dims,\n                                      const data_dims_t *output_dims,\n                                      const conv_params_t *conv_params);\nvoid esp_nn_set_conv_scratch_buf_ansi(const void *buf);\n\nint esp_nn_get_depthwise_conv_scratch_size_ansi(const data_dims_t *input_dims,\n                                                const data_dims_t *filter_dims,\n                                                const data_dims_t *output_dims,\n                                                const dw_conv_params_t *conv_params);\nvoid esp_nn_set_depthwise_conv_scratch_buf_ansi(const void *buf);\n\n/************************** Activation functions *****************************/\n\n/**\n * @brief       relu6\n *\n * @note        inout: int8_t\n */\nvoid esp_nn_relu6_s8_ansi(int8_t *data, uint16_t size);\n\n/**\n * @brief       hard_swish activation: y = x * relu6(x + 3) / 6\n *\n * @note        Quantized int8 fixed-point implementation\n */\nvoid esp_nn_hard_swish_s8_ansi(const int8_t *input, int8_t *output,\n                                const int32_t size,\n                                const int16_t input_zero_point,\n                                const int16_t output_mult_fxp,\n                                const int16_t reluish_mult_fxp,\n                                const int32_t reluish_mult_exp,\n                                const int32_t output_mult_exp,\n                                const int16_t output_zero_point);\n\n/**\n * @brief       mean reduction over spatial dims (H,W) for NHWC int8 tensor\n *\n * @note        Specialized for 4D [N,H,W,C] → [N,1,1,C] reduction.\n *              Used by Squeeze-and-Excite in MobileNetV3.\n */\nvoid esp_nn_mean_nhwc_s8_ansi(const int8_t *input, int8_t *output,\n                               const int32_t height, const int32_t width,\n                               const int32_t channels,\n                               const int32_t input_zero_point,\n                               const int32_t output_zero_point,\n                               const int32_t multiplier,\n                               const int32_t shift);\n\n/************************** Pooling functions *****************************/\n\n\n/**\n * @brief       max_pool\n *\n * @note        inputs type: int8_t, output: int8_t\n *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]\n */\nvoid esp_nn_max_pool_s8_ansi(const int8_t *input,\n                             const uint16_t input_wd,\n                             const uint16_t input_ht,\n                             int8_t *output,\n                             const uint16_t output_wd,\n                             const uint16_t output_ht,\n                             const uint16_t stride_wd,\n                             const uint16_t stride_ht,\n                             const uint16_t filter_wd,\n                             const uint16_t filter_ht,\n                             const uint16_t pad_wd,\n                             const uint16_t pad_ht,\n                             const int32_t activation_min,\n                             const int32_t activation_max,\n                             const uint16_t channels);\n\n/**\n * @brief       avg_pool\n *\n * @note        inputs type: int8_t, output: int8_t\n *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]\n */\nvoid esp_nn_avg_pool_s8_ansi(const int8_t *input,\n                             const uint16_t input_wd,\n                             const uint16_t input_ht,\n                             int8_t *output,\n                             const uint16_t output_wd,\n                             const uint16_t output_ht,\n                             const uint16_t stride_wd,\n                             const uint16_t stride_ht,\n                             const uint16_t filter_wd,\n                             const uint16_t filter_ht,\n                             const uint16_t pad_wd,\n                             const uint16_t pad_ht,\n                             const int32_t activation_min,\n                             const int32_t activation_max,\n                             const uint16_t channels);\n\n\n/************************** Fully connected functions ***********************/\n\n/**\n * @brief       fully connected\n *\n * @note        inputs type: int8_t, output: int8_t\n *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]\n */\nvoid esp_nn_fully_connected_s8_ansi(const int8_t *input_data,\n                                    const int32_t input_offset,\n                                    const uint16_t row_len,\n                                    const int8_t *filter_data,\n                                    const int32_t filter_offset,\n                                    const int32_t *bias,\n                                    int8_t *out_data,\n                                    const uint16_t out_channels,\n                                    const int32_t out_offset,\n                                    const int32_t out_shift,\n                                    const int32_t out_mult,\n                                    const int32_t activation_min,\n                                    const int32_t activation_max);\n\n/**\n * @brief       fully connected\n *\n * @note        inputs type: int8_t, output: int8_t\n *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]\n *              out_mult, out_shift: int32_t* containing per-channel data\n */\nvoid esp_nn_fully_connected_per_ch_s8_ansi(const int8_t *input_data,\n                                    const int32_t input_offset,\n                                    const uint16_t row_len,\n                                    const int8_t *filter_data,\n                                    const int32_t filter_offset,\n                                    const int32_t *bias,\n                                    int8_t *out_data,\n                                    const uint16_t out_channels,\n                                    const int32_t out_offset,\n                                    const int32_t* out_shift,\n                                    const int32_t* out_mult,\n                                    const int32_t activation_min,\n                                    const int32_t activation_max);\n\n/**\n * @brief   Get scratch buffer size needed by softmax function\n *\n * @param   width\n * @param   height\n * @return  size in bytes\n *\n * @note    buffer must be 4 byte aligned\n */\nint32_t esp_nn_get_softmax_scratch_size_ansi(const int32_t width, const int32_t height);\n\n/* ANSI C function to be hooked up when optimised version needed */\nint32_t esp_nn_get_softmax_scratch_size_opt(const int32_t width, const int32_t height);\n\n/**\n * @brief   Set scratch buffer to be used by softmax function\n *\n * @param   buffer  this can be NULL if one needs to unset it\n *                  must be aligned to 4 bytes\n */\nvoid esp_nn_set_softmax_scratch_buf_ansi(void *buffer);\n\n/**\n * @brief       reference softmax function\n *\n * @note        inputs type: int8_t, output: int8_t\n */\nvoid esp_nn_softmax_s8_ansi(const int8_t *input_data,\n                            const int32_t height,\n                            const int32_t width,\n                            const int32_t mult,\n                            const int32_t shift,\n                            const int32_t diff_min,\n                            int8_t *output_data);\n\n\n//////////////////////////// Generic optimisations /////////////////////////////\n\n/************************** Convolution functions *****************************/\n\n/**\n * @brief       2d-convolution channelwise optimized version\n *\n * @note        operation: result += (input + offset) * filter\n *\n *              inputs type: int8_t, output: int8_t\n *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]\n */\nvoid esp_nn_conv_s8_opt(const data_dims_t *input_dims,\n                        const int8_t *input_data,\n                        const data_dims_t *filter_dims,\n                        const int8_t *filter_data,\n                        const int32_t *bias,\n                        const data_dims_t *output_dims,\n                        int8_t *out_data,\n                        const conv_params_t *conv_params,\n                        const quant_data_t *quant_data);\n\n/**\n * @brief       depthwise convolution per channel optimized version\n *\n * @note        inputs type: int8_t, output: int8_t\n *              Version used in tflite is per channel.\n *              This version follows the same footsprints.\n *              Meaning, it has per out_channel shift and multiplier for\n *              requantization\n *\n *              optimization notes: Though input_offset is int32 type,\n *              offset values are contained in 8 bits [-128, 127]\n */\nvoid esp_nn_depthwise_conv_s8_opt(const data_dims_t *input_dims,\n                                  const int8_t *input_data,\n                                  const data_dims_t *filter_dims,\n                                  const int8_t *filter_data,\n                                  const int32_t *bias,\n                                  const data_dims_t *output_dims,\n                                  int8_t *out_data,\n                                  const dw_conv_params_t *conv_params,\n                                  const quant_data_t *quant_data);\n\nint esp_nn_get_conv_scratch_size_opt(const data_dims_t *input_dims,\n                                     const data_dims_t *filter_dims,\n                                     const data_dims_t *output_dims,\n                                     const conv_params_t *conv_params);\nvoid esp_nn_set_conv_scratch_buf_opt(const void *buf);\n\nint esp_nn_get_depthwise_conv_scratch_size_opt(const data_dims_t *input_dims,\n                                               const data_dims_t *filter_dims,\n                                               const data_dims_t *output_dims,\n                                               const dw_conv_params_t *conv_params);\nvoid esp_nn_set_depthwise_conv_scratch_buf_opt(const void *buf);\n\n/* ANSI C function to be hooked up when optimised version needed */\nvoid esp_nn_set_softmax_scratch_buf_opt(void *buffer);\n\n/**\n * @brief       optimised version of softmax function\n *\n * @note        the function uses extra buffer (4 * width bytes)\n *              hence, scratch buffers must be set before calling this.\n */\nvoid esp_nn_softmax_s8_opt(const int8_t *input_data,\n                           const int32_t height,\n                           const int32_t width,\n                           const int32_t mult,\n                           const int32_t shift,\n                           const int32_t diff_min,\n                           int8_t *output_data);\n\n/**\n * @brief       Get scratch buffer size for int8 logistic (sigmoid).\n * @return      256 (size of LUT in bytes)\n */\nint32_t esp_nn_get_logistic_s8_scratch_size_ansi(void);\n\n/**\n * @brief       Prepare LUT for int8 logistic (sigmoid).\n *              Call once during model preparation after scratch is allocated.\n *\n * @param       scratch_buf         Scratch buffer (256 bytes, from get_scratch_size)\n * @param       input_zero_point    Input quantization zero point\n * @param       input_scale         Input quantization scale (float)\n *\n * @note        Output quantization is fixed: scale=1/256, zero_point=-128.\n */\nvoid esp_nn_logistic_s8_prepare_ansi(int8_t *scratch_buf,\n                                      int32_t input_zero_point,\n                                      float input_scale);\n\n/**\n * @brief       Apply int8 logistic (sigmoid) using precomputed LUT.\n *\n * @param       input       Input int8 data\n * @param       output      Output int8 data\n * @param       size        Number of elements\n * @param       scratch_buf 256-byte LUT from esp_nn_logistic_s8_prepare()\n */\nvoid esp_nn_logistic_s8_ansi(const int8_t *input, int8_t *output,\n                              int32_t size, const int8_t *scratch_buf);\n"
  },
  {
    "path": "include/esp_nn_defs.h",
    "content": "// Copyright 2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#pragma once\n\n#include <stdint.h>\n\n/**\n * @brief structure to club data dims\n * this structure can be used for input, output and filter\n */\ntypedef struct data_dims {\n    int32_t width;\n    int32_t height;\n    int32_t channels;\n\n    int32_t extra; // can be used as batch or any other param\n} data_dims_t;\n\n/**\n * @brief 2d data structure (width, height)\n *\n */\ntypedef struct data_2d {\n    int32_t width;\n    int32_t height;\n} data_2d_t;\n\n/**\n * @brief min/max activation\n */\ntypedef struct act_params {\n    int32_t min;\n    int32_t max;\n} act_params_t;\n\n/**\n * @brief per channel quant data\n *\n * @note number of shift and mult elements are equal to output channels\n */\ntypedef struct quant_data {\n    int32_t *shift;\n    int32_t *mult;\n} quant_data_t;\n\n/**\n * @brief params specific to convolution 2d\n *\n */\ntypedef struct conv_params {\n    int32_t in_offset;\n    int32_t out_offset;\n    data_2d_t stride;\n    data_2d_t padding;\n    data_2d_t dilation;\n    act_params_t activation;\n} conv_params_t;\n\n/**\n * @brief params specific to depthwise convolution 2d\n *\n */\ntypedef struct dw_conv_params {\n    int32_t in_offset;\n    int32_t out_offset;\n    int32_t ch_mult; // channel multiplier. (in_ch * ch_mult = out_ch)\n    data_2d_t stride;\n    data_2d_t padding;\n    data_2d_t dilation;\n    act_params_t activation;\n} dw_conv_params_t;\n"
  },
  {
    "path": "include/esp_nn_esp32p4.h",
    "content": "/*\n * SPDX-FileCopyrightText: 2024-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/**\n * @file        Header definitions to include for esp_nn optimized functions for\n *              the ESP32-P4 platform\n */\n\n#pragma once\n\n#include \"esp_nn_defs.h\"\n#include \"esp_nn_ansi_headers.h\"\n\n/**\n * @brief       2d - convolution channelwise\n *\n * @note        operation: result += (input + offset) * filter\n *\n *              inputs type: int8_t, output: int8_t\n *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]\n */\nvoid esp_nn_conv_s8_esp32p4(const data_dims_t *input_dims,\n                            const int8_t *input_data,\n                            const data_dims_t *filter_dims,\n                            const int8_t *filter_data,\n                            const int32_t *bias,\n                            const data_dims_t *output_dims,\n                            int8_t *output_data,\n                            const conv_params_t *conv_params,\n                            const quant_data_t *quant_data);\n\nint esp_nn_get_conv_scratch_size_esp32p4(const data_dims_t *input_dims,\n                                         const data_dims_t *filter_dims,\n                                         const data_dims_t *output_dims,\n                                         const conv_params_t *conv_params);\nvoid esp_nn_set_conv_scratch_buf_esp32p4(const void *buf);\n\n/********************** function defines ***************************/\n\n\n\n#define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_ansi\n\nvoid esp_nn_add_elementwise_s8_esp32p4(const int8_t *input1_data,\n                                        const int8_t *input2_data,\n                                        const int32_t input1_offset,\n                                        const int32_t input2_offset,\n                                        const int32_t input1_mult,\n                                        const int32_t input2_mult,\n                                        const int32_t input1_shift,\n                                        const int32_t input2_shift,\n                                        const int32_t left_shift,\n                                        int8_t *output,\n                                        const int32_t out_offset,\n                                        const int32_t out_mult,\n                                        const int32_t out_shift,\n                                        const int32_t activation_min,\n                                        const int32_t activation_max,\n                                        const int32_t size);\n#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_esp32p4\n\nvoid esp_nn_mul_elementwise_s8_esp32p4(const int8_t *input1_data,\n                                        const int8_t *input2_data,\n                                        const int32_t input1_offset,\n                                        const int32_t input2_offset,\n                                        int8_t *output,\n                                        const int32_t out_offset,\n                                        const int32_t out_mult,\n                                        const int32_t out_shift,\n                                        const int32_t activation_min,\n                                        const int32_t activation_max,\n                                        const int32_t size);\n#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_esp32p4\n\nvoid esp_nn_depthwise_conv_s8_esp32p4(const data_dims_t *input_dims,\n                                       const int8_t *input_data,\n                                       const data_dims_t *filter_dims,\n                                       const int8_t *filter_data,\n                                       const int32_t *bias,\n                                       const data_dims_t *output_dims,\n                                       int8_t *out_data,\n                                       const dw_conv_params_t *conv_params,\n                                       const quant_data_t *quant_data);\nint esp_nn_get_depthwise_conv_scratch_size_esp32p4(const data_dims_t *input_dims,\n                                                    const data_dims_t *filter_dims,\n                                                    const data_dims_t *output_dims,\n                                                    const dw_conv_params_t *conv_params);\nvoid esp_nn_set_depthwise_conv_scratch_buf_esp32p4(const void *buf);\n#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_esp32p4\n\n#define esp_nn_conv_s8 esp_nn_conv_s8_esp32p4\n\n#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_esp32p4\n#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_esp32p4\n\n#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_esp32p4\n#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_esp32p4\n\n/* Functions not yet optimized for P4 - use ANSI fallback */\nvoid esp_nn_hard_swish_s8_esp32p4(const int8_t *input, int8_t *output,\n                                   const int32_t size,\n                                   const int16_t input_zero_point,\n                                   const int16_t output_mult_fxp,\n                                   const int16_t reluish_mult_fxp,\n                                   const int32_t reluish_mult_exp,\n                                   const int32_t output_mult_exp,\n                                   const int16_t output_zero_point);\n#define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_esp32p4\n#define esp_nn_get_hard_swish_scratch_size() 0\n#define esp_nn_set_hard_swish_scratch_buf(buf)\n\nvoid esp_nn_mean_nhwc_s8_esp32p4(const int8_t *input, int8_t *output,\n                                  const int32_t height, const int32_t width,\n                                  const int32_t channels,\n                                  const int32_t input_zero_point,\n                                  const int32_t output_zero_point,\n                                  const int32_t multiplier,\n                                  const int32_t shift);\n#define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_esp32p4\n\nvoid esp_nn_relu6_s8_esp32p4(int8_t *data, uint16_t size);\n#define esp_nn_relu6_s8 esp_nn_relu6_s8_esp32p4\n\nvoid esp_nn_avg_pool_s8_esp32p4(const int8_t *input,\n                                 const uint16_t input_wd,\n                                 const uint16_t input_ht,\n                                 int8_t *output,\n                                 const uint16_t output_wd,\n                                 const uint16_t output_ht,\n                                 const uint16_t stride_wd,\n                                 const uint16_t stride_ht,\n                                 const uint16_t filter_wd,\n                                 const uint16_t filter_ht,\n                                 const uint16_t pad_wd,\n                                 const uint16_t pad_ht,\n                                 const int32_t activation_min,\n                                 const int32_t activation_max,\n                                 const uint16_t channels);\n#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_esp32p4\nvoid esp_nn_max_pool_s8_esp32p4(const int8_t *input,\n                                 const uint16_t input_wd,\n                                 const uint16_t input_ht,\n                                 int8_t *output,\n                                 const uint16_t output_wd,\n                                 const uint16_t output_ht,\n                                 const uint16_t stride_wd,\n                                 const uint16_t stride_ht,\n                                 const uint16_t filter_wd,\n                                 const uint16_t filter_ht,\n                                 const uint16_t pad_wd,\n                                 const uint16_t pad_ht,\n                                 const int32_t activation_min,\n                                 const int32_t activation_max,\n                                 const uint16_t channels);\n#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_esp32p4\n\nvoid esp_nn_fully_connected_s8_esp32p4(const int8_t *input_data,\n                                        const int32_t input_offset,\n                                        const uint16_t row_len,\n                                        const int8_t *filter_data,\n                                        const int32_t filter_offset,\n                                        const int32_t *bias,\n                                        int8_t *out_data,\n                                        const uint16_t out_channels,\n                                        const int32_t out_offset,\n                                        const int32_t out_shift,\n                                        const int32_t out_mult,\n                                        const int32_t activation_min,\n                                        const int32_t activation_max);\nvoid esp_nn_fully_connected_per_ch_s8_esp32p4(const int8_t *input_data,\n                                        const int32_t input_offset,\n                                        const uint16_t row_len,\n                                        const int8_t *filter_data,\n                                        const int32_t filter_offset,\n                                        const int32_t *bias,\n                                        int8_t *out_data,\n                                        const uint16_t out_channels,\n                                        const int32_t out_offset,\n                                        const int32_t *out_shift,\n                                        const int32_t *out_mult,\n                                        const int32_t activation_min,\n                                        const int32_t activation_max);\n#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_esp32p4\n#define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_esp32p4\n\nint32_t esp_nn_get_softmax_scratch_size_esp32p4(const int32_t width, const int32_t height);\nvoid esp_nn_set_softmax_scratch_buf_esp32p4(void *buffer);\nvoid esp_nn_softmax_s8_esp32p4(const int8_t *input_data,\n                                const int32_t height,\n                                const int32_t width,\n                                const int32_t mult,\n                                const int32_t shift,\n                                const int32_t diff_min,\n                                int8_t *output_data);\n#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_esp32p4\n#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_esp32p4\n#define esp_nn_softmax_s8 esp_nn_softmax_s8_esp32p4\n\n#define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi\n#define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi\n#define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi\n"
  },
  {
    "path": "include/esp_nn_esp32s3.h",
    "content": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/**\n * @file        Header definitions to include for esp_nn optimized functions for\n *              the ESP32-S3 platform\n */\n\n#pragma once\n\n#include \"esp_nn_defs.h\"\n#include \"esp_nn_ansi_headers.h\"\n\n/************************** Basic math functions *****************************/\n\n\n/**\n * @brief       elementwise addition\n *\n * @note        inputs type: int8_t, output: int8_t\n *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]\n *\n *              shift values are expected to be <= 0\n */\nvoid esp_nn_add_elementwise_s8_esp32s3(const int8_t *input1_data,\n                                       const int8_t *input2_data,\n                                       const int32_t input1_offset,\n                                       const int32_t input2_offset,\n                                       const int32_t input1_mult,\n                                       const int32_t input2_mult,\n                                       const int32_t input1_shift,\n                                       const int32_t input2_shift,\n                                       const int32_t left_shift,\n                                       int8_t *output,\n                                       const int32_t out_offset,\n                                       const int32_t out_mult,\n                                       const int32_t out_shift,\n                                       const int32_t activation_min,\n                                       const int32_t activation_max,\n                                       const int32_t size);\n\n/**\n * @brief       elementwise multiplication\n *\n * @note        inputs type: int8_t, output: int8_t\n *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]\n *\n *              output shift is expected to be <= 0\n */\nvoid esp_nn_mul_elementwise_s8_esp32s3(const int8_t *input1_data,\n                                       const int8_t *input2_data,\n                                       const int32_t input1_offset,\n                                       const int32_t input2_offset,\n                                       int8_t *output,\n                                       const int32_t out_offset,\n                                       const int32_t out_mult,\n                                       const int32_t out_shift,\n                                       const int32_t activation_min,\n                                       const int32_t activation_max,\n                                       const int32_t size);\n\n\n/************************** Convolution functions *****************************/\n\n/**\n * @brief       depthwise convolution per channel\n *\n * @note        inputs type: int8_t, output: int8_t\n *              Version used in tflite is per channel.\n *              This version follows the same footsprints.\n *              Meaning, it has per out_channel shift and multiplier for\n *              requantization\n *\n *              optimization notes: Though input_offset is int32 type,\n *              offset values are contained in 8 bits [-128, 127]\n */\nvoid esp_nn_depthwise_conv_s8_esp32s3(const data_dims_t *input_dims,\n                                      const int8_t *input_data,\n                                      const data_dims_t *filter_dims,\n                                      const int8_t *filter_data,\n                                      const int32_t *bias,\n                                      const data_dims_t *output_dims,\n                                      int8_t *output_data,\n                                      const dw_conv_params_t *conv_params,\n                                      const quant_data_t *quant_data);\n\n/**\n * @brief       2d - convolution channelwise\n *\n * @note        operation: result += (input + offset) * filter\n *\n *              inputs type: int8_t, output: int8_t\n *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]\n */\nvoid esp_nn_conv_s8_esp32s3(const data_dims_t *input_dims,\n                            const int8_t *input_data,\n                            const data_dims_t *filter_dims,\n                            const int8_t *filter_data,\n                            const int32_t *bias,\n                            const data_dims_t *output_dims,\n                            int8_t *output_data,\n                            const conv_params_t *conv_params,\n                            const quant_data_t *quant_data);\n\nint esp_nn_get_conv_scratch_size_esp32s3(const data_dims_t *input_dims,\n                                         const data_dims_t *filter_dims,\n                                         const data_dims_t *output_dims,\n                                         const conv_params_t *conv_params);\nvoid esp_nn_set_conv_scratch_buf_esp32s3(const void *buf);\n\nint esp_nn_get_depthwise_conv_scratch_size_esp32s3(const data_dims_t *input_dims,\n                                                   const data_dims_t *filter_dims,\n                                                   const data_dims_t *output_dims,\n                                                   const dw_conv_params_t *conv_params);\nvoid esp_nn_set_depthwise_conv_scratch_buf_esp32s3(const void *buf);\n\n/************************** Pooling functions *****************************/\n\n/**\n * @brief       max_pool\n *\n * @note        inputs type: int8_t, output: int8_t\n *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]\n */\nvoid esp_nn_max_pool_s8_esp32s3(const int8_t *input,\n                                const uint16_t input_wd,\n                                const uint16_t input_ht,\n                                int8_t *output,\n                                const uint16_t output_wd,\n                                const uint16_t output_ht,\n                                const uint16_t stride_wd,\n                                const uint16_t stride_ht,\n                                const uint16_t filter_wd,\n                                const uint16_t filter_ht,\n                                const uint16_t pad_wd,\n                                const uint16_t pad_ht,\n                                const int32_t activation_min,\n                                const int32_t activation_max,\n                                const uint16_t channels);\n\n/**\n * @brief       avg_pool\n *\n * @note        inputs type: int8_t, output: int8_t\n *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]\n */\nvoid esp_nn_avg_pool_s8_esp32s3(const int8_t *input,\n                                const uint16_t input_wd,\n                                const uint16_t input_ht,\n                                int8_t *output,\n                                const uint16_t output_wd,\n                                const uint16_t output_ht,\n                                const uint16_t stride_wd,\n                                const uint16_t stride_ht,\n                                const uint16_t filter_wd,\n                                const uint16_t filter_ht,\n                                const uint16_t pad_wd,\n                                const uint16_t pad_ht,\n                                const int32_t activation_min,\n                                const int32_t activation_max,\n                                const uint16_t channels);\n\n\n/************************** Fully connected functions *****************************/\n\n/**\n * @brief       fully connected\n *\n * @note        inputs type: int8_t, output: int8_t\n *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]\n *\n *              Current version works only on aligned input.\n *              row_len and channels should both be multiple of 8.\n */\nvoid esp_nn_fully_connected_s8_esp32s3(const int8_t *input_data,\n                                       const int32_t input_offset,\n                                       const uint16_t row_len,\n                                       const int8_t *filter_data,\n                                       const int32_t filter_offset,\n                                       const int32_t *bias,\n                                       int8_t *out_data,\n                                       const uint16_t out_channels,\n                                       const int32_t out_offset,\n                                       const int32_t out_shift,\n                                       const int32_t out_mult,\n                                       const int32_t activation_min,\n                                       const int32_t activation_max);\n\n/**\n * @brief       fully connected - per channel\n *\n * @note        inputs type: int8_t, output: int8_t\n *              input offsets: although int32_t, they are contained in 8 bits [-128, 127]\n *              out_mult, out_shift: int32_t* containing per-channel data\n *\n *              Current version works only on aligned input.\n *              row_len and channels should both be multiple of 8.\n */\nvoid esp_nn_fully_connected_per_ch_s8_esp32s3(const int8_t *input_data,\n                                       const int32_t input_offset,\n                                       const uint16_t row_len,\n                                       const int8_t *filter_data,\n                                       const int32_t filter_offset,\n                                       const int32_t *bias,\n                                       int8_t *out_data,\n                                       const uint16_t out_channels,\n                                       const int32_t out_offset,\n                                       const int32_t* out_shift,\n                                       const int32_t* out_mult,\n                                       const int32_t activation_min,\n                                       const int32_t activation_max);\n\n/**\n * @brief       relu6\n *\n * @note        inout: int8_t\n */\nvoid esp_nn_relu6_s8_esp32s3(int8_t *data, uint16_t size);\n\n/********************** function defines ***************************/\n\n#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_esp32s3\n#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_esp32s3\n\nvoid esp_nn_mul_broadcast_channel_s8_esp32s3(const int8_t *input1,\n                                              const int8_t *input2_per_ch,\n                                              const int32_t input1_offset,\n                                              const int32_t input2_offset,\n                                              int8_t *output,\n                                              const int32_t output_offset,\n                                              const int32_t output_mult,\n                                              const int32_t output_shift,\n                                              const int32_t activation_min,\n                                              const int32_t activation_max,\n                                              const int32_t total_spatial,\n                                              const int32_t channels);\n#define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_esp32s3\n\n#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_esp32s3\n\n#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_esp32s3\n#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_esp32s3\n\n#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_esp32s3\n#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_esp32s3\n\n#define esp_nn_conv_s8 esp_nn_conv_s8_esp32s3\n\n#define esp_nn_relu6_s8 esp_nn_relu6_s8_esp32s3\n\nint32_t esp_nn_get_hard_swish_scratch_size_esp32s3(void);\nvoid esp_nn_set_hard_swish_scratch_buf_esp32s3(void *buf);\nvoid esp_nn_hard_swish_s8_esp32s3(const int8_t *input, int8_t *output,\n                                   const int32_t size,\n                                   const int16_t input_zero_point,\n                                   const int16_t output_mult_fxp,\n                                   const int16_t reluish_mult_fxp,\n                                   const int32_t reluish_mult_exp,\n                                   const int32_t output_mult_exp,\n                                   const int16_t output_zero_point);\n#define esp_nn_get_hard_swish_scratch_size esp_nn_get_hard_swish_scratch_size_esp32s3\n#define esp_nn_set_hard_swish_scratch_buf esp_nn_set_hard_swish_scratch_buf_esp32s3\n#define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_esp32s3\n\nvoid esp_nn_mean_nhwc_s8_esp32s3(const int8_t *input, int8_t *output,\n                                  const int32_t height, const int32_t width,\n                                  const int32_t channels,\n                                  const int32_t input_zero_point,\n                                  const int32_t output_zero_point,\n                                  const int32_t multiplier,\n                                  const int32_t shift);\n#define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_esp32s3\n\n#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_esp32s3\n#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_esp32s3\n\n#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_esp32s3\n#define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_esp32s3\n\nint32_t esp_nn_get_softmax_scratch_size_esp32s3(const int32_t width, const int32_t height);\nvoid esp_nn_set_softmax_scratch_buf_esp32s3(void *buffer);\nvoid esp_nn_softmax_s8_esp32s3(const int8_t *input_data, const int32_t height,\n                                const int32_t width, const int32_t mult,\n                                const int32_t shift, const int32_t diff_min,\n                                int8_t *output_data);\n\n#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_esp32s3\n#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_esp32s3\n#define esp_nn_softmax_s8 esp_nn_softmax_s8_esp32s3\n\n/* Logistic (sigmoid) — LUT-based, same impl for all targets */\n#define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi\n#define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi\n#define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi\n"
  },
  {
    "path": "include/esp_nn_generic_opt.h",
    "content": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/**\n * @file        Header definitions to include for esp_nn generic optimisations\n *              For functions which not having optimisations, _ansi versions are picked.\n */\n\n#pragma once\n\n#include \"esp_nn_defs.h\"\n#include \"esp_nn_ansi_headers.h\"\n\n#define esp_nn_add_elementwise_s8 esp_nn_add_elementwise_s8_ansi\n#define esp_nn_mul_elementwise_s8 esp_nn_mul_elementwise_s8_ansi\n#define esp_nn_mul_broadcast_channel_s8 esp_nn_mul_broadcast_channel_s8_ansi\n\n#define esp_nn_depthwise_conv_s8 esp_nn_depthwise_conv_s8_opt\n\n#define esp_nn_conv_s8 esp_nn_conv_s8_opt\n\n#define esp_nn_get_conv_scratch_size esp_nn_get_conv_scratch_size_opt\n#define esp_nn_set_conv_scratch_buf esp_nn_set_conv_scratch_buf_opt\n\n#define esp_nn_get_depthwise_conv_scratch_size esp_nn_get_depthwise_conv_scratch_size_opt\n#define esp_nn_set_depthwise_conv_scratch_buf esp_nn_set_depthwise_conv_scratch_buf_opt\n\n#define esp_nn_relu6_s8 esp_nn_relu6_s8_ansi\n#define esp_nn_hard_swish_s8 esp_nn_hard_swish_s8_ansi\n#define esp_nn_get_hard_swish_scratch_size() 0\n#define esp_nn_set_hard_swish_scratch_buf(buf)\n#define esp_nn_mean_nhwc_s8 esp_nn_mean_nhwc_s8_ansi\n\n#define esp_nn_avg_pool_s8 esp_nn_avg_pool_s8_ansi\n#define esp_nn_max_pool_s8 esp_nn_max_pool_s8_ansi\n\n#define esp_nn_fully_connected_s8 esp_nn_fully_connected_s8_ansi\n#define esp_nn_fully_connected_per_ch_s8 esp_nn_fully_connected_per_ch_s8_ansi\n\n#define esp_nn_get_softmax_scratch_size esp_nn_get_softmax_scratch_size_opt\n#define esp_nn_set_softmax_scratch_buf esp_nn_set_softmax_scratch_buf_opt\n#define esp_nn_softmax_s8 esp_nn_softmax_s8_opt\n\n#define esp_nn_get_logistic_s8_scratch_size esp_nn_get_logistic_s8_scratch_size_ansi\n#define esp_nn_logistic_s8_prepare esp_nn_logistic_s8_prepare_ansi\n#define esp_nn_logistic_s8 esp_nn_logistic_s8_ansi\n"
  },
  {
    "path": "src/activation_functions/esp_nn_hard_swish_ansi.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*\n * HardSwish activation function: y = x * relu6(x + 3) / 6\n * Quantized int8 implementation using fixed-point arithmetic.\n */\n\n#include <stdint.h>\n#include <common_functions.h>\n\n/*\n * Saturating left shift for int16\n */\nstatic inline int16_t sat_left_shift_s16(int16_t val, int shift)\n{\n    int32_t result = (int32_t)val << shift;\n    if (result > 32767) return 32767;\n    if (result < -32768) return -32768;\n    return (int16_t)result;\n}\n\n/*\n * SaturatingRoundingDoublingHighMul for int16: (a * b + (1<<14)) >> 15\n */\nstatic inline int16_t sat_round_dbl_high_mul_s16(int16_t a, int16_t b)\n{\n    if (a == b && a == -32768) return 32767;\n    int32_t ab = (int32_t)a * (int32_t)b;\n    return (int16_t)((ab + (1 << 14)) >> 15);\n}\n\n/*\n * SaturatingDoublingHighMul (NOT rounding): (a * b) >> 15\n */\nstatic inline int16_t sat_dbl_high_mul_s16(int16_t a, int16_t b)\n{\n    if (a == b && a == -32768) return 32767;\n    return (int16_t)(((int32_t)a * (int32_t)b) / (1 << 15));\n}\n\n/*\n * RoundingDivideByPOT for int16\n */\nstatic inline int16_t rounding_div_pot_s16(int16_t val, int exponent)\n{\n    int32_t mask = (1 << exponent) - 1;\n    int32_t remainder = val & mask;\n    int32_t threshold = (mask >> 1) + (val < 0 ? 1 : 0);\n    return (int16_t)((val >> exponent) + (remainder > threshold ? 1 : 0));\n}\n\nvoid esp_nn_hard_swish_s8_ansi(const int8_t *input,\n                                int8_t *output,\n                                const int32_t size,\n                                const int16_t input_zero_point,\n                                const int16_t output_mult_fxp,\n                                const int16_t reluish_mult_fxp,\n                                const int32_t reluish_mult_exp,\n                                const int32_t output_mult_exp,\n                                const int16_t output_zero_point)\n{\n    for (int i = 0; i < size; i++) {\n        const int16_t in_val = input[i] - input_zero_point;\n        const int16_t in_hires = in_val * 128; /* << 7 */\n\n        /* Scale input to output scale */\n        const int16_t in_on_out_scale = sat_round_dbl_high_mul_s16(in_hires, output_mult_fxp);\n\n        /* Compute reluish value: maps input from [-3,3] to [-1,1] */\n        int16_t reluish = in_hires;\n        if (reluish_mult_exp > 0) {\n            reluish = sat_left_shift_s16(reluish, reluish_mult_exp - 1);\n        }\n        reluish = sat_round_dbl_high_mul_s16(reluish, reluish_mult_fxp);\n        if (reluish_mult_exp > 0) {\n            reluish = sat_left_shift_s16(reluish, 1);\n        }\n        if (reluish_mult_exp < 0) {\n            reluish = rounding_div_pot_s16(reluish, -reluish_mult_exp);\n        }\n\n        /* Convert from [-1,1] to [0,1] */\n        reluish = (reluish + (1 << 15)) >> 1;\n\n        /* Multiply: output = reluish * input_on_output_scale */\n        const int16_t pre_out = sat_dbl_high_mul_s16(reluish, in_on_out_scale);\n\n        /* Final shift and offset */\n        int16_t out_val = rounding_div_pot_s16(pre_out, -output_mult_exp);\n        out_val += output_zero_point;\n        if (out_val > 127) out_val = 127;\n        if (out_val < -128) out_val = -128;\n        output[i] = (int8_t)out_val;\n    }\n}\n"
  },
  {
    "path": "src/activation_functions/esp_nn_hard_swish_s8_esp32p4.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*\n * ESP32-P4 optimized HardSwish with:\n * 1. Branch hoisting (borrowed from S3): dispatch on reluish_mult_exp ONCE\n * 2. 2x loop unrolling for better ILP on RISC-V pipeline\n * 3. All int16 arithmetic - no 64-bit multiply bottleneck\n */\n\n#include <stdint.h>\n\nstatic inline __attribute__((always_inline))\nint16_t sat_rnd_dbl_hi_mul(int16_t a, int16_t b) {\n    if (__builtin_expect(a == b && a == -32768, 0)) return 32767;\n    return (int16_t)(((int32_t)a * (int32_t)b + (1 << 14)) >> 15);\n}\n\nstatic inline __attribute__((always_inline))\nint16_t sat_dbl_hi_mul(int16_t a, int16_t b) {\n    if (__builtin_expect(a == b && a == -32768, 0)) return 32767;\n    return (int16_t)(((int32_t)a * (int32_t)b) >> 15);\n}\n\nstatic inline __attribute__((always_inline))\nint16_t sat_left_shift_s16(int32_t val) {\n    if (val > 32767) return 32767;\n    if (val < -32768) return -32768;\n    return (int16_t)val;\n}\n\nstatic inline __attribute__((always_inline))\nint16_t rounding_div_pot_s16(int16_t val, int exp) {\n    int32_t mask = (1 << exp) - 1;\n    int32_t remainder = val & mask;\n    int32_t threshold = (mask >> 1) + (val < 0 ? 1 : 0);\n    return (int16_t)((val >> exp) + (remainder > threshold ? 1 : 0));\n}\n\n/* Core output computation shared by all paths */\nstatic inline __attribute__((always_inline))\nint8_t hard_swish_output(int16_t reluish, int16_t in_on_out_scale,\n                          int neg_out_exp, int16_t output_zero_point) {\n    int16_t pre = sat_dbl_hi_mul(reluish, in_on_out_scale);\n    int16_t ov = rounding_div_pot_s16(pre, neg_out_exp);\n    int32_t result = ov + output_zero_point;\n    if (result > 127) result = 127;\n    if (result < -128) result = -128;\n    return (int8_t)result;\n}\n\nvoid esp_nn_hard_swish_s8_esp32p4(const int8_t *input,\n                                   int8_t *output,\n                                   const int32_t size,\n                                   const int16_t input_zero_point,\n                                   const int16_t output_mult_fxp,\n                                   const int16_t reluish_mult_fxp,\n                                   const int32_t reluish_mult_exp,\n                                   const int32_t output_mult_exp,\n                                   const int16_t output_zero_point)\n{\n    const int neg_out_exp = -output_mult_exp;\n    int i = 0;\n\n    /* Branch on reluish_mult_exp ONCE - 3 specialized loops */\n    if (reluish_mult_exp > 0) {\n        const int ls1 = reluish_mult_exp - 1;\n\n        for (; i <= size - 2; i += 2) {\n            int16_t iv0 = input[i] - input_zero_point;\n            int16_t iv1 = input[i+1] - input_zero_point;\n            int16_t hi0 = iv0 * 128, hi1 = iv1 * 128;\n\n            int16_t on0 = sat_rnd_dbl_hi_mul(hi0, output_mult_fxp);\n            int16_t on1 = sat_rnd_dbl_hi_mul(hi1, output_mult_fxp);\n\n            int16_t rv0 = sat_left_shift_s16((int32_t)hi0 << ls1);\n            int16_t rv1 = sat_left_shift_s16((int32_t)hi1 << ls1);\n            rv0 = sat_rnd_dbl_hi_mul(rv0, reluish_mult_fxp);\n            rv1 = sat_rnd_dbl_hi_mul(rv1, reluish_mult_fxp);\n            rv0 = sat_left_shift_s16((int32_t)rv0 * 2);\n            rv1 = sat_left_shift_s16((int32_t)rv1 * 2);\n\n            rv0 = (int16_t)(((int32_t)rv0 + 32768) >> 1);\n            rv1 = (int16_t)(((int32_t)rv1 + 32768) >> 1);\n\n            output[i]   = hard_swish_output(rv0, on0, neg_out_exp, output_zero_point);\n            output[i+1] = hard_swish_output(rv1, on1, neg_out_exp, output_zero_point);\n        }\n    } else if (reluish_mult_exp < 0) {\n        const int neg_relu_exp = -reluish_mult_exp;\n\n        for (; i <= size - 2; i += 2) {\n            int16_t iv0 = input[i] - input_zero_point;\n            int16_t iv1 = input[i+1] - input_zero_point;\n            int16_t hi0 = iv0 * 128, hi1 = iv1 * 128;\n\n            int16_t on0 = sat_rnd_dbl_hi_mul(hi0, output_mult_fxp);\n            int16_t on1 = sat_rnd_dbl_hi_mul(hi1, output_mult_fxp);\n\n            int16_t rv0 = sat_rnd_dbl_hi_mul(hi0, reluish_mult_fxp);\n            int16_t rv1 = sat_rnd_dbl_hi_mul(hi1, reluish_mult_fxp);\n            rv0 = rounding_div_pot_s16(rv0, neg_relu_exp);\n            rv1 = rounding_div_pot_s16(rv1, neg_relu_exp);\n\n            rv0 = (int16_t)(((int32_t)rv0 + 32768) >> 1);\n            rv1 = (int16_t)(((int32_t)rv1 + 32768) >> 1);\n\n            output[i]   = hard_swish_output(rv0, on0, neg_out_exp, output_zero_point);\n            output[i+1] = hard_swish_output(rv1, on1, neg_out_exp, output_zero_point);\n        }\n    } else {\n        for (; i <= size - 2; i += 2) {\n            int16_t iv0 = input[i] - input_zero_point;\n            int16_t iv1 = input[i+1] - input_zero_point;\n            int16_t hi0 = iv0 * 128, hi1 = iv1 * 128;\n\n            int16_t on0 = sat_rnd_dbl_hi_mul(hi0, output_mult_fxp);\n            int16_t on1 = sat_rnd_dbl_hi_mul(hi1, output_mult_fxp);\n            int16_t rv0 = sat_rnd_dbl_hi_mul(hi0, reluish_mult_fxp);\n            int16_t rv1 = sat_rnd_dbl_hi_mul(hi1, reluish_mult_fxp);\n\n            rv0 = (int16_t)(((int32_t)rv0 + 32768) >> 1);\n            rv1 = (int16_t)(((int32_t)rv1 + 32768) >> 1);\n\n            output[i]   = hard_swish_output(rv0, on0, neg_out_exp, output_zero_point);\n            output[i+1] = hard_swish_output(rv1, on1, neg_out_exp, output_zero_point);\n        }\n    }\n\n    /* Scalar remainder */\n    for (; i < size; i++) {\n        int16_t iv = input[i] - input_zero_point;\n        int16_t hi = iv * 128;\n        int16_t on_out = sat_rnd_dbl_hi_mul(hi, output_mult_fxp);\n\n        int16_t rv = hi;\n        if (reluish_mult_exp > 0)\n            rv = sat_left_shift_s16((int32_t)rv << (reluish_mult_exp - 1));\n        rv = sat_rnd_dbl_hi_mul(rv, reluish_mult_fxp);\n        if (reluish_mult_exp > 0)\n            rv = sat_left_shift_s16((int32_t)rv * 2);\n        if (reluish_mult_exp < 0)\n            rv = rounding_div_pot_s16(rv, -reluish_mult_exp);\n\n        rv = (int16_t)(((int32_t)rv + 32768) >> 1);\n        output[i] = hard_swish_output(rv, on_out, neg_out_exp, output_zero_point);\n    }\n}\n"
  },
  {
    "path": "src/activation_functions/esp_nn_hard_swish_s8_esp32s3.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*\n * ESP32-S3 optimized HardSwish using 256-byte lookup table.\n *\n * Key insight: HardSwish maps int8 -> int8 with fixed quantization parameters\n * per layer. Only 256 possible input values exist. We precompute the full\n * mapping once using the ANSI reference (bit-exact), then the inner loop\n * is a single byte load per element.\n *\n * Scratch buffer: 256 bytes (set via esp_nn_set_hard_swish_scratch_buf).\n */\n\n#include <stdint.h>\n#include <stddef.h>\n\n/* Use ANSI C reference to build LUT — guarantees bit-exact match */\nextern void esp_nn_hard_swish_s8_ansi(const int8_t *input,\n                                       int8_t *output,\n                                       const int32_t size,\n                                       const int16_t input_zero_point,\n                                       const int16_t output_mult_fxp,\n                                       const int16_t reluish_mult_fxp,\n                                       const int32_t reluish_mult_exp,\n                                       const int32_t output_mult_exp,\n                                       const int16_t output_zero_point);\n\nstatic int8_t *hard_swish_scratch = NULL;\n\nint32_t esp_nn_get_hard_swish_scratch_size_esp32s3(void)\n{\n    return 512; /* 256 for lut_input + 256 for lut output */\n}\n\nvoid esp_nn_set_hard_swish_scratch_buf_esp32s3(void *buf)\n{\n    hard_swish_scratch = (int8_t *)buf;\n}\n\nvoid esp_nn_hard_swish_s8_esp32s3(const int8_t *input,\n                                   int8_t *output,\n                                   const int32_t size,\n                                   const int16_t input_zero_point,\n                                   const int16_t output_mult_fxp,\n                                   const int16_t reluish_mult_fxp,\n                                   const int32_t reluish_mult_exp,\n                                   const int32_t output_mult_exp,\n                                   const int16_t output_zero_point)\n{\n    if (!hard_swish_scratch) {\n        /* No scratch — fall through to ANSI */\n        esp_nn_hard_swish_s8_ansi(input, output, size,\n                                   input_zero_point, output_mult_fxp,\n                                   reluish_mult_fxp, reluish_mult_exp,\n                                   output_mult_exp, output_zero_point);\n        return;\n    }\n\n    /* Build 256-byte LUT using ANSI reference (bit-exact).\n     * lut[i] = hardswish((int8_t)i) for the given quant params.\n     * Indexed by (uint8_t)input_val for direct lookup. */\n    int8_t *lut_input = hard_swish_scratch;\n    int8_t *lut = hard_swish_scratch + 256;\n\n    for (int i = 0; i < 256; i++) {\n        lut_input[i] = (int8_t)i;\n    }\n    esp_nn_hard_swish_s8_ansi(lut_input, lut, 256,\n                               input_zero_point, output_mult_fxp,\n                               reluish_mult_fxp, reluish_mult_exp,\n                               output_mult_exp, output_zero_point);\n\n    /* Apply LUT — one byte load per element */\n    for (int i = 0; i < size; i++) {\n        output[i] = lut[(uint8_t)input[i]];\n    }\n}\n"
  },
  {
    "path": "src/activation_functions/esp_nn_relu_ansi.c",
    "content": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <stdint.h>\n#include <stdlib.h>\n\n#include <common_functions.h>\n\nvoid esp_nn_relu6_s8_ansi(int8_t *data, uint16_t size)\n{\n    int32_t i;\n\n    for (i = 0; i < size; i++) {\n        int32_t ip = data[i];\n\n        ip = max(ip, 0);\n        data[i] = min(ip, 6);\n    }\n}\n"
  },
  {
    "path": "src/activation_functions/esp_nn_relu_s8_esp32p4.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <stdint.h>\n\n/**\n * In-place ReLU6 for s8 data using ESP32-P4 PIE SIMD.\n * Clamps each element to [0, 6].\n * Processes 16 elements per iteration via 128-bit vector ops.\n */\nvoid esp_nn_relu6_s8_esp32p4(int8_t *data, uint16_t size)\n{\n    /* Enable PIE */\n    asm volatile (\n        \"csrsi  0x7f2, 0b01        \\n\\t\"\n        \"li     x29, 0b10          \\n\\t\"\n        \"esp.movx.w.cfg x29        \\n\\t\"\n        ::: \"x29\"\n    );\n\n    int i = 0;\n\n    if (size >= 16) {\n        /* Broadcast 0 into q2 and 6 into q3 */\n        const int8_t zero_val = 0;\n        const int8_t six_val = 6;\n\n        asm volatile (\n            \"esp.vldbc.8.ip  q2, %0, 0   \\n\\t\"\n            \"esp.vldbc.8.ip  q3, %1, 0   \\n\\t\"\n            :: \"r\"(&zero_val), \"r\"(&six_val)\n        );\n\n        int count = size >> 4;\n        int stride = 16;\n\n        asm volatile (\n            \"mv     x30, %[ptr]             \\n\\t\"\n            \"mv     x31, %[cnt]             \\n\\t\"\n\n            \"1:                             \\n\\t\"\n            \"esp.vld.128.ip   q0, x30, 0    \\n\\t\"  /* load 16 bytes, no auto-increment */\n            \"esp.vmax.s8      q0, q0, q2    \\n\\t\"  /* max(val, 0) */\n            \"esp.vmin.s8      q0, q0, q3    \\n\\t\"  /* min(val, 6) */\n            \"esp.vst.128.xp   q0, x30, %[stride] \\n\\t\"  /* store and advance ptr by 16 */\n            \"addi   x31, x31, -1            \\n\\t\"\n            \"bnez   x31, 1b                 \\n\\t\"\n\n            :\n            : [ptr] \"r\"(data), [cnt] \"r\"(count), [stride] \"r\"(stride)\n            : \"x30\", \"x31\", \"memory\"\n        );\n\n        i = count << 4;\n    }\n\n    /* Handle remaining elements scalar */\n    for (; i < size; i++) {\n        int32_t val = data[i];\n        if (val < 0) val = 0;\n        if (val > 6) val = 6;\n        data[i] = (int8_t) val;\n    }\n}\n"
  },
  {
    "path": "src/activation_functions/esp_nn_relu_s8_esp32s3.S",
    "content": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n\n    .text\n    .align  4\n    .literal_position\n\n# in place relu6 function. a2: data, a3: size\n    # Program Unit: esp_nn_relu6_s8_esp32s3\n    .type   esp_nn_relu6_s8_esp32s3, @function\n    .align   4\n    .global esp_nn_relu6_s8_esp32s3\n\nesp_nn_relu6_s8_esp32s3:\n    entry   a1,48                       #\n    mov.n   a9,a2                       # [0], data\n    mov.n   a7,a3                       # [1], size\n\n // process multiple of 16\n    movi.n      a4,6                    # [4]\n    s8i         a4,a1,0                     # [5]  six\n    addi        a10,a3,-7                   # [2]\n    ee.vldbc.8  q1,a1               # [6]  id:72 six+0x0\n    blti        a3,16,.Lt_0_5634            # [7]\n\n    srai        a8,a3,4                     # [0]\n    ee.zero.q   q2                      # [1]\n    loopgtz     a8,.LBB37_esp_nn_relu6_s8_esp32s3   # [3]\n\n    ee.vld.128.ip   q0,a2,0             # [0*II+0]  id:73\n    ee.vmax.s8      q0,q0,q2            # [0*II+2]\n    ee.vmin.s8      q0,q0,q1            # [0*II+3]\n    ee.vst.128.ip   q0,a2,16            # [0*II+4]  id:74\n.LBB37_esp_nn_relu6_s8_esp32s3: # 0x34\n\n    slli    a8,a8,4                     # [0]\n\n // remaining multiple of 8 data\n    bge     a8,a10,.Lt_0_3586           # [1]\n\n.Lt_0_3842: # 0x3a\n    sub     a6,a7,a8                    # [0]\n    srai    a6,a6,3                     # [1]\n    loopgtz a6,.LBB52_esp_nn_relu6_s8_esp32s3   # [2]\n\n    ee.vld.l.64.ip  q0,a2,0         # [0*II+0]  id:75\n    ee.vmax.s8      q0,q0,q2            # [0*II+2]\n    ee.vmin.s8      q0,q0,q1            # [0*II+3]\n    ee.vst.l.64.ip  q0,a2,8         # [0*II+4]  id:76\n\n.LBB52_esp_nn_relu6_s8_esp32s3: # 0x4f\n    addx8   a8,a6,a8                    # [0]\n\n.Lt_0_3586: # 0x52\n // process leftover\n    bge     a8,a7,.Lt_0_6402            # [0]\n\n.Lt_0_4866: # 0x55\n    movi.n  a5,0                    # [0]\n    sub     a3,a7,a8                    # [1]\n    add.n   a2,a8,a9                    # [2]\n    l8ui    a6,a2,0                     # [3]  id:78\n    addi.n  a3,a3,-1                # [4]\n    sext    a6,a6,7\n    max     a6,a5,a6                    # [6]\n    min     a6,a4,a6                    # [7]\n    s8i     a6,a2,0                     # [8]  id:79\n\n    loopgtz a3,.LBB67_esp_nn_relu6_s8_esp32s3   # [9]\n\n    l8ui    a3,a2,1                     # [0*II+0]  id:78\n    addi.n  a2,a2,1                 # [1*II+1]\n    sext    a3,a3,7\n    max     a3,a5,a3                    # [0*II+3]\n    min     a3,a4,a3                    # [0*II+4]\n    s8i     a3,a2,0                     # [0*II+5]  id:79\n.LBB67_esp_nn_relu6_s8_esp32s3: # 0x81\n\n.Lt_0_6402: # 0x83\n    retw.n                          # [0]\n\n.Lt_0_5634: # 0x85\n    blti    a10,1,.Lt_0_5890            # [0]\n\n    movi.n  a8,0                    # [0]\n    ee.zero.q   q2                      # [1]\n    j   .Lt_0_3842                      # [2]\n\n.Lt_0_5890: # 0x90\n    beqz.n  a3,.Lt_0_6402           # [0]\n\n    movi.n  a8,0                    # [0]\n    j   .Lt_0_4866                      # [1]\n\n    .size   esp_nn_relu6_s8_esp32s3, . - esp_nn_relu6_s8_esp32s3\n"
  },
  {
    "path": "src/basic_math/esp_nn_add_ansi.c",
    "content": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <stdint.h>\n\n#include <common_functions.h>\n\nvoid esp_nn_add_elementwise_u8_ansi(const uint8_t *input1_data,\n                                    const uint8_t *input2_data,\n                                    const int32_t input1_offset,\n                                    const int32_t input2_offset,\n                                    const int32_t input1_mult,\n                                    const int32_t input2_mult,\n                                    const int32_t input1_shift,\n                                    const int32_t input2_shift,\n                                    const int32_t left_shift,\n                                    uint8_t *output,\n                                    const int32_t out_offset,\n                                    const int32_t out_mult,\n                                    const int32_t out_shift,\n                                    const int32_t activation_min,\n                                    const int32_t activation_max,\n                                    const int32_t size)\n{\n    for (int i = 0; i < size; i++) {\n        int32_t tmp1 = input1_data[i] + input1_offset;\n        int32_t tmp2 = input2_data[i] + input2_offset;\n\n        tmp1 <<= left_shift;\n        tmp2 <<= left_shift;\n\n        tmp1 = esp_nn_sat_round_doubling_high_mul(tmp1, input1_mult);\n        tmp2 = esp_nn_sat_round_doubling_high_mul(tmp2, input2_mult);\n\n        tmp1 = esp_nn_div_by_power_of_two(tmp1, -input1_shift);\n        tmp2 = esp_nn_div_by_power_of_two(tmp2, -input2_shift);\n\n        int32_t out = tmp1 + tmp2;\n        out = esp_nn_sat_round_doubling_high_mul(out, out_mult);\n        out = esp_nn_div_by_power_of_two(out, -out_shift);\n        out = out + out_offset;\n\n        out = max(activation_min, min(out, activation_max));\n        output[i] = (uint8_t) out;\n    }\n}\n\nvoid esp_nn_add_elementwise_s8_ansi(const int8_t *input1_data,\n                                    const int8_t *input2_data,\n                                    const int32_t input1_offset,\n                                    const int32_t input2_offset,\n                                    const int32_t input1_mult,\n                                    const int32_t input2_mult,\n                                    const int32_t input1_shift,\n                                    const int32_t input2_shift,\n                                    const int32_t left_shift,\n                                    int8_t *output,\n                                    const int32_t out_offset,\n                                    const int32_t out_mult,\n                                    const int32_t out_shift,\n                                    const int32_t activation_min,\n                                    const int32_t activation_max,\n                                    const int32_t size)\n{\n    for (int i = 0; i < size; i++) {\n        int32_t tmp1 = input1_data[i] + input1_offset;\n        int32_t tmp2 = input2_data[i] + input2_offset;\n\n        tmp1 <<= left_shift;\n        tmp2 <<= left_shift;\n\n        tmp1 = esp_nn_sat_round_doubling_high_mul(tmp1, input1_mult);\n        tmp2 = esp_nn_sat_round_doubling_high_mul(tmp2, input2_mult);\n\n        tmp1 = esp_nn_div_by_power_of_two(tmp1, -input1_shift);\n        tmp2 = esp_nn_div_by_power_of_two(tmp2, -input2_shift);\n\n        int32_t out = tmp1 + tmp2;\n        out = esp_nn_sat_round_doubling_high_mul(out, out_mult);\n        out = esp_nn_div_by_power_of_two(out, -out_shift);\n        out = out + out_offset;\n\n        out = max(activation_min, min(out, activation_max));\n        output[i] = (int8_t) out;\n    }\n}\n"
  },
  {
    "path": "src/basic_math/esp_nn_add_s8_esp32p4.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <stdint.h>\n#include <common_functions.h>\n\n/**\n * Optimized elementwise add for s8 on ESP32-P4.\n * Uses fast multiply-by-quantized-mult and 2x unrolling.\n */\n\n/* Inline the core requantization to avoid function call overhead */\n/* Inlined fast requant using explicit RISC-V mul/mulh to avoid\n * compiler generating 64-bit multiply helper calls */\nstatic inline __attribute__((always_inline))\nint32_t add_requant(int32_t val, int32_t mult, int32_t neg_shift)\n{\n    /* Use C 64-bit multiply - compiler already generates mul+mulh pair at -O2 */\n    int64_t prod64 = (int64_t)val * mult + ((int64_t)1 << 30);\n    int32_t result = (int32_t)(prod64 >> 31);\n\n    if (neg_shift > 0) {\n        int32_t rnd = (1 << (neg_shift - 1)) - (result < 0);\n        result = (result + rnd) >> neg_shift;\n    }\n    return result;\n}\n\nvoid esp_nn_add_elementwise_s8_esp32p4(const int8_t *input1_data,\n                                        const int8_t *input2_data,\n                                        const int32_t input1_offset,\n                                        const int32_t input2_offset,\n                                        const int32_t input1_mult,\n                                        const int32_t input2_mult,\n                                        const int32_t input1_shift,\n                                        const int32_t input2_shift,\n                                        const int32_t left_shift,\n                                        int8_t *output,\n                                        const int32_t out_offset,\n                                        const int32_t out_mult,\n                                        const int32_t out_shift,\n                                        const int32_t activation_min,\n                                        const int32_t activation_max,\n                                        const int32_t size)\n{\n    const int32_t neg_in1_shift = -input1_shift;\n    const int32_t neg_in2_shift = -input2_shift;\n    const int32_t neg_out_shift = -out_shift;\n\n    int i = 0;\n    /* Process 2 at a time - C inline requant lets compiler optimize across calls */\n    for (; i <= size - 2; i += 2) {\n        int32_t a0 = (input1_data[i + 0] + input1_offset) << left_shift;\n        int32_t b0 = (input2_data[i + 0] + input2_offset) << left_shift;\n\n        a0 = add_requant(a0, input1_mult, neg_in1_shift);\n        b0 = add_requant(b0, input2_mult, neg_in2_shift);\n        int32_t out0 = add_requant(a0 + b0, out_mult, neg_out_shift) + out_offset;\n        out0 = max(activation_min, min(out0, activation_max));\n\n        int32_t a1 = (input1_data[i + 1] + input1_offset) << left_shift;\n        int32_t b1 = (input2_data[i + 1] + input2_offset) << left_shift;\n\n        a1 = add_requant(a1, input1_mult, neg_in1_shift);\n        b1 = add_requant(b1, input2_mult, neg_in2_shift);\n        int32_t out1 = add_requant(a1 + b1, out_mult, neg_out_shift) + out_offset;\n        out1 = max(activation_min, min(out1, activation_max));\n\n        output[i + 0] = (int8_t) out0;\n        output[i + 1] = (int8_t) out1;\n    }\n\n    for (; i < size; i++) {\n        int32_t tmp1 = (input1_data[i] + input1_offset) << left_shift;\n        int32_t tmp2 = (input2_data[i] + input2_offset) << left_shift;\n\n        tmp1 = add_requant(tmp1, input1_mult, neg_in1_shift);\n        tmp2 = add_requant(tmp2, input2_mult, neg_in2_shift);\n\n        int32_t out = add_requant(tmp1 + tmp2, out_mult, neg_out_shift) + out_offset;\n        out = max(activation_min, min(out, activation_max));\n        output[i] = (int8_t) out;\n    }\n}\n"
  },
  {
    "path": "src/basic_math/esp_nn_add_s8_esp32s3.S",
    "content": "// Copyright 2021-2023 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n    .text\n    .align  4\n    .literal_position\n    .literal    .nudge_val, 1073741824\n\n    # Program Unit: esp_nn_add_elementwise_s8_esp32s3\n    .type   esp_nn_add_elementwise_s8_esp32s3, @function\n    .align   4\n    .global esp_nn_add_elementwise_s8_esp32s3\n\nesp_nn_add_elementwise_s8_esp32s3:  # 0x4\n    # temp_neg_out_shift = 0\n    # temp_neg_input2_shift = 4\n    # temp_neg_input1_shift = 8\n    # gra_spill_temp_2 = 12\n    # gra_spill_temp_3 = 16\n    # gra_spill_temp_4 = 20\n    # gra_spill_temp_5 = 24\n    # gra_spill_temp_6 = 28\n    # gra_spill_temp_7 = 32\n    # gra_spill_temp_8 = 36\n    # gra_spill_temp_9 = 40\n    # gra_spill_temp_10 = 44\n    # gra_spill_temp_11 = 48\n    # gra_spill_temp_12 = 52\n    # gra_spill_temp_13 = 56\n\n // a2 : *input1_data\n // a3 : *input2_data\n // a4 : input1_offset\n // a5 : input2_offset\n // a6 : input1_mult\n // a7 : input2_mult\n // On stack:\n // 80: input1_shift\n // 84: input2_shift\n // 88: left_shift\n // 92: *output\n // 96: out_offset\n // 100: out_mult, loaded in `a8`\n // 104: out_shift\n // 108: activation_min\n // 112: activation_max\n // 116: size\n\n    entry       a1,80                      #\n    s32i.n      a4,a1,48                    # [10]  gra_spill_temp_11, input1_offset\n    s32i.n      a5,a1,52                    # [0]  gra_spill_temp_12, input2_offset\n    s32i.n      a2,a1,32                 # [5]  gra_spill_temp_7, input1_data\n    s32i.n      a3,a1,12                    # [3]  gra_spill_temp_2, input2_data\n\n    l32i        a12,a1,116                  # [11]  id:720 size+0x0\n    mov.n       a14,a2                      # [6]\n    mov.n       a10,a3                      # [8]\n    blti        a12,1,.exit           # [1] // exit\n\n    l32i        a3,a1,80                   # [0]  id:721 input1_shift+0x0\n    l32i        a13,a1,84                  # [1]  id:722 input2_shift+0x0\n    l32i        a2,a1,104                   # [8]  id:723 out_shift+0x0\n    l32i        a8,a1,100                   # [1]  out_mult\n\n    neg         a3,a3                       # [12]\n    neg         a13,a13                     # [7]\n    neg         a2,a2                       # [11]\n\n    s32i.n      a3,a1,8                    # [12]  temp_neg_input1_shift, -input1_shift\n    s32i.n      a13,a1,4                   # [7]  temp_neg_input2_shift, -input2_shift\n    s32i.n      a2,a1,0                    # [16]  temp_neg_out_shift, -out_shift\n\n    movi.n      a5,1\n    addi        a9,a3,-1\n    ssl         a9\n    sll         a15,a5\n    s32i.n      a15,a1,16               # gra_spill_temp_3, 1 << (exponent - 1) for input1\n\n    addi        a9,a13,-1\n    ssl         a9\n    sll         a15,a5\n    s32i.n      a15,a1,20               # gra_spill_temp_4, 1 << (exponent - 1) for input2\n\n    addi        a9,a2,-1\n    ssl         a9\n    sll         a15,a5\n    s32i.n      a15,a1,24               # gra_spill_temp_5, 1 << (exponent - 1) for out\n\n    movi.n      a2,0\n    blti        a12,12,.process_leftover          # [23]\n\n    // skip to leftover routine if inputs are unaligned\n    or          a9,a14,a10\n    extui       a9,a9,0,4\n    bnez        a9,.process_leftover\n\n    l32i        a9,a1,92                   # [17]  id:1279 output+0x0\n\n    l32i        a13,a1,116                  # [20]\n    srai        a13,a13,3                   # [21]\n    s32i.n      a13,a1,56                   # [22]  gra_spill_temp_13\n\n    movi.n      a13,8\n    s32i.n      a13,a1,28               # gra_spill_temp_6, mult_of8 counter\n\n    ee.zero.q       q6                      # [8]\n\n.vector_loop: // process 8 values in one go\n    l32i            a15,a1,88                  # [6]  left_shift\n    ee.vld.l.64.ip  q0,a14,8        # [9]  id:729\n    s32i.n          a9,a1,44                    # [10]  gra_spill_temp_10, out_ptr\n    s32i.n          a14,a1,40                   # [20]  gra_spill_temp_9\n    wsr.sar         a15                     # [21] load left shift\n\n    addi.n          a15,a1,48                   # [14]\n    ee.vldbc.16     q7,a15              # [21]  id:1277 input1_offset\n    ee.vcmp.lt.s8   q5,q0,q6            # [29]\n    ee.vzip.8       q0,q5                   # [31], 20 bits\n    ee.vadds.s16    q0,q0,q7            # [34], add offset\n    ee.vcmp.lt.s16  q2,q0,q6        # [36]\n    ee.vzip.16      q0,q2               # [39], 32 bits\n    ee.vsl.32       q0,q0                   # [41] left_shift\n    ee.vsl.32       q2,q2                   # [42] left_shift\n\n    l32r            a9,.nudge_val              # [15], nudge\n\n// mulhi32 for q0\n    ee.movi.32.a    q0,a3,2             # [44]\n    ee.movi.32.a    q0,a4,3             # [45]\n    ee.movi.32.a    q0,a14,1            # [46]\n    ee.movi.32.a    q0,a5,0             # [62]\n\n    mulsh           a13,a6,a3                   # [51]\n    mull            a3,a6,a3                    # [53]\n\n    mulsh           a12,a6,a4                   # [50]\n    mull            a4,a6,a4                    # [55]\n\n    mulsh           a15,a6,a14                  # [48]\n    mull            a14,a6,a14                  # [49]\n\n    ssai            31                          # [47]\n\n    add             a3,a3,a9\n    saltu           a2,a3,a9\n    add.n           a13,a13,a2\n    src             a13,a13,a3\n\n    add             a4,a4,a9\n    saltu           a2,a4,a9\n    add.n           a12,a12,a2\n    src             a12,a12,a4\n    ee.movi.32.q    q0,a13,2            # [62]\n\n    add             a14,a14,a9\n    saltu           a2,a14,a9\n    add.n           a15,a15,a2\n    src             a15,a15,a14\n    ee.movi.32.q    q0,a12,3            # [62]\n\n    mulsh           a13,a6,a5                   # [51]\n    mull            a5,a6,a5                    # [53]\n    ee.movi.32.q    q0,a15,1            # [62]\n\n    add             a5,a5,a9\n    saltu           a2,a5,a9\n    add.n           a13,a13,a2\n    src             a13,a13,a5\n    ee.movi.32.q    q0,a13,0            # [62]\n\n\n// mulhi32 for q2\n    ee.movi.32.a    q2,a3,2             # [44]\n    ee.movi.32.a    q2,a4,3             # [45]\n    ee.movi.32.a    q2,a14,1            # [46]\n    ee.movi.32.a    q2,a5,0             # [62]\n\n    mulsh           a13,a6,a3                   # [51]\n    mull            a3,a6,a3                    # [53]\n\n    mulsh           a12,a6,a4                   # [50]\n    mull            a4,a6,a4                    # [55]\n\n    mulsh           a15,a6,a14                  # [48]\n    mull            a14,a6,a14                  # [49]\n\n    ssai            31                          # [47]\n\n    add             a3,a3,a9\n    saltu           a2,a3,a9\n    add.n           a13,a13,a2\n    src             a13,a13,a3\n\n    add             a4,a4,a9\n    saltu           a2,a4,a9\n    add.n           a12,a12,a2\n    src             a12,a12,a4\n    ee.movi.32.q    q2,a13,2            # [62]\n\n    add             a14,a14,a9\n    saltu           a2,a14,a9\n    add.n           a15,a15,a2\n    src             a15,a15,a14\n    ee.movi.32.q    q2,a12,3            # [62]\n\n    mulsh           a13,a6,a5                   # [51]\n    mull            a5,a6,a5                    # [53]\n    ee.movi.32.q    q2,a15,1            # [62]\n\n    l32i            a3,a1,8                    # [12]  temp_neg_input1_shift, -input1_shift\n    add             a5,a5,a9\n    saltu           a2,a5,a9\n    add.n           a13,a13,a2\n    src             a13,a13,a5\n    ee.movi.32.q    q2,a13,0            # [62]\n\n\n    blti            a3,1, .skip_div_by2_in0\n\n    addi.n          a13,a1,16\n    ee.vcmp.lt.s32  q1,q0,q6\n    ee.vcmp.lt.s32  q3,q2,q6\n    ee.vldbc.32     q5,a13      // 1 << (exponent - 1)\n    wsr.sar         a3          // load right_shift\n    ee.vadds.s32    q0,q0,q1    // subtract 1 `if (val < 0)`\n    ee.vadds.s32    q2,q2,q3    // subtract 1 `if (val < 0)`\n    ee.vadds.s32    q0,q0,q5\n    ee.vadds.s32    q2,q2,q5\n    ee.vsr.32       q0,q0\n    ee.vsr.32       q2,q2\n\n.skip_div_by2_in0:\n\n\n    ee.vld.l.64.ip  q1,a10,8        # [11]  id:1290\n    addi.n          a15,a1,52                   # [12]\n    ee.vldbc.16     q7,a15              # [19]  id:1278 input2_offset\n    l32i            a15,a1,88                  # [6]  left_shift\n    s32i            a10,a1,36                   # [14]  gra_spill_temp_8\n    ee.vcmp.lt.s8   q3,q1,q6            # [271]\n    wsr.sar         a15                     # [21], load shift for left shift\n    ee.vzip.8       q1,q3                   # [274], 20 bits\n    ee.vadds.s16    q1,q1,q7            # [281]\n    ee.vcmp.lt.s16  q3,q1,q6        # [282]\n    ee.vzip.16      q1,q3               # [283], 32 bits\n    ee.vsl.32       q1,q1                   # [284]\n    ee.vsl.32       q3,q3                   # [285]\n\n\n// mulhi32 for q1\n    ee.movi.32.a    q1,a3,2             # [44]\n    ee.movi.32.a    q1,a4,3             # [45]\n    ee.movi.32.a    q1,a14,1            # [46]\n    ee.movi.32.a    q1,a5,0             # [62]\n\n    mulsh           a13,a7,a3                   # [51]\n    mull            a3,a7,a3                    # [53]\n\n    mulsh           a12,a7,a4                   # [50]\n    mull            a4,a7,a4                    # [55]\n\n    mulsh           a15,a7,a14                  # [48]\n    mull            a14,a7,a14                  # [49]\n\n    ssai            31                          # [47]\n\n    add             a3,a3,a9\n    saltu           a2,a3,a9\n    add.n           a13,a13,a2\n    src             a13,a13,a3\n\n    add             a4,a4,a9\n    saltu           a2,a4,a9\n    add.n           a12,a12,a2\n    src             a12,a12,a4\n    ee.movi.32.q    q1,a13,2            # [62]\n\n    add             a14,a14,a9\n    saltu           a2,a14,a9\n    add.n           a15,a15,a2\n    src             a15,a15,a14\n    ee.movi.32.q    q1,a12,3            # [62]\n\n    mulsh           a13,a7,a5                   # [51]\n    mull            a5,a7,a5                    # [53]\n    ee.movi.32.q    q1,a15,1            # [62]\n\n    add             a5,a5,a9\n    saltu           a2,a5,a9\n    add.n           a13,a13,a2\n    src             a13,a13,a5\n    ee.movi.32.q    q1,a13,0            # [62]\n\n\n// mulhi32 for q3\n    ee.movi.32.a    q3,a3,2             # [44]\n    ee.movi.32.a    q3,a4,3             # [45]\n    ee.movi.32.a    q3,a14,1            # [46]\n    ee.movi.32.a    q3,a5,0             # [62]\n\n    mulsh           a13,a7,a3                   # [51]\n    mull            a3,a7,a3                    # [53]\n\n    mulsh           a12,a7,a4                   # [50]\n    mull            a4,a7,a4                    # [55]\n\n    mulsh           a15,a7,a14                  # [48]\n    mull            a14,a7,a14                  # [49]\n\n    ssai            31                          # [47]\n\n    add             a3,a3,a9\n    saltu           a2,a3,a9\n    add.n           a13,a13,a2\n    src             a13,a13,a3\n\n    add             a4,a4,a9\n    saltu           a2,a4,a9\n    add.n           a12,a12,a2\n    src             a12,a12,a4\n    ee.movi.32.q    q3,a13,2            # [62]\n\n    add             a14,a14,a9\n    saltu           a2,a14,a9\n    add.n           a15,a15,a2\n    src             a15,a15,a14\n    ee.movi.32.q    q3,a12,3            # [62]\n\n    mulsh           a13,a7,a5                   # [51]\n    mull            a5,a7,a5                    # [53]\n    ee.movi.32.q    q3,a15,1            # [62]\n    l32i            a14,a1,4                   # [7]  temp_neg_input2_shift, -input2_shift\n\n    add             a5,a5,a9\n    saltu           a2,a5,a9\n    add.n           a13,a13,a2\n    src             a13,a13,a5\n    ee.movi.32.q    q3,a13,0            # [62]\n\n    // multiplication results: q0-q2 & q1-q3\n\n\n    blti            a14,1, .skip_div_by2_in1\n\n    addi.n          a5,a1,20\n    ee.vcmp.lt.s32  q4,q1,q6\n    ee.vcmp.lt.s32  q5,q3,q6\n    ee.vldbc.32     q7,a5       // 1 << (exponent - 1)\n    wsr.sar         a14         // load right_shift\n    ee.vadds.s32    q4,q4,q7    // subtract 1 `if (val < 0)`\n    ee.vadds.s32    q5,q5,q7    // subtract 1 `if (val < 0)`\n    ee.vadds.s32    q1,q1,q4\n    ee.vadds.s32    q3,q3,q5\n    ee.vsr.32       q1,q1\n    ee.vsr.32       q3,q3\n\n.skip_div_by2_in1:\n\n    ee.vadds.s32        q0,q0,q1\n    ee.vadds.s32        q1,q2,q3\n\n// mulhi32 for q0\n    ee.movi.32.a    q0,a3,2             # [44]\n    ee.movi.32.a    q0,a4,3             # [45]\n    ee.movi.32.a    q0,a14,1            # [46]\n    ee.movi.32.a    q0,a5,0             # [62]\n\n    mulsh           a13,a8,a3                   # [51]\n    mull            a3,a8,a3                    # [53]\n\n    mulsh           a12,a8,a4                   # [50]\n    mull            a4,a8,a4                    # [55]\n\n    mulsh           a15,a8,a14                  # [48]\n    mull            a14,a8,a14                  # [49]\n\n    ssai            31                          # [47]\n\n    add             a3,a3,a9\n    saltu           a2,a3,a9\n    add.n           a13,a13,a2\n    src             a13,a13,a3\n\n    add             a4,a4,a9\n    saltu           a2,a4,a9\n    add.n           a12,a12,a2\n    src             a12,a12,a4\n    ee.movi.32.q    q0,a13,2            # [62]\n\n    add             a14,a14,a9\n    saltu           a2,a14,a9\n    add.n           a15,a15,a2\n    src             a15,a15,a14\n    ee.movi.32.q    q0,a12,3            # [62]\n\n    mulsh           a13,a8,a5                   # [51]\n    mull            a5,a8,a5                    # [53]\n    ee.movi.32.q    q0,a15,1            # [62]\n\n    add             a5,a5,a9\n    saltu           a2,a5,a9\n    add.n           a13,a13,a2\n    src             a13,a13,a5\n    ee.movi.32.q    q0,a13,0            # [62]\n\n\n// mulhi32 for q1\n    ee.movi.32.a    q1,a3,2             # [44]\n    ee.movi.32.a    q1,a4,3             # [45]\n    ee.movi.32.a    q1,a14,1            # [46]\n    ee.movi.32.a    q1,a5,0             # [62]\n\n    mulsh           a13,a8,a3                   # [51]\n    mull            a3,a8,a3                    # [53]\n\n    mulsh           a12,a8,a4                   # [50]\n    mull            a4,a8,a4                    # [55]\n\n    mulsh           a15,a8,a14                  # [48]\n    mull            a14,a8,a14                  # [49]\n\n    ssai            31                          # [47]\n\n    add             a3,a3,a9\n    saltu           a2,a3,a9\n    add.n           a13,a13,a2\n    src             a13,a13,a3\n\n    add             a4,a4,a9\n    saltu           a2,a4,a9\n    add.n           a12,a12,a2\n    src             a12,a12,a4\n    ee.movi.32.q    q1,a13,2            # [62]\n\n    add             a14,a14,a9\n    saltu           a2,a14,a9\n    add.n           a15,a15,a2\n    src             a15,a15,a14\n    ee.movi.32.q    q1,a12,3            # [62]\n\n    mulsh           a13,a8,a5                   # [51]\n    mull            a5,a8,a5                    # [53]\n    ee.movi.32.q    q1,a15,1            # [62]\n    l32i            a14,a1,0                   # [738]  temp_neg_out_shift, -out_shift\n\n    add             a5,a5,a9\n    saltu           a2,a5,a9\n    add.n           a13,a13,a2\n    src             a13,a13,a5\n    ee.movi.32.q    q1,a13,0            # [62]\n\n\n    //q0-q1 has output\n\n    blti            a14,1,.skip_div_by2_out\n    addi.n          a5,a1,24\n    ee.vcmp.lt.s32  q2,q0,q6\n    ee.vcmp.lt.s32  q3,q1,q6\n    ee.vldbc.32     q5,a5       // 1 << (exponent - 1)\n    wsr.sar         a14         // load right shift\n    ee.vadds.s32    q0,q0,q2    // subtract 1 `if (val < 0)`\n    ee.vadds.s32    q1,q1,q3    // subtract 1 `if (val < 0)`\n    ee.vadds.s32    q0,q0,q5\n    ee.vadds.s32    q1,q1,q5\n    ee.vsr.32       q0,q0\n    ee.vsr.32       q1,q1\n\n.skip_div_by2_out:\n\n// add offset and apply activation\n    addi            a15,a1,96\n    ee.vldbc.32     q3,a15              # [809]  id:802 out_offset\n    ee.vadds.s32    q0,q0,q3            # [811]\n    ee.vadds.s32    q1,q1,q3            # [812]\n    addi            a13,a1,108\n    addi            a14,a1,112\n    ee.vldbc.32     q3,a14              # [813]  id:803 activation_max\n    ee.vmin.s32     q0,q0,q3            # [815]\n    ee.vmin.s32     q1,q1,q3            # [816]\n    ee.vldbc.32     q3,a13              # [817]  id:804 activation_min\n    l32i            a13,a1,4                   # [818]  temp_neg_input2_shift\n    ee.vmax.s32     q1,q1,q3            # [819]\n    ee.vmax.s32     q0,q0,q3            # [820]\n\n//pack the data and store\n    l32i.n          a9,a1,44                    # [784]  gra_spill_temp_10\n    ee.vunzip.16    q0,q1               # [821]\n    ee.vunzip.8     q0,q1               # [822]\n    l32i.n          a13,a1,28           # gra_spill_temp_6, multiple of 12 index\n    ee.vst.l.64.ip  q0,a9,8             # [823]  id:805\n    l32i            a15,a1,116                  # [1], size\n    l32i.n          a14,a1,40                   # [20]  gra_spill_temp_9\n    l32i.n          a10,a1,36                   # [14]  gra_spill_temp_8\n    addi            a13,a13,8\n    s32i.n          a13,a1,28           # gra_spill_temp_6\n    bge             a15,a13,.vector_loop\n\n    l32i.n  a2,a1,56                # [0]  gra_spill_temp_13\n\n// check for leftover\n    l32i    a10,a1,116                  # [1]\n    slli    a2,a2,3                     # [2]\n    bge     a2,a10,.exit          # [3] // done, exit\n\n.process_leftover:\n    l32i.n  a3,a1,48                    # [1]  gra_spill_temp_11\n    l32i.n  a12,a1,52                   # [2]  gra_spill_temp_12\n\n    l32i.n  a10,a1,12                   # [3]  gra_spill_temp_2\n    l32i.n  a14,a1,32                # [8]  gra_spill_temp_7\n    add.n   a10,a2,a10                  # [5]\n    add.n   a14,a2,a14                  # [6]\n    l8ui    a14,a14,0                   # [7]  id:809, input1\n    l8ui    a10,a10,0                   # [12]  id:1370, input2\n\n    sext    a14,a14,7                   # [9]\n    sext    a10,a10,7                   # [10]\n    add.n   a10,a10,a12                 # [11] // add offset2\n    add.n   a14,a14,a3                  # [16] // add offset1\n    l32i    a12,a1,88                  # [13]  left_shift\n\n    // sat_round_doubling_high_mul step for input1 and input2\n    ssl     a12                         # [15]\n    sll     a10,a10                     # [20]\n    sll     a14,a14                     # [17]\n\n    l32r            a12,.nudge_val             # [0], nudge\n\n    // a13,a3 are free, a12: nudge, a6:mult1\n    mulsh           a13,a14,a6\n    mull            a9,a14,a6\n    ssai            31\n\n    add             a9,a9,a12\n    saltu           a3,a9,a12\n    add.n           a13,a13,a3\n    src             a14,a13,a9 //result in a14\n\n    mulsh           a13,a10,a7\n    mull            a9,a10,a7\n    ssai            31\n\n    add             a9,a9,a12\n    saltu           a3,a9,a12\n    add.n           a13,a13,a3\n    src             a10,a13,a9 //result in a10\n\n// divide_by_power_of2_step for input1 (a14), input2 (a10)\n// free registers: a13, a12, a9, a3\n\n    l32i.n          a12,a1,8   // -input1_shift\n    l32i.n          a13,a1,4   // -input2_shift\n\n    blti            a12,1,.skip_div_by2_in0_remain\n    l32i.n          a3,a1,16    // 1 << (exponent - 1)\n    extui           a9,a14,31,1\n    ssr             a12         // load right_shift\n    sub             a3,a3,a9    // 1 << (exponent - 1) - (val < 0)\n    add             a14,a14,a3\n    sra             a14,a14\n.skip_div_by2_in0_remain:\n\n    blti            a13,1,.skip_div_by2_in1_remain\n    l32i.n          a3,a1,20    // 1 << (exponent - 1)\n    extui           a9,a10,31,1\n    ssr             a13         // load right_shift\n    sub             a3,a3,a9    // 1 << (exponent - 1) - (val < 0)\n    add             a10,a10,a3\n    sra             a10,a10\n.skip_div_by2_in1_remain:\n\n// process output\n    l32r            a12,.nudge_val             # [0], nudge\n    l32i            a13,a1,0                   // -out_shift\n    add.n           a10,a10,a14                 # [45]\n\n// multiply and pick high32\n    mulsh           a3,a10,a8\n    mull            a10,a10,a8\n    ssai            31                          # [0]\n    add             a10,a10,a12\n    saltu           a9,a10,a12\n    add             a12,a3,a9\n    src             a12,a12,a10\n\n// div by power of 2 for output\n\n    l32i            a9,a1,96                   # [31]  out_offset\n    blti            a13,1,.skip_div_by2_out_remain\n    l32i.n          a3,a1,24    // 1 << (exponent - 1)\n    extui           a14,a12,31,1\n    ssr             a13         // load right_shift\n    sub             a3,a3,a14   // 1 << (exponent - 1) - (val < 0)\n    add             a12,a12,a3\n    sra             a12,a12\n.skip_div_by2_out_remain:\n\n// add offset\n    add.n   a9,a9,a12                   # [33]\n\n// apply activation\n    l32i    a13,a1,112                  # [34]  activation_max\n    l32i    a12,a1,108                  # [35]  activation_min\n    min     a13,a13,a9                      # [36]\n    l32i    a9,a1,92                   # [37]  output\n    max     a13,a13,a12                     # [38]\n    add.n   a9,a2,a9                    # [39]\n    s8i     a13,a9,0                    # [40]  id:1371\n    l32i    a12,a1,116\n    addi.n  a2,a2,1                 # [41]\n    blt     a2,a12,.process_leftover\n\n.exit:\n    retw.n                          # [0]\n\n    .size   esp_nn_add_elementwise_s8_esp32s3, . - esp_nn_add_elementwise_s8_esp32s3\n"
  },
  {
    "path": "src/basic_math/esp_nn_mul_ansi.c",
    "content": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <stdint.h>\n\n#include <common_functions.h>\n\nvoid esp_nn_mul_elementwise_s8_ansi(const int8_t *input1_data,\n                                    const int8_t *input2_data,\n                                    const int32_t input1_offset,\n                                    const int32_t input2_offset,\n                                    int8_t *output,\n                                    const int32_t out_offset,\n                                    const int32_t out_mult,\n                                    const int32_t out_shift,\n                                    const int32_t activation_min,\n                                    const int32_t activation_max,\n                                    const int32_t size)\n{\n    for (int i = 0; i < size; i++) {\n        int32_t tmp1 = input1_data[i] + input1_offset;\n        int32_t tmp2 = input2_data[i] + input2_offset;\n\n        int32_t out = tmp1 * tmp2;\n        out = esp_nn_multiply_by_quantized_mult(out, out_mult, out_shift);\n        out = out + out_offset;\n\n        out = max(activation_min, min(out, activation_max));\n        output[i] = (int8_t) out;\n    }\n}\n\nvoid esp_nn_mul_broadcast_channel_s8_ansi(const int8_t *input1,\n                                          const int8_t *input2_per_ch,\n                                          const int32_t input1_offset,\n                                          const int32_t input2_offset,\n                                          int8_t *output,\n                                          const int32_t output_offset,\n                                          const int32_t output_mult,\n                                          const int32_t output_shift,\n                                          const int32_t activation_min,\n                                          const int32_t activation_max,\n                                          const int32_t total_spatial,\n                                          const int32_t channels)\n{\n    for (int s = 0; s < total_spatial; s++) {\n        const int8_t *in_row = input1 + s * channels;\n        int8_t *out_row = output + s * channels;\n        for (int c = 0; c < channels; c++) {\n            int32_t val = ((int32_t)in_row[c] + input1_offset) *\n                          ((int32_t)input2_per_ch[c] + input2_offset);\n            val = esp_nn_multiply_by_quantized_mult(val, output_mult, output_shift);\n            val += output_offset;\n            val = max(val, activation_min);\n            val = min(val, activation_max);\n            out_row[c] = (int8_t)val;\n        }\n    }\n}\n"
  },
  {
    "path": "src/basic_math/esp_nn_mul_broadcast_s8_esp32s3.S",
    "content": "// Copyright 2026 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n// Optimized broadcast MUL for SE-block pattern: [H,W,C] * [1,1,C]\n// Processes 8 channels at a time using S3 SIMD.\n\n    .text\n    .align  4\n    .literal_position\n    .literal    .LC_nudge, 1073741824   // 1 << 30\n\n    .type   esp_nn_mul_broadcast_channel_s8_esp32s3, @function\n    .align  4\n    .global esp_nn_mul_broadcast_channel_s8_esp32s3\n\n// void esp_nn_mul_broadcast_channel_s8_esp32s3(\n//     const int8_t *input1,           // a2\n//     const int8_t *input2_per_ch,    // a3\n//     const int32_t input1_offset,    // a4\n//     const int32_t input2_offset,    // a5\n//     int8_t *output,                 // a6\n//     const int32_t output_offset,    // a7\n//     const int32_t output_mult,      // stack+120\n//     const int32_t output_shift,     // stack+124\n//     const int32_t activation_min,   // stack+128\n//     const int32_t activation_max,   // stack+132\n//     const int32_t total_spatial,    // stack+136\n//     const int32_t channels);        // stack+140\n\n// Stack frame layout (entry a1, 120):\n//  0: to_add (for div by power of 2)\n//  4: input2_per_ch (saved)\n//  8: output base (saved)\n// 12: channels\n// 16: input1 base (saved)\n// 20: right_shift\n// 24: input1_offset (saved)\n// 28: input2_offset (saved)\n// 32: spatial counter\n// 36: out_ptr (current)\n// 40: out_offset (from a7)\n// 44: input1_offset (for vldbc)\n// 48: input2_offset (for vldbc)\n\nesp_nn_mul_broadcast_channel_s8_esp32s3:\n    entry   a1, 120\n\n    // Save args\n    s32i.n  a3, a1, 4               // input2_per_ch base\n    s32i.n  a6, a1, 8               // output base\n    s32i.n  a2, a1, 16              // input1 base\n    s32i.n  a4, a1, 24              // input1_offset\n    s32i.n  a5, a1, 28              // input2_offset\n    s32i    a7, a1, 40              // out_offset\n\n    l32i    a8, a1, 136             // total_spatial\n    l32i    a9, a1, 140             // channels\n    s32i.n  a9, a1, 12              // save channels\n\n    blti    a8, 1, .Lexit           // no spatial positions\n    blti    a9, 1, .Lexit           // no channels\n\n    // Prepare shift values\n    l32i    a15, a1, 124            // output_shift\n    movi.n  a11, 0\n    max     a14, a15, a11           // left_shift = max(shift, 0)\n    sub     a4, a14, a15            // right_shift = left_shift - shift\n    s32i.n  a4, a1, 20              // save right_shift\n\n    l32i    a13, a1, 120            // output_mult\n    l32r    a4, .LC_nudge           // nudge = 1 << 30\n\n    // Store offsets for vldbc\n    l32i    a8, a1, 136             // reload total_spatial\n    s32i    a5, a1, 48              // input2_offset for vldbc\n    l32i.n  a5, a1, 24              // input1_offset\n    s32i    a5, a1, 44              // input1_offset for vldbc\n\n    // Init spatial counter\n    movi.n  a10, 0\n    s32i    a10, a1, 32             // spatial counter = 0\n\n    // Pointers: a2 = input1 (current), a3 = input2_per_ch (reloaded each row),\n    //           a6 = output (current)\n\n.Lspatial_loop:\n    l32i    a8, a1, 136             // total_spatial\n    l32i    a10, a1, 32             // spatial counter\n    bge     a10, a8, .Lexit\n\n    // Reset input2 pointer for each spatial position\n    l32i.n  a3, a1, 4               // input2_per_ch base\n\n    // Channel counter\n    l32i.n  a9, a1, 12              // channels\n    movi.n  a11, 0                  // channel index\n\n    blti    a9, 8, .Lchannel_leftover\n\n    // Check alignment for SIMD path\n    or      a8, a2, a3\n    or      a8, a8, a6\n    extui   a8, a8, 0, 4\n    bnez    a8, .Lchannel_leftover\n\n    // Setup SIMD constants\n    ee.zero.q   q1                  // zero register\n    addi    a8, a1, 44\n    ee.vldbc.16 q0, a8              // input1_offset broadcast\n    addi    a8, a1, 48\n    ee.vldbc.16 q7, a8              // input2_offset broadcast\n    st.qr   q0, a1, 64             // save for reload in loop\n\n.Lchannel_simd_loop:\n    addi    a8, a9, -7              // channels - 7\n    blt     a11, a8, .Lchannel_simd_body\n    j       .Lchannel_leftover\n\n.Lchannel_simd_body:\n    ld.qr           q4, a1, 64             // input1_offset\n    ee.vld.l.64.ip  q2, a2, 8              // load 8 input1 values\n    movi.n          a7, 16\n    ee.vld.h.64.ip  q2, a3, 8              // load 8 input2 values (per-ch)\n    wsr.sar         a7\n    ee.vcmp.lt.s8   q5, q2, q1             // sign extend\n    ee.vzip.8       q2, q5                 // interleave to 16-bit\n    ee.vadds.s16    q5, q5, q7             // add input2_offset\n    ee.vadds.s16    q4, q2, q4             // add input1_offset\n    ee.vmul.s16     q3, q4, q5             // multiply (high part)\n    ssai            0                      // sar = 0\n    ee.vmul.s16     q2, q4, q5             // multiply (low part)\n\n    // Requantize 8 results (same pattern as elementwise mul)\n    wsr.sar         a14                     // left_shift\n    ee.vzip.16      q2, q3\n    ee.vsl.32       q6, q2                  // left shift first 4\n\n    ssai            31\n\n    // Element 2 of q6\n    ee.movi.32.a    q6, a8, 2\n    mulsh           a7, a13, a8\n    mull            a8, a13, a8\n    add.n           a8, a4, a8\n    saltu           a5, a8, a4\n    add.n           a5, a5, a7\n    src             a5, a5, a8\n    // Element 3\n    ee.movi.32.a    q6, a8, 3\n    mulsh           a7, a13, a8\n    mull            a8, a13, a8\n    add.n           a8, a4, a8\n    saltu           a12, a8, a4\n    add.n           a12, a12, a7\n    src             a12, a12, a8\n    ee.movi.32.q    q2, a5, 2\n    ee.movi.32.q    q2, a12, 3\n    // Element 1\n    ee.movi.32.a    q6, a8, 1\n    mulsh           a7, a13, a8\n    mull            a8, a13, a8\n    add.n           a8, a4, a8\n    saltu           a5, a8, a4\n    add.n           a5, a5, a7\n    src             a5, a5, a8\n    // Element 0\n    ee.movi.32.a    q6, a8, 0\n    mulsh           a7, a13, a8\n    mull            a8, a13, a8\n    add.n           a8, a4, a8\n    saltu           a12, a8, a4\n    add.n           a12, a12, a7\n    src             a12, a12, a8\n    ee.movi.32.q    q2, a5, 1\n    ee.movi.32.q    q2, a12, 0\n\n    // Second group of 4 (q3)\n    wsr.sar         a14                     // left_shift\n    ee.vsl.32       q4, q3\n\n    ssai            31\n\n    ee.movi.32.a    q4, a8, 2\n    mulsh           a7, a13, a8\n    mull            a8, a13, a8\n    add.n           a8, a4, a8\n    saltu           a5, a8, a4\n    add.n           a5, a5, a7\n    src             a5, a5, a8\n    ee.movi.32.a    q4, a8, 3\n    mulsh           a7, a13, a8\n    mull            a8, a13, a8\n    add.n           a8, a4, a8\n    saltu           a12, a8, a4\n    add.n           a12, a12, a7\n    src             a12, a12, a8\n    ee.movi.32.q    q0, a5, 2\n    ee.movi.32.q    q0, a12, 3\n    ee.movi.32.a    q4, a8, 1\n    mulsh           a7, a13, a8\n    mull            a8, a13, a8\n    add.n           a8, a4, a8\n    saltu           a5, a8, a4\n    add.n           a5, a5, a7\n    src             a5, a5, a8\n    ee.movi.32.a    q4, a8, 0\n    mulsh           a7, a13, a8\n    mull            a8, a13, a8\n    add.n           a8, a4, a8\n    saltu           a12, a8, a4\n    add.n           a12, a12, a7\n    src             a12, a12, a8\n    ee.movi.32.q    q0, a5, 1\n    ee.movi.32.q    q0, a12, 0\n\n    // Divide by power of 2 (right_shift)\n    l32i.n          a5, a1, 20              // right_shift\n    movi.n          a7, 1\n\n    blti            a5, 1, .Lskip_div\n\n    ee.vcmp.lt.s32  q5, q2, q1\n    ee.vcmp.lt.s32  q6, q0, q1\n    addi.n          a8, a5, -1\n    ssl             a8\n    sll             a7, a7                  // to_add = 1 << (right_shift - 1)\n    s32i.n          a7, a1, 0\n    ee.vldbc.32     q4, a1                  // broadcast to_add\n    wsr.sar         a5\n    ee.vadds.s32    q5, q4, q5\n    ee.vadds.s32    q5, q2, q5\n    ee.vsr.32       q2, q5\n    wsr.sar         a5\n    ee.vadds.s32    q5, q4, q6\n    ee.vadds.s32    q5, q0, q5\n    ee.vsr.32       q0, q5\n\n.Lskip_div:\n    // Add output offset, apply activation\n    addi            a8, a1, 132\n    ee.vldbc.32     q4, a8                  // activation_max\n    addi            a5, a1, 40\n    ee.vldbc.32     q6, a5                  // output_offset\n    addi            a7, a1, 128\n    ee.vadds.s32    q0, q0, q6              // add offset\n    ee.vadds.s32    q2, q2, q6\n    ee.vldbc.32     q6, a7                  // activation_min\n    ee.vmin.s32     q0, q0, q4\n    ee.vmin.s32     q2, q2, q4\n    ee.vmax.s32     q0, q0, q6\n    ee.vmax.s32     q2, q2, q6\n\n    // Pack 32-bit -> 8-bit and store\n    ee.vunzip.16    q2, q0\n    ee.vunzip.8     q2, q0\n    ee.vst.l.64.ip  q2, a6, 8\n\n    addi            a11, a11, 8             // channel index += 8\n    j               .Lchannel_simd_loop\n\n.Lchannel_leftover:\n    // Process remaining channels one by one\n    l32i.n  a9, a1, 12              // channels\n    bge     a11, a9, .Lspatial_next\n\n    ssl     a14                     // left_shift\n    l32i.n  a8, a1, 24              // input1_offset\n    l8ui    a10, a2, 0              // *input1\n    sext    a10, a10, 7\n    add.n   a10, a10, a8            // + input1_offset\n    l32i.n  a8, a1, 28              // input2_offset\n    l8ui    a12, a3, 0              // *input2_per_ch\n    sext    a12, a12, 7\n    add.n   a12, a12, a8            // + input2_offset\n    mull    a10, a10, a12           // multiply\n\n    // Requantize\n    sll     a10, a10                // left shift\n\n    l32i.n  a9, a1, 20              // right_shift\n    mulsh   a8, a10, a13\n    mull    a12, a10, a13\n    ssai    31\n    add.n   a12, a4, a12\n    saltu   a10, a12, a4\n    add.n   a10, a10, a8\n    src     a10, a10, a12           // result\n\n    blti    a9, 1, .Lskip_div_scalar\n\n    addi    a8, a9, -1\n    ssl     a8\n    movi    a7, 1\n    sll     a7, a7                  // to_add\n    extui   a8, a10, 31, 1          // sign bit (1 if neg, 0 if pos)\n    sub     a10, a10, a8            // val -= sign (fast rounding)\n    add     a10, a10, a7\n    ssr     a9\n    sra     a10, a10\n\n.Lskip_div_scalar:\n    l32i    a8, a1, 40              // output_offset\n    l32i    a7, a1, 128             // activation_min\n    l32i    a12, a1, 132            // activation_max\n    add.n   a10, a10, a8\n    min     a10, a10, a12\n    max     a10, a10, a7\n    s8i     a10, a6, 0              // store\n\n    addi    a2, a2, 1               // input1++\n    addi    a3, a3, 1               // input2++\n    addi    a6, a6, 1               // output++\n    addi    a11, a11, 1             // channel index++\n    j       .Lchannel_leftover\n\n.Lspatial_next:\n    l32i    a10, a1, 32             // spatial counter\n    addi    a10, a10, 1\n    s32i    a10, a1, 32\n    j       .Lspatial_loop\n\n.Lexit:\n    retw.n\n\n    .size   esp_nn_mul_broadcast_channel_s8_esp32s3, . - esp_nn_mul_broadcast_channel_s8_esp32s3\n"
  },
  {
    "path": "src/basic_math/esp_nn_mul_s8_esp32p4.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <stdint.h>\n#include <common_functions.h>\n\n/**\n * Elementwise multiply for s8 optimized for ESP32-P4.\n * Uses inlined fast requantization with 4x unrolled loop.\n * Interleaves independent computations to hide latency.\n */\nvoid esp_nn_mul_elementwise_s8_esp32p4(const int8_t *input1_data,\n                                        const int8_t *input2_data,\n                                        const int32_t input1_offset,\n                                        const int32_t input2_offset,\n                                        int8_t *output,\n                                        const int32_t out_offset,\n                                        const int32_t out_mult,\n                                        const int32_t out_shift,\n                                        const int32_t activation_min,\n                                        const int32_t activation_max,\n                                        const int32_t size)\n{\n    const int32_t left_shift = out_shift > 0 ? out_shift : 0;\n    const int32_t right_shift = left_shift - out_shift;\n    const int64_t nudge = (int64_t)1 << 30;\n\n    int i = 0;\n    for (; i <= size - 4; i += 4) {\n        int32_t prod0 = (input1_data[i+0] + input1_offset) * (input2_data[i+0] + input2_offset);\n        int32_t prod1 = (input1_data[i+1] + input1_offset) * (input2_data[i+1] + input2_offset);\n        int32_t prod2 = (input1_data[i+2] + input1_offset) * (input2_data[i+2] + input2_offset);\n        int32_t prod3 = (input1_data[i+3] + input1_offset) * (input2_data[i+3] + input2_offset);\n\n        int32_t s0 = prod0 << left_shift;\n        int32_t s1 = prod1 << left_shift;\n        int32_t s2 = prod2 << left_shift;\n        int32_t s3 = prod3 << left_shift;\n\n        int32_t r0 = (int32_t)(((int64_t)s0 * out_mult + nudge) >> 31);\n        int32_t r1 = (int32_t)(((int64_t)s1 * out_mult + nudge) >> 31);\n        int32_t r2 = (int32_t)(((int64_t)s2 * out_mult + nudge) >> 31);\n        int32_t r3 = (int32_t)(((int64_t)s3 * out_mult + nudge) >> 31);\n\n        if (right_shift > 0) {\n            int32_t rnd = (1 << (right_shift - 1));\n            r0 = (r0 + rnd - (r0 < 0)) >> right_shift;\n            r1 = (r1 + rnd - (r1 < 0)) >> right_shift;\n            r2 = (r2 + rnd - (r2 < 0)) >> right_shift;\n            r3 = (r3 + rnd - (r3 < 0)) >> right_shift;\n        }\n\n        r0 = max(activation_min, min(r0 + out_offset, activation_max));\n        r1 = max(activation_min, min(r1 + out_offset, activation_max));\n        r2 = max(activation_min, min(r2 + out_offset, activation_max));\n        r3 = max(activation_min, min(r3 + out_offset, activation_max));\n\n        output[i+0] = (int8_t) r0;\n        output[i+1] = (int8_t) r1;\n        output[i+2] = (int8_t) r2;\n        output[i+3] = (int8_t) r3;\n    }\n\n    for (; i < size; i++) {\n        int32_t prod = (input1_data[i] + input1_offset) * (input2_data[i] + input2_offset);\n        int32_t out = esp_nn_requantize(prod, out_mult, out_shift);\n        out = max(activation_min, min(out + out_offset, activation_max));\n        output[i] = (int8_t) out;\n    }\n}\n"
  },
  {
    "path": "src/basic_math/esp_nn_mul_s8_esp32s3.S",
    "content": "// Copyright 2021-2023 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n    .text\n    .align  4\n    .literal_position\n    .literal    .LC0_26_123, 1073741824 // `1 << 30`\n\n    # Program Unit: esp_nn_mul_elementwise_s8_esp32s3\n    .type   esp_nn_mul_elementwise_s8_esp32s3, @function\n    .align   4\n    .global esp_nn_mul_elementwise_s8_esp32s3\n\nesp_nn_mul_elementwise_s8_esp32s3:  # 0x4\n    # to_add = 0\n    # gra_spill_temp_0 = 4\n    # gra_spill_temp_1 = 8\n    # gra_spill_temp_2 = 12\n    # gra_spill_temp_3 = 16\n    # gra_spill_temp_4 = 20\n    # gra_spill_temp_5 = 24\n    # gra_spill_temp_6 = 28\n    # gra_spill_temp_7 = 32\n    # gra_spill_temp_8 = 36\n    # gra_spill_temp_<> = 40\n    # gra_spill_temp_<> = 44\n    # gra_spill_temp_<> = 48\n    # gra_spill_temp_13 = 64\n\n // registers:\n // a2: const int8_t *input1_data\n // a3: const int8_t *input2_data\n // a4: const int32_t input1_offset\n // a5: const int32_t input2_offset\n // a6: int8_t *output\n // a7: const int32_t out_offset\n\n // on stack:\n // 120: const int32_t out_mult\n // 124: const int32_t out_shift\n // 128: const int32_t activation_min\n // 132: const int32_t activation_max\n // 136: const int32_t size\n\n    entry   a1,120                      #\n    s32i.n  a4,a1,24                # [0]  gra_spill_temp_5, input1_offset\n    s32i.n  a5,a1,28                # [1]  gra_spill_temp_12, input2_offset\n\n    s32i.n  a3,a1,4                     # [5]  gra_spill_temp_0, input2\n    mov.n   a10,a3                      # [6]\n    l32i    a3,a1,136                   # [18]  id:361 size+0x0\n    mov.n   a9,a6                       # [2] // out_addr\n    blti    a3,1,.exit            # [0] // exit\n\n    s32i.n  a2,a1,16                    # [9]  gra_spill_temp_3, input1\n    s32i    a7,a1,40                    # [4]  id:358 out_offset+0x0\n    movi.n  a11,0                       # [3]\n    mov.n   a12,a2                      # [10]\n    s32i    a4,a1,44                # [13]  id:356 input1_offset+0x0\n    s32i    a5,a1,48                # [14]  id:357 input2_offset+0x0\n    movi.n  a2,1                    # [15]\n\n    l32i    a15,a1,124                  # [3]  id:362 out_shift+0x0\n    l32i    a13,a1,120                  # [4]  id:363 out_mult+0x0\n    s32i.n  a6,a1,8                 # [1]  gra_spill_temp_1, out_addr\n    max     a14,a15,a11                 # [11] left_shift\n    sub     a4,a14,a15              # right_shift\n    s32i.n  a4,a1,20                # [9]  gra_spill_temp_4\n\n    blti    a3,8,.process_leftover             # [20]\n\n    // skip to leftover routine if inputs are unaligned\n    or          a6,a12,a10\n    extui       a6,a6,0,4\n    bnez        a6,.process_leftover\n\n    // `size > 8`, s3 optimisation path...\n    ee.zero.q   q1                      # [0]\n    addi    a4,a1,44                # [7]\n    addi    a8,a1,48                    # [8]\n    ee.vldbc.16 q0,a4               # [17]  id:359 input1_offset\n    ee.vldbc.16 q7,a8               # [16]  id:360 input2_offset\n    l32r    a4,.LC0_26_123              # [12]\n    movi    a8, 8\n    st.qr   q0,a1,64                    # [19]  gra_spill_temp_13\n    s32i.n  a8,a1,12                # [6]  gra_spill_temp_2\n\n.Lt_0_7682: # 0x60\n    s32i            a9,a1,36                    # [1]  gra_spill_temp_8, out_addr\n    ld.qr           q4,a1,64                    # [2]  gra_spill_temp_13, input1_offset\n    ee.vld.l.64.ip  q2,a12,8        # [4]  id:367, input1_ptr\n    movi.n          a7,16                   # [3]\n    ee.vld.h.64.ip  q2,a10,8        # [5]  id:368, input2_ptr\n    wsr.sar         a7                      # [6]\n    ee.vcmp.lt.s8   q5,q2,q1            # [7]\n    ee.vzip.8       q2,q5               # [8]\n    ee.vadds.s16    q5,q5,q7            # [9] input2_offset\n    ee.vadds.s16    q4,q2,q4            # [10] input1_offset\n    ee.vmul.s16     q3,q4,q5            # [11]\n    wsr.sar         a11                         # [12]\n    ee.vmul.s16     q2,q4,q5            # [13]\n\n    wsr.sar         a14                     # [14] left_shift\n    ee.vzip.16      q2,q3               # [15]\n    ee.vsl.32       q6,q2                   # [16] left_shift\n    ssai            31                          # [17]\n\n    ee.movi.32.a    q6,a3,2             # [18]\n    ee.movi.32.a    q6,a8,3             # [26]\n\n    mulsh           a6,a13,a3                   # [19]\n    mull            a3,a13,a3                   # [20]\n    mulsh           a7,a13,a8                   # [27]\n    add.n           a3,a4,a3                    # [22]\n    saltu           a2,a3,a4                    # [23]\n    add.n           a2,a2,a6                    # [24]\n    src             a2,a2,a3                    # [25]\n\n    mull            a6,a13,a8                   # [28]\n    add.n           a6,a4,a6                    # [30]\n    saltu           a9,a6,a4                    # [31]\n    add.n           a9,a9,a7                    # [32]\n    src             a9,a9,a6                    # [33]\n    ee.movi.32.q    q2,a2,2             # [53]\n    ee.movi.32.q    q2,a9,3             # [54]\n\n    ee.movi.32.a    q6,a6,1             # [34]\n    mulsh           a7,a13,a6                   # [35]\n    mull            a6,a13,a6                   # [36]\n    add.n           a6,a4,a6                    # [38]\n    saltu           a3,a6,a4                    # [39]\n    add.n           a3,a3,a7                    # [16]\n    src             a3,a3,a6                    # [41]\n    ee.movi.32.a    q6,a2,0             # [42]\n    mulsh           a8,a13,a2                   # [43]\n    mull            a7,a13,a2                   # [4]\n    add.n           a7,a4,a7                    # [46]\n    saltu           a6,a7,a4                    # [47]\n    add.n           a6,a6,a8                    # [24]\n    src             a6,a6,a7                    # [49]\n    ee.movi.32.q    q2,a3,1             # [28]\n    ee.movi.32.q    q2,a6,0             # [50]\n\n    wsr.sar         a14                     # [10]\n    ee.vsl.32       q4,q3                   # [11]\n    ee.movi.32.a    q4,a2,2             # [13]\n    mulsh           a3,a13,a2                   # [14]\n    mull            a2,a13,a2                   # [15]\n    ssai            31                          # [12]\n    add.n           a2,a4,a2                    # [17]\n    saltu           a5,a2,a4                # [18]\n    add.n           a5,a5,a3                # [19]\n    src             a5,a5,a2                    # [20]\n    ee.movi.32.a    q4,a3,3             # [21]\n    mulsh           a6,a13,a3                   # [22]\n    mull            a3,a13,a3                   # [23]\n    add.n           a3,a4,a3                    # [25]\n    saltu           a8,a3,a4                    # [26]\n    add.n           a8,a8,a6                    # [27]\n    src             a8,a8,a3                    # [28]\n    ee.movi.32.q    q0,a5,2             # [24]\n    ee.movi.32.q    q0,a8,3             # [51]\n\n    ee.movi.32.a    q4,a7,1             # [29]\n    mulsh           a6,a13,a7                   # [30]\n    mull            a3,a13,a7                   # [31]\n    add.n           a3,a4,a3                    # [33]\n    saltu           a2,a3,a4                    # [34]\n    add.n           a2,a2,a6                    # [35]\n    src             a2,a2,a3                    # [36]\n    ee.movi.32.a    q4,a6,0             # [37]\n    mulsh           a7,a13,a6                   # [38]\n    mull            a6,a13,a6                   # [39]\n    add.n           a6,a4,a6                    # [41]\n    saltu           a3,a6,a4                    # [42]\n    add.n           a3,a3,a7                    # [43]\n    src             a3,a3,a6                    # [4]\n    ee.movi.32.q    q0,a2,1             # [47]\n    ee.movi.32.q    q0,a3,0             # [46]\n\n    l32i.n          a5,a1,20                # [0]  gra_spill_temp_4, right_shift\n    movi.n          a7,1                    # [51]\n\n    blti            a5,1,.skip_div_by_pow_of_2\n// divide by power of 2\n    ee.vcmp.lt.s32  q5,q2,q1        # [56]\n    ee.vcmp.lt.s32  q6,q0,q1        # [28]\n\n    addi.n          a8,a5,-1                # [1]\n    ssl             a8                          # [2]\n    sll             a7,a7                       # [3]\n    s32i.n          a7,a1,0                 # [4]  to_add\n    ee.vldbc.32     q4,a1               # [5]  id:376 to_add\n\n    wsr.sar         a5                      # [6]\n    ee.vadds.s32    q5,q4,q5            # [7]\n    ee.vadds.s32    q5,q2,q5            # [8]\n    ee.vsr.32       q2,q5                   # [9]\n\n    wsr.sar         a5                      # [5]\n    ee.vadds.s32    q5,q4,q6            # [9]\n    ee.vadds.s32    q5,q0,q5            # [11]\n    ee.vsr.32       q0,q5                   # [12]\n.skip_div_by_pow_of_2:\n\n// add offset, apply activation\n    addi            a8,a1,132                   # [54]\n    ee.vldbc.32     q4,a8               # [55]  id:385 activation_max\n    addi            a5,a1,40                    # [8]\n    ee.vldbc.32     q6,a5               # [10]  id:384 out_offset\n    addi            a7,a1,128                   # [4]\n    ee.vadds.s32    q0,q0,q6            # [13] // add out_offset\n    ee.vadds.s32    q2,q2,q6            # [14] // add out_offset\n    ee.vldbc.32     q6,a7               # [16]  id:386 activation_min\n    ee.vmin.s32     q0,q0,q4            # [17]\n    ee.vmin.s32     q2,q2,q4            # [15]\n    ee.vmax.s32     q0,q0,q6            # [18]\n    ee.vmax.s32     q2,q2,q6            # [19]\n\n// pack and store\n    ee.vunzip.16    q2,q0               # [20]\n    ee.vunzip.8     q2,q0               # [21]\n    l32i.n          a7,a1,12 // count\n    l32i            a9,a1,36                    # [55]  gra_spill_temp_8\n    l32i.n          a3,a1,136               # [1] , size\n    ee.vst.l.64.ip  q2,a9,8         # [22]  id:387\n    addi            a7,a7,8\n    s32i.n          a7,a1,12 // increment count\n    bge             a3,a7,.Lt_0_7682\n\n    addi            a11,a7,-8\n    bge             a11,a3,.exit  # [3] // exit\n\n.process_leftover:\n    sub     a8,a3,a11                   # [1]\n    loopgtz a8,.LBB33_esp_nn_mul_elementwise_s8_esp32s3     # [9]\n\n    ssl     a14                         # [0] left_shift\n    l32i.n  a8,a1,24                # [1]  gra_spill_temp_5, input1_offset\n    l32i.n  a10,a1,4                # [2]  gra_spill_temp_0, input2\n    l32i.n  a12,a1,16               # [3]  gra_spill_temp_3, input1\n    add.n   a10,a11,a10                 # [4], input2\n    add.n   a12,a11,a12                 # [5], input1\n    l8ui    a12,a12,0                   # [6]  id:390\n    l8ui    a10,a10,0                   # [7]  id:391\n    sext    a12,a12,7                   # [8]\n    add.n   a12,a12,a8                  # [9]\n    l32i.n  a8,a1,28                # [10]  gra_spill_temp_12, input2_offset\n    sext    a10,a10,7                   # [11]\n    add.n   a10,a10,a8                  # [12]\n    mull    a10,a12,a10                 # [13] // multiplication result\n\n// multiply by quantised mult\n    l32i.n  a9,a1,20                # [0]  gra_spill_temp_4, load right_shift\n\n    sll     a10,a10                     # [15] // left shift\n\n    mulsh   a3,a10,a13                  # [1]\n    mull    a8,a10,a13                  # [6]\n    ssai    31                          # [0]\n    add.n   a6,a8,a4                    # [8]\n    saltu   a8,a6,a8                    # [9]\n    add.n   a8,a8,a3                    # [10]\n    src     a3,a8,a6                    # [19] // result\n\n    blti    a9, 1, .skip_div_by_pow_of_2_remains\n// divide by power of 2\n    // calculate to_add = `1 << (exponent - 1)`\n    addi    a6,a9,-1\n    ssl     a6                          # [23]\n    movi    a7,1\n    sll     a7,a7                       // to_add\n\n    extui   a8,a3,31,1                  # [24], sign\n    add     a3,a3,a8            // add sign\n    add     a3,a3,a7            // add to_add\n\n    ssr     a9                          # [20] load right_shift\n    sra     a3,a3               // right shift\n.skip_div_by_pow_of_2_remains:\n\n    l32i.n  a6,a1,40                    # [32], out_offset\n    l32i.n  a8,a1,132                   # [35], act_max\n    l32i.n  a7,a1,128                   # [36], act_min\n\n// add offset and apply activation\n    add.n   a3,a3,a6                    # [34], offset added\n    min     a8,a8,a3                    # [37]\n    l32i.n  a3,a1,8                 # [38]  gra_spill_temp_1, load base out_addr\n    max     a8,a8,a7                    # [39]\n\n// store\n    add.n   a3,a11,a3                   # [16], add index from `a11`\n    s8i     a8,a3,0                     # [41]  id:392 // store\n    addi.n  a11,a11,1               # [42]  // inc index\n\n.LBB33_esp_nn_mul_elementwise_s8_esp32s3:   # 0x2ed\n.exit:\n    retw.n                          # [0]\n\n    .size   esp_nn_mul_elementwise_s8_esp32s3, . - esp_nn_mul_elementwise_s8_esp32s3\n"
  },
  {
    "path": "src/common/common_functions.h",
    "content": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#pragma once\n\n#include <stdint.h>\n#include <stdbool.h>\n#include <string.h>\n\n/**\n * c99 standard still doesn't strictly inline functions\n * We need to use attribute as well to do this.\n */\n#define __NN_FORCE_INLINE__ __attribute((always_inline)) static inline\n\n/* min/max macros */\n#ifndef max\n#define max(a, b) ({            \\\n    __typeof__ (a) _a = (a);    \\\n    __typeof__ (b) _b = (b);    \\\n    _a > _b ? _a : _b;          \\\n})\n\n#define min(a, b) ({            \\\n    __typeof__ (a) _a = (a);    \\\n    __typeof__ (b) _b = (b);    \\\n    _a < _b ? _a : _b;          \\\n})\n#endif\n\n__NN_FORCE_INLINE__ int32_t esp_nn_clz32(uint32_t in)\n{\n#if CONFIG_IDF_TARGET_ARCH_XTENSA\n    __asm__ volatile(\"nsau %0, %0\" : \"+r\" (in));\n    return in;\n#elif defined(__GNUC__)\n    return __builtin_clz(in);\n#else\n    int32_t count = 32;\n    uint32_t x = in, y = in >> 16;\n    if (y != 0) {\n        count -= 16;\n        x = y;\n    }\n    y = x >> 8;\n    if (y != 0) {\n        count -= 8;\n        x = y;\n    }\n    y = x >> 4;\n    if (y != 0) {\n        count -= 4;\n        x = y;\n    }\n    y = x >> 2;\n    if (y != 0) {\n        count -= 2;\n        x = y;\n    }\n    y = x >> 1;\n    if (y != 0) {\n        return count - 2;\n    }\n    return count - x;\n#endif\n}\n\n/**\n * Signed saturate a 32 bit value to 8 bits keeping output in 32 bit variable.\n */\n__NN_FORCE_INLINE__ int32_t esp_nn_saturate8(int32_t in)\n{\n#if CONFIG_IDF_TARGET_ARCH_XTENSA\n    __asm__ volatile(\"clamps %0, %0, 7\" : \"+a\"(in));\n    return in;\n#else\n    return max(INT8_MIN, min(in, INT8_MAX));\n#endif\n}\n\n__NN_FORCE_INLINE__ int32_t esp_nn_pick_sat_high32_of64(int64_t val64)\n{\n    int32_t sign = (int32_t) (val64 >> 63);\n    int32_t to_add = sign & ((1ul << 31) - 1);\n    return (int32_t) ((int64_t) (val64 + to_add) >> 31);\n}\n\n__NN_FORCE_INLINE__ int32_t esp_nn_sat_round_doubling_high_mul(int32_t in0, int32_t in1)\n{\n    int32_t result;\n    int64_t in0_64 = (int64_t) in0;\n    bool overflow = (in0 == in1) && (in0 == (int32_t) INT32_MIN);\n\n    /* Nudge value */\n    int64_t nudge_val = 1 << 30;\n    if ((in0 < 0) ^ (in1 < 0)) {\n        nudge_val = 1 - nudge_val;\n    }\n\n    /* Multiply and add nudge */\n    int64_t mult = in0_64 * in1 + nudge_val;\n\n    /* Round and pickup 32 bits */\n    result = esp_nn_pick_sat_high32_of64(mult);\n\n    return overflow ? INT32_MAX : result;\n}\n\n/**\n * fast version\n * this will fail for values closer to INT32_MAX and INT32_MIN by `1 << (exponent - 1)`.\n * We can afford to do this because we are at the very last stage of filter.\n * Also it is pretty rare condition as our output is going to be 8 bit.\n */\n__NN_FORCE_INLINE__ int32_t esp_nn_div_by_power_of_two_fast(int32_t val, int32_t exponent)\n{\n    int32_t to_add = (1 << (exponent - 1)) - (val < 0);\n    return (int32_t) ((val + to_add) >> exponent);\n}\n\n__NN_FORCE_INLINE__ int32_t esp_nn_div_by_power_of_two(int32_t val, int32_t exponent)\n{\n    int32_t result;\n\n    const int32_t mask = (1 << exponent) - 1;\n    const int32_t remainder = val & mask;\n\n    result = val >> exponent;\n    int32_t threshold = (mask >> 1) + (result < 0);\n\n    if (remainder > threshold) {\n        result += 1;\n    }\n    return result;\n}\n\n__NN_FORCE_INLINE__ int32_t esp_nn_multiply_by_quantized_mult(int32_t x, int32_t mult, int32_t shift)\n{\n    int32_t left_shift = shift > 0 ? shift : 0;\n    int32_t right_shift = shift > 0 ? 0 : -shift;\n    int32_t result = esp_nn_sat_round_doubling_high_mul(x * (1 << left_shift), mult);\n    return esp_nn_div_by_power_of_two(result, right_shift);\n}\n\n#if CONFIG_IDF_TARGET_ESP32P4\n/** PIE enable macro - call once before using any esp.* instructions */\n#define ESP_NN_PIE_ENABLE() do { \\\n    asm volatile ( \\\n        \"csrsi  0x7f2, 0b01        \\n\\t\" \\\n        \"li     x29, 0b10          \\n\\t\" \\\n        \"esp.movx.w.cfg x29        \\n\\t\" \\\n        ::: \"x29\" \\\n    ); \\\n} while(0)\n\n/** Extract 16 int32 per-lane results from QACC into array */\n#define ESP_NN_QACC_EXTRACT_S32(dst) do { \\\n    asm volatile ( \\\n        \"mv                      x30, %0     \\n\\t\" \\\n        \"esp.st.qacc.l.l.128.ip  x30, 16     \\n\\t\" \\\n        \"esp.st.qacc.l.h.128.ip  x30, 16     \\n\\t\" \\\n        \"esp.st.qacc.h.l.128.ip  x30, 16     \\n\\t\" \\\n        \"esp.st.qacc.h.h.128.ip  x30, 0      \\n\\t\" \\\n        :: \"r\"(dst) \\\n        : \"x30\", \"memory\" \\\n    ); \\\n} while(0)\n#endif /* CONFIG_IDF_TARGET_ESP32P4 - PIE_ENABLE and QACC_EXTRACT */\n\n/**\n * 2-wide interleaved requant macro for ESP32-P4 RISC-V.\n * Interleaves mulh across two independent elements for pipeline fill.\n * Outputs r0, r1 as requantized int32 values (before offset/clamp).\n */\n#if CONFIG_IDF_TARGET_ESP32P4\n#define ESP_NN_REQUANT_2X(x0, x1, m0, m1, s0, s1, r0, r1) do { \\\n    int32_t _ls0 = (s0) > 0 ? (s0) : 0; \\\n    int32_t _ls1 = (s1) > 0 ? (s1) : 0; \\\n    int32_t _v0 = (x0) << _ls0; \\\n    int32_t _v1 = (x1) << _ls1; \\\n    int32_t _rs0 = _ls0 - (s0); \\\n    int32_t _rs1 = _ls1 - (s1); \\\n    int32_t _hi0, _lo0, _hi1, _lo1; \\\n    asm volatile ( \\\n        \"mulh  %[h0], %[v0], %[mm0]  \\n\\t\" \\\n        \"mulh  %[h1], %[v1], %[mm1]  \\n\\t\" \\\n        \"mul   %[l0], %[v0], %[mm0]  \\n\\t\" \\\n        \"mul   %[l1], %[v1], %[mm1]  \\n\\t\" \\\n        : [h0] \"=&r\"(_hi0), [h1] \"=&r\"(_hi1), \\\n          [l0] \"=&r\"(_lo0), [l1] \"=&r\"(_lo1) \\\n        : [v0] \"r\"(_v0), [v1] \"r\"(_v1), \\\n          [mm0] \"r\"((int32_t)(m0)), [mm1] \"r\"((int32_t)(m1)) \\\n    ); \\\n    /* Add nudge (1<<30) and extract bits [31:62] */ \\\n    uint32_t _n = 0x40000000u; \\\n    uint32_t _a0 = (uint32_t)_lo0 + _n; \\\n    _hi0 += (_a0 < (uint32_t)_lo0); \\\n    (r0) = (_hi0 << 1) | (_a0 >> 31); \\\n    uint32_t _a1 = (uint32_t)_lo1 + _n; \\\n    _hi1 += (_a1 < (uint32_t)_lo1); \\\n    (r1) = (_hi1 << 1) | (_a1 >> 31); \\\n    /* Right shift with rounding */ \\\n    if (_rs0) { (r0) = ((r0) + (1 << (_rs0 - 1)) - ((r0) < 0)) >> _rs0; } \\\n    if (_rs1) { (r1) = ((r1) + (1 << (_rs1 - 1)) - ((r1) < 0)) >> _rs1; } \\\n} while(0)\n#endif\n\n__NN_FORCE_INLINE__ int32_t esp_nn_multiply_by_quantized_mult_fast(int32_t x, int32_t mult, int32_t shift)\n{\n    int32_t left_shift = max(shift, 0);\n    int32_t right_shift = left_shift - shift;\n\n    int64_t nudge_val = 1 << 30;\n    int64_t in0_64 = (int64_t) (x << left_shift);\n\n    /* Multiply and add nudge */\n    int64_t mult_64 = in0_64 * mult + nudge_val;\n    int32_t result = (int32_t) (mult_64 >> 31);\n    if (right_shift) {\n        result = esp_nn_div_by_power_of_two_fast(result, right_shift);\n    }\n    return result;\n}\n\n/*\n * Unified requantize wrapper. Defining either SKIP_NUDGE (legacy) or\n * CONFIG_NN_SKIP_NUDGE (Kconfig-driven) selects the faster, non-bit-exact\n * path; otherwise the bit-exact TFLite-reference path is used.\n */\n#if defined(SKIP_NUDGE) || defined(CONFIG_NN_SKIP_NUDGE)\n#define esp_nn_requantize(x, m, s) esp_nn_multiply_by_quantized_mult_fast((x), (m), (s))\n#else\n#define esp_nn_requantize(x, m, s) esp_nn_multiply_by_quantized_mult((x), (m), (s))\n#endif\n\nstatic void esp_nn_aligned_s8_pad_with_value(const int8_t *src, int8_t *dst,\n                                             const uint16_t input_wd,\n                                             const uint16_t input_ht,\n                                             const uint16_t channels,\n                                             const int32_t pad_val,\n                                             const uint16_t pad_wd,\n                                             const uint16_t pad_ht)\n{\n    /* memset with pad_val */\n    memset(dst, pad_val, ((input_wd + 2 * pad_wd) * (input_ht + 2 * pad_ht)) * channels);\n    dst += (pad_wd + input_wd + pad_wd) * pad_ht * channels;\n\n    for (int i = 0; i < input_ht; i++) {\n        dst += pad_wd * channels;\n        for (int j = 0; j < input_wd * channels; j++) {\n            *dst++ = *src++;\n        }\n        dst += pad_wd * channels;\n    }\n}\n\nstatic void esp_nn_aligned_s8_pad_end_with_value(const int8_t *src, int8_t *dst,\n                                                 const uint16_t input_wd,\n                                                 const uint16_t input_ht,\n                                                 const uint16_t channels,\n                                                 const int32_t pad_val,\n                                                 const uint16_t pad_wd,\n                                                 const uint16_t pad_ht)\n{\n    for (int i = 0; i < input_ht; i++) {\n        for (int j = 0; j < input_wd * channels; j++) {\n            *dst++ = *src++;\n        }\n        if (pad_wd) {\n            memset(dst, pad_val, pad_wd * channels);\n            dst += pad_wd * channels;\n        }\n    }\n    /* pad end `pad_ht` lines at end */\n    if (pad_ht) {\n        memset(dst, pad_val, (input_wd + pad_wd) * pad_ht * channels);\n    }\n}\n\n/**\n * @brief       convert 8 bit input data to 16 bit\n *\n * @param       src int8_t source data\n * @param       dst int16_t dst data\n * @param       size length of data\n * @param       offset  offset to be added to src data. Range: [-128, 127]\n */\n__NN_FORCE_INLINE__ void esp_nn_s8_to_s16_with_offset(const int8_t *src, int16_t *dst,\n                                                      const int size, const int32_t offset)\n{\n    int i = 0;\n    for (; i < size; i += 2) {\n        dst[i + 0] = src[i + 0] + offset;\n        dst[i + 1] = src[i + 1] + offset;\n    }\n    if(i < size) {\n        dst[i] = src[i] + offset;\n    }\n}\n\n/**\n * @brief       convert 8 bit input data to 16 bit\n *\n * @param       src int8_t source data\n * @param       dst int16_t dst data\n * @param       size length of data\n */\n__NN_FORCE_INLINE__ void esp_nn_s8_to_s16(const int8_t *src, int16_t *dst, const int size)\n{\n    int i = 0;\n    for (; i < size; i += 2) {\n        dst[i + 0] = src[i + 0];\n        dst[i + 1] = src[i + 1];\n    }\n    if(i < size) {\n        dst[i] = src[i];\n    }\n}\n\n#if CONFIG_IDF_TARGET_ESP32S3\n/**\n * @brief       s8 dot product — both pointers 16-byte aligned.\n *              Uses ACCX accumulator with fused MAC+load.\n *\n * @param       a       input data (16-byte aligned)\n * @param       b       filter data (16-byte aligned)\n * @param       len     number of elements (must be multiple of 16, >= 16)\n * @return      int32_t dot product result\n */\nextern int32_t esp_nn_dot_s8_aligned_esp32s3(const int8_t *a, const int8_t *b, int32_t len);\n\n/**\n * @brief       s8 dot product — input aligned, filter may be unaligned.\n *              Uses USAR+QUP pattern for filter data.\n *\n * @param       a       input data (16-byte aligned)\n * @param       b       filter data (may be unaligned)\n * @param       len_div16  number of 16-element chunks (>= 1)\n * @return      int32_t dot product result\n */\nextern int32_t esp_nn_dot_s8_unaligned_esp32s3(const int8_t *a, const int8_t *b, int32_t len_div16);\n#endif\n"
  },
  {
    "path": "src/common/esp_nn_common_functions_esp32s3.S",
    "content": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n\t.text\n\n\t# Program Unit: esp_nn_aligned_s8_to_s16_with_offset_esp32s3\n\t.type\tesp_nn_aligned_s8_to_s16_with_offset_esp32s3, @function\n\t.align\t 4\n\t.global esp_nn_aligned_s8_to_s16_with_offset_esp32s3\n\nesp_nn_aligned_s8_to_s16_with_offset_esp32s3:\t# 0x30d\n\n\tentry\ta1,48                   \t#\n\tmov.n\ta10,a2                  \t# // src\n\tmov.n\ta9,a3                   \t# // dst\n\tmov.n\ta8,a4                   \t# // size\n\ts32i.n\ta5,a1,12               \t# [3] // offset\n\taddi.n\ta2,a1,12               \t# [4]\n\n\tblti\ta4,32,.Lt_2_6402         \t# [5] if (size < 32) goto unopt\n\n\taddi.n\ta6,a8,-1               \t# [0]\n\tee.zero.q\tq5                  \t# [1]\n\tee.vldbc.16\tq4,a2             \t# [2]  id:136 offset\n\tmov.n\ta3,a10                  \t# [3]\n\tmov.n\ta2,a9                   \t# [4]\n\tee.vld.128.ip\tq0,a3,16        \t# [5]  id:137\n\tee.vld.128.ip\tq1,a3,16        \t# [6]  id:138\n\tee.vcmp.lt.s8\tq2,q0,q5        \t# [7]\n\tee.vzip.8\tq0,q2               \t# [8]\n\tee.vadds.s16\tq0,q0,q4         \t# [9]\n\tee.vadds.s16.st.incp\tq0,a2,q0,q2,q4 \t# [10]  id:139\n\tblti\ta4,64,.Lt_2_7170         \t# [11]\n\n\taddi\ta5,a4,-32                \t# [0]\n\tsrai\ta5,a5,5                  \t# [1]\n\tslli\ta4,a5,5                  \t# [2]\n\tloopgtz\ta5,.LBB37_esp_nn_aligned_s8_to_s16_with_offset_esp32s3 \t# [3]\n\n\tee.vst.128.ip\tq0,a2,16        \t# [0*II+0]  id:140\n\tee.vcmp.lt.s8\tq0,q1,q5        \t# [0*II+1]\n\tee.vzip.8\tq1,q0               \t# [0*II+2]\n\tee.vadds.s16.ld.incp\tq2,a3,q3,q1,q4 \t# [0*II+3]  id:141\n\tee.vadds.s16.st.incp\tq3,a2,q0,q0,q4 \t# [0*II+4]  id:142\n\tee.vcmp.lt.s8\tq3,q2,q5        \t# [0*II+5]\n\tee.vst.128.ip\tq0,a2,16        \t# [0*II+6]  id:143\n\tee.vzip.8\tq2,q3               \t# [0*II+7]\n\tee.vadds.s16.ld.incp\tq1,a3,q0,q2,q4 \t# [0*II+8]  id:144\n\tee.vadds.s16.st.incp\tq0,a2,q0,q3,q4 \t# [0*II+9]  id:145\n\n.LBB37_esp_nn_aligned_s8_to_s16_with_offset_esp32s3:\t# 0x36d\n\taddi\ta4,a4,32                 \t# [0]\n\n.Lt_2_3842:\t# 0x370\n\tee.vst.128.ip\tq0,a2,16        \t# [0]  id:146\n\tee.vcmp.lt.s8\tq2,q1,q5        \t# [1]\n\tee.vzip.8\tq1,q2               \t# [2]\n\tee.vadds.s16\tq2,q2,q4         \t# [3]\n\tee.vadds.s16\tq3,q1,q4         \t# [4]\n\tee.vst.128.ip\tq3,a2,16        \t# [5]  id:147\n\tee.vst.128.ip\tq2,a2,16        \t# [6]  id:148\n\tbge\ta4,a6,.Lt_2_4866          \t# [7]\n\n\tl32i.n\ta5,a1,12               \t# [0]  id:135 offset+0x0\n\n.Lt_2_5122:\t# 0x38a\n\tmov.n\ta11,a4                  \t# [0]\n\tadd.n\ta2,a4,a10               \t# [1]\n # 576          dst[i + 0] = src[i + 0] + offset;\n\tl8ui\ta7,a2,0                  \t# [2]  id:149\n\taddx2\ta6,a4,a9                \t# [3]\n\tsext\ta7,a7,7                  \t# [4]\n\tadd.n\ta7,a7,a5                \t# [5]\n\ts16i\ta7,a6,0                  \t# [6]  id:150\n # 577          dst[i + 1] = src[i + 1] + offset;\n\tl8ui\ta3,a2,1                  \t# [7]  id:151\n\tsub\ta7,a8,a4                  \t# [8]\n\taddi.n\ta2,a2,2                \t# [9]\n\tsrai\ta7,a7,1                  \t# [10]\n\tsext\ta3,a3,7                  \t# [11]\n\tadd.n\ta3,a3,a5                \t# [12]\n\ts16i\ta3,a6,2                  \t# [13]  id:152\n\taddi.n\ta3,a7,-1               \t# [14]\n\tloopgtz\ta3,.LBB52_esp_nn_aligned_s8_to_s16_with_offset_esp32s3 \t# [15]\n\n\tl8ui\ta3,a2,0                  \t# [0*II+0]  id:149\n\taddi.n\ta6,a6,4                \t# [1*II+1]\n\tsext\ta3,a3,7                  \t# [0*II+2]\n\tadd.n\ta3,a3,a5                \t# [0*II+3]\n\ts16i\ta3,a6,0                  \t# [0*II+4]  id:150\n\tl8ui\ta3,a2,1                  \t# [0*II+5]  id:151\n\taddi.n\ta2,a2,2                \t# [0*II+6]\n\tsext\ta3,a3,7                  \t# [0*II+7]\n\tadd.n\ta3,a3,a5                \t# [0*II+8]\n\ts16i\ta3,a6,2                  \t# [0*II+9]  id:152\n\n.LBB52_esp_nn_aligned_s8_to_s16_with_offset_esp32s3:\t# 0x3ce\n\taddx2\ta4,a7,a11               \t# [0]\n\n.Lt_2_4866:\t# 0x3d1\n\tbge\ta4,a8,.Lt_2_7682          \t# [0]\n\n # 580          dst[i] = src[i] + offset;\n\taddx2\ta11,a4,a9               \t# [0]\n\tadd.n\ta8,a4,a10               \t# [1]\n\tl8ui\ta8,a8,0                  \t# [2]  id:153\n\tl32i.n\ta12,a1,12              \t# [3]  id:135 offset+0x0\n\tsext\ta8,a8,7                  \t# [4]\n\tadd.n\ta8,a8,a12               \t# [5]\n\ts16i\ta8,a11,0                 \t# [6]  id:154\n\tretw.n                        \t# [7]\n\n.Lt_2_6402:\t# 0x3e8\n\tblti\ta4,2,.Lt_2_6658          \t# [0]\n\n\tmovi.n\ta4,0                   \t# [0]\n\tj\t.Lt_2_5122                  \t# [1]\n\n.Lt_2_7682:\t# 0x3f0\n\tretw.n                        \t# [0]\n\n.Lt_2_6658:\t# 0x3f2\n\tblti\ta4,1,.Lt_2_7682          \t# [0]\n\n\tl8ui\ta11,a10,0                \t# [0]  id:153\n\tsext\ta11,a11,7                \t# [2]\n\tadd.n\ta11,a11,a5              \t# [3]\n\ts16i\ta11,a3,0                 \t# [4]  id:154\n\tretw.n                        \t# [5]\n\n.Lt_2_7170:\t# 0x402\n\tmovi.n\ta4,32                  \t# [0]\n\tj\t.Lt_2_3842                  \t# [1]\n\n\t.size\tesp_nn_aligned_s8_to_s16_with_offset_esp32s3, . - esp_nn_aligned_s8_to_s16_with_offset_esp32s3\n\n\n\t.literal_position\n\n\t# Program Unit: esp_nn_s8_to_s16_esp32s3\n\t.type\tesp_nn_s8_to_s16_esp32s3, @function\n\t.align\t 4\n\t.global esp_nn_s8_to_s16_esp32s3\n\nesp_nn_s8_to_s16_esp32s3:\t# 0x40b\n\tentry\ta1,32                   \t#\n\tmov.n\ta9,a2 // src\n\tmov.n\ta8,a3 // dst\n\tmov.n\ta7,a4 // size\n    blti\ta4,1,.Lt_3_4866  // size == 0\n\tblti\ta4,16,.Lt_3_4610 // if (size < 16) jump to unopt path\n\n // load align_len to sar_byte\n\textui\ta2,a2,0,4               \t# [0]\n\twur.sar_byte\ta2               \t# [1]\n\tmov.n\ta2,a9                   \t# [2]\n\n // preload\n\tee.vld.128.ip\tq0,a2,16\n\tee.vld.128.ip\tq1,a2,16\n    ee.zero.q\t    q4\n # 672\n # 673      for (i = 16; i < size - 15; i += 16) {\n\tblti\ta4,32,.Lt_3_5378         \t# [5]\n\taddi\ta6,a4,-16                \t# [1]\n\tsrai\ta6,a6,4                  \t# [2]\n\tslli\ta4,a6,4                  \t# [3]\n\tloopgtz\ta6,.LBB35_esp_nn_s8_to_s16_esp32s3 \t# [4]\n\n\tee.src.q.qup\tq2,q0,q1         \t# [0*II+0]\n\tee.vcmp.lt.s8\tq3,q2,q4        \t# [0*II+1] // sign\n\tee.vld.128.ip\tq1,a2,16        \t# [0*II+2] // for next iteration\n\tee.vzip.8\tq2,q3               \t# [0*II+3]\n\tee.vst.128.ip\tq2,a3,16        \t# [0*II+4]  id:93\n\tee.vst.128.ip\tq3,a3,16        \t# [0*II+5]  id:94\n\n.LBB35_esp_nn_s8_to_s16_esp32s3:\t# 0x449\n\taddi\ta4,a4,16                 \t# [0]\n\n.Lt_3_2050:\t# 0x44c\n\tee.src.q.qup\tq5,q0,q1         \t# [0]\n\tee.vcmp.lt.s8\tq3,q5,q4        \t# [1]\n\tee.vzip.8\tq5,q3               \t# [2]\n\tee.vst.128.ip\tq5,a3,16        \t# [3]  id:96\n\tee.vst.128.ip\tq3,a3,16        \t# [4]  id:97\n # 687\n # 688  skip_to_remains_s8_to_s16:\n # 689      for (; i < size; i += 2) {\n\tbge\ta4,a7,.Lt_3_4866          \t# [5]\n\n.Lt_3_3330:\t# 0x45e\n\tmov.n\ta11,a4                  \t# [0]\n\tadd.n\ta2,a4,a9                \t# [1]\n # 690          dst[i + 0] = src[i + 0];\n\tl8ui\ta10,a2,0                 \t# [2]  id:98\n\taddx2\ta5,a4,a8                \t# [3]\n\tsext\ta10,a10,7                \t# [4]\n\ts16i\ta10,a5,0                 \t# [5]  id:99\n # 691          dst[i + 1] = src[i + 1];\n\tl8ui\ta3,a2,1                  \t# [6]  id:100\n\tsub\ta10,a7,a4                 \t# [7]\n\taddi.n\ta2,a2,2                \t# [8]\n\taddi.n\ta10,a10,1              \t# [9]\n\tsrai\ta10,a10,1                \t# [10]\n\tsext\ta3,a3,7                  \t# [11]\n\ts16i\ta3,a5,2                  \t# [12]  id:101\n\taddi.n\ta3,a10,-1              \t# [13]\n\tloopgtz\ta3,.LBB50_esp_nn_s8_to_s16_esp32s3 \t# [14]\n\n\tl8ui\ta3,a2,0                  \t# [0*II+0]  id:98\n\taddi.n\ta5,a5,4                \t# [1*II+1]\n\tsext\ta3,a3,7                  \t# [0*II+2]\n\ts16i\ta3,a5,0                  \t# [0*II+3]  id:99\n\tl8ui\ta3,a2,1                  \t# [0*II+4]  id:100\n\taddi.n\ta2,a2,2                \t# [0*II+5]\n\tsext\ta3,a3,7                  \t# [0*II+6]\n\ts16i\ta3,a5,2                  \t# [0*II+7]  id:101\n\n.LBB50_esp_nn_s8_to_s16_esp32s3:\t# 0x49c\n\taddx2\ta4,a10,a11              \t# [0]\n # 692      }\n # 693      if(i < size) {\n\tbge\ta4,a7,.Lt_3_4866          \t# [1]\n\n # 694          dst[i] = src[i];\n\tadd.n\ta11,a4,a9               \t# [0]\n\tl8ui\ta11,a11,0                \t# [1]  id:102\n\taddx2\ta12,a4,a8               \t# [2]\n\tsext\ta11,a11,7                \t# [3]\n\ts16i\ta11,a12,0                \t# [4]  id:103\n\tretw.n                        \t# [5]\n\n.Lt_3_4610:\t# 0x4b2\n\tmovi.n\ta4,0                   \t# [0]\n\tj\t.Lt_3_3330                  \t# [1]\n\n.Lt_3_4866:\t# 0x4ba\n\tretw.n                        \t# [0]\n\n.Lt_3_5378:\t# 0x4bc\n\tmovi.n\ta4,16                  \t# [1]\n\tj\t.Lt_3_2050                  \t# [2]\n\n\t.size\tesp_nn_s8_to_s16_esp32s3, . - esp_nn_s8_to_s16_esp32s3\n"
  },
  {
    "path": "src/common/esp_nn_dot_s8_esp32s3.S",
    "content": "//\n// SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n//\n// SPDX-License-Identifier: Apache-2.0\n//\n\n//\n// Reusable s8 dot product kernels for ESP32-S3.\n// Used by conv im2col, FC, and any kernel that reduces to a dot product.\n//\n// esp_nn_dot_s8_aligned_esp32s3:\n//   Both input and filter 16-byte aligned. Uses ee.vld.128.ip + fused MAC.\n//\n// esp_nn_dot_s8_unaligned_esp32s3:\n//   Input aligned, filter may be unaligned. Uses USAR+QUP for filter.\n//\n\n    .text\n    .align  4\n\n// ============================================================\n// esp_nn_dot_s8_aligned_esp32s3\n// Both pointers must be 16-byte aligned.\n// a2: input_data (aligned)\n// a3: filter_data (aligned)\n// a4: len (must be multiple of 16, >= 16)\n// Returns: int32_t dot product in a2\n// ============================================================\n    .type   esp_nn_dot_s8_aligned_esp32s3, @function\n    .align  4\n    .global esp_nn_dot_s8_aligned_esp32s3\n\nesp_nn_dot_s8_aligned_esp32s3:\n    entry   a1, 32\n\n    ee.zero.accx\n    beqz    a4, .Lalign_done\n\n    // Compute loop count and remainder\n    srli    a5, a4, 4               // a5 = len / 16\n    beqz    a5, .Lalign_done\n\n    // Prime: load first pair\n    ee.vld.128.ip   q0, a2, 16\n    ee.vld.128.ip   q1, a3, 16\n    addi            a5, a5, -1\n    beqz            a5, .Lalign_last\n\n    // Main loop: fused MAC + load\n    loopgtz a5, .Lalign_loop_end\n    ee.vmulas.s8.accx.ld.ip  q0, a2, 16, q0, q1\n    ee.vld.128.ip   q1, a3, 16\n.Lalign_loop_end:\n\n.Lalign_last:\n    // Final MAC\n    ee.vmulas.s8.accx  q0, q1\n\n.Lalign_done:\n    // Read lower 32 bits of ACCX (sufficient for int8 dot products)\n    nop\n    nop\n    rur.accx_0 a2\n\n    retw.n\n\n    .size   esp_nn_dot_s8_aligned_esp32s3, . - esp_nn_dot_s8_aligned_esp32s3\n\n\n// ============================================================\n// esp_nn_dot_s8_unaligned_esp32s3\n// Input must be 16-byte aligned. Filter can be unaligned.\n// Uses USAR+QUP pattern for filter loads.\n// a2: input_data (aligned)\n// a3: filter_data (may be unaligned)\n// a4: len_div16 (>= 1)\n// Returns: int32_t dot product in a2\n// ============================================================\n    .type   esp_nn_dot_s8_unaligned_esp32s3, @function\n    .align  4\n    .global esp_nn_dot_s8_unaligned_esp32s3\n\nesp_nn_dot_s8_unaligned_esp32s3:\n    entry   a1, 32\n\n    ee.zero.accx\n    beqz    a4, .Lunalign_done\n\n    // Prime: first unaligned filter load (sets SAR_BYTE)\n    ee.ld.128.usar.ip   q0, a3, 16\n\n    // Check if we can do 2x unrolled (need >= 2 iterations)\n    srai    a5, a4, 1               // a5 = len_div16 / 2\n    beqz    a5, .Lunalign_single\n\n    // Load first input + filter pair for unrolled loop\n    ee.vld.128.ip       q1, a2, 16\n    ee.ld.128.usar.ip   q2, a3, 16\n\n    // 2x unrolled main loop\n    loopgtz a5, .Lunalign_loop2_end\n\n    ee.src.q.qup        q4, q0, q2         // align filter[i]\n    ee.vld.128.ip       q3, a2, 16         // input[i+1]\n    ee.vmulas.s8.accx   q4, q1             // MAC filter[i] * input[i]\n    ee.ld.128.usar.ip   q0, a3, 16         // filter chunk[i+2]\n    ee.src.q.qup        q5, q2, q0         // align filter[i+1]\n    ee.vld.128.ip       q1, a2, 16         // input[i+2] (primed for next)\n    ee.vmulas.s8.accx   q5, q3             // MAC filter[i+1] * input[i+1]\n    ee.ld.128.usar.ip   q2, a3, 16         // filter chunk[i+3]\n\n.Lunalign_loop2_end:\n\n    // Check if there's a remaining single iteration (odd len_div16)\n    bbci    a4, 0, .Lunalign_done_mac\n\n    // Odd remainder: the 2x loop already loaded q0/q2 for the next chunk.\n    // Just qup the filter and MAC with the primed input (q1).\n    // But q1 was loaded as input[i+2] in the last loop iteration — we need\n    // to re-read the correct input. Actually, q1 is already the right input.\n    // q0 and q2 are the filter chunks ready for qup.\n    ee.src.q.qup        q4, q0, q2\n    ee.vmulas.s8.accx   q4, q1\n    j                   .Lunalign_done_mac\n\n.Lunalign_single:\n    // Called when len_div16 < 2 (single chunk only)\n    ee.vld.128.ip       q1, a2, 16\n    ee.ld.128.usar.ip   q2, a3, 16\n    ee.src.q.qup        q4, q0, q2\n    ee.vmulas.s8.accx   q4, q1\n\n.Lunalign_done_mac:\n.Lunalign_done:\n    // 2-cycle gap before ACCX read\n    movi.n  a3, 0\n    nop\n    ee.srs.accx a2, a3, 0\n\n    retw.n\n\n    .size   esp_nn_dot_s8_unaligned_esp32s3, . - esp_nn_dot_s8_unaligned_esp32s3\n"
  },
  {
    "path": "src/common/esp_nn_mean_ansi.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*\n * Quantized mean reduction over spatial dimensions (axes 1,2).\n * Specialized for 4D tensors [N, H, W, C] → [N, 1, 1, C].\n * This is the common case in Squeeze-and-Excite blocks.\n */\n\n#include <stdint.h>\n#include <common_functions.h>\n\nvoid esp_nn_mean_nhwc_s8_ansi(const int8_t *input,\n                               int8_t *output,\n                               const int32_t height,\n                               const int32_t width,\n                               const int32_t channels,\n                               const int32_t input_zero_point,\n                               const int32_t output_zero_point,\n                               const int32_t multiplier,\n                               const int32_t shift)\n{\n    const int32_t num_elements = height * width;\n\n    for (int c = 0; c < channels; c++) {\n        /* Sum over spatial dimensions */\n        int32_t sum = 0;\n        for (int h = 0; h < height; h++) {\n            for (int w = 0; w < width; w++) {\n                sum += input[(h * width + w) * channels + c];\n            }\n        }\n\n        /* Apply zero point correction */\n        sum -= num_elements * input_zero_point;\n\n        /* Requantize: multiply_by_quantized_mult(sum, multiplier, shift) */\n        int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);\n        result += output_zero_point;\n        result = max(result, -128);\n        result = min(result, 127);\n        output[c] = (int8_t)result;\n    }\n}\n"
  },
  {
    "path": "src/common/esp_nn_mean_s8_esp32p4.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*\n * ESP32-P4 optimized spatial mean reduction using QACC per-lane accumulation.\n * Processes 16 channels in parallel via esp.vmulas.s8.qacc (same pattern as avg_pool).\n */\n\n#include <stdint.h>\n#include <common_functions.h>\n\nvoid esp_nn_mean_nhwc_s8_esp32p4(const int8_t *input,\n                                  int8_t *output,\n                                  const int32_t height,\n                                  const int32_t width,\n                                  const int32_t channels,\n                                  const int32_t input_zero_point,\n                                  const int32_t output_zero_point,\n                                  const int32_t multiplier,\n                                  const int32_t shift)\n{\n    const int32_t num_elements = height * width;\n    const int32_t ch_16 = channels >> 4;\n\n    const int8_t one_val = 1;\n    if (ch_16 > 0) {\n        /* Enable PIE and broadcast 1 into q7 */\n        asm volatile (\n            \"csrsi  0x7f2, 0b01        \\n\\t\"\n            \"li     x29, 0b10          \\n\\t\"\n            \"esp.movx.w.cfg x29        \\n\\t\"\n            ::: \"x29\"\n        );\n        asm volatile (\n            \"mv     x30, %0             \\n\\t\"\n            \"esp.vldbc.8.ip q7, x30, 0  \\n\\t\"\n            :: \"r\"(&one_val) : \"x30\"\n        );\n    }\n\n    /* Process all channels - QACC for 16-channel blocks, scalar for remainder */\n    int ch = 0;\n    for (int ch_blk = 0; ch_blk < ch_16; ch_blk++, ch += 16) {\n        /* Single asm block: broadcast ones, zero QACC, accumulate all spatial\n         * positions. Keeping in one block prevents compiler from clobbering\n         * q7 between the broadcast and the MAC loop. */\n        const int8_t *base_ptr = input + ch;\n        asm volatile (\n            /* Broadcast 1 into q7 */\n            \"mv     x30, %[one]             \\n\\t\"\n            \"esp.vldbc.8.ip q7, x30, 0      \\n\\t\"\n            /* Zero QACC */\n            \"esp.zero.qacc                   \\n\\t\"\n            /* Accumulate loop: stride = channels between spatial positions */\n            \"mv     x30, %[base]            \\n\\t\"\n            \"mv     s7,  %[cnt]             \\n\\t\"\n            \"1:                             \\n\\t\"\n            \"esp.vld.128.ip  q0, x30, 0     \\n\\t\"\n            \"esp.vmulas.s8.qacc q0, q7      \\n\\t\"\n            \"add    x30, x30, %[stride]     \\n\\t\"\n            \"addi   s7, s7, -1              \\n\\t\"\n            \"bnez   s7, 1b                  \\n\\t\"\n            :\n            : [one] \"r\"(&one_val), [base] \"r\"(base_ptr),\n              [cnt] \"r\"(num_elements), [stride] \"r\"((int32_t)channels)\n            : \"x30\", \"s7\"\n        );\n\n        int32_t sums[16] __attribute__((aligned(16)));\n        ESP_NN_QACC_EXTRACT_S32(sums);\n\n        int32_t zp_correction = num_elements * input_zero_point;\n        for (int k = 0; k < 16; k++) {\n            int32_t result = sums[k] - zp_correction;\n            result = esp_nn_multiply_by_quantized_mult(result, multiplier, shift);\n            result += output_zero_point;\n            result = max(result, -128);\n            result = min(result, 127);\n            output[ch + k] = (int8_t)result;\n        }\n    }\n\n    /* Remaining channels scalar */\n    for (; ch < channels; ch++) {\n        int32_t sum = 0;\n        for (int hw = 0; hw < num_elements; hw++) {\n            sum += input[hw * channels + ch];\n        }\n        sum -= num_elements * input_zero_point;\n        int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);\n        result += output_zero_point;\n        result = max(result, -128);\n        result = min(result, 127);\n        output[ch] = (int8_t)result;\n    }\n}\n"
  },
  {
    "path": "src/common/esp_nn_mean_s8_esp32s3.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*\n * ESP32-S3 optimized mean reduction for NHWC int8 tensors.\n * Uses int16 accumulation for small spatial sizes (H*W <= 256),\n * int32 for larger. Accumulates all channels at once per spatial position.\n */\n\n#include <stdint.h>\n#include <string.h>\n#include <common_functions.h>\n\nvoid esp_nn_mean_nhwc_s8_esp32s3(const int8_t *input,\n                                  int8_t *output,\n                                  const int32_t height,\n                                  const int32_t width,\n                                  const int32_t channels,\n                                  const int32_t input_zero_point,\n                                  const int32_t output_zero_point,\n                                  const int32_t multiplier,\n                                  const int32_t shift)\n{\n    const int32_t num_elements = height * width;\n    const int32_t zp_correction = num_elements * input_zero_point;\n\n    if (num_elements <= 256 && channels <= 512) {\n        /* int16 accumulation (safe: 256 * 127 = 32,512 < 32,767) */\n        /* Process 8 channels at a time using int16 accumulators */\n        int16_t acc16[channels];\n        memset(acc16, 0, channels * sizeof(int16_t));\n\n        const int8_t *ptr = input;\n        for (int i = 0; i < num_elements; i++) {\n            /* Inner loop — compiler should auto-vectorize with -O2 */\n            for (int c = 0; c < channels; c++) {\n                acc16[c] += (int16_t)ptr[c];\n            }\n            ptr += channels;\n        }\n\n        /* Requantize per channel */\n        for (int c = 0; c < channels; c++) {\n            int32_t sum = (int32_t)acc16[c] - zp_correction;\n            int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);\n            result += output_zero_point;\n            result = max(result, -128);\n            result = min(result, 127);\n            output[c] = (int8_t)result;\n        }\n    } else if (channels <= 512) {\n        /* int32 accumulation for larger spatial sizes */\n        int32_t acc[channels];\n        memset(acc, 0, channels * sizeof(int32_t));\n\n        const int8_t *ptr = input;\n        for (int i = 0; i < num_elements; i++) {\n            for (int c = 0; c < channels; c++) {\n                acc[c] += ptr[c];\n            }\n            ptr += channels;\n        }\n\n        for (int c = 0; c < channels; c++) {\n            int32_t sum = acc[c] - zp_correction;\n            int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);\n            result += output_zero_point;\n            result = max(result, -128);\n            result = min(result, 127);\n            output[c] = (int8_t)result;\n        }\n    } else {\n        /* Per-channel fallback for huge channel counts */\n        for (int c = 0; c < channels; c++) {\n            int32_t sum = 0;\n            for (int i = 0; i < num_elements; i++) {\n                sum += input[i * channels + c];\n            }\n            sum -= zp_correction;\n            int32_t result = esp_nn_multiply_by_quantized_mult(sum, multiplier, shift);\n            result += output_zero_point;\n            result = max(result, -128);\n            result = min(result, 127);\n            output[c] = (int8_t)result;\n        }\n    }\n}\n"
  },
  {
    "path": "src/common/esp_nn_multiply_by_quantized_mult_esp32p4.S",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*\n * Fast 2-wide requantization for ESP32-P4 (RISC-V).\n * Interleaves mul/mulh across 2 elements for better pipeline utilization.\n * Uses a0-a7 and t0-t6 only (no callee-saved registers needed).\n *\n * void esp_nn_requant_2x_esp32p4(\n *     int32_t x0,       // a0\n *     int32_t x1,       // a1\n *     int32_t mult0,    // a2\n *     int32_t mult1,    // a3\n *     int32_t shift0,   // a4\n *     int32_t shift1,   // a5\n *     int32_t *out      // a6: pointer to store 2 results\n * );\n */\n\n    .text\n    .align  4\n    .global esp_nn_requant_2x_esp32p4\n    .type   esp_nn_requant_2x_esp32p4, @function\n\nesp_nn_requant_2x_esp32p4:\n    /* Compute left_shift and apply */\n    mv      t0, a0              /* x0 */\n    mv      t1, a1              /* x1 */\n    bgez    a4, .Lls0_pos\n    mv      t6, zero            /* ls0 = 0 */\n    j       .Lls0_done\n.Lls0_pos:\n    sll     t0, t0, a4          /* x0 <<= shift0 (positive = left shift) */\n    mv      t6, a4              /* ls0 = shift0 */\n.Lls0_done:\n    sub     a4, t6, a4          /* rs0 = ls0 - shift0 */\n\n    bgez    a5, .Lls1_pos\n    mv      t6, zero\n    j       .Lls1_done\n.Lls1_pos:\n    sll     t1, t1, a5\n    mv      t6, a5\n.Lls1_done:\n    sub     a5, t6, a5          /* rs1 = ls1 - shift1 */\n\n    /* ---- Interleaved 64-bit multiply ---- */\n    /* mulh first (both elements), then mul (both elements) */\n    mulh    t2, t0, a2          /* hi0 */\n    mulh    t3, t1, a3          /* hi1 */\n    mul     t0, t0, a2          /* lo0 */\n    mul     t1, t1, a3          /* lo1 */\n\n    /* Add nudge and combine: result = ((hi:lo) + (1<<30)) >> 31 */\n    li      t4, 0x40000000      /* nudge = 1 << 30 */\n\n    add     t5, t0, t4          /* lo0 + nudge */\n    sltu    t6, t5, t0          /* carry0 */\n    add     t2, t2, t6          /* hi0 += carry0 */\n    srli    t5, t5, 31          /* (lo0+nudge) >> 31 */\n    slli    t0, t2, 1           /* hi0 << 1 */\n    or      t0, t0, t5          /* result0 */\n\n    add     t5, t1, t4          /* lo1 + nudge */\n    sltu    t6, t5, t1          /* carry1 */\n    add     t3, t3, t6          /* hi1 += carry1 */\n    srli    t5, t5, 31\n    slli    t1, t3, 1\n    or      t1, t1, t5          /* result1 */\n\n    /* ---- Right shift with rounding ---- */\n    li      t4, 1\n\n    beqz    a4, .Lskip_rs0\n    addi    t5, a4, -1\n    sll     t5, t4, t5          /* round0 = 1 << (rs0-1) */\n    srai    t6, t0, 31          /* -1 if negative, 0 otherwise */\n    add     t5, t5, t6          /* round0 += sign */\n    add     t0, t0, t5\n    sra     t0, t0, a4\n.Lskip_rs0:\n\n    beqz    a5, .Lskip_rs1\n    addi    t5, a5, -1\n    sll     t5, t4, t5\n    srai    t6, t1, 31\n    add     t5, t5, t6\n    add     t1, t1, t5\n    sra     t1, t1, a5\n.Lskip_rs1:\n\n    /* Store results */\n    sw      t0, 0(a6)\n    sw      t1, 4(a6)\n    ret\n\n    .size   esp_nn_requant_2x_esp32p4, . - esp_nn_requant_2x_esp32p4\n"
  },
  {
    "path": "src/common/esp_nn_multiply_by_quantized_mult_esp32s3.S",
    "content": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n// the macro `use_nudge` enables adding rounding factor similar to tflite implementation\n// this barely changes any accuracy\n// keep this disabled for better performance\n\n#ifndef SKIP_NUDGE\n    # set SKIP_NUDGE flag for ~20% faster (but not bit-exact) quantisation\n    .set use_nudge, 1\n#endif\n\n    .text\n    .literal_position\n    .literal    .nudge_val, 1073741824          # 1 << 30\n\n    .type   esp_nn_multiply_by_quantized_mult_asm_esp32s3, @function\n    .align   4\n    .global esp_nn_multiply_by_quantized_mult_asm_esp32s3\n\nesp_nn_multiply_by_quantized_mult_asm_esp32s3:  # 0x4\n    # to_add = 4\n\n    entry       a1,32\n    wsr.sar     a3\n    ee.zero.q   q2\n\n    bltz        a3,     .skip_left_shift\n    ee.vsl.32   q0,q0                   # [13]\n.skip_left_shift:\n\n    ssai    31                      # [15]\n\n# move data to general purpose registers\n    ee.movi.32.a    q0,a12,0            # [17]\n    ee.movi.32.a    q0,a13,1            # [16]\n    ee.movi.32.a    q0,a14,2            # [18]\n    ee.movi.32.a    q0,a15,3            # [19]\n\n.ifdef use_nudge\n    l32r            a6,.nudge_val\n.endif\n\n# perform 64 bit mult\n    mulsh   a4,a2,a12                   # [22]\n    mulsh   a11,a2,a13                  # [23]\n    mulsh   a10,a2,a14                  # [21]\n    mulsh   a8,a2,a15                   # [20]\n    mull    a12,a2,a12                  # [24]\n    mull    a13,a2,a13                  # [25]\n    mull    a14,a2,a14                  # [26]\n    mull    a15,a2,a15                  # [27]\n\n# add nudge_val and discard low31\n\n.ifdef use_nudge\n    add.n           a14,a6,a14                  # [41]\n    saltu           a2,a14,a6                   # [44]\n    add.n           a10,a10,a2                  # [45]\n\n    add.n           a13,a6,a13                  # [47]\n    saltu           a9,a13,a6                   # [50]\n    add.n           a11,a11,a9                  # [51]\n.endif\n\n    src             a10,a10,a14                     # [88]\n    src             a11,a11,a13                 # [78]\n    ee.movi.32.q    q0,a10,2\n    ee.movi.32.q    q0,a11,1\n\n.ifdef use_nudge\n    add.n           a15,a6,a15                  # [36]\n    saltu           a2,a15,a6                   # [39]\n    add.n           a8,a8,a2                    # [40]\n\n    add.n           a12,a6,a12                  # [54]\n    saltu           a10,a12,a6                  # [57]\n    add.n           a4,a4,a10                   # [58]\n.endif\n\n    src             a8,a8,a15                  # [95]\n    src             a4,a4,a12                  # [69] # discard lower 31 bits\n    ee.movi.32.q    q0,a8,3\n    ee.movi.32.q    q0,a4,0\n\n    bgez    a3, .skip_div_by_power_of_2\n\n    neg     a5,a3                       # [0]  right_shift/exponent = -shift\n    ee.vcmp.lt.s32  q2,q0,q2        # [97]\n    addi.n          a7,a5,-1                # [0]  exponent - 1\n    ssl             a7                      # [1]\n    movi.n          a6,1                    # [92]\n    sll             a6,a6                   # [2]\n    s32i.n          a6,a1,4                 # [3]  to_add\n    addi.n          a4,a1,4                 # [94]  to_add_addr\n    ee.vldbc.32     q1,a4           # [4]  id:148 to_add\n    wsr.sar         a5\n    ee.vadds.s32    q1,q1,q2\n    ee.vadds.s32    q0,q0,q1\n    ee.vsr.32       q0,q0\n\n.skip_div_by_power_of_2:\n    retw.n                          # [9]\n\n    .size   esp_nn_multiply_by_quantized_mult_asm_esp32s3, . - esp_nn_multiply_by_quantized_mult_asm_esp32s3\n"
  },
  {
    "path": "src/common/esp_nn_multiply_by_quantized_mult_ver1_esp32s3.S",
    "content": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n// quantisation version where we deal with different shifts and mults.\n\n    .set use_nudge, 1\n\n    .text\n    .literal_position\n    .literal    .LC3_19_48, 1073741824\n\n    # Program Unit: esp_nn_multiply_by_quantized_mult_ver1_esp32s3\n    .type   esp_nn_multiply_by_quantized_mult_ver1_esp32s3, @function\n    .align   4\n    .global esp_nn_multiply_by_quantized_mult_ver1_esp32s3\n\nesp_nn_multiply_by_quantized_mult_ver1_esp32s3:    # 0x1ee\n    entry       a1,32                       #\n    ee.zero.q   q3                      # [0]\n    l32i.n      a8,a3,0                 # [5]  id:200 // shift0\n    l32i.n      a7,a3,4                 # [2]  id:201 // shift1\n    l32i.n      a12,a2,0                # [3]  id:204 // mult0\n    l32i.n      a15,a2,4                # [1]  id:205 // mult1\n    movi.n      a10,0                   # [7]\n\n    max             a6,a10,a8                   # [1] // left_shift0\n    max             a5,a10,a7                   # [7] // left_shift1\n    sub             a8,a6,a8                    # [2] // right_shift0\n    sub             a7,a5,a7                    # [8] // right_shift1\n\n    ee.movi.32.a    q0,a9,0             # [4]\n    ee.movi.32.a    q0,a11,1            # [11]\n    ssl             a6                          # [3]\n    sll             a9,a9                       # [4]\n    mulsh           a4,a12,a9                   # [6]\n    mull            a12,a12,a9                  # [9]\n    ssl             a5                          # [10]\n    sll             a11,a11                         # [12]\n    mulsh           a14,a15,a11                 # [14]\n    mull            a15,a15,a11                 # [16]\n    l32r            a13,.LC3_19_48              # [23]\n\n    ee.movi.32.q    q0,a9,0             # [5]\n    ee.movi.32.q    q0,a11,1            # [15]\n\n\n    l32i.n          a6,a3,8                 # [6]  id:202 // shift2\n    l32i.n          a9,a2,8                 # [19]  id:206 // mult2\n    max             a5,a10,a6                   # [0] // left_shift2\n    sub             a6,a5,a6                    # [24] // right_shift2\n\n\n    ee.movi.32.a    q0,a11,2            # [17]\n    ssl             a5                          # [13]\n    sll             a11,a11                     # [18]\n    ee.movi.32.q    q0,a11,2            # [20]\n    mulsh           a5,a9,a11                  # [21]\n    mull            a9,a9,a11                   # [22]\n    mov             a11, a5\n\n// add nudge to result0 & result1\n    add.n           a12,a13,a12                 # [25]\n    saltu           a5,a12,a13                  # [26]\n    add.n           a15,a13,a15                 # [27]\n    add.n           a5,a5,a4                    # [28]\n    saltu           a4,a15,a13                  # [29]\n    add.n           a4,a4,a14                   # [30]\n\n\n    l32i.n          a14,a3,12               # [31]  id:203 // shift3\n    add.n           a9,a13,a9                   # [32] // add nudge low2\n    max             a10,a10,a14                 # [33]  // left_shift3\n    sub             a14,a10,a14                 # [34]  // right_shift3\n    ssl             a10                         # [35]\n    ee.movi.32.a    q0,a10,3            # [36]\n    sll             a10,a10                     # [37]\n\n// select high32 from result0 and resul1\n    ssai            31                          # [39]\n    src             a5,a5,a12                   # [40]\n    src             a4,a4,a15                   # [41]\n    movi.n          a12,1                   # [42]\n    ee.movi.32.q    q0,a5,0             # [43]\n    saltu           a15,a9,a13                  # [44]\n    add.n           a15,a15,a11                 # [45]\n    ee.movi.32.q    q0,a4,1             # [46]\n    l32i.n          a11,a2,12               # [47]  id:207 // mult3\n    src             a15,a15,a9                  # [48]\n    ee.movi.32.q    q0,a15,2            # [49]\n    mull            a9,a11,a10                  # [50]\n    mulsh           a11,a11,a10                 # [51]\n    add.n           a9,a13,a9                   # [52]\n    saltu           a13,a9,a13                  # [53]\n    add.n           a13,a13,a11                 # [54]\n    src             a13,a13,a9                  # [55]\n    ee.movi.32.q    q0,a13,3            # [57]\n\n// divide_by_power_of2_step\n    ssl             a8                          # [56]\n    sll             a9,a12                      # [58]\n    ssl             a7                          # [59]\n    addi.n          a9,a9,-1                # [60]\n    ee.movi.32.q    q2,a9,0             # [61]\n    sll             a11,a12                     # [62]\n    addi.n          a11,a11,-1              # [63]\n    ssl             a6                          # [64]\n    sll             a10,a12                     # [65]\n    ee.movi.32.q    q2,a11,1            # [66]\n    ssl             a14                         # [67]\n    addi.n          a10,a10,-1              # [68]\n    ee.movi.32.q    q2,a10,2            # [69]\n    sll             a9,a12                      # [70]\n    addi.n          a9,a9,-1                # [71]\n    ee.movi.32.q    q2,a9,3             # [74]\n    ee.andq         q1,q0,q2                # [75]\n\n    ssr             a8                          # [72]\n    sra             a5,a5                       # [73]\n    ssr             a7                          # [76]\n    sra             a4,a4                       # [78]\n    ssr             a6                          # [79]\n    sra             a15,a15                     # [81]\n    ssr             a14                         # [82]\n    sra             a13,a13                     # [84]\n    wsr.sar         a12                     # [85]\n\n    ee.movi.32.q    q7,a5,0             # [77]\n    ee.movi.32.q    q7,a4,1             # [80]\n    ee.movi.32.q    q7,a15,2            # [83]\n    ee.movi.32.q    q7,a13,3            # [86]\n\n    ee.vcmp.lt.s32  q3,q7,q3        # [87]\n    ee.vsr.32       q2,q2                   # [88]\n    ee.vsubs.s32    q2,q2,q3            # [89]\n    ee.vcmp.gt.s32  q1,q1,q2        # [90]\n    ee.vsubs.s32    q0,q7,q1            # [91]\n\n// return\n    retw.n                          # [92]\n\n    .size   esp_nn_multiply_by_quantized_mult_ver1_esp32s3, . - esp_nn_multiply_by_quantized_mult_ver1_esp32s3\n"
  },
  {
    "path": "src/convolution/esp_nn_conv_ansi.c",
    "content": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <esp_nn_defs.h>\n\n#include <common_functions.h>\n\nint esp_nn_get_conv_scratch_size_ansi(const data_dims_t *input_dims,\n                                      const data_dims_t *filter_dims,\n                                      const data_dims_t *output_dims,\n                                      const conv_params_t *conv_params)\n{\n    return 0;\n}\n\nvoid esp_nn_set_conv_scratch_buf_ansi(const void *buf)\n{\n\n}\n\n/**\n * Assumption 1: i/p channels == o/p channels\n * Assumption 2: Pointers are valid\n * Assumption 3: dialation width = 1\n */\nvoid esp_nn_conv_u8_ansi(const uint8_t *input_data,\n                         const uint16_t input_wd,\n                         const uint16_t input_ht,\n                         const uint16_t in_channels,\n                         const int32_t input_offset,\n                         const uint16_t pad_wd,\n                         const uint16_t pad_ht,\n                         const uint16_t stride_wd,\n                         const uint16_t stride_ht,\n                         const uint8_t *filter_data,\n                         const uint16_t filter_wd,\n                         const uint16_t filter_ht,\n                         const int32_t filter_offset,\n                         const int32_t *bias,\n                         uint8_t *out_data,\n                         const uint16_t out_wd,\n                         const uint16_t out_ht,\n                         const uint16_t out_channels,\n                         const int32_t out_offset,\n                         const int32_t out_shift,\n                         const int32_t out_mult,\n                         const int32_t activation_min,\n                         const int32_t activation_max)\n{\n    for (int out_y = 0; out_y < out_ht; out_y++) { //height loop\n        const int16_t base_y = (out_y * stride_ht) - pad_ht;\n        for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop\n            const int16_t base_x = (out_x * stride_wd) - pad_wd;\n            for (int out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {//channel_loop\n                int32_t result = 0;\n\n                /* Select filter so as the point doesn't lie outside block */\n                int filter_y_start = max(0, -base_y);\n                int filter_x_start = max(0, -base_x);\n                int filter_y_end = min(filter_ht, input_ht - base_y);\n                int filter_x_end = min(filter_wd, input_wd - base_x);\n\n                for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {\n                    const int32_t idx_y = base_y + filter_y_idx;\n                    for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {\n                        const int32_t idx_x = base_x + filter_x_idx;\n                        for (int in_ch_idx = 0; in_ch_idx < in_channels; in_ch_idx++) {\n                            int32_t input_index = (idx_y * input_wd + idx_x) * in_channels + in_ch_idx;\n                            int32_t filter_index = ((out_ch_idx * filter_ht + filter_y_idx)\n                                                    * filter_wd + filter_x_idx) * in_channels\n                                                   + in_ch_idx;\n                            int32_t input_val = input_data[input_index] + input_offset;\n                            int32_t filter_val = filter_data[filter_index] + filter_offset;\n                            result += input_val * filter_val;\n                        }\n                    }\n                }\n                if (bias) {\n                    result += bias[out_ch_idx];\n                }\n                result = esp_nn_multiply_by_quantized_mult(result, out_mult, out_shift);\n                result += out_offset;\n                result = max(result, activation_min);\n                result = min(result, activation_max);\n\n                int out_index = (out_y * out_wd + out_x) * out_channels + out_ch_idx;\n                out_data[out_index] = (uint8_t) result;\n            }\n        }\n    }\n}\n\n/**\n * Assumption 1: i/p channels == o/p channels\n * Assumption 2: Pointers are valid\n * Assumption 3: dialation width = 1\n */\nvoid esp_nn_conv_s8_ansi(const data_dims_t *input_dims,\n                         const int8_t *input_data,\n                         const data_dims_t *filter_dims,\n                         const int8_t *filter_data,\n                         const int32_t *bias,\n                         const data_dims_t *output_dims,\n                         int8_t *out_data,\n                         const conv_params_t *conv_params,\n                         const quant_data_t *quant_data)\n{\n    const uint16_t input_wd = input_dims->width;\n    const uint16_t input_ht = input_dims->height;\n    const uint16_t in_channels = input_dims->channels;\n    const int32_t input_offset = conv_params->in_offset;\n    const int32_t out_offset = conv_params->out_offset;\n    const uint16_t pad_wd = conv_params->padding.width;\n    const uint16_t pad_ht = conv_params->padding.height;\n    const uint16_t stride_wd = conv_params->stride.width;\n    const uint16_t stride_ht = conv_params->stride.height;\n    const uint16_t filter_wd = filter_dims->width;\n    const uint16_t filter_ht = filter_dims->height;\n    const uint16_t out_wd = output_dims->width;\n    const uint16_t out_ht = output_dims->height;\n    const uint16_t out_channels = output_dims->channels;\n    const int32_t *out_shift = quant_data->shift;\n    const int32_t *out_mult = quant_data->mult;\n    const int32_t activation_min = conv_params->activation.min;\n    const int32_t activation_max = conv_params->activation.max;\n\n    /* Fall back to in_channels when filter_dims->channels is unset (legacy callers). */\n    const uint16_t filter_ch = filter_dims->channels ? filter_dims->channels : in_channels;\n    const int32_t groups = in_channels / filter_ch;\n    const int32_t filters_per_group = out_channels / groups;\n\n    int32_t out_ch_idx, out_y, out_x, in_ch_idx, filter_y_idx, filter_x_idx;\n\n    for (out_y = 0; out_y < out_ht; out_y++) {\n        for (out_x = 0; out_x < out_wd; out_x++) {\n            for (out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {\n                int32_t conv_out = 0;\n                const int32_t group = out_ch_idx / filters_per_group;\n                const int32_t in_ch_start = group * filter_ch;\n\n                const int32_t base_y = stride_ht * out_y - pad_ht;\n                const int32_t base_x = stride_wd * out_x - pad_wd;\n\n                const int32_t filter_y_start = max(0, -base_y);\n                const int32_t filter_x_start = max(0, -base_x);\n\n                const int32_t filter_y_end = min(filter_ht, input_ht - base_y);\n                const int32_t filter_x_end = min(filter_wd, input_wd - base_x);\n\n                for (filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {\n                    for (filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {\n                        const int32_t in_row = base_y + filter_y_idx;\n                        const int32_t in_col = base_x + filter_x_idx;\n                        int32_t input_base_offset = (in_row * input_wd + in_col) * in_channels + in_ch_start;\n                        int32_t filter_base_offset = out_ch_idx * filter_ch * filter_ht * filter_wd +\n                                                       (filter_y_idx * filter_wd + filter_x_idx) * filter_ch;\n                        for (in_ch_idx = 0; in_ch_idx < filter_ch; in_ch_idx++) {\n                            conv_out +=\n                                (input_data[input_base_offset + in_ch_idx] + input_offset) *\n                                filter_data[filter_base_offset + in_ch_idx];\n                        }\n                    }\n                }\n                if (bias) {\n                    conv_out += bias[out_ch_idx];\n                }\n                conv_out = esp_nn_multiply_by_quantized_mult(conv_out, out_mult[out_ch_idx], out_shift[out_ch_idx]);\n                conv_out += out_offset;\n                conv_out = max(conv_out, activation_min);\n                conv_out = min(conv_out, activation_max);\n                *out_data++ = (int8_t) conv_out;\n            }\n        }\n    }\n}\n"
  },
  {
    "path": "src/convolution/esp_nn_conv_esp32p4.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2024-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/**\n * Optimizations strategies used:\n * Below optimizations are capable of any size of input/filter:\n *\n * 1. For filter wdxht = 1x1 (Refer esp_nn_conv_s8_mult8_1x1_esp32p4 function)\n *      - For this specific version, the strategy we employ:\n *          > This particular filter has only the channel\n *              dimension and we have `out_ch` number of such filters.\n *          > We take 8 input lines at a time and transpose those.\n *          > Keep loading and multiplying filter values one by one,\n *              to produce 8 outputs in parallel\n *\n * 2. General version: (Refer esp_nn_conv_s8_filter_aligned_input_padded_esp32p4)\n *      - For all other cases:\n *          > Consider `filter_wd * in_ch` as a single row. These many values can\n *              be continuosly loaded from inputs as well.\n *          > multiply accumulate into a single filter output.\n *          > To speed things up further, we pre-calculate\n *              (filter * in_offset + bias term) earlier and add it at the end of filter\n *\n *      About ((filter * in_offset + bias term)) accumulate term:\n *          > The conv operation before requantization is as follows:\n *              for i in filter_size:\n *                  conv_out += (input + input_offset) * filter;\n *               conv_out += bias\n *\n *          > where input_offset is constant term hence, we can see that\n *              this term can be precalculated as:\n *                  for i in filter_size:\n *                      acc_term += input_offset * filter[i];\n *                  acc_term += bias\n *              OR\n *                   for i in filter_size:\n *                      acc_term += filter[i]; // accumulate filter values\n *                  acc_term = acc_term * input_offset + bias\n *\n *\n * In both the above versions we align the filter if needed, pad the input with\n *       -input_offset if needed and extend the channels to make those multiple\n *       of 8/16 as per function needs\n */\n\n#include <stdio.h>\n#include <esp_nn_defs.h>\n#include <esp_nn_ansi_headers.h>\n#include \"esp_nn_generic_opt.h\"\n\n#include <common_functions.h>\n\nstatic int16_t *scratch_buffer = NULL;\n\n/**\n * Reusable PIE-accelerated dot product (same as FC version).\n * Processes 32 elements/iter (double-pump) for len >= 32,\n * 16 elements/iter for len >= 16, scalar remainder.\n */\nstatic inline __attribute__((always_inline))\nint32_t pie_dot_s8(const int8_t *a, const int8_t *b, int32_t len)\n{\n    int32_t result = 0;\n    int32_t idx = 0;\n\n    if (len >= 32) {\n        asm volatile (\n            \"esp.zero.xacc                          \\n\\t\"\n            \"mv     x30, %[in]                      \\n\\t\"\n            \"mv     x31, %[flt]                     \\n\\t\"\n            \"li     %[idx], 32                      \\n\\t\"\n            \"addi   s7, %[len], -31                 \\n\\t\"\n            \"esp.vld.128.ip  q0, x30, 16            \\n\\t\"\n            \"esp.vld.128.ip  q2, x30, 16            \\n\\t\"\n            \"esp.vld.128.ip  q1, x31, 16            \\n\\t\"\n            \"esp.vld.128.ip  q3, x31, 16            \\n\\t\"\n            \"j      2f                              \\n\\t\"\n            \"1:                                     \\n\\t\"\n            \"esp.vmulas.s8.xacc.ld.ip q0, x30, 16, q0, q1 \\n\\t\"\n            \"esp.vld.128.ip  q1, x31, 16            \\n\\t\"\n            \"esp.vmulas.s8.xacc.ld.ip q2, x30, 16, q2, q3 \\n\\t\"\n            \"esp.vld.128.ip  q3, x31, 16            \\n\\t\"\n            \"addi   %[idx], %[idx], 32              \\n\\t\"\n            \"2:                                     \\n\\t\"\n            \"blt    %[idx], s7, 1b                  \\n\\t\"\n            \"esp.vmulas.s8.xacc  q0, q1             \\n\\t\"\n            \"esp.vmulas.s8.xacc  q2, q3             \\n\\t\"\n            \"addi   s7, %[len], -15                 \\n\\t\"\n            \"bge    %[idx], s7, 3f                  \\n\\t\"\n            \"esp.vld.128.ip  q0, x30, 16            \\n\\t\"\n            \"esp.vld.128.ip  q1, x31, 16            \\n\\t\"\n            \"esp.vmulas.s8.xacc  q0, q1             \\n\\t\"\n            \"addi   %[idx], %[idx], 16              \\n\\t\"\n            \"3:                                     \\n\\t\"\n            \"esp.movx.r.xacc.l   x30                \\n\\t\"\n            \"mv     %[res], x30                     \\n\\t\"\n            : [idx] \"+r\"(idx), [res] \"=r\"(result)\n            : [in] \"r\"(a), [flt] \"r\"(b), [len] \"r\"(len)\n            : \"x30\", \"x31\", \"s7\"\n        );\n    } else if (len >= 16) {\n        asm volatile (\n            \"esp.zero.xacc                          \\n\\t\"\n            \"mv     x30, %[in]                      \\n\\t\"\n            \"mv     x31, %[flt]                     \\n\\t\"\n            \"li     %[idx], 16                      \\n\\t\"\n            \"addi   s7, %[len], -15                 \\n\\t\"\n            \"esp.vld.128.ip  q0, x30, 16            \\n\\t\"\n            \"esp.vld.128.ip  q1, x31, 16            \\n\\t\"\n            \"j      5f                              \\n\\t\"\n            \"4:                                     \\n\\t\"\n            \"esp.vmulas.s8.xacc.ld.ip q0, x30, 16, q0, q1 \\n\\t\"\n            \"esp.vld.128.ip  q1, x31, 16            \\n\\t\"\n            \"addi   %[idx], %[idx], 16              \\n\\t\"\n            \"5:                                     \\n\\t\"\n            \"blt    %[idx], s7, 4b                  \\n\\t\"\n            \"esp.vmulas.s8.xacc  q0, q1             \\n\\t\"\n            \"esp.movx.r.xacc.l   x30                \\n\\t\"\n            \"mv     %[res], x30                     \\n\\t\"\n            : [idx] \"+r\"(idx), [res] \"=r\"(result)\n            : [in] \"r\"(a), [flt] \"r\"(b), [len] \"r\"(len)\n            : \"x30\", \"x31\", \"s7\"\n        );\n    }\n\n    for (; idx < len; idx++) {\n        result += (int32_t)a[idx] * (int32_t)b[idx];\n    }\n    return result;\n}\n\n/**\n * Batched 1x1 conv using QACC per-lane: processes 16 pixels simultaneously.\n * Transposes input so each QACC lane = one pixel, then broadcasts filter\n * coefficients for per-lane accumulation. Critical for small in_ch where\n * XACC can't be used (in_ch < 16).\n *\n * For in_ch=8: 4.5x faster than scalar per-pixel approach.\n */\n__attribute__((noinline))\nstatic void conv_1x1_batch16(const int8_t *pixel_ptrs[16],\n                      const int8_t *filter_data,\n                      const int32_t *filter_sum,\n                      const int32_t *bias,\n                      int8_t *out_ptrs[16],\n                      int32_t in_ch, int32_t out_ch,\n                      int32_t out_offset,\n                      const int32_t *out_mult, const int32_t *out_shift,\n                      int32_t act_min, int32_t act_max)\n{\n    /* Ensure PIE is enabled (might be lost across noinline function call) */\n    asm volatile (\n        \"csrsi  0x7f2, 0b01        \\n\\t\"\n        \"li     x29, 0b10          \\n\\t\"\n        \"esp.movx.w.cfg x29        \\n\\t\"\n        ::: \"x29\"\n    );\n\n    /* Transpose: arrange 16 pixels' data as ch0[p0..p15], ch1[p0..p15], ... */\n    int8_t transposed[16 * 16] __attribute__((aligned(16)));  /* in_ch <= 16 for this path */\n    for (int c = 0; c < in_ch; c++) {\n        for (int p = 0; p < 16; p++) {\n            transposed[c * 16 + p] = pixel_ptrs[p][c];\n        }\n    }\n\n    /* For each output channel: QACC per-lane MAC with broadcast filter.\n     * Use single asm block for zero + accumulate loop to prevent\n     * q register clobber between separate asm blocks. */\n    const int8_t *filt = filter_data;\n    for (int32_t oc = 0; oc < out_ch; oc++) {\n        /* Single asm: zero QACC, then loop over in_ch channels:\n         * broadcast filter[ch], load 16 transposed pixels, MAC per-lane */\n        asm volatile (\n            \"esp.zero.qacc                       \\n\\t\"\n            \"mv     x30, %[trans]                \\n\\t\"  /* transposed base */\n            \"mv     x31, %[flt]                  \\n\\t\"  /* filter base */\n            \"mv     s7,  %[cnt]                  \\n\\t\"  /* in_ch count */\n            \"1:                                  \\n\\t\"\n            \"esp.vld.128.ip  q0, x30, 16         \\n\\t\"  /* load 16 pixel values, advance by 16 */\n            \"esp.vldbc.8.ip  q1, x31, 1          \\n\\t\"  /* broadcast filter[ch], advance by 1 */\n            \"esp.vmulas.s8.qacc q0, q1           \\n\\t\"\n            \"addi   s7, s7, -1                   \\n\\t\"\n            \"bnez   s7, 1b                       \\n\\t\"\n            :\n            : [trans] \"r\"(transposed), [flt] \"r\"(filt), [cnt] \"r\"(in_ch)\n            : \"x30\", \"x31\", \"s7\"\n        );\n\n        /* Extract 16 results */\n        int32_t results[16] __attribute__((aligned(16)));\n        ESP_NN_QACC_EXTRACT_S32(results);\n\n        /* Add filter_sum + bias, requant, clamp, store for each pixel */\n        int32_t fs = filter_sum[oc];\n        int32_t b = bias ? bias[oc] : 0;\n        int32_t combined = fs + b;\n        int32_t m = out_mult[oc];\n        int32_t s = out_shift[oc];\n\n        for (int p = 0; p < 16; p++) {\n            int32_t r = results[p] + combined;\n            r = esp_nn_multiply_by_quantized_mult(r, m, s);\n            r += out_offset;\n            r = max(r, act_min);\n            r = min(r, act_max);\n            out_ptrs[p][oc] = (int8_t) r;\n        }\n\n        filt += in_ch;\n    }\n}\n\n__attribute__ ((noinline))\nstatic void esp_nn_conv_s8_1x1(const data_dims_t *input_dims,\n                               const int8_t *input_data,\n                               const int8_t *filter_data,\n                               const int32_t *bias,\n                               const data_dims_t *output_dims,\n                               int8_t *out_data,\n                               const conv_params_t *conv_params,\n                               const quant_data_t *quant_data,\n                               void *scratch)\n{\n    const uint16_t input_wd = input_dims->width;\n    const uint16_t in_channels = input_dims->channels;\n    const int32_t input_offset = conv_params->in_offset;\n    const int32_t out_offset = conv_params->out_offset;\n    const uint16_t out_wd = output_dims->width;\n    const uint16_t out_ht = output_dims->height;\n    const uint16_t out_channels = output_dims->channels;\n    const int32_t activation_min = conv_params->activation.min;\n    const int32_t activation_max = conv_params->activation.max;\n\n    int32_t *filter_sum = (int32_t *) scratch; // alignment of 4 bytes assumed\n\n    /* pre-calculate filter_sum * input_offset */\n    const int8_t *filter_ptr = filter_data;\n    for (int32_t out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {\n        int32_t sum = 0;\n        int32_t in_ch_idx = 0;\n        for (; in_ch_idx < in_channels - 3; in_ch_idx += 4) {\n            sum += *filter_ptr++;\n            sum += *filter_ptr++;\n            sum += *filter_ptr++;\n            sum += *filter_ptr++;\n        }\n        for (; in_ch_idx < in_channels; in_ch_idx ++) {\n            sum += *filter_ptr++;\n        }\n        filter_sum[out_ch_idx] = sum * input_offset;\n    }\n\n    /* When in_ch < 16: use QACC batch path (16 pixels at once) or channel padding.\n     * QACC batch: transpose pixels, broadcast filter, per-lane MAC.\n     * Channel pad: pad in/filter to 16 ch for XACC. */\n    /* When in_ch < 16: use QACC batch (16 pixels at a time with broadcast filter).\n     * Falls back to channel-padding for remaining pixels. */\n    if (in_channels < 16) {\n        /* Enable PIE for QACC */\n        asm volatile (\n            \"csrsi  0x7f2, 0b01        \\n\\t\"\n            \"li     x29, 0b10          \\n\\t\"\n            \"esp.movx.w.cfg x29        \\n\\t\"\n            ::: \"x29\"\n        );\n\n        int32_t total_pixels = out_wd * out_ht;\n        int32_t pix = 0;\n\n        /* Process batches of 16 pixels using QACC per-lane */\n        for (; pix <= total_pixels - 16; pix += 16) {\n            const int8_t *pp[16];\n            int8_t *op[16];\n            for (int p = 0; p < 16; p++) {\n                pp[p] = input_data + (pix + p) * in_channels;\n                op[p] = out_data + (pix + p) * out_channels;\n            }\n            conv_1x1_batch16(pp, filter_data, filter_sum, bias, op,\n                             in_channels, out_channels, out_offset,\n                             quant_data->mult, quant_data->shift,\n                             activation_min, activation_max);\n        }\n\n        /* Remaining pixels (< 16): scalar fallback */\n        for (; pix < total_pixels; pix++) {\n            const int8_t *inp = input_data + pix * in_channels;\n            filter_ptr = filter_data;\n            for (int32_t oc = 0; oc < out_channels; oc++) {\n                int32_t conv_out = 0;\n                for (int32_t ic = 0; ic < in_channels; ic++) {\n                    conv_out += inp[ic] * filter_ptr[ic];\n                }\n                conv_out += filter_sum[oc];\n                if (bias) conv_out += bias[oc];\n                conv_out = esp_nn_multiply_by_quantized_mult(conv_out,\n                    quant_data->mult[oc], quant_data->shift[oc]);\n                conv_out += out_offset;\n                conv_out = max(conv_out, activation_min);\n                conv_out = min(conv_out, activation_max);\n                out_data[pix * out_channels + oc] = (int8_t) conv_out;\n                filter_ptr += in_channels;\n            }\n        }\n        return;\n    }\n\n    for (int32_t in_row = 0; in_row < out_ht; in_row++) {\n        for (int32_t in_col = 0; in_col < out_wd; in_col++) {\n            const int32_t *out_mult = quant_data->mult;\n            const int32_t *out_shift = quant_data->shift;\n            filter_ptr = filter_data;\n            const int8_t *input_base_ptr = input_data + (in_row * input_wd + in_col) * in_channels;\n            for (int32_t out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {\n                /* initializations */\n                int32_t conv_out = 0;\n                const int8_t *input_ptr = input_base_ptr;\n\n                int32_t in_ch_idx = 0;\n#if 1 // inline asm\n                // for now check for the alignment as well\n                if (in_channels < 16) {// || ((uint32_t) input_ptr & 15) || ((uint32_t) filter_ptr & 15)) {\n                    goto skip_asm;\n                }\n\n                asm volatile (\n                    \"li %0, 16                      \\n\\t\"\n                    \"addi s7, %4, -15               \\n\\t\"\n                    \"mv x30, %1                     \\n\\t\"\n                    \"mv x31, %2                     \\n\\t\"\n                    \"esp.zero.xacc                  \\n\\t\"\n                    \"esp.vld.128.ip  q0, x30, 16    \\n\\t\"\n                    \"esp.vld.128.ip  q1, x31, 16    \\n\\t\"\n\n                    \"j .loop16_end  \\n\\t\"\n\n                    \".loop16_start:      \\n\\t\"\n                    \"esp.vmulas.s8.xacc.ld.ip  q0, x30, 16, q0, q1   \\n\\t\"\n                    \"esp.vld.128.ip  q1, x31, 16                     \\n\\t\"\n                    \"addi %0, %0, 16                \\n\\t\"   // in_ch_idx += 16\n\n                    \".loop16_end:    \\n\\t\"\n                    \"blt %0, s7, .loop16_start \\n\\t\"  // if in_ch_idx < `in_channels - 15` abort\n\n                    // move input_ptr, filter_ptr and conv_out\n                    \"mv %1, x30                     \\n\\t\"\n                    \"mv %2, x31                     \\n\\t\"\n                    \"esp.vmulas.s8.xacc  q0, q1     \\n\\t\"\n                    \"esp.movx.r.xacc.l  %3          \\n\\t\"\n\n                    : \"+r\" (in_ch_idx), \"+r\" (input_ptr), \"+r\" (filter_ptr), \"=r\" (conv_out)\n                    :  \"r\"(in_channels)\n                    : \"x30\", \"x31\", \"s7\"\n                );\nskip_asm:\n#endif\n                for (; in_ch_idx < in_channels - 3; in_ch_idx += 4) {\n                    conv_out += *input_ptr++ * *filter_ptr++;\n                    conv_out += *input_ptr++ * *filter_ptr++;\n                    conv_out += *input_ptr++ * *filter_ptr++;\n                    conv_out += *input_ptr++ * *filter_ptr++;\n                }\n\n                for (; in_ch_idx < in_channels; in_ch_idx++) {\n                    conv_out += *input_ptr++ * *filter_ptr++;\n                }\n                conv_out = conv_out + filter_sum[out_ch_idx];\n                if (bias) {\n                    conv_out += bias[out_ch_idx];\n                }\n                conv_out = esp_nn_requantize(conv_out, *out_mult++, *out_shift++);\n                conv_out += out_offset;\n                conv_out = max(conv_out, activation_min);\n                conv_out = min(conv_out, activation_max);\n                *out_data++ = (int8_t) conv_out;\n            }\n        }\n    }\n}\n\n__attribute__ ((noinline))\nstatic void esp_nn_conv_s8_padded(\n        const data_dims_t *input_dims,\n        const int8_t *input_data,\n        const data_dims_t *filter_dims,\n        const int8_t *filter_data,\n        const int32_t *bias,\n        const data_dims_t *output_dims,\n        int8_t *out_data,\n        const conv_params_t *conv_params,\n        const quant_data_t *quant_data,\n        void *scratch)\n{\n    const uint16_t input_wd = input_dims->width;\n    const uint16_t input_ht = input_dims->height;\n    const uint16_t in_channels = input_dims->channels;\n    const int32_t input_offset = conv_params->in_offset;\n    const int32_t out_offset = conv_params->out_offset;\n    const uint16_t stride_wd = conv_params->stride.width;\n    const uint16_t stride_ht = conv_params->stride.height;\n    const uint16_t filter_wd = filter_dims->width;\n    const uint16_t filter_ht = filter_dims->height;\n    const uint16_t out_wd = output_dims->width;\n    const uint16_t out_ht = output_dims->height;\n    const uint16_t out_channels = output_dims->channels;\n    const int32_t *out_shift = quant_data->shift;\n    const int32_t *out_mult = quant_data->mult;\n    const int32_t activation_min = conv_params->activation.min;\n    const int32_t activation_max = conv_params->activation.max;\n\n    /* Grouped conv (filter_ch < input_ch): fall back to ansi which handles it */\n    if (in_channels != filter_dims->channels) {\n        esp_nn_conv_s8_ansi(input_dims, input_data, filter_dims, filter_data,\n                            bias, output_dims, out_data, conv_params, quant_data);\n        return;\n    }\n\n    int32_t *filter_sum = (int32_t *) scratch; // alignment of 4 bytes assumed\n\n    /* pre-calculate filter_sum * input_offset */\n    const int8_t *filter_ptr = filter_data;\n    for (int32_t out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {\n        int32_t sum = 0;\n        int32_t filter_len = filter_wd * filter_ht * in_channels;\n        int32_t filter_idx = 0;\n        for (; filter_idx < filter_len - 3; filter_idx += 4) {\n            sum += *filter_ptr++;\n            sum += *filter_ptr++;\n            sum += *filter_ptr++;\n            sum += *filter_ptr++;\n        }\n        for (; filter_idx < filter_len; filter_idx++) {\n            sum += *filter_ptr++;\n        }\n        filter_sum[out_ch_idx] = sum * input_offset;\n    }\n\n    const int32_t row_size = filter_wd * in_channels;\n\n    bool right_pad = max(0, ((out_wd - 1) * stride_wd + filter_wd - input_wd));\n    bool bottom_pad = max(0, ((out_ht - 1) * stride_ht + filter_ht - input_ht));\n\n    for (int32_t out_y = 0; out_y < out_ht - bottom_pad; out_y++) {\n        for (int32_t out_x = 0; out_x < out_wd - right_pad; out_x++) {\n            const int32_t base_y = stride_ht * out_y;\n            const int32_t base_x = stride_wd * out_x;\n            const int32_t *out_mult_ptr = out_mult;\n            const int32_t *out_shift_ptr = out_shift;\n            const int32_t *bias_ptr = bias;\n            const int8_t *filter_data_ptr = filter_data;\n            for (int32_t out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {\n                int32_t conv_out = 0, filter_y_idx;\n                if (row_size >= 16) {\n                    asm volatile(\"esp.zero.xacc                  \\n\\t\");\n                }\n\n                for (filter_y_idx = 0; filter_y_idx < filter_ht; filter_y_idx++) {\n                    const int32_t in_row = base_y + filter_y_idx;\n                    const int32_t in_col = base_x;\n                    const int8_t *input_data_ptr =\n                            input_data + (in_row * input_wd + in_col) * in_channels;\n                    int32_t row_idx = 0;\n#if 1 // inline asm\n                // for now check for the alignment as well\n                if (row_size < 16) {// || ((uint32_t) input_ptr & 15) || ((uint32_t) filter_ptr & 15)) {\n                    goto skip_asm_pad0;\n                }\n\n                asm volatile (\n                    \"li %0, 16                      \\n\\t\"\n                    \"addi s7, %3, -15               \\n\\t\"\n                    \"mv x30, %1                     \\n\\t\"\n                    \"mv x31, %2                     \\n\\t\"\n                    \"esp.vld.128.ip  q0, x30, 16    \\n\\t\"\n                    \"esp.vld.128.ip  q1, x31, 16    \\n\\t\"\n\n                    \"j .loop16_pad0_end  \\n\\t\"\n\n                    \".loop16_pad0_start:      \\n\\t\"\n                    \"esp.vmulas.s8.xacc.ld.ip  q0, x30, 16, q0, q1   \\n\\t\"\n                    \"esp.vld.128.ip  q1, x31, 16                     \\n\\t\"\n                    \"addi %0, %0, 16                \\n\\t\"   // in_ch_idx += 16\n\n                    \".loop16_pad0_end:    \\n\\t\"\n                    \"blt %0, s7, .loop16_pad0_start \\n\\t\"  // if in_ch_idx < `in_channels - 15` abort\n\n                    // move input_ptr, filter_ptr and conv_out\n                    \"mv %1, x30                     \\n\\t\"\n                    \"mv %2, x31                     \\n\\t\"\n                    \"esp.vmulas.s8.xacc  q0, q1     \\n\\t\"\n\n                    : \"+r\" (row_idx), \"+r\" (input_data_ptr), \"+r\" (filter_data_ptr)\n                    :  \"r\"(row_size)\n                    : \"x30\", \"x31\", \"s7\"\n                );\nskip_asm_pad0:\n#endif\n                    for (; row_idx < row_size - 3; row_idx += 4) {\n                        conv_out += *input_data_ptr++ * *filter_data_ptr++;\n                        conv_out += *input_data_ptr++ * *filter_data_ptr++;\n                        conv_out += *input_data_ptr++ * *filter_data_ptr++;\n                        conv_out += *input_data_ptr++ * *filter_data_ptr++;\n                    }\n                    for (; row_idx < row_size; row_idx++) {\n                        conv_out += *input_data_ptr++ * *filter_data_ptr++;\n                    }\n                }\n                if (row_size >= 16) {\n                    asm volatile (\n                        \"esp.movx.r.xacc.l  x30   \\n\\t\"\n                        \"add %0, %0, x30          \\n\\t\"\n                        : \"+r\" (conv_out)\n                        :\n                        : \"x30\"\n                    );\n                }\n                /* add input_offset term */\n                conv_out += filter_sum[out_ch_idx];\n\n                if (bias) {\n                    conv_out += *bias_ptr++;\n                }\n                conv_out = esp_nn_requantize(conv_out, *out_mult_ptr++, *out_shift_ptr++);\n                conv_out += out_offset;\n                conv_out = max(conv_out, activation_min);\n                conv_out = min(conv_out, activation_max);\n                *out_data++ = (int8_t) conv_out;\n            }\n        }\n\n        for (int32_t out_x = out_wd - right_pad; out_x < out_wd; out_x++) {\n            const int32_t base_y = stride_ht * out_y;\n            const int32_t base_x = stride_wd * out_x;\n            const int32_t *out_mult_ptr = out_mult;\n            const int32_t *out_shift_ptr = out_shift;\n            const int32_t *bias_ptr = bias;\n            for (int32_t out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {\n                int32_t conv_out = 0, filter_y_idx;\n                for (filter_y_idx = 0; filter_y_idx < filter_ht; filter_y_idx++) {\n                    for (int32_t filter_x_idx = 0; filter_x_idx < filter_wd - right_pad; filter_x_idx++) {\n                        const int32_t in_row = base_y + filter_y_idx;\n                        const int32_t in_col = base_x + filter_x_idx;\n\n                        const int8_t *input_ptr = input_data +\n                                        (in_row * input_wd + in_col) * in_channels;\n                        const int8_t *filter_ptr = filter_data +\n                                        out_ch_idx * in_channels * filter_ht * filter_wd +\n                                        (filter_y_idx * filter_wd + filter_x_idx) * in_channels;\n                        int32_t in_ch_idx = 0;\n                        for (; in_ch_idx < in_channels - 3; in_ch_idx += 4) {\n                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;\n                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;\n                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;\n                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;\n                        }\n                        for (; in_ch_idx < in_channels; in_ch_idx ++) {\n                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;\n                        }\n                    }\n                }\n\n                if (bias) {\n                    conv_out += *bias_ptr++;\n                }\n                conv_out = esp_nn_requantize(conv_out, *out_mult_ptr++, *out_shift_ptr++);\n                conv_out += out_offset;\n                conv_out = max(conv_out, activation_min);\n                conv_out = min(conv_out, activation_max);\n                *out_data++ = (int8_t) conv_out;\n            }\n        }\n    }\n\n    // Calculate the last row if needed\n    if (bottom_pad) {\n        int in_row = input_dims->height - filter_dims->height + 1;\n        esp_nn_conv_s8_opt(&(data_dims_t){input_dims->width, 2, input_dims->channels, 0},\n                            input_data + in_row * input_dims->width * input_dims->channels,\n                            filter_dims, filter_data, bias,\n                            &(data_dims_t){output_dims->width, 1, output_dims->channels, 0},\n                            out_data, conv_params, quant_data);\n    }\n}\n\n/* L1D cache budget: use half of 64KB to leave room for filter streaming */\n#define L1D_BUDGET 32768\n\n/**\n * Im2col convolution for small in_ch where filter_wd * in_ch < 16.\n *\n * Instead of padding channels (81% wasted MACs for in_ch=3),\n * concatenates the entire filter window into one contiguous vector:\n *   window_len = filter_wd * filter_ht * in_ch (e.g., 3*3*3 = 27)\n *\n * For each output pixel: copy the input window into a contiguous scratch\n * buffer, then use PIE dot product on the full window. No wasted MACs.\n *\n * Scratch layout: [filter_sum | im2col_buf]\n *   im2col_buf = filter_wd * filter_ht * in_ch bytes\n */\n__attribute__ ((noinline))\nstatic void esp_nn_conv_s8_im2col(\n        const data_dims_t *input_dims,\n        const int8_t *input_data,\n        const data_dims_t *filter_dims,\n        const int8_t *filter_data,\n        const int32_t *bias,\n        const data_dims_t *output_dims,\n        int8_t *out_data,\n        const conv_params_t *conv_params,\n        const quant_data_t *quant_data,\n        void *scratch)\n{\n    const uint16_t input_wd = input_dims->width;\n    const uint16_t input_ht = input_dims->height;\n    const uint16_t in_ch = input_dims->channels;\n    const uint16_t filter_wd = filter_dims->width;\n    const uint16_t filter_ht = filter_dims->height;\n    const uint16_t out_wd = output_dims->width;\n    const uint16_t out_ht = output_dims->height;\n    const uint16_t out_ch = output_dims->channels;\n    const uint16_t pad_wd = conv_params->padding.width;\n    const uint16_t pad_ht = conv_params->padding.height;\n    const uint16_t stride_wd = conv_params->stride.width;\n    const uint16_t stride_ht = conv_params->stride.height;\n    const int32_t input_offset = conv_params->in_offset;\n    const int32_t out_offset = conv_params->out_offset;\n    const int32_t activation_min = conv_params->activation.min;\n    const int32_t activation_max = conv_params->activation.max;\n\n    const int32_t window_len = filter_wd * filter_ht * in_ch;\n    const int8_t pad_val = (int8_t)(-input_offset);\n\n    /* Scratch: filter_sum[out_ch] + im2col_buf[window_len] */\n    int32_t *filter_sum = (int32_t *)scratch;\n    int8_t *im2col_buf = (int8_t *)scratch + out_ch * sizeof(int32_t);\n\n    /* Pre-compute filter_sum * input_offset */\n    const int8_t *fptr = filter_data;\n    for (int32_t oc = 0; oc < out_ch; oc++) {\n        int32_t sum = 0;\n        for (int32_t fi = 0; fi < window_len; fi++) {\n            sum += *fptr++;\n        }\n        filter_sum[oc] = sum * input_offset;\n    }\n\n    /* Process each output pixel */\n    int8_t *out_ptr = out_data;\n    for (int32_t out_y = 0; out_y < out_ht; out_y++) {\n        for (int32_t out_x = 0; out_x < out_wd; out_x++) {\n            const int32_t base_y = out_y * stride_ht - pad_ht;\n            const int32_t base_x = out_x * stride_wd - pad_wd;\n\n            /* Copy input window into contiguous im2col buffer */\n            int8_t *buf = im2col_buf;\n            for (int32_t fy = 0; fy < filter_ht; fy++) {\n                int32_t in_y = base_y + fy;\n                for (int32_t fx = 0; fx < filter_wd; fx++) {\n                    int32_t in_x = base_x + fx;\n                    if (in_y >= 0 && in_y < input_ht && in_x >= 0 && in_x < input_wd) {\n                        const int8_t *src = input_data + (in_y * input_wd + in_x) * in_ch;\n                        for (int c = 0; c < in_ch; c++) {\n                            *buf++ = src[c];\n                        }\n                    } else {\n                        /* Padding pixel */\n                        for (int c = 0; c < in_ch; c++) {\n                            *buf++ = pad_val;\n                        }\n                    }\n                }\n            }\n\n            /* Dot product against each output channel's filter */\n            const int32_t *out_mult = quant_data->mult;\n            const int32_t *out_shift = quant_data->shift;\n            const int8_t *filter_ptr = filter_data;\n\n            for (int32_t oc = 0; oc < out_ch; oc++) {\n                int32_t conv_out = pie_dot_s8(im2col_buf, filter_ptr, window_len);\n                conv_out += filter_sum[oc];\n                if (bias) conv_out += bias[oc];\n                conv_out = esp_nn_requantize(conv_out, *out_mult++, *out_shift++);\n                conv_out += out_offset;\n                conv_out = max(conv_out, activation_min);\n                conv_out = min(conv_out, activation_max);\n                *out_ptr++ = (int8_t) conv_out;\n                filter_ptr += window_len;\n            }\n        }\n    }\n}\n\n/**\n * Tiled convolution: process T output rows at a time.\n * Converts padded conv into a series of no-pad sub-problems by\n * copying/padding input tiles into the scratch buffer.\n *\n * This keeps the working set in L1D for large input tensors.\n * Reuses the existing esp_nn_conv_s8_padded PIE inner loop per tile.\n */\n__attribute__ ((noinline))\nstatic void esp_nn_conv_s8_tiled(\n        const data_dims_t *input_dims,\n        const int8_t *input_data,\n        const data_dims_t *filter_dims,\n        const int8_t *filter_data,\n        const int32_t *bias,\n        const data_dims_t *output_dims,\n        int8_t *out_data,\n        const conv_params_t *conv_params,\n        const quant_data_t *quant_data,\n        void *scratch)\n{\n    const uint16_t input_wd = input_dims->width;\n    const uint16_t input_ht = input_dims->height;\n    const uint16_t in_ch = input_dims->channels;\n    const uint16_t filter_wd = filter_dims->width;\n    const uint16_t filter_ht = filter_dims->height;\n    const uint16_t out_wd = output_dims->width;\n    const uint16_t out_ht = output_dims->height;\n    const uint16_t out_ch = output_dims->channels;\n    const uint16_t pad_wd = conv_params->padding.width;\n    const uint16_t pad_ht = conv_params->padding.height;\n    const uint16_t stride_ht = conv_params->stride.height;\n    const int32_t input_offset = conv_params->in_offset;\n\n    /* Check if we need channel padding for PIE (row_size must be >= 16) */\n    int new_ch = in_ch;\n    int need_ch_pad = 0;\n    if (filter_wd * in_ch < 16) {\n        new_ch = (16 + filter_wd - 1) / filter_wd;  /* minimum channels for PIE */\n        new_ch = (new_ch + 15) & ~15;                /* align to 16 */\n        need_ch_pad = 1;\n    }\n    int padded_input_wd = input_wd + 2 * pad_wd;\n\n    /* Scratch layout:\n     * [0] filter_sum: out_ch * 4 bytes\n     * [after filter_sum] aligned_filter (if ch padding): filter_wd * filter_ht * new_ch * out_ch\n     * [after filter] tile_input_buf: variable per tile\n     */\n    int32_t *filter_sum = (int32_t *) scratch;\n    int filter_sum_size = out_ch * sizeof(int32_t);\n\n    /* Pre-compute filter_sum * input_offset (once for entire layer) */\n    const int8_t *fptr = filter_data;\n    for (int32_t oc = 0; oc < out_ch; oc++) {\n        int32_t sum = 0;\n        int32_t flen = filter_wd * filter_ht * in_ch;\n        for (int32_t fi = 0; fi < flen; fi++) {\n            sum += *fptr++;\n        }\n        filter_sum[oc] = sum * input_offset;\n    }\n\n    /* Channel-pad filter if needed (pad with 0s - doesn't affect filter_sum) */\n    int8_t *aligned_filter = NULL;\n    int aligned_filter_size = 0;\n    if (need_ch_pad) {\n        aligned_filter = (int8_t *)scratch + filter_sum_size;\n        aligned_filter_size = filter_wd * filter_ht * new_ch * out_ch;\n        memset(aligned_filter, 0, aligned_filter_size);\n        const int8_t *src_f = filter_data;\n        int8_t *dst_f = aligned_filter;\n        for (int oc = 0; oc < out_ch; oc++) {\n            for (int fh = 0; fh < filter_ht; fh++) {\n                for (int fw = 0; fw < filter_wd; fw++) {\n                    memcpy(dst_f, src_f, in_ch);\n                    src_f += in_ch;\n                    dst_f += new_ch;  /* zero-padded channels */\n                }\n            }\n        }\n    }\n\n    /* Tile input buffer starts after filter_sum + aligned_filter */\n    int8_t *tile_buf = (int8_t *)scratch + filter_sum_size + aligned_filter_size;\n\n    /* Use effective channel count for tile buffer sizing */\n    int eff_ch = need_ch_pad ? new_ch : in_ch;\n    int tile_input_row_bytes = padded_input_wd * eff_ch;\n\n    /* Compute tile height T (output rows per tile) */\n    int tile_T = out_ht;\n    int total_input_bytes = padded_input_wd * (input_ht + 2 * pad_ht) * eff_ch;\n    int used_scratch = filter_sum_size + aligned_filter_size;\n    if (total_input_bytes + used_scratch > L1D_BUDGET) {\n        int budget_for_input = L1D_BUDGET - used_scratch;\n        int min_input_rows = filter_ht;\n        if (min_input_rows * tile_input_row_bytes <= budget_for_input) {\n            tile_T = (budget_for_input - filter_ht * tile_input_row_bytes)\n                     / (stride_ht * tile_input_row_bytes) + 1;\n            if (tile_T < 1) tile_T = 1;\n            if (tile_T > out_ht) tile_T = out_ht;\n        }\n    }\n\n    /* Process tiles */\n    const int8_t *use_filter = need_ch_pad ? aligned_filter : filter_data;\n    data_dims_t eff_filter_dims = {filter_wd, filter_ht, eff_ch, 0};\n\n    for (int32_t tile_y = 0; tile_y < out_ht; tile_y += tile_T) {\n        int32_t actual_T = min(tile_T, out_ht - tile_y);\n\n        int32_t in_row_start = tile_y * stride_ht - pad_ht;\n        int32_t in_row_end = (tile_y + actual_T - 1) * stride_ht + filter_ht - 1;\n        int32_t tile_input_ht = in_row_end - in_row_start + 1;\n\n        /* Copy/pad input rows into tile buffer, with channel padding if needed */\n        int8_t pad_val = (int8_t)(-input_offset);\n        int8_t *dst = tile_buf;\n\n        for (int32_t row = in_row_start; row <= in_row_end; row++) {\n            if (row < 0 || row >= input_ht) {\n                memset(dst, pad_val, padded_input_wd * eff_ch);\n            } else {\n                /* For each pixel in padded row */\n                int8_t *row_dst = dst;\n                /* Left padding */\n                for (int px = 0; px < pad_wd; px++) {\n                    memset(row_dst, pad_val, eff_ch);\n                    row_dst += eff_ch;\n                }\n                /* Valid pixels - with optional channel padding */\n                const int8_t *row_src = input_data + row * input_wd * in_ch;\n                if (need_ch_pad) {\n                    for (int px = 0; px < input_wd; px++) {\n                        memcpy(row_dst, row_src, in_ch);\n                        if (eff_ch > in_ch) {\n                            memset(row_dst + in_ch, pad_val, eff_ch - in_ch);\n                        }\n                        row_src += in_ch;\n                        row_dst += eff_ch;\n                    }\n                } else {\n                    memcpy(row_dst, row_src, input_wd * in_ch);\n                    row_dst += input_wd * in_ch;\n                }\n                /* Right padding */\n                for (int px = 0; px < pad_wd; px++) {\n                    memset(row_dst, pad_val, eff_ch);\n                    row_dst += eff_ch;\n                }\n            }\n            dst += padded_input_wd * eff_ch;\n        }\n\n        /* Sub-problem with pad=0, effective channels */\n        data_dims_t tile_input_dims = {padded_input_wd, tile_input_ht, eff_ch, 0};\n        data_dims_t tile_output_dims = {out_wd, actual_T, out_ch, 0};\n        conv_params_t tile_conv_params = *conv_params;\n        tile_conv_params.padding.width = 0;\n        tile_conv_params.padding.height = 0;\n\n        esp_nn_conv_s8_padded(&tile_input_dims, tile_buf,\n                              &eff_filter_dims, use_filter, bias,\n                              &tile_output_dims,\n                              out_data + tile_y * out_wd * out_ch,\n                              &tile_conv_params, quant_data,\n                              filter_sum);\n    }\n}\n\nint esp_nn_get_conv_scratch_size_esp32p4(const data_dims_t *input_dims,\n                                         const data_dims_t *filter_dims,\n                                         const data_dims_t *output_dims,\n                                         const conv_params_t *conv_params)\n{\n    const uint16_t input_wd = input_dims->width;\n    const uint16_t input_ht = input_dims->height;\n    const uint16_t in_ch = input_dims->channels;\n    const uint16_t filter_wd = filter_dims->width;\n    const uint16_t filter_ht = filter_dims->height;\n    const uint16_t out_ch = output_dims->channels;\n    const uint16_t pad_wd = conv_params->padding.width;\n    const uint16_t pad_ht = conv_params->padding.height;\n    const uint16_t stride_wd = conv_params->stride.width;\n    const uint16_t stride_ht = conv_params->stride.height;\n\n    int new_channels = (in_ch + 7) & ~7;\n\n    int input_scratch = input_wd * input_ht * in_ch;\n    int filter_scratch = filter_wd * filter_ht * in_ch * out_ch;\n\n    int align_buf_size = 32; /* extra buffer for alignment */\n    if ((filter_wd == 1 && filter_ht == 1 && pad_wd == 0 && pad_ht == 0) &&\n            (stride_wd == 1 && stride_ht == 1)) {\n        if (in_ch < 16) {\n            /* Channel-padding path: filter_sum + padded_filter + padded_input */\n            int filter_sum_sz = out_ch * 4;\n            int padded_filter_sz = 16 * out_ch;\n            int padded_input_sz = 32; /* 16 bytes + alignment */\n            return filter_sum_sz + padded_filter_sz + padded_input_sz + align_buf_size;\n        }\n        int transpose_buf_size = 2 * (8 * new_channels);\n        if (input_wd * input_ht < 8) {\n            transpose_buf_size = 0;\n        }\n        if (in_ch % 8) {\n            input_scratch = input_wd * input_ht * new_channels;\n        } else {\n            input_scratch = 0;\n        }\n        filter_scratch = new_channels * out_ch;\n        return input_scratch + filter_scratch + transpose_buf_size + align_buf_size;\n    } else {\n        new_channels = (in_ch + 15) & ~15;\n        int offset_acc_scratch = out_ch * 4;\n\n        if (pad_wd == 0 && pad_ht == 0 && filter_wd * in_ch >= 16) {\n            /* Direct no-pad path: no input scratch needed */\n            input_scratch = 0;\n            filter_scratch = filter_wd * filter_ht * new_channels * out_ch;\n            return input_scratch + filter_scratch + align_buf_size + offset_acc_scratch;\n        }\n\n        /* Im2col path: scratch = filter_sum + im2col_buf */\n        if (filter_wd * filter_ht * in_ch >= 16) {\n            int window_len = filter_wd * filter_ht * in_ch;\n            int im2col_scratch = window_len;  /* one window buffer */\n            return offset_acc_scratch + im2col_scratch + align_buf_size;\n        }\n\n        if (pad_wd == 0 && pad_ht == 0) {\n            /* Very small window (< 16 elements total): tiled path */\n            int eff_ch = ((16 + filter_wd - 1) / filter_wd + 15) & ~15;\n            int filt_aligned = filter_wd * filter_ht * eff_ch * out_ch;\n            int tile_input = input_wd * input_ht * eff_ch;\n            return offset_acc_scratch + filt_aligned + tile_input + align_buf_size;\n        }\n\n        /* Padded case: check if tiling is beneficial */\n        int padded_input_wd = input_wd + 2 * pad_wd;\n        int full_input_size = padded_input_wd * (input_ht + 2 * pad_ht) * in_ch;\n\n        if (full_input_size + offset_acc_scratch > L1D_BUDGET) {\n            /* Tiled path: compute tile input size */\n            int eff_ch = in_ch;\n            int filt_aligned = 0;\n            if (filter_wd * in_ch < 16) {\n                eff_ch = ((16 + filter_wd - 1) / filter_wd + 15) & ~15;\n                filt_aligned = filter_wd * filter_ht * eff_ch * out_ch;\n            }\n            int tile_row_bytes = padded_input_wd * eff_ch;\n            int budget_for_input = L1D_BUDGET - offset_acc_scratch - filt_aligned;\n            int tile_T = 1;\n            if (budget_for_input > 0 && filter_ht * tile_row_bytes <= budget_for_input) {\n                tile_T = (budget_for_input - filter_ht * tile_row_bytes)\n                         / (stride_ht * tile_row_bytes) + 1;\n                if (tile_T > (int)(output_dims->height)) tile_T = output_dims->height;\n            }\n            int tile_input_rows = (tile_T - 1) * stride_ht + filter_ht + 2 * pad_ht;\n            input_scratch = tile_input_rows * tile_row_bytes;\n            filter_scratch = filt_aligned;\n        } else {\n            /* Monolithic padded path */\n            input_scratch = full_input_size;\n            filter_scratch = filter_wd * filter_ht * new_channels * out_ch;\n        }\n        return input_scratch + filter_scratch + align_buf_size + offset_acc_scratch;\n    }\n    return align_buf_size;\n}\n\nvoid esp_nn_set_conv_scratch_buf_esp32p4(void *buf)\n{\n    // We are going to use the vector extensions\n    asm volatile (\n        \"csrsi 0x7f2, 0b01      \\n\\t\" // enable `esp` vector extension\n        \"li x29, 0b10           \\n\\t\"\n        \"esp.movx.w.cfg x29     \\n\\t\"\n        :\n        :\n        : \"x29\"\n    );\n\n    scratch_buffer = (int16_t *) buf;\n}\n\nvoid esp_nn_conv_s8_esp32p4(const data_dims_t *input_dims,\n                            const int8_t *input,\n                            const data_dims_t *filter_dims,\n                            const int8_t *filter_data,\n                            const int32_t *bias,\n                            const data_dims_t *output_dims,\n                            int8_t *out_data,\n                            const conv_params_t *conv_params,\n                            const quant_data_t *quant_data)\n{\n    if (scratch_buffer == NULL) {\n        printf(\"esp_nn_conv error! scratch_buffer not set!\\n\");\n        return;\n    }\n\n    const uint16_t filter_wd = filter_dims->width;\n    const uint16_t filter_ht = filter_dims->height;\n    const uint16_t pad_wd = conv_params->padding.width;\n    const uint16_t pad_ht = conv_params->padding.height;\n    const uint16_t stride_wd = conv_params->stride.width;\n    const uint16_t stride_ht = conv_params->stride.height;\n\n    if (filter_wd == 1 && filter_ht == 1 && pad_wd == 0 && pad_ht == 0 &&\n            stride_wd == 1 && stride_ht == 1) {\n        esp_nn_conv_s8_1x1(input_dims, input, filter_data, bias,\n                           output_dims, out_data, conv_params, quant_data,\n                           scratch_buffer);\n    } else if (pad_wd == 0 && pad_ht == 0 &&\n               filter_wd * input_dims->channels >= 16) {\n        /* No-pad, channels large enough for PIE: use direct padded path */\n        esp_nn_conv_s8_padded(input_dims, input, filter_dims, filter_data, bias,\n                              output_dims, out_data, conv_params, quant_data,\n                              scratch_buffer);\n    } else if (filter_wd * filter_ht * input_dims->channels >= 16) {\n        /* Small in_ch but window_len >= 16: use im2col for zero-waste PIE.\n         * Also handles padded cases naturally. */\n        esp_nn_conv_s8_im2col(input_dims, input, filter_dims, filter_data, bias,\n                              output_dims, out_data, conv_params, quant_data,\n                              scratch_buffer);\n    } else if (pad_wd != 0 || pad_ht != 0) {\n        /* Padded case with very small window: use tiled path */\n        esp_nn_conv_s8_tiled(input_dims, input, filter_dims, filter_data, bias,\n                             output_dims, out_data, conv_params, quant_data,\n                             scratch_buffer);\n    } else {\n        /* Tiny output: fall back to generic opt */\n        esp_nn_conv_s8_opt(input_dims, input, filter_dims, filter_data, bias,\n                           output_dims, out_data, conv_params, quant_data);\n    }\n}\n"
  },
  {
    "path": "src/convolution/esp_nn_conv_esp32s3.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/**\n * Optimizations strategies used:\n * Below optimizations are capable of any size of input/filter:\n *\n * 1. For filter wdxht = 1x1 (Refer esp_nn_conv_s8_mult8_1x1_esp32s3 function)\n *      - For this specific version, the strategy we employ:\n *          > This particular filter has only the channel\n *              dimension and we have `out_ch` number of such filters.\n *          > We take 8 input lines at a time and transpose those.\n *          > Keep loading and multiplying filter values one by one,\n *              to produce 8 outputs in parallel\n *\n * 2. General version: (Refer esp_nn_conv_s8_filter_aligned_input_padded_esp32s3)\n *      - For all other cases:\n *          > Consider `filter_wd * in_ch` as a single row. These many values can\n *              be continuosly loaded from inputs as well.\n *          > multiply accumulate into a single filter output.\n *          > To speed things up further, we pre-calculate\n *              (filter * in_offset + bias term) earlier and add it at the end of filter\n *\n *      About ((filter * in_offset + bias term)) accumulate term:\n *          > The conv operation before requantization is as follows:\n *              for i in filter_size:\n *                  conv_out += (input + input_offset) * filter;\n *               conv_out += bias\n *\n *          > where input_offset is constant term hence, we can see that\n *              this term can be precalculated as:\n *                  for i in filter_size:\n *                      acc_term += input_offset * filter[i];\n *                  acc_term += bias\n *              OR\n *                   for i in filter_size:\n *                      acc_term += filter[i]; // accumulate filter values\n *                  acc_term = acc_term * input_offset + bias\n *\n *\n * In both the above versions we align the filter if needed, pad the input with\n *       -input_offset if needed and extend the channels to make those multiple\n *       of 8/16 as per function needs\n *\n * 3. Im2col version: (for small in_ch where filter_wd * in_ch < 16)\n *      - Inspired by ESP32-P4 im2col approach.\n *      - Instead of padding channels (wastes 81% of SIMD lanes for in_ch=3),\n *        flatten the entire filter window into one contiguous vector:\n *          window_len = filter_wd * filter_ht * in_ch (e.g., 3*3*3 = 27)\n *      - For each output pixel: copy the input window into a scratch buffer,\n *        then use ACCX dot product on the full window. No wasted MACs.\n */\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <esp_nn_defs.h>\n\n#include <common_functions.h>\n\n/* 3x3 optimized path — im2col per pixel, iterate OC with input in cache */\nextern int esp_nn_conv_s8_3x3_can_use(int filter_wd, int filter_ht, int in_channels);\nextern void esp_nn_conv_s8_3x3_opt(const int8_t *input,\n    const uint16_t input_wd, const uint16_t input_ht,\n    const uint16_t in_channels, const int32_t input_offset,\n    const uint16_t stride_wd, const uint16_t stride_ht,\n    const int8_t *filter_data, const int32_t *bias,\n    int8_t *out_data, const uint16_t out_wd, const uint16_t out_ht,\n    const uint16_t out_channels, const int32_t out_offset,\n    const int32_t *out_shift, const int32_t *out_mult,\n    const int32_t activation_min, const int32_t activation_max,\n    void *scratch);\n\n/* ANSI C reference conv for comparison */\nextern void esp_nn_conv_s8_ansi(const data_dims_t *input_dims,\n                                const int8_t *input_data,\n                                const data_dims_t *filter_dims,\n                                const int8_t *filter_data,\n                                const int32_t *bias,\n                                const data_dims_t *output_dims,\n                                int8_t *out_data,\n                                const conv_params_t *conv_params,\n                                const quant_data_t *quant_data);\n\n/* 1x1 conv — correct SIMD implementation */\nextern int esp_nn_conv_s8_1x1_scratch_size(int out_channels);\nextern void esp_nn_conv_s8_1x1(const int8_t *input,\n                                const uint16_t input_wd,\n                                const uint16_t input_ht,\n                                const uint16_t in_channels,\n                                const int32_t input_offset,\n                                const int8_t *filter_data,\n                                const int32_t *bias,\n                                int8_t *out_data,\n                                const uint16_t out_channels,\n                                const int32_t out_offset,\n                                const int32_t *out_shift,\n                                const int32_t *out_mult,\n                                const int32_t activation_min,\n                                const int32_t activation_max,\n                                void *scratch);\n\n/* Debug heap checks — enable to find buffer overruns */\n#if CONFIG_IDF_CMAKE\n#include \"esp_heap_caps.h\"\n#define CONV_HEAP_CHECK(tag) do { \\\n    if (!heap_caps_check_integrity_all(false)) { \\\n        printf(\"CONV HEAP CORRUPT: %s\\n\", tag); \\\n    } \\\n} while(0)\n#else\n#define CONV_HEAP_CHECK(tag)\n#endif\n\nstatic int16_t *scratch_buffer = NULL;\n\nextern void esp_nn_conv_s8_mult8_1x1_esp32s3(\n                const int8_t *input_data,\n                const uint16_t input_wd,\n                const uint16_t input_ht,\n                const uint16_t in_channels,\n                const int32_t input_offset,\n                const int8_t *filter_aligned,\n                const int32_t *bias,\n                int8_t *out_data,\n                const uint16_t out_wd,\n                const uint16_t out_ht,\n                const uint16_t out_channels,\n                const int32_t out_offset,\n                const int32_t *out_shift,\n                const int32_t *out_mult,\n                const int32_t activation_min,\n                const int32_t activation_max,\n                void *buffer /* scratch buffer */);\n\nextern void esp_nn_conv_s8_filter_aligned_input_padded_esp32s3(\n                const int8_t *input_data,\n                const uint16_t input_wd,\n                const uint16_t input_ht,\n                const uint16_t in_channels,\n                const int32_t input_offset,\n                const uint16_t stride_wd,\n                const uint16_t stride_ht,\n                const int8_t *filter_data,\n                const uint16_t filter_wd,\n                const uint16_t filter_ht,\n                const int32_t *bias,\n                int8_t *out_data,\n                const uint16_t out_wd,\n                const uint16_t out_ht,\n                const uint16_t out_channels,\n                const int32_t out_offset,\n                const int32_t *out_shift,\n                const int32_t *out_mult,\n                const int32_t activation_min,\n                const int32_t activation_max,\n                void *scratch_buffer);\n\n/* Use shared dot product from common — see esp_nn_dot_s8_esp32s3.S */\n\n/**\n * Im2col convolution for small in_ch (filter_wd * in_ch < 16).\n *\n * Instead of padding channels to 16 (wasting 81% MACs for in_ch=3),\n * flatten the entire filter window into one contiguous vector:\n *   window_len = filter_wd * filter_ht * in_ch (e.g., 3*3*3 = 27)\n *\n * For each output pixel: copy the input window into a contiguous scratch\n * buffer, then use ACCX dot product. No wasted MACs.\n *\n * Scratch layout: [filter_sum[out_ch] | im2col_buf[window_len_aligned]]\n */\n__attribute__ ((noinline))\nstatic void esp_nn_conv_s8_im2col_s3(\n        const data_dims_t *input_dims,\n        const int8_t *input_data,\n        const data_dims_t *filter_dims,\n        const int8_t *filter_data,\n        const int32_t *bias,\n        const data_dims_t *output_dims,\n        int8_t *out_data,\n        const conv_params_t *conv_params,\n        const quant_data_t *quant_data,\n        void *scratch)\n{\n    const uint16_t input_wd = input_dims->width;\n    const uint16_t input_ht = input_dims->height;\n    const uint16_t in_ch = input_dims->channels;\n    const uint16_t filter_wd = filter_dims->width;\n    const uint16_t filter_ht = filter_dims->height;\n    const uint16_t out_wd = output_dims->width;\n    const uint16_t out_ht = output_dims->height;\n    const uint16_t out_ch = output_dims->channels;\n    const uint16_t pad_wd = conv_params->padding.width;\n    const uint16_t pad_ht = conv_params->padding.height;\n    const uint16_t stride_wd = conv_params->stride.width;\n    const uint16_t stride_ht = conv_params->stride.height;\n    const int32_t input_offset = conv_params->in_offset;\n    const int32_t out_offset = conv_params->out_offset;\n    const int32_t activation_min = conv_params->activation.min;\n    const int32_t activation_max = conv_params->activation.max;\n\n    const int32_t window_len = filter_wd * filter_ht * in_ch;\n    /* Align to 16 for SIMD: zero-padded tail doesn't affect dot product */\n    const int32_t window_len_aligned = (window_len + 15) & ~15;\n    const int8_t pad_val = (int8_t)(-input_offset);\n\n    /* Scratch layout (16-byte aligned):\n     * [filter_sum: out_ch * 4]\n     * [aligned_filter: out_ch * window_len_aligned]  -- zero-padded copy\n     * [im2col_buf: window_len_aligned]\n     */\n    int32_t *filter_sum = (int32_t *)scratch;\n    int8_t *aligned_filter = (int8_t *)((uintptr_t)((int8_t *)scratch + out_ch * sizeof(int32_t) + 15) & ~15);\n    int8_t *im2col_buf = (int8_t *)((uintptr_t)(aligned_filter + out_ch * window_len_aligned + 15) & ~15);\n\n    /* Pre-compute filter_sum * input_offset AND copy filter with zero-padded tail */\n    const int8_t *fptr = filter_data;\n    int8_t *af_ptr = aligned_filter;\n    for (int32_t oc = 0; oc < out_ch; oc++) {\n        int32_t sum = 0;\n        for (int32_t fi = 0; fi < window_len; fi++) {\n            sum += fptr[fi];\n        }\n        filter_sum[oc] = sum * input_offset;\n        /* Copy filter + zero-pad tail for safe SIMD reads */\n        memcpy(af_ptr, fptr, window_len);\n        memset(af_ptr + window_len, 0, window_len_aligned - window_len);\n        fptr += window_len;\n        af_ptr += window_len_aligned;\n    }\n\n    /* Zero the tail of im2col buffer once (for aligned SIMD reads) */\n    memset(im2col_buf + window_len, 0, window_len_aligned - window_len);\n\n    /* Compute safe interior region where no bounds checking needed.\n     * Interior: all filter taps fall within valid input. */\n    const int32_t row_bytes = filter_wd * in_ch;\n    int32_t safe_y_start = (pad_ht + stride_ht - 1) / stride_ht;\n    int32_t safe_y_end = (input_ht - filter_ht + pad_ht) / stride_ht + 1;\n    int32_t safe_x_start = (pad_wd + stride_wd - 1) / stride_wd;\n    int32_t safe_x_end = (input_wd - filter_wd + pad_wd) / stride_wd + 1;\n    if (safe_y_start > out_ht) safe_y_start = out_ht;\n    if (safe_y_end > out_ht) safe_y_end = out_ht;\n    if (safe_y_end < safe_y_start) safe_y_end = safe_y_start;\n    if (safe_x_start > out_wd) safe_x_start = out_wd;\n    if (safe_x_end > out_wd) safe_x_end = out_wd;\n    if (safe_x_end < safe_x_start) safe_x_end = safe_x_start;\n\n    /* Process each output pixel */\n    int8_t *out_ptr = out_data;\n    for (int32_t out_y = 0; out_y < out_ht; out_y++) {\n        const int32_t base_y = out_y * stride_ht - pad_ht;\n        int is_safe_y = (out_y >= safe_y_start && out_y < safe_y_end);\n\n        for (int32_t out_x = 0; out_x < out_wd; out_x++) {\n            const int32_t base_x = out_x * stride_wd - pad_wd;\n\n            /* Copy input window into contiguous im2col buffer */\n            int8_t *buf = im2col_buf;\n\n            if (is_safe_y && out_x >= safe_x_start && out_x < safe_x_end) {\n                /* FAST PATH: interior pixel — no bounds checking needed.\n                 * All filter taps guaranteed to be within valid input. */\n                for (int32_t fy = 0; fy < filter_ht; fy++) {\n                    const int8_t *src = input_data + ((base_y + fy) * input_wd + base_x) * in_ch;\n                    memcpy(buf, src, row_bytes);\n                    buf += row_bytes;\n                }\n            } else {\n                /* SLOW PATH: edge pixel — per-element bounds checking */\n                for (int32_t fy = 0; fy < filter_ht; fy++) {\n                    int32_t in_y = base_y + fy;\n                    if (in_y >= 0 && in_y < input_ht) {\n                        for (int32_t fx = 0; fx < filter_wd; fx++) {\n                            int32_t in_x = base_x + fx;\n                            if (in_x >= 0 && in_x < input_wd) {\n                                const int8_t *src = input_data + (in_y * input_wd + in_x) * in_ch;\n                                memcpy(buf, src, in_ch);\n                            } else {\n                                memset(buf, pad_val, in_ch);\n                            }\n                            buf += in_ch;\n                        }\n                    } else {\n                        memset(buf, pad_val, row_bytes);\n                        buf += row_bytes;\n                    }\n                }\n            }\n\n            /* Dot product against each output channel's filter (aligned copy) */\n            const int32_t *out_mult_ptr = quant_data->mult;\n            const int32_t *out_shift_ptr = quant_data->shift;\n            const int8_t *filter_ptr = aligned_filter;\n\n            for (int32_t oc = 0; oc < out_ch; oc++) {\n                int32_t conv_out = esp_nn_dot_s8_aligned_esp32s3(im2col_buf, filter_ptr, window_len_aligned);\n                conv_out += filter_sum[oc];\n                if (bias) conv_out += bias[oc];\n                conv_out = esp_nn_requantize(conv_out, *out_mult_ptr++, *out_shift_ptr++);\n                conv_out += out_offset;\n                conv_out = max(conv_out, activation_min);\n                conv_out = min(conv_out, activation_max);\n                *out_ptr++ = (int8_t) conv_out;\n                filter_ptr += window_len_aligned;\n            }\n        }\n    }\n}\n\nint esp_nn_get_conv_scratch_size_esp32s3(const data_dims_t *input_dims,\n                                         const data_dims_t *filter_dims,\n                                         const data_dims_t *output_dims,\n                                         const conv_params_t *conv_params)\n{\n    const uint16_t input_wd = input_dims->width;\n    const uint16_t input_ht = input_dims->height;\n    const uint16_t in_ch = input_dims->channels;\n    const uint16_t filter_wd = filter_dims->width;\n    const uint16_t filter_ht = filter_dims->height;\n    const uint16_t out_ch = output_dims->channels;\n    const uint16_t pad_wd = conv_params->padding.width;\n    const uint16_t pad_ht = conv_params->padding.height;\n    const uint16_t stride_wd = conv_params->stride.width;\n    const uint16_t stride_ht = conv_params->stride.height;\n\n    int new_channels = (in_ch + 7) & ~7;\n\n    int input_scratch = input_wd * input_ht * in_ch;\n    int filter_scratch = filter_wd * filter_ht * in_ch * out_ch;\n\n    int align_buf_size = 64; /* alignment (16) + assembly pre/post access margin (48) */\n    if ((filter_wd == 1 && filter_ht == 1 && pad_wd == 0 && pad_ht == 0) &&\n            (stride_wd == 1 && stride_ht == 1)) {\n        int transpose_buf_size = 2 * (8 * new_channels);\n        if (input_wd * input_ht < 8) {\n            transpose_buf_size = 0;\n        }\n        if (in_ch % 8) {\n            input_scratch = input_wd * input_ht * new_channels;\n        } else {\n            input_scratch = 0;\n        }\n        filter_scratch = new_channels * out_ch;\n        return input_scratch + filter_scratch + transpose_buf_size + align_buf_size;\n    } else {\n        int32_t filter_row_size = filter_wd * in_ch;\n        int32_t window_len = filter_wd * filter_ht * in_ch;\n\n        /* Im2col path: filter_wd * in_ch < 16 but window_len >= 16 */\n        if (filter_row_size < 16 && window_len >= 16) {\n            int32_t window_len_aligned = (window_len + 15) & ~15;\n            /* filter_sum + aligned_filter_copy + im2col_buf + alignment padding */\n            int im2col_scratch = out_ch * 4 + 16 + out_ch * window_len_aligned + 16 + window_len_aligned;\n            return im2col_scratch + align_buf_size;\n        }\n\n        new_channels = (in_ch + 15) & ~15;\n        if (pad_wd == 0 && pad_ht == 0) {\n            input_scratch = 0;\n        } else {\n            input_scratch = (input_wd + 2 * pad_wd) * (input_ht + 2 * pad_ht) * in_ch;\n        }\n        filter_scratch = filter_wd * filter_ht * new_channels * out_ch;\n\n        // Account for filter alignment padding (worst case)\n        int32_t aligned_filter_row_size = ((filter_row_size + 15) / 16) * 16;\n        int filter_alignment_scratch = aligned_filter_row_size * filter_ht * out_ch;\n\n        // Account for right/bottom padding even when pad_wd=0, pad_ht=0\n        int pad_right = max(0, (output_dims->width * stride_wd + filter_wd - 1) - input_wd);\n        int pad_bottom = max(0, (output_dims->height * stride_ht + filter_ht - 1) - input_ht);\n        int boundary_padding_scratch = 0;\n        if (pad_right > 0 || pad_bottom > 0) {\n            boundary_padding_scratch = (input_wd + pad_right) * (input_ht + pad_bottom) * in_ch;\n        }\n\n        int offset_acc_scratch = out_ch * 4;\n        return input_scratch + filter_scratch + filter_alignment_scratch + boundary_padding_scratch + align_buf_size + offset_acc_scratch;\n    }\n    return align_buf_size;\n}\n\nvoid esp_nn_set_conv_scratch_buf_esp32s3(void *buf)\n{\n    scratch_buffer = (int16_t *) buf;\n}\n\nvoid esp_nn_conv_s8_esp32s3(const data_dims_t *input_dims,\n                            const int8_t *input,\n                            const data_dims_t *filter_dims,\n                            const int8_t *filter_data,\n                            const int32_t *bias,\n                            const data_dims_t *output_dims,\n                            int8_t *out_data,\n                            const conv_params_t *conv_params,\n                            const quant_data_t *quant_data)\n{\n    const uint16_t input_wd = input_dims->width;\n    const uint16_t input_ht = input_dims->height;\n    const uint16_t channels = input_dims->channels;\n    const int32_t input_offset = conv_params->in_offset;\n    const int32_t out_offset = conv_params->out_offset;\n    const uint16_t pad_wd = conv_params->padding.width;\n    const uint16_t pad_ht = conv_params->padding.height;\n    const uint16_t stride_wd = conv_params->stride.width;\n    const uint16_t stride_ht = conv_params->stride.height;\n    const uint16_t filter_wd = filter_dims->width;\n    const uint16_t filter_ht = filter_dims->height;\n    const uint16_t out_wd = output_dims->width;\n    const uint16_t out_ht = output_dims->height;\n    const uint16_t out_channels = output_dims->channels;\n    const int32_t *out_shift = quant_data->shift;\n    const int32_t *out_mult = quant_data->mult;\n    const int32_t activation_min = conv_params->activation.min;\n    const int32_t activation_max = conv_params->activation.max;\n\n    /* Grouped conv (filter_ch < input_ch): fall back to ansi which handles it */\n    if (channels != filter_dims->channels) {\n        esp_nn_conv_s8_ansi(input_dims, input, filter_dims, filter_data,\n                            bias, output_dims, out_data, conv_params, quant_data);\n        return;\n    }\n\n    int filter_size = filter_wd * filter_ht * channels * out_channels;\n\n    /* 1x1 stride-1 conv */\n    if (filter_wd == 1 && filter_ht == 1 && pad_wd == 0 && pad_ht == 0 &&\n            stride_wd == 1 && stride_ht == 1) {\n        if (channels % 8 == 0) {\n            /* Full asm path — requires mult8 channels + 8-byte aligned filter */\n            esp_nn_conv_s8_mult8_1x1_esp32s3(input, input_wd, input_ht, channels,\n                               input_offset, filter_data, bias, out_data,\n                               out_wd, out_ht, out_channels, out_offset,\n                               out_shift, out_mult, activation_min, activation_max,\n                               scratch_buffer);\n        } else {\n            /* Fallback: handles any alignment + any channel count */\n            esp_nn_conv_s8_1x1(input, input_wd, input_ht, channels, input_offset,\n                               filter_data, bias, out_data, out_channels, out_offset,\n                               out_shift, out_mult, activation_min, activation_max,\n                               scratch_buffer);\n        }\n        return;\n    }\n\n    if (scratch_buffer == NULL) {\n        printf(\"esp_nn_conv error! scratch_buffer not set!\\n\");\n        return;\n    }\n\n    {\n        int32_t filter_row_size = filter_wd * channels;\n        int32_t window_len = filter_wd * filter_ht * channels;\n\n        /* 3x3 optimized path: im2col per pixel, iterate OC with input in cache.\n         * TODO: fix inline asm priming + performance regression before enabling.\n         * Avoids the 128× input reload of the general aligned asm. */\n#if 0\n        if (esp_nn_conv_s8_3x3_can_use(filter_wd, filter_ht, channels) &&\n                pad_wd == 0 && pad_ht == 0) {\n            esp_nn_conv_s8_3x3_opt(input, input_wd, input_ht, channels,\n                                    input_offset, stride_wd, stride_ht,\n                                    filter_data, bias, out_data,\n                                    out_wd, out_ht, out_channels, out_offset,\n                                    out_shift, out_mult, activation_min, activation_max,\n                                    (void *)scratch_buffer);\n            return;\n        }\n#endif\n\n        /* Im2col path: small in_ch where per-row SIMD is wasteful,\n         * but entire window is large enough for SIMD dot product.\n         * E.g., 3x3 conv with in_ch=3: row=9 (<16), window=27 (>=16). */\n        if (filter_row_size < 16 && window_len >= 16) {\n            esp_nn_conv_s8_im2col_s3(input_dims, input, filter_dims, filter_data,\n                                      bias, output_dims, out_data, conv_params,\n                                      quant_data, scratch_buffer);\n            return;\n        }\n\n        // align the `filter width * channels` to 16 bytes. Do zero padding for the same\n        int32_t filter_alignment_padding = 16 - (filter_row_size & 15);\n        int8_t *filter_data_aligned = (int8_t *) filter_data;\n        int8_t *input_padded = (int8_t *) input;\n        int8_t *scratch_data = (int8_t *) scratch_buffer;\n        int new_input_wd = input_wd, new_input_ht = input_ht;\n        if (filter_alignment_padding != 16) {\n            // pad filter_data\n            int32_t new_row_size = filter_wd * channels + filter_alignment_padding;\n            filter_data_aligned = scratch_data;\n            int8_t *row_ptr = filter_data_aligned;\n            const int8_t *filter_data_ptr = filter_data;\n            for (int32_t ch_idx = 0; ch_idx < out_channels; ch_idx++) {\n                for (int32_t row_idx = 0; row_idx < filter_ht; row_idx++) {\n                    memcpy(row_ptr, filter_data_ptr, filter_row_size);\n                    memset(row_ptr + filter_row_size, 0, new_row_size - filter_row_size);\n                    filter_data_ptr += filter_row_size;\n                    row_ptr += new_row_size;\n                }\n            }\n            scratch_data += new_row_size * filter_ht * out_channels;\n            filter_row_size = new_row_size;\n        } else if ((int) filter_data & 15) {\n            filter_data_aligned = scratch_data;\n            memcpy(filter_data_aligned, filter_data, filter_size);\n            scratch_data += filter_size;\n        }\n        // Calculate if right/bottom padding is needed even when pad_wd=0, pad_ht=0\n        // This happens when the filter extends beyond input boundaries at the edges\n        // Formula matches depthwise convolution: (out_wd * stride_wd + filter_wd - 1) - input_wd\n        int32_t pad_right = max(0, (out_wd * stride_wd + filter_wd - 1) - input_wd);\n        int32_t pad_bottom = max(0, (out_ht * stride_ht + filter_ht - 1) - input_ht);\n\n        // Apply padding if explicitly requested (pad_wd/pad_ht) OR if needed for boundary handling\n        if (pad_wd != 0 || pad_ht != 0) {\n            // Full padding (top, bottom, left, right) when pad_wd/pad_ht are set\n            input_padded = (int8_t *) scratch_data;\n            esp_nn_aligned_s8_pad_with_value(input, input_padded, input_wd, input_ht, channels,\n                                            -input_offset, pad_wd, pad_ht);\n            new_input_wd = input_wd + 2 * pad_wd;\n            new_input_ht = input_ht + 2 * pad_ht;\n            scratch_data += new_input_wd * new_input_ht * channels;\n        } else if (pad_right > 0 || pad_bottom > 0) {\n            // Only right/bottom padding needed for boundary handling (like depthwise conv)\n            input_padded = (int8_t *) scratch_data;\n            esp_nn_aligned_s8_pad_end_with_value(input, input_padded, input_wd, input_ht, channels,\n                                                -input_offset, (uint16_t)pad_right, (uint16_t)pad_bottom);\n            new_input_wd = input_wd + pad_right;\n            new_input_ht = input_ht + pad_bottom;\n            scratch_data += new_input_wd * new_input_ht * channels;\n        }\n\n\n        int filter_total = filter_wd * filter_ht * channels * out_channels;\n        if (input_offset != 0 && filter_total > 16384) {\n            int32_t *corrections = (int32_t *)scratch_data;\n            int32_t filter_ch_size = filter_wd * filter_ht * channels;\n            const int8_t *f_src = filter_data; // use ORIGINAL (not aligned) filter for sum\n            for (int ch = 0; ch < out_channels; ch++) {\n                int32_t filter_sum = 0;\n                for (int i = 0; i < filter_ch_size; i++) {\n                    filter_sum += f_src[i];\n                }\n                corrections[ch] = filter_sum * input_offset;\n                if (bias) {\n                    corrections[ch] += bias[ch];\n                }\n                f_src += filter_ch_size;\n            }\n            // Pass input_offset=0 to assembly so it skips its pre-computation.\n            // Pass scratch_data as \"bias\" pointer — the assembly's bias-copy loop\n            // will read from scratch and write to scratch (identity, no-op).\n            esp_nn_conv_s8_filter_aligned_input_padded_esp32s3(\n                input_padded, new_input_wd, new_input_ht, channels, 0,\n                stride_wd, stride_ht, filter_data_aligned, filter_wd, filter_ht,\n                (const int32_t *)scratch_data, out_data, out_wd, out_ht, out_channels,\n                out_offset, out_shift, out_mult, activation_min, activation_max,\n                scratch_data);\n            CONV_HEAP_CHECK(\"general: after asm (precomp)\");\n        } else {\n            esp_nn_conv_s8_filter_aligned_input_padded_esp32s3(\n                input_padded, new_input_wd, new_input_ht, channels, input_offset,\n                stride_wd, stride_ht, filter_data_aligned, filter_wd, filter_ht,\n                bias, out_data, out_wd, out_ht, out_channels, out_offset,\n                out_shift, out_mult, activation_min, activation_max, scratch_data);\n            CONV_HEAP_CHECK(\"general: after asm (normal)\");\n        }\n    }\n}\n"
  },
  {
    "path": "src/convolution/esp_nn_conv_opt.c",
    "content": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <esp_nn_defs.h>\n#include <esp_nn_ansi_headers.h>\n\n#include <common_functions.h>\n\nint esp_nn_get_conv_scratch_size_opt(const data_dims_t *input_dims,\n                                     const data_dims_t *filter_dims,\n                                     const data_dims_t *output_dims,\n                                     const conv_params_t *conv_params)\n{\n    return 0;\n}\n\nvoid esp_nn_set_conv_scratch_buf_opt(const void *buf)\n{\n\n}\n\n__attribute__ ((noinline))\nstatic void esp_nn_conv_s8_1x1(const data_dims_t *input_dims,\n                               const int8_t *input_data,\n                               const int8_t *filter_data,\n                               const int32_t *bias,\n                               const data_dims_t *output_dims,\n                               int8_t *out_data,\n                               const conv_params_t *conv_params,\n                               const quant_data_t *quant_data)\n{\n    const uint16_t input_wd = input_dims->width;\n    const uint16_t in_channels = input_dims->channels;\n    const int32_t input_offset = conv_params->in_offset;\n    const int32_t out_offset = conv_params->out_offset;\n    const uint16_t stride_wd = conv_params->stride.width;\n    const uint16_t stride_ht = conv_params->stride.height;\n    const uint16_t out_wd = output_dims->width;\n    const uint16_t out_ht = output_dims->height;\n    const uint16_t out_channels = output_dims->channels;\n    const int32_t activation_min = conv_params->activation.min;\n    const int32_t activation_max = conv_params->activation.max;\n\n    for (int32_t in_row = 0; in_row < out_ht * stride_ht; in_row += stride_ht) {\n        for (int32_t in_col = 0; in_col < out_wd * stride_wd; in_col += stride_wd) {\n            const int32_t *out_mult = quant_data->mult;\n            const int32_t *out_shift = quant_data->shift;\n            const int8_t *filter_ptr = filter_data;\n            const int8_t *input_base_ptr = input_data + (in_row * input_wd + in_col) * in_channels;\n            int32_t out_ch_idx = 0;\n            for (; out_ch_idx < out_channels; out_ch_idx++) {\n                int32_t conv_out = 0;\n\n                const int8_t *input_ptr = input_base_ptr;\n\n                int32_t in_ch_idx = 0;\n                for (; in_ch_idx < in_channels - 3; in_ch_idx += 4) {\n                    conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;\n                    conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;\n                    conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;\n                    conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;\n                }\n                for (; in_ch_idx < in_channels; in_ch_idx ++) {\n                    conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;\n                }\n                if (bias) {\n                    conv_out += bias[out_ch_idx];\n                }\n                conv_out = esp_nn_requantize(conv_out, *out_mult++, *out_shift++);\n                conv_out += out_offset;\n                conv_out = max(conv_out, activation_min);\n                conv_out = min(conv_out, activation_max);\n                *out_data++ = (int8_t) conv_out;\n            }\n        }\n    }\n}\n\n/**\n * Assumption 1: i/p channels == o/p channels\n * Assumption 2: Pointers are valid\n * Assumption 3: dialation width = 1\n */\nvoid esp_nn_conv_s8_opt(const data_dims_t *input_dims,\n                        const int8_t *input_data,\n                        const data_dims_t *filter_dims,\n                        const int8_t *filter_data,\n                        const int32_t *bias,\n                        const data_dims_t *output_dims,\n                        int8_t *out_data,\n                        const conv_params_t *conv_params,\n                        const quant_data_t *quant_data)\n{\n    const uint16_t filter_wd = filter_dims->width;\n    const uint16_t filter_ht = filter_dims->height;\n\n    if (filter_wd == 1 && filter_ht == 1) {\n        esp_nn_conv_s8_1x1(input_dims, input_data, filter_data, bias,\n                           output_dims, out_data, conv_params, quant_data);\n        return;\n    }\n\n    const uint16_t input_wd = input_dims->width;\n    const uint16_t input_ht = input_dims->height;\n    const uint16_t in_channels = input_dims->channels;\n    const int32_t input_offset = conv_params->in_offset;\n    const int32_t out_offset = conv_params->out_offset;\n    const uint16_t pad_wd = conv_params->padding.width;\n    const uint16_t pad_ht = conv_params->padding.height;\n    const uint16_t stride_wd = conv_params->stride.width;\n    const uint16_t stride_ht = conv_params->stride.height;\n    const uint16_t out_wd = output_dims->width;\n    const uint16_t out_ht = output_dims->height;\n    const uint16_t out_channels = output_dims->channels;\n    const int32_t activation_min = conv_params->activation.min;\n    const int32_t activation_max = conv_params->activation.max;\n\n    /* Grouped conv (filter_ch < input_ch): fall back to ansi which handles it */\n    if (in_channels != filter_dims->channels) {\n        esp_nn_conv_s8_ansi(input_dims, input_data, filter_dims, filter_data,\n                            bias, output_dims, out_data, conv_params, quant_data);\n        return;\n    }\n\n    int32_t out_ch_idx, out_y, out_x, filter_y_idx, filter_x_idx;\n\n    for (out_y = 0; out_y < out_ht; out_y++) {\n        for (out_x = 0; out_x < out_wd; out_x++) {\n            const int32_t *out_shift = quant_data->shift;\n            const int32_t *out_mult = quant_data->mult;\n            for (out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {\n                int32_t conv_out = 0;\n\n                const int32_t base_y = stride_ht * out_y - pad_ht;\n                const int32_t base_x = stride_wd * out_x - pad_wd;\n\n                const int32_t filter_y_start = max(0, -base_y);\n                const int32_t filter_x_start = max(0, -base_x);\n\n                const int32_t filter_y_end = min(filter_ht, input_ht - base_y);\n                const int32_t filter_x_end = min(filter_wd, input_wd - base_x);\n\n                for (filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {\n                    for (filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {\n                        const int32_t in_row = base_y + filter_y_idx;\n                        const int32_t in_col = base_x + filter_x_idx;\n\n                        const int8_t *input_ptr = input_data +\n                                        (in_row * input_wd + in_col) * in_channels;\n                        const int8_t *filter_ptr = filter_data +\n                                        out_ch_idx * in_channels * filter_ht * filter_wd +\n                                        (filter_y_idx * filter_wd + filter_x_idx) * in_channels;\n                        int32_t in_ch_idx = 0;\n                        for (; in_ch_idx < in_channels - 3; in_ch_idx += 4) {\n                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;\n                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;\n                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;\n                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;\n                        }\n                        for (; in_ch_idx < in_channels; in_ch_idx ++) {\n                            conv_out += (*input_ptr++ + input_offset) * *filter_ptr++;\n                        }\n                    }\n                }\n                if (bias) {\n                    conv_out += bias[out_ch_idx];\n                }\n                conv_out = esp_nn_requantize(conv_out, *out_mult++, *out_shift++);\n                conv_out += out_offset;\n                conv_out = max(conv_out, activation_min);\n                conv_out = min(conv_out, activation_max);\n                *out_data++ = (int8_t) conv_out;\n            }\n        }\n    }\n}\n"
  },
  {
    "path": "src/convolution/esp_nn_conv_s16_mult4_1x1_esp32s3.S",
    "content": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n    .text\n    .literal_position\n    .literal    .nudge_val, 1073741824\n\n    # Program Unit: esp_nn_conv_s16_mult4_1x1_esp32s3\n    .type   esp_nn_conv_s16_mult4_1x1_esp32s3, @function\n    .align   4\n    .global esp_nn_conv_s16_mult4_1x1_esp32s3\nesp_nn_conv_s16_mult4_1x1_esp32s3:  # 0xa62\n    # scratch_buf = 0\n    # to_add = 32\n    # gra_spill_temp_139 = 36\n    # gra_spill_temp_140 = 40\n    # gra_spill_temp_141 = 44\n    # gra_spill_temp_155 = 48\n    # gra_spill_temp_156 = 52\n    # gra_spill_temp_144 = 56\n    # gra_spill_temp_145 = 60\n    # gra_spill_temp_146 = 64\n    # gra_spill_temp_147 = 68\n    # gra_spill_temp_148 = 72\n    # gra_spill_temp_149 = 76\n    # gra_spill_temp_150 = 80\n    # gra_spill_temp_151 = 84\n    # gra_spill_temp_152 = 88\n    # gra_spill_temp_153 = 92\n    # lgra_spill_temp_165 = 96\n    # lgra_spill_temp_166 = 100\n    # lgra_spill_temp_167 = 104\n    # lgra_spill_temp_168 = 108\n    # gra_spill_temp_158 = 112\n    # gra_spill_temp_159 = 116\n    # gra_spill_temp_160 = 120\n\n\n // registers:\n // a2: int16_t *input_data\n // a3: uint16_t input_wd\n // a4: uint16_t input_ht\n // a5: uint16_t in_channels\n // a6: int16_t *filter_data\n // a7: int32_t *bias\n\n // on stack:\n // 160: int8_t *out_data\n // 164: uint16_t out_wd\n // 168: uint16_t out_ht\n // 172: uint16_t out_channels\n // 176: int32_t out_offset\n // 180: int32_t *out_shift\n // 184: int32_t *out_mult\n // 188: int32_t activation_min\n // 192: int32_t activation_max\n // 196: *buffer /* scratch buffer */\n\n\n    entry   a1,160                      #\n    s32i.n  a2,a1,40                # [0]  gra_spill_temp_140\n    s32i    a6,a1,68                    # [1]  gra_spill_temp_147\n    s32i    a7,a1,116                   # [2]  gra_spill_temp_159\n\n    mul16u  a3,a3,a4                # [3]\n    addi    a10,a1,112                  # [4]\n    addmi   a11,a1,176                  # [5]\n    addmi   a8,a1,176                   # [6]\n    addmi   a9,a1,176                   # [7]\n    addi.n  a9,a9,12                # [8]\n    addi    a8,a8,16                    # [9]\n    ee.vldbc.32 q5,a11              # [10]  id:188 out_offset\n    ee.vldbc.32 q7,a8               # [12]  id:270 activation_max\n    ee.vldbc.32 q6,a9               # [13]  id:269 activation_min\n    blti    a3,4,.Lt_3_6402             # [14]\n\n.LBB3_esp_nn_conv_s16_mult4_1x1_esp32s3:    # 0xa90\n    l32i    a13,a1,160                  # [0]  id:280 out_data+0x0\n    srai    a8,a5,2                     # [1]\n    addi    a10,a3,-3                   # [2]\n    addi    a9,a5,-3                    # [3]\n    movi.n  a12,0                   # [4]\n    slli    a11,a5,2                    # [5]\n    slli    a15,a5,1                    # [6]\n    l16ui   a14,a1,172                  # [7]  id:271 out_channels+0x0\n    s32i.n  a15,a1,36               # [9]  gra_spill_temp_139\n    s32i.n  a11,a1,56               # [10]  gra_spill_temp_144\n    s32i    a12,a1,84                   # [11]  gra_spill_temp_151\n    s32i    a9,a1,52                   # [12]  gra_spill_temp_156\n    s32i.n  a10,a1,60               # [13]  gra_spill_temp_145\n    s32i    a8,a1,88                    # [14]  gra_spill_temp_152\n    movi.n  a10,0                   # [15]\n    l32i    a8,a1,196                   # [16]  id:281 buffer+0x0\n    slli    a11,a11,1                   # [19]\n    l32i    a15,a1,184                  # [20]  id:192 out_mult+0x0\n    s32i    a11,a1,64                   # [22]  gra_spill_temp_146\n    s32i    a8,a1,112                   # [25]  gra_spill_temp_158\n    s32i    a10,a1,92                   # [26]  gra_spill_temp_153\n    movi.n  a8,0                    # [27]\n    s32i    a10,a1,80                   # [31]  gra_spill_temp_150\n    s32i    a8,a1,76                    # [32]  gra_spill_temp_149\n    slli    a8,a14,1                    # [34]\n    addx2   a9,a14,a14                  # [35]\n    s32i    a9,a1,72                    # [36]  gra_spill_temp_148\n    s32i.n  a8,a1,44                # [37]  gra_spill_temp_141\n    addx4   a14,a14,a15                 # [38]\n    s32i    a14,a1,48                  # [39]  gra_spill_temp_155\n    j   .Lt_3_6914                      # [40]\n\n.Lt_3_8194: # 0xb00\n#<loop> Part of loop body line 305, head labeled .Lt_3_6914\n    l32i.n  a12,a1,60               # [0]  gra_spill_temp_145\n    l32i.n  a9,a1,56                # [1]  gra_spill_temp_144\n    l32i    a8,a1,76                    # [2]  gra_spill_temp_149\n    l32i    a15,a1,64                   # [3]  gra_spill_temp_146\n    l32i    a11,a1,72                   # [4]  gra_spill_temp_148\n    l32i    a14,a1,84                   # [5]  gra_spill_temp_151\n    add.n   a13,a13,a11                 # [6]\n    l32i    a11,a1,80                   # [7]  gra_spill_temp_150\n    add.n   a14,a14,a15                 # [8]\n    add.n   a8,a8,a9                    # [9]\n    s32i    a8,a1,76                    # [10]  gra_spill_temp_149\n    s32i    a14,a1,84                   # [11]  gra_spill_temp_151\n    addi.n  a11,a11,4               # [12]\n    s32i    a11,a1,80                   # [13]  gra_spill_temp_150\n    bge     a11,a12,.Lt_3_6402          # [14]\n\n.Lt_3_6914: # 0xb27\n    l32i    a12,a1,52                  # [0]  gra_spill_temp_156\n    l32i    a4,a1,112                   # [1]  gra_spill_temp_158\n    blti    a12,1,.Lt_3_7170            # [2]\n\n.LBB6_esp_nn_conv_s16_mult4_1x1_esp32s3:    # 0xb30\n    l32i    a3,a1,88                    # [0]  gra_spill_temp_152\n    l32i.n  a5,a1,40                # [1]  gra_spill_temp_140\n    l32i    a2,a1,84                    # [3]  gra_spill_temp_151\n    add.n   a2,a2,a5                    # [7]\n    l32i.n  a5,a1,36                # [9]  gra_spill_temp_139\n\n    // load and transose 4 lines of input 4xchannels,\n    loopgtz a3,.transpose_loop_end\n    mov.n   a3,a2                       # [0*II+0]\n    ee.vld.l.64.xp  q0,a3,a5        # [0*II+2]  id:282\n    ee.vld.l.64.xp  q1,a3,a5        # [0*II+3]  id:283\n    ee.vld.l.64.xp  q2,a3,a5        # [0*II+4]  id:284\n    ee.vld.l.64.xp  q3,a3,a5        # [0*II+5]  id:285\n    ee.vzip.16      q0,q1               # [0*II+6]\n    ee.vzip.16      q2,q3               # [0*II+7]\n    ee.vzip.32      q0,q2               # [0*II+8]\n    ee.vst.128.ip   q0,a4,16            # [0*II+9]  id:286\n    ee.vst.128.ip   q2,a4,16            # [0*II+10]  id:287\n    addi.n  a2,a2,8                 # [0*II+1]\n.transpose_loop_end:\n\n.Lt_3_7170: # 0xb7c\n    l32i    a2,a1,68                    # [0]  gra_spill_temp_147\n    l32i    a9,a1,116                   # [1]  gra_spill_temp_159\n    l16ui   a8,a1,172                   # [2]  out_channels\n    s32i    a9,a1,120                   # [3]  gra_spill_temp_160\n    beqz.n  a8,.Lt_3_8194           # [4]\n\n    l32i    a9,a1,180                # [0]  out_shift\n    l32i    a11,a1,184               # [1]  out_mult\n    l32i    a15,a1,72                   # [2]  gra_spill_temp_148\n    l32i.n  a14,a1,44               # [3]  gra_spill_temp_141\n    add.n   a15,a15,a13                 # [4]\n    add.n   a14,a14,a13                 # [5]\n    j   .Lt_3_8706                      # [6]\n\n.Lt_3_10754:    # 0xb9a\n\n    movi.n  a3,0                    # [0]\n\n.Lt_3_10498:    # 0xb9c\n\n// esp_nn_multiply_by_quantized_mult_esp32s3\n    ee.zero.q   q0                      # [0]\n    l32i        a5,a1,92                    # [1]  gra_spill_temp_153\n    s32i        a2,a1,96                   # [2]  lgra_spill_temp_165\n    s32i        a11,a1,104                  # [3]  lgra_spill_temp_167\n    s32i        a13,a1,108                  # [4]  lgra_spill_temp_168\n    s32i        a9,a1,100                   # [5]  lgra_spill_temp_166\n\n    movi.n          a13,0                   # [6]\n    max             a12,a12,a13                 # [7]\n    wsr.sar         a12                     # [8]\n    ee.vsl.32       q1,q1                   # [9]\n    ssai            31                          # [10]\n    ee.movi.32.a    q1,a7,0             # [11]\n    ee.movi.32.a    q1,a8,1             # [12]\n    ee.movi.32.a    q1,a6,3             # [13]\n    ee.movi.32.a    q1,a9,2             # [14]\n    mulsh           a12,a4,a9                   # [15]\n    mulsh           a11,a4,a6                   # [16]\n    mulsh           a2,a4,a8                    # [17]\n    mulsh           a13,a7,a4                   # [18]\n    mull            a8,a4,a8                    # [19]\n    mull            a7,a7,a4                    # [20]\n    mull            a6,a4,a6                    # [24]\n\n    add.n           a11,a5,a11                  # [21]\n    add.n           a12,a5,a12                  # [22]\n    add.n           a2,a5,a2                    # [23]\n    add.n           a5,a5,a13                   # [25]\n\n    l32r            a13,.nudge_val\n    mull            a9,a4,a9                    # [27]\n\n    add.n           a6,a13,a6                   # [28]\n    add.n           a9,a13,a9                   # [29]\n    add.n           a10,a13,a7                   # [30]\n    add.n           a8,a13,a8                   # [32]\n\n    saltu           a7,a10,a13                   # [33]\n    add.n           a7,a7,a5                    # [34]\n    saltu           a5,a8,a13                   # [35]\n    add.n           a5,a5,a2                    # [36]\n    src             a5,a5,a8                    # [37]\n    saltu           a2,a9,a13                   # [38]\n    add.n           a2,a2,a12                   # [40]\n    saltu           a13,a6,a13                  # [41]\n    addi.n          a12,a3,-1               # [42]\n    src             a2,a2,a9                    # [43]\n    ee.movi.32.q    q3,a5,1             # [51]\n    ee.movi.32.q    q3,a2,2             # [54]\n\n    add.n           a13,a13,a11                 # [44]\n    addi            a9,a1,32                    # [45]  to_add\n    movi.n          a11,1                   # [46]\n    src             a7,a7,a10                    # [47]\n    src             a13,a13,a6                  # [48]\n    ee.movi.32.q    q3,a7,0             # [50]\n    ee.movi.32.q    q3,a13,3            # [57]\n\n    addi            a8,a1,112                   # [49]\n\n    l32i            a7,a1,48                   # [52]  gra_spill_temp_155\n    l16ui           a5,a1,172                   # [53]  out_channels\n    ssl             a12                         # [55]\n    sll             a11,a11                     # [56]\n    wsr.sar         a3                      # [58]\n    ee.vcmp.lt.s32  q0,q3,q0        # [59]\n    l32i            a13,a1,108                  # [60]  lgra_spill_temp_168\n    s32i.n          a11,a1,32               # [61]  to_add\n    ee.vldbc.32     q1,a9               # [62]  id:317 to_add\n    add.n           a5,a5,a13                   # [63]\n    l32i            a9,a1,100                   # [64]  lgra_spill_temp_166\n    ee.vadds.s32    q1,q1,q0            # [65]\n    addi.n          a9,a9,4                 # [66]\n    ee.vadds.s32    q1,q3,q1            # [67]\n    ee.vsr.32       q1,q1                   # [69]\n\n# add offset, apply activation and store\n    ee.vadds.s32    q1,q1,q5            # [70]\n    ee.vmin.s32     q1,q1,q7            # [72]\n    ee.vmax.s32     q1,q1,q6            # [73]\n    ee.vst.128.ip   q1,a1,0             # [74]  id:320\n    l8ui        a6,a1,0                     # [75]  scratch_buf\n    s8i         a6,a13,0                    # [76]\n    addi.n      a13,a13,1               # [77]\n    l8ui        a2,a1,4                     # [78]  scratch_buf+4\n    s8i         a2,a5,0                     # [79]\n    l8ui        a12,a1,8                    # [80]  scratch_buf+8\n    l32i        a2,a1,96                   # [81]  lgra_spill_temp_165\n    s8i         a12,a14,0                   # [82]\n    addi.n      a14,a14,1               # [83]\n    l8ui        a11,a1,12                   # [84]  scratch_buf+12\n    s8i         a11,a15,0                   # [85]\n    l32i        a11,a1,104                  # [86]  lgra_spill_temp_167\n    addi.n      a15,a15,1               # [87]\n    addi.n      a11,a11,4               # [88]\n    sub         a7,a11,a7                   # [89]\n    beqz        a7,.Lt_3_8194               # [90]\n\n.Lt_3_8706: # 0xc97\n    ee.zero.qacc                    # [0]\n    l32i    a8,a1,52                   # [1]  gra_spill_temp_156\n    l32i    a3,a1,112                   # [2]  gra_spill_temp_158\n    blti    a8,1,.Lt_3_8962             # [3]\n\n    l32i    a4,a1,88                    # [0]  gra_spill_temp_152\n    loopgtz a4,.LBB53_esp_nn_conv_s16_mult4_1x1_esp32s3     # [2]\n\n    ee.vld.l.64.ip          q0,a2,8         # [0*II+0]  id:289\n    ee.vld.l.64.ip          q1,a3,8         # [0*II+1]  id:290\n    ee.vld.l.64.ip          q2,a3,8         # [0*II+2]  id:291\n    ee.vsmulas.s16.qacc     q1,q0,0     # [0*II+3]\n    ee.vld.l.64.ip          q3,a3,8         # [0*II+4]  id:292\n    ee.vsmulas.s16.qacc     q2,q0,1     # [0*II+5]\n    ee.vld.l.64.ip          q4,a3,8         # [0*II+6]  id:293\n    ee.vsmulas.s16.qacc     q3,q0,2     # [0*II+7]\n    ee.vsmulas.s16.qacc     q4,q0,3     # [0*II+8]\n\n.LBB53_esp_nn_conv_s16_mult4_1x1_esp32s3:   # 0xcc4\n\n.Lt_3_8962: # 0xcc4\n\n// extract data:\n    mov     a10,a1\n    ee.st.qacc_l.l.128.ip   a10,16      # [0]  id:298\n    ee.st.qacc_l.h.32.ip    a10,-16     # [1]  id:299\n    l8ui    a12,a1,16                   # [2]  scratch_buf+16\n    l8ui    a8,a1,6                     # [3]  scratch_buf+6\n    s8i     a8,a1,3                     # [4]  scratch_buf+3\n    s8i     a12,a1,7                    # [5]  scratch_buf+7\n    l8ui    a8,a1,15                    # [6]  scratch_buf+15\n    l8ui    a12,a1,5                    # [7]  scratch_buf+5\n    s8i     a12,a1,2                    # [8]  scratch_buf+2\n    s8i     a8,a1,6                     # [9]  scratch_buf+6\n    l16ui   a12,a1,10                   # [10]  scratch_buf+10\n    movi.n  a8,16                   # [11]\n    ee.srcmb.s16.qacc   q2,a8,0         # [12]\n    s16i                a12,a1,4                    # [13]  scratch_buf+4\n    ee.vld.l.64.ip      q1,a10,0        # [14]  id:309\n    l32i                a12,a1,116                  # [15]  gra_spill_temp_159, bias\n    ee.vzip.16          q1,q2               # [16]\n\n    beqz.n  a12,.Lt_3_9986          # [17] // skip bias\n // add bias:\n    l32i            a8,a1,120                   # [0]  gra_spill_temp_160\n    ee.vldbc.32.ip  q0,a8,4         # [2]  id:311\n    s32i            a8,a1,120                   # [3]  gra_spill_temp_160\n    ee.vadds.s32    q1,q1,q0            # [4]\n.Lt_3_9986: # 0xd04\n\n    l32i.n  a12,a9,0                # [0]  id:313\n    l32i.n  a4,a11,0                # [1]  id:312\n    bgei    a12,1,.Lt_3_10754           # [2]\n\n    neg     a3,a12                      # [0]\n    j       .Lt_3_10498                     # [1]\n\n.Lt_3_6402: # 0xd11\n    retw.n                          # [0]\n\n    .size   esp_nn_conv_s16_mult4_1x1_esp32s3, . - esp_nn_conv_s16_mult4_1x1_esp32s3\n"
  },
  {
    "path": "src/convolution/esp_nn_conv_s16_mult8_esp32s3.S",
    "content": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n    .text\n    .literal_position\n    .literal    .LC10_28_153, -2147483648\n    .literal    .LC11_28_154, -1073741823\n    .literal    .LC12_28_155, 2147483647\n    .literal    .LC13_28_156, 1073741824\n\n    # Program Unit: esp_nn_conv_s16_mult8_esp32s3\n    .type   esp_nn_conv_s16_mult8_esp32s3, @function\n    .align   4\n    .global esp_nn_conv_s16_mult8_esp32s3\nesp_nn_conv_s16_mult8_esp32s3:  # 0x6e2\n    # qacc_scratch = 0\n    # gra_spill_temp_96 = 48\n    # gra_spill_temp_97 = 52\n    # gra_spill_temp_98 = 56\n    # gra_spill_temp_99 = 60\n    # gra_spill_temp_100 = 64\n    # gra_spill_temp_101 = 68\n    # gra_spill_temp_102 = 72\n    # gra_spill_temp_103 = 76\n    # gra_spill_temp_104 = 80\n    # gra_spill_temp_105 = 84\n    # gra_spill_temp_106 = 88\n    # gra_spill_temp_107 = 92\n    # gra_spill_temp_108 = 96\n    # gra_spill_temp_109 = 100\n    # gra_spill_temp_110 = 104\n    # gra_spill_temp_111 = 108\n    # gra_spill_temp_112 = 112\n    # gra_spill_temp_113 = 116\n    # gra_spill_temp_114 = 120\n    # gra_spill_temp_115 = 124\n    # gra_spill_temp_116 = 128\n    # gra_spill_temp_117 = 132\n    # gra_spill_temp_118 = 136\n    # gra_spill_temp_119 = 140\n    # gra_spill_temp_120 = 144\n    # gra_spill_temp_121 = 148\n    # gra_spill_temp_122 = 152\n    # gra_spill_temp_123 = 156\n    # gra_spill_temp_124 = 160\n    # gra_spill_temp_125 = 164\n    # gra_spill_temp_126 = 168\n    # gra_spill_temp_127 = 172\n    # gra_spill_temp_128 = 176\n    # gra_spill_temp_129 = 180\n    # gra_spill_temp_130 = 184\n    # gra_spill_temp_131 = 188\n    # gra_spill_temp_132 = 192\n    # gra_spill_temp_133 = 196\n    # gra_spill_temp_134 = 200\n    # gra_spill_temp_135 = 204\n    # gra_spill_temp_136 = 208\n    # gra_spill_temp_137 = 212\n\n // registers:\n // a2: const int16_t *input_data\n // a3: const uint16_t input_wd\n // a4: const uint16_t input_ht\n // a5: const uint16_t in_channels\n // a6: const uint16_t pad_wd\n // a7: const uint16_t pad_ht\n\n // on stack:\n // const uint16_t stride_wd\n // const uint16_t stride_ht\n // const int16_t *filter_data\n // const uint16_t filter_wd\n // const uint16_t filter_ht\n // const int32_t *bias\n // int8_t *out_data\n // const uint16_t out_wd\n // const uint16_t out_ht\n // const uint16_t out_channels\n // const int32_t out_offset\n // const int32_t *out_shift\n // const int32_t *out_mult\n // const int32_t activation_min\n // const int32_t activation_max\n\n\n    entry   a1,256                      #\n    s32i    a2,a1,176                   # [0]  gra_spill_temp_128\n    s32i    a3,a1,192                   # [1]  gra_spill_temp_132\n    s32i.n  a6,a1,60                # [2]  gra_spill_temp_99\n    l16ui   a8,a1,288                   # [3]  id:282 out_ht+0x0\n    s32i    a8,a1,68                    # [4]  gra_spill_temp_101\n    beqz.n  a8,.Lt_2_11778          # [5]\n\n    s32i    a7,a1,76                    # [0]  gra_spill_temp_103\n    s32i    a1,a1,156                   # [1]  gra_spill_temp_123\n    l16ui   a8,a1,272                   # [2]  id:285 filter_ht+0x0\n    neg     a11,a7                      # [3]\n    movi.n  a12,0                   # [4]\n    neg     a14,a6                      # [5]\n    l16ui   a15,a1,268                  # [6]  id:286 filter_wd+0x0\n    l16ui   a9,a1,292                   # [7]  id:283 out_channels+0x0\n    l32i    a10,a1,304                  # [8]  id:284 out_mult+0x0\n    s32i    a10,a1,88                   # [9]  gra_spill_temp_106\n    s32i    a9,a1,96                    # [10]  gra_spill_temp_108\n    s32i    a15,a1,196                  # [11]  gra_spill_temp_133\n    s32i.n  a14,a1,48               # [12]  gra_spill_temp_96\n    s32i    a12,a1,72                   # [13]  gra_spill_temp_102\n    s32i    a11,a1,80                   # [14]  gra_spill_temp_104\n    s32i.n  a8,a1,52                # [15]  gra_spill_temp_97\n    sub     a13,a3,a14                  # [16]\n    mul16u  a8,a5,a8                # [17]\n    s32i.n  a13,a1,56               # [18]  gra_spill_temp_98\n    sub     a11,a4,a11                  # [19]\n    l32i    a12,a1,276                  # [20]  id:292 bias+0x0\n    s32i    a12,a1,152                  # [21]  gra_spill_temp_122\n    s32i    a11,a1,84                   # [22]  gra_spill_temp_105\n    l32i    a14,a1,308                  # [23]  id:290 activation_min+0x0\n    l32i    a13,a1,312                  # [24]  id:291 activation_max+0x0\n    s32i    a13,a1,144                  # [25]  gra_spill_temp_120\n    mull    a15,a15,a8                  # [26]\n    addx4   a9,a9,a10                   # [27]\n    s32i    a14,a1,140                  # [28]  gra_spill_temp_119\n    l32i    a11,a1,300                  # [29]  id:293 out_shift+0x0\n    s32i    a11,a1,92                   # [30]  gra_spill_temp_107\n    slli    a14,a5,1                    # [31]\n    s32i    a9,a1,124                   # [32]  gra_spill_temp_115\n    s32i    a15,a1,128                  # [33]  gra_spill_temp_116\n    l32i    a8,a1,280                   # [34]  id:288 out_data+0x0\n    movi.n  a10,0                   # [35]\n    s32i    a10,a1,160                  # [36]  gra_spill_temp_124\n    s32i    a8,a1,132                   # [37]  gra_spill_temp_117\n    l32i    a15,a1,296                  # [38]  id:289 out_offset+0x0\n    l32i    a9,a1,264                   # [39]  id:287 filter_data+0x0\n    s32i    a9,a1,180                   # [40]  gra_spill_temp_129\n    s32i    a15,a1,136                  # [41]  gra_spill_temp_118\n    l16ui   a8,a1,284                   # [42]  id:296 out_wd+0x0\n    l16ui   a10,a1,256                  # [43]  id:294 stride_wd+0x0\n    s32i    a10,a1,100                  # [44]  gra_spill_temp_109\n    s32i    a8,a1,104                   # [45]  gra_spill_temp_110\n    addi.n  a15,a5,-1               # [46]\n    l16ui   a9,a1,260                   # [47]  id:295 stride_ht+0x0\n    s32i    a9,a1,64                    # [48]  gra_spill_temp_100\n    srai    a15,a15,3                   # [49]\n    j   .Lt_2_12290                     # [50]\n\n.Lt_2_12546:    # 0x788\n    l32i    a8,a1,68                    # [0]  gra_spill_temp_101\n    l32i    a12,a1,80                   # [1]  gra_spill_temp_104\n    l32i    a11,a1,84                   # [2]  gra_spill_temp_105\n    l32i    a10,a1,64                   # [3]  gra_spill_temp_100\n    l32i    a13,a1,72                   # [4]  gra_spill_temp_102\n    l32i    a9,a1,76                    # [5]  gra_spill_temp_103\n    addi.n  a13,a13,1               # [6]\n    s32i    a13,a1,72                   # [7]  gra_spill_temp_102\n    sub     a9,a9,a10                   # [8]\n    sub     a11,a11,a10                 # [9]\n    add.n   a12,a12,a10                 # [10]\n    s32i    a12,a1,80                   # [11]  gra_spill_temp_104\n    s32i    a11,a1,84                   # [12]  gra_spill_temp_105\n    s32i    a9,a1,76                    # [13]  gra_spill_temp_103\n    sub     a13,a13,a8                  # [14]\n    beqz    a13,.Lt_2_11778             # [15]\n\n.Lt_2_12290:    # 0x7b6 // width loop\n    l32i    a13,a1,104                  # [0]  gra_spill_temp_110\n    beqz.n  a13,.Lt_2_12546         # [2]\n\n    l32i    a8,a1,192                   # [0]  gra_spill_temp_132\n    l32i    a9,a1,80                    # [1]  gra_spill_temp_104\n    movi.n  a11,0                   # [2]\n    l32i    a10,a1,76                   # [3]  gra_spill_temp_103\n    l32i.n  a12,a1,60               # [4]  gra_spill_temp_99\n    l32i.n  a13,a1,56               # [5]  gra_spill_temp_98\n    s32i    a13,a1,116                  # [6]  gra_spill_temp_113\n    s32i    a12,a1,112                  # [7]  gra_spill_temp_112\n    max     a10,a10,a11                 # [8]\n    s32i    a10,a1,148                  # [9]  gra_spill_temp_121\n    add.n   a9,a9,a10                   # [10]\n    l32i.n  a11,a1,48               # [11]  gra_spill_temp_96\n    s32i    a11,a1,184                  # [12]  gra_spill_temp_130\n    mull    a8,a8,a9                    # [13]\n    l32i    a10,a1,84                   # [14]  gra_spill_temp_105\n    s32i    a8,a1,120                   # [15]  gra_spill_temp_114\n    l32i.n  a9,a1,52                # [16]  gra_spill_temp_97\n    movi.n  a8,0                    # [17]\n    s32i    a8,a1,108                   # [18]  gra_spill_temp_111\n    min     a9,a9,a10                   # [19]\n    s32i    a9,a1,204                   # [20]  gra_spill_temp_135\n    j   .Lt_2_13058                     # [21]\n\n.Lt_2_13314:    # 0x7f6\n#<loop> Part of loop body line 186, head labeled .Lt_2_13058\n    l32i    a13,a1,104                  # [0]  gra_spill_temp_110\n    l32i    a11,a1,112                  # [1]  gra_spill_temp_112\n    l32i    a10,a1,184                  # [2]  gra_spill_temp_130\n    l32i    a9,a1,100                   # [3]  gra_spill_temp_109\n    l32i    a12,a1,108                  # [4]  gra_spill_temp_111\n    l32i    a8,a1,116                   # [5]  gra_spill_temp_113\n    addi.n  a12,a12,1               # [6]\n    s32i    a12,a1,108                  # [7]  gra_spill_temp_111\n    sub     a8,a8,a9                    # [8]\n    add.n   a10,a10,a9                  # [9]\n    sub     a11,a11,a9                  # [10]\n    s32i    a11,a1,112                  # [11]  gra_spill_temp_112\n    s32i    a10,a1,184                  # [12]  gra_spill_temp_130\n    s32i    a8,a1,116                   # [13]  gra_spill_temp_113\n    beq     a12,a13,.Lt_2_12546         # [14]\n\n.Lt_2_13058:    # 0x821 // channel loop\n    l32i    a12,a1,96                   # [0]  gra_spill_temp_108\n    beqz.n  a12,.Lt_2_13314         # [2]\n\n    movi.n  a11,0                   # [0]\n    l32i    a10,a1,112                  # [1]  gra_spill_temp_112\n    l32i    a13,a1,92                   # [2]  gra_spill_temp_107\n    l32i    a8,a1,152                   # [3]  gra_spill_temp_122\n    movi.n  a9,0                    # [4]\n    l32i    a12,a1,88                   # [5]  gra_spill_temp_106\n    s32i    a12,a1,168                  # [6]  gra_spill_temp_126\n    s32i    a9,a1,188                   # [7]  gra_spill_temp_131\n    s32i    a8,a1,164                   # [8]  gra_spill_temp_125\n    s32i    a13,a1,172                  # [9]  gra_spill_temp_127\n    l32i    a8,a1,116                   # [10]  gra_spill_temp_113\n    l32i    a13,a1,196                  # [11]  gra_spill_temp_133\n    max     a10,a10,a11                 # [12]\n    s32i    a10,a1,208                  # [13]  gra_spill_temp_136\n    min     a13,a13,a8                  # [14]\n    s32i    a13,a1,200                  # [15]  gra_spill_temp_134\n    j   .Lt_2_13826                     # [16]\n\n.Lt_2_14082:    # 0x857\n\n// extract data\n    l32i    a4,a1,156                   # [0]  gra_spill_temp_123\n    ee.st.qacc_l.l.128.ip   a4,16       # [2]  id:303\n    ee.st.qacc_l.h.32.ip    a4,0        # [3]  id:304\n    l8ui    a9,a1,15                    # [4]  qacc_scratch+15\n    l16ui   a8,a1,10                    # [5]  qacc_scratch+10\n    l8ui    a12,a1,16                   # [6]  qacc_scratch+16\n    l8ui    a11,a1,6                    # [7]  qacc_scratch+6\n    l8ui    a10,a1,5                    # [8]  qacc_scratch+5\n    s8i     a10,a1,2                    # [9]  qacc_scratch+2\n    s8i     a11,a1,3                    # [10]  qacc_scratch+3\n    s8i     a12,a1,7                    # [11]  qacc_scratch+7\n    s16i    a8,a1,4                     # [12]  qacc_scratch+4\n    s8i     a9,a1,6                     # [13]  qacc_scratch+6\n\n    ee.st.qacc_h.l.128.ip   a4,16       # [14]  id:314\n    ee.st.qacc_h.h.32.ip    a4,-32      # [15]  id:315\n    l8ui    a13,a1,32                   # [16]  qacc_scratch+32\n    l8ui    a9,a1,21                    # [17]  qacc_scratch+21\n    l8ui    a12,a1,31                   # [18]  qacc_scratch+31\n    l16ui   a11,a1,26                   # [19]  qacc_scratch+26\n    l8ui    a10,a1,22                   # [20]  qacc_scratch+22\n    l16ui   a8,a1,16                    # [21]  qacc_scratch+16\n    s16i    a8,a1,8                     # [22]  qacc_scratch+8\n    s8i     a10,a1,11                   # [23]  qacc_scratch+11\n    s16i    a11,a1,12                   # [24]  qacc_scratch+12\n    s8i     a12,a1,14                   # [25]  qacc_scratch+14\n    s8i     a9,a1,10                    # [26]  qacc_scratch+10\n    s8i     a13,a1,15                   # [27]  qacc_scratch+15\n\n    l32i    a9,a1,152                   # [28]  gra_spill_temp_122, bias\n    movi.n  a13,16                  # [29]\n    ee.srcmb.s16.qacc   q1,a13,0        # [30]\n    ee.vld.128.ip   q0,a4,0             # [31]  id:327\n    s32i            a4,a1,156                   # [32]  gra_spill_temp_123\n    ee.vzip.16      q0,q1               # [33]\n    ee.vadds.s32    q0,q0,q1            # [34]\n    ee.movi.32.a    q0,a12,3            # [35]\n    ee.movi.32.a    q0,a11,2            # [36]\n    ee.movi.32.a    q0,a10,0            # [37]\n    add.n           a11,a11,a12                 # [38]\n    ee.movi.32.a    q0,a12,1            # [39]\n    add.n           a10,a10,a12                 # [40]\n    add.n           a10,a10,a11                 # [41]\n\n    beqz.n  a9,.Lt_2_17154          # [42] // skip bias\n\n    l32i    a13,a1,164                  # [0]  gra_spill_temp_125\n    l32i.n  a13,a13,0               # [2]  id:329\n    add.n   a10,a10,a13                 # [4]\n.Lt_2_17154:    # 0x8d7\n\n # 259                  conv_out = esp_nn_multiply_by_quantized_mult(conv_out, out_mult[out_ch_idx], out_shift[out_ch_idx]);\n    l32i    a11,a1,172                  # [0]  gra_spill_temp_127\n    l32i    a4,a1,168                   # [1]  gra_spill_temp_126\n    l32i.n  a11,a11,0               # [2]  id:331\n    l32i.n  a4,a4,0                 # [3]  id:330\n\n    blti    a11,1,.LBB26_esp_nn_conv_s16_mult8_esp32s3  # [4]\n    movi.n  a13,0                   # [0]\n    j       .Lt_2_17666                     # [1]\n.LBB26_esp_nn_conv_s16_mult8_esp32s3:   # 0xa4e\n    neg     a13,a11                     # [0]\n.Lt_2_17666:    # 0x8e6\n\n    movi.n  a12,0                   # [0]\n    max     a12,a11,a12                 # [1]\n    movi.n  a11,0                   # [2]\n    ssl     a12                         # [3]\n    sll     a10,a10                     # [4]\n    bne     a10,a4,.Lt_2_20994          # [5]\n\n    l32r    a9,.LC10_28_153             # [0]\n    movi.n  a8,1                    # [1]\n    sub     a9,a10,a9                   # [2]\n    moveqz  a11,a8,a9               # [3]\n\n.Lt_2_20994:    # 0x901\n    extui   a8,a4,31,1                  # [0]\n    extui   a12,a10,31,1                # [1]\n    xor     a12,a12,a8                  # [2]\n    extui   a12,a12,0,8                 # [3]\n\n    beqz.n  a12,.Lt_2_18434         # [4]\n    movi.n  a12,-1                  # [0]\n    l32r    a9,.LC11_28_154             # [1]\n    j       .Lt_2_18178                     # [2]\n\n.Lt_2_18434:    # 0xa54\n    movi.n  a12,0                   # [0]\n    l32r    a9,.LC13_28_156             # [1]\n.Lt_2_18178:    # 0x914\n\n    ssai    31                          # [0]\n    l32r    a8,.LC12_28_155             # [1]\n    mulsh   a6,a4,a10                   # [2]\n    mull    a4,a4,a10                   # [3]\n    add.n   a6,a6,a12                   # [4]\n    add.n   a7,a4,a9                    # [5]\n    saltu   a4,a7,a4                    # [6]\n    add.n   a4,a4,a6                    # [7]\n    srai    a6,a4,31                    # [8]\n    and     a6,a6,a8                    # [9]\n    add.n   a7,a6,a7                    # [10]\n    srai    a3,a6,31                    # [11]\n    add.n   a3,a3,a4                    # [12]\n    saltu   a6,a7,a6                    # [13]\n    add.n   a6,a6,a3                    # [14]\n    src     a6,a6,a7                    # [15]\n    extui   a3,a11,0,8                  # [16]\n    movi.n  a7,1                    # [17]\n    ssr     a13                         # [18]\n    movnez  a6,a8,a3                # [19]\n    sra     a8,a6                       # [20]\n\n    addi.n  a3,a8,1                 # [21]\n    ssl     a13                         # [22]\n    sll     a7,a7                       # [23]\n    extui   a4,a8,31,1                  # [24]\n    addi.n  a7,a7,-1                # [25]\n    and     a6,a6,a7                    # [26]\n    srai    a7,a7,1                     # [27]\n    add.n   a4,a4,a7                    # [28]\n    l32i    a7,a1,164                   # [29]  gra_spill_temp_125\n    salt    a4,a4,a6                    # [30]\n    movnez  a8,a3,a4                # [31]\n    l32i    a6,a1,172                   # [32]  gra_spill_temp_127\n    l32i    a4,a1,132                   # [33]  gra_spill_temp_117\n    l32i    a3,a1,160                   # [34]  gra_spill_temp_124\n    addi.n  a7,a7,4                 # [35]\n    s32i    a7,a1,164                   # [36]  gra_spill_temp_125\n    addi.n  a6,a6,4                 # [37]\n    s32i    a6,a1,172                   # [38]  gra_spill_temp_127\n    l32i    a7,a1,136                   # [39]  gra_spill_temp_118\n    l32i    a6,a1,140                   # [40]  gra_spill_temp_119\n    add.n   a4,a3,a4                    # [41]\n    add.n   a7,a7,a8                    # [42]\n    addi.n  a3,a3,1                 # [43]\n    l32i    a8,a1,128                   # [44]  gra_spill_temp_116\n    max     a6,a6,a7                    # [45]\n    s32i    a3,a1,160                   # [46]  gra_spill_temp_124\n    l32i    a7,a1,188                   # [47]  gra_spill_temp_131\n    l32i    a3,a1,144                   # [48]  gra_spill_temp_120\n    add.n   a7,a7,a8                    # [49]\n    min     a3,a3,a6                    # [50]\n    s8i     a3,a4,0                     # [51]  id:332\n    s32i    a7,a1,188                   # [52]  gra_spill_temp_131\n    l32i    a4,a1,168                   # [53]  gra_spill_temp_126\n    l32i    a6,a1,124                   # [54]  gra_spill_temp_115\n    addi.n  a4,a4,4                 # [55]\n    s32i    a4,a1,168                   # [56]  gra_spill_temp_126\n    sub     a4,a4,a6                    # [57]\n    beqz    a4,.Lt_2_13314              # [58]\n\n.Lt_2_13826:    # 0x9b4\n    ee.zero.qacc                    # [0]\n    l32i    a9,a1,204                   # [1]  gra_spill_temp_135\n    l32i    a8,a1,148                   # [2]  gra_spill_temp_121\n    s32i    a8,a1,212                   # [3]  gra_spill_temp_137\n    bge     a8,a9,.Lt_2_14082           # [4]\n\n.LBB12_esp_nn_conv_s16_mult8_esp32s3:   # 0x9c3\n#<loop> Part of loop body line 187, head labeled .Lt_2_13826\n    l32i    a8,a1,196                   # [0]  gra_spill_temp_133\n    l32i    a7,a1,212                   # [1]  gra_spill_temp_137\n    l32i    a13,a1,200                  # [2]  gra_spill_temp_134\n    mull    a7,a7,a8                    # [3]\n    l32i    a6,a1,120                   # [4]  gra_spill_temp_114\n    add.n   a13,a7,a13                  # [5]\n    j   .Lt_2_14594                     # [6]\n\n.Lt_2_14850:    # 0x9d7\n#<loop> Part of loop body line 201, head labeled .Lt_2_14594\n    l32i    a9,a1,204                   # [0]  gra_spill_temp_135\n    l32i    a10,a1,212                  # [1]  gra_spill_temp_137\n    l32i    a12,a1,192                  # [2]  gra_spill_temp_132\n    l32i    a11,a1,196                  # [3]  gra_spill_temp_133\n    add.n   a6,a6,a12                   # [4]\n    add.n   a7,a7,a11                   # [5]\n    add.n   a13,a13,a11                 # [6]\n    addi.n  a10,a10,1               # [7]\n    s32i    a10,a1,212                  # [8]  gra_spill_temp_137\n    sub     a9,a9,a10                   # [9]\n    beqz    a9,.Lt_2_14082              # [10]\n\n.Lt_2_14594:    # 0x9f4\n    l32i    a9,a1,200                   # [0]  gra_spill_temp_134\n    l32i    a8,a1,208                   # [1]  gra_spill_temp_136\n    bge     a8,a9,.Lt_2_14850           # [3]\n\n    l32i    a11,a1,176                  # [0]  gra_spill_temp_128\n    l32i    a10,a1,184                  # [1]  gra_spill_temp_130\n    add.n   a12,a7,a8                   # [2]\n    add.n   a10,a10,a8                  # [3]\n    add.n   a10,a6,a10                  # [4]\n    mull    a10,a5,a10                  # [5]\n    mull    a8,a12,a5                   # [6]\n    addx2   a10,a10,a11                 # [7]\n    l32i    a11,a1,188                  # [8]  gra_spill_temp_131\n    add.n   a11,a11,a8                  # [10]\n    l32i    a8,a1,180                   # [11]  gra_spill_temp_129\n    mov.n   a2,a10                      # [12]\n    addx2   a11,a11,a8                  # [13]\n    movi.n  a8,8                    # [14]\n    mov.n   a3,a11                      # [15]\n    j   .Lt_2_15362                     # [16]\n\n.LBB18_esp_nn_conv_s16_mult8_esp32s3:   # 0xa26\n    loopgtz a15,.LBB54_esp_nn_conv_s16_mult8_esp32s3    # [0]\n\n    ee.vmulas.s16.qacc.ld.ip    q0,a2,16,q0,q1  # [0*II+0]  id:300\n    ee.vld.128.ip   q1,a3,16            # [0*II+1]  id:301\n.LBB54_esp_nn_conv_s16_mult8_esp32s3:   # 0xa30\n\n.Lt_2_15618:    # 0xa30\n    ee.vmulas.s16.qacc  q0,q1       # [0]\n    movi.n  a8,8                    # [1]\n    add.n   a10,a10,a14                 # [2]\n    add.n   a11,a11,a14                 # [3]\n    mov.n   a3,a11                      # [4]\n    mov.n   a2,a10                      # [5]\n    beq     a12,a13,.Lt_2_14850         # [6]\n\n.Lt_2_15362:    # 0xa40\n    ee.vld.128.ip   q1,a3,16            # [0]  id:299\n    ee.vld.128.ip   q0,a2,16            # [1]  id:298\n    addi.n  a12,a12,1               # [2]\n    bltu    a8,a5,.LBB18_esp_nn_conv_s16_mult8_esp32s3  # [3]\n\n    j   .Lt_2_15618                     # [0]\n\n.Lt_2_11778:    # 0xa5c\n    retw.n                          # [0]\n\n    .size   esp_nn_conv_s16_mult8_esp32s3, . - esp_nn_conv_s16_mult8_esp32s3\n"
  },
  {
    "path": "src/convolution/esp_nn_conv_s8_1x1_esp32s3.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*\n * 1x1 convolution for ESP32-S3 using transpose + parallel MAC.\n * Processes 8 spatial positions simultaneously via QACC lanes.\n */\n\n#include <stdint.h>\n#include <string.h>\n#include <esp_nn_defs.h>\n#include <common_functions.h>\n\nint esp_nn_conv_s8_1x1_scratch_size(int out_channels)\n{\n    /* Transpose buffer: 8 channels × 8 positions × 2 bytes = 128 bytes per chunk.\n     * Multiple chunks processed sequentially, so 128 is enough. */\n    return 128 + 64; /* transpose + alignment */\n}\n\n/*\n * Transpose 8 spatial positions × 8 channels from int8 to int16 with offset.\n * C fallback for when input address is not 8-byte aligned.\n */\nstatic inline void transpose_8x8_s16_c(const int8_t *input, int stride,\n                                         int32_t input_offset, int16_t *out_buf)\n{\n    for (int ch = 0; ch < 8; ch++) {\n        for (int pos = 0; pos < 8; pos++) {\n            out_buf[ch * 8 + pos] = (int16_t)(input[pos * stride + ch] + input_offset);\n        }\n    }\n}\n\n/*\n * SIMD transpose: 8 positions × 8 channels → channel-major int16 with offset.\n * Uses vzip.8/16/32 chain (same as original .S transpose, verified correct).\n *\n * Input: 8 consecutive spatial positions, each `stride` bytes apart.\n *        Input address MUST be 8-byte aligned.\n * Output: int16 buffer [ch0: pos0..pos7, ch1: pos0..pos7, ...] (16-byte aligned)\n */\nstatic inline void transpose_8x8_s16_simd(const int8_t *input, int stride,\n                                            int16_t offset16, int16_t *out_buf)\n{\n    const int8_t *p = input;\n    int16_t *out = out_buf;\n    int16_t *off_ptr = &offset16;\n\n    __asm__ volatile(\n        /* Load input_offset broadcast to all 8 int16 lanes */\n        \"ee.vldbc.16 q5, %[off]\\n\"\n        /* Zero register for sign extension comparisons */\n        \"ee.zero.q q7\\n\"\n\n        /* Load 8 positions × 8 channels into q0-q3 using paired l/h loads.\n         * Each vld.l.64.xp loads 8 bytes (1 position) into low half, advances by stride.\n         * Each vld.h.64.xp loads 8 bytes into high half, advances by stride.\n         * Result: q0=[pos0|pos2], q1=[pos1|pos3], q2=[pos4|pos6], q3=[pos5|pos7] */\n        \"ee.vld.l.64.xp q0, %[p], %[s]\\n\"\n        \"ee.vld.l.64.xp q1, %[p], %[s]\\n\"\n        \"ee.vld.h.64.xp q0, %[p], %[s]\\n\"\n        \"ee.vld.h.64.xp q1, %[p], %[s]\\n\"\n        \"ee.vld.l.64.xp q2, %[p], %[s]\\n\"\n        \"ee.vzip.8 q0, q1\\n\"\n        \"ee.vld.l.64.xp q3, %[p], %[s]\\n\"\n        \"ee.vld.h.64.xp q2, %[p], %[s]\\n\"\n        \"ee.vld.h.64.ip q3, %[p], 0\\n\"\n        \"ee.vzip.16 q0, q1\\n\"\n        \"ee.vzip.8 q2, q3\\n\"\n        \"ee.vzip.16 q2, q3\\n\"\n        \"ee.vzip.32 q0, q2\\n\"\n\n        /* First 4 channels: sign-extend q0→(q0,q6), q2→(q2,q4), add offset, store */\n        \"ee.vcmp.lt.s8 q4, q2, q7\\n\"\n        \"ee.vzip.8 q2, q4\\n\"\n        \"ee.vcmp.lt.s8 q6, q0, q7\\n\"\n        \"ee.vzip.8 q0, q6\\n\"\n        \"ee.vadds.s16 q0, q0, q5\\n\"\n        \"ee.vst.128.ip q0, %[out], 16\\n\"\n        \"ee.vadds.s16 q6, q6, q5\\n\"\n        \"ee.vst.128.ip q6, %[out], 16\\n\"\n        \"ee.vadds.s16 q2, q2, q5\\n\"\n        \"ee.vst.128.ip q2, %[out], 16\\n\"\n        \"ee.vadds.s16 q4, q4, q5\\n\"\n        \"ee.vst.128.ip q4, %[out], 16\\n\"\n\n        /* Last 4 channels: sign-extend q1→(q1,q6), q3→(q3,q4), add offset, store */\n        \"ee.vzip.32 q1, q3\\n\"\n        \"ee.vcmp.lt.s8 q4, q3, q7\\n\"\n        \"ee.vzip.8 q3, q4\\n\"\n        \"ee.vcmp.lt.s8 q6, q1, q7\\n\"\n        \"ee.vzip.8 q1, q6\\n\"\n        \"ee.vadds.s16 q1, q1, q5\\n\"\n        \"ee.vst.128.ip q1, %[out], 16\\n\"\n        \"ee.vadds.s16 q6, q6, q5\\n\"\n        \"ee.vst.128.ip q6, %[out], 16\\n\"\n        \"ee.vadds.s16 q3, q3, q5\\n\"\n        \"ee.vst.128.ip q3, %[out], 16\\n\"\n        \"ee.vadds.s16 q4, q4, q5\\n\"\n        \"ee.vst.128.ip q4, %[out], 16\\n\"\n\n        : [p] \"+r\" (p), [out] \"+r\" (out), [off] \"+r\" (off_ptr)\n        : [s] \"r\" (stride)\n        : \"memory\"\n    );\n}\n\n/*\n * MAC 8 filter channels against 8 positions using QACC.\n * data_buf: [ch0: 8 int16, ch1: 8 int16, ...] = 128 bytes, 16-byte aligned\n * filter: 8 int8 values, sign-extended to int16 internally\n * Accumulates into QACC lanes 0-7 (must be zeroed before first call per oc)\n *\n * NOTE: filter pointer may not be 8-byte aligned, so we copy to an aligned\n * local buffer before using ee.vld.l.64.ip (which ignores unaligned address bits).\n */\nstatic inline void mac_8pos_8ch_simd(const int16_t *data_buf, const int8_t *filter)\n{\n    /* Copy filter to aligned buffer — ee.vld.l.64.ip requires 8-byte alignment */\n    int8_t __attribute__((aligned(16))) f_aligned[16];\n    memcpy(f_aligned, filter, 8);\n\n    const int16_t *dp = data_buf;\n    const int8_t *fp = f_aligned;\n    __asm__ volatile(\n        /* Sign-extend filter: load 8 int8 → 8 int16 in q7 */\n        \"ee.zero.q q5\\n\"\n        \"ee.vld.l.64.ip q7, %[f], 0\\n\"\n        /* Pre-load first two data chunks during sign extension */\n        \"ee.vld.128.ip q0, %[d], 16\\n\"\n        \"ee.vld.128.ip q1, %[d], 16\\n\"\n        \"ee.vcmp.lt.s8 q6, q7, q5\\n\"\n        \"ee.vzip.8 q7, q6\\n\"\n\n        /* Pipelined: MAC current + load next in one instruction */\n        \"ee.vsmulas.s16.qacc.ld.incp q2, %[d], q0, q7, 0\\n\"\n        \"ee.vsmulas.s16.qacc.ld.incp q3, %[d], q1, q7, 1\\n\"\n        \"ee.vsmulas.s16.qacc.ld.incp q0, %[d], q2, q7, 2\\n\"\n        \"ee.vsmulas.s16.qacc.ld.incp q1, %[d], q3, q7, 3\\n\"\n        \"ee.vsmulas.s16.qacc.ld.incp q2, %[d], q0, q7, 4\\n\"\n        \"ee.vsmulas.s16.qacc.ld.incp q3, %[d], q1, q7, 5\\n\"\n        /* Last two: plain MAC, no more data to load */\n        \"ee.vsmulas.s16.qacc q2, q7, 6\\n\"\n        \"ee.vsmulas.s16.qacc q3, q7, 7\\n\"\n        : [d] \"+r\" (dp), [f] \"+r\" (fp)\n        :\n        : \"memory\"\n    );\n}\n\nvoid esp_nn_conv_s8_1x1(const int8_t *input,\n                         const uint16_t input_wd,\n                         const uint16_t input_ht,\n                         const uint16_t in_channels,\n                         const int32_t input_offset,\n                         const int8_t *filter_data,\n                         const int32_t *bias,\n                         int8_t *out_data,\n                         const uint16_t out_channels,\n                         const int32_t out_offset,\n                         const int32_t *out_shift,\n                         const int32_t *out_mult,\n                         const int32_t activation_min,\n                         const int32_t activation_max,\n                         void *scratch)\n{\n    const int size = input_wd * input_ht;\n    const int ch8 = in_channels / 8;\n\n    /* SIMD transpose requires 8-byte aligned input; check once */\n    const int use_simd_transpose = (in_channels % 8 == 0) &&\n                                    (((uintptr_t)input & 7) == 0);\n    const int16_t offset16 = (int16_t)input_offset;\n\n    /* Use scratch buffer for transpose data — holds ALL channel groups at once.\n     * Layout: [cg0: 8 int16 × 8 pos, cg1: 8 int16 × 8 pos, ...] = ch8 × 128 bytes.\n     * Aligned to 16 bytes for SIMD loads. */\n    int16_t *tbuf = (int16_t *)((uintptr_t)((int8_t *)scratch + 15) & ~15);\n\n    int pos = 0;\n    for (; pos + 7 < size; pos += 8) {\n        const int8_t *in_base = input + pos * in_channels;\n\n        /* Transpose ALL channel groups ONCE per position batch.\n         * This is the key optimization — reuse transposed data across all out_channels. */\n        for (int cg = 0; cg < ch8; cg++) {\n            int16_t *cg_buf = tbuf + cg * 64; /* 64 int16 per channel group */\n            if (use_simd_transpose) {\n                transpose_8x8_s16_simd(in_base + cg * 8, in_channels,\n                                        offset16, cg_buf);\n            } else {\n                transpose_8x8_s16_c(in_base + cg * 8, in_channels,\n                                     input_offset, cg_buf);\n            }\n        }\n        __asm__ volatile(\"\" ::: \"memory\");\n\n        for (int oc = 0; oc < out_channels; oc++) {\n            const int8_t *filt = filter_data + oc * in_channels;\n\n            /* MAC across all channel groups using pre-transposed data */\n            __asm__ volatile(\"ee.zero.qacc\");\n\n            for (int cg = 0; cg < ch8; cg++) {\n                mac_8pos_8ch_simd(tbuf + cg * 64, filt + cg * 8);\n            }\n\n            /* Extract QACC → 8 int32 values */\n            int32_t qacc[8];\n            {\n                int8_t __attribute__((aligned(16))) qraw[24];\n                int8_t *qp = qraw;\n\n                __asm__ volatile(\n                    \"ee.st.qacc_l.l.128.ip %[p], 16\\n\"\n                    \"ee.st.qacc_l.h.32.ip  %[p], -16\\n\"\n                    : [p] \"+r\" (qp) : : \"memory\"\n                );\n                qacc[0] = *(int32_t *)(qraw + 0);\n                qacc[1] = *(int32_t *)(qraw + 5);\n                qacc[2] = *(int32_t *)(qraw + 10);\n                qacc[3] = *(int32_t *)(qraw + 15);\n\n                qp = qraw;\n                __asm__ volatile(\n                    \"ee.st.qacc_h.l.128.ip %[p], 16\\n\"\n                    \"ee.st.qacc_h.h.32.ip  %[p], -16\\n\"\n                    : [p] \"+r\" (qp) : : \"memory\"\n                );\n                qacc[4] = *(int32_t *)(qraw + 0);\n                qacc[5] = *(int32_t *)(qraw + 5);\n                qacc[6] = *(int32_t *)(qraw + 10);\n                qacc[7] = *(int32_t *)(qraw + 15);\n            }\n\n            /* Remainder channels (scalar) */\n            for (int c = ch8 * 8; c < in_channels; c++) {\n                int16_t f = (int16_t)filt[c];\n                for (int p = 0; p < 8; p++) {\n                    qacc[p] += ((int32_t)in_base[p * in_channels + c] + input_offset) * f;\n                }\n            }\n\n            /* Bias + requant + store for 8 positions */\n            for (int p = 0; p < 8; p++) {\n                int32_t acc = qacc[p];\n                if (bias) acc += bias[oc];\n                acc = esp_nn_multiply_by_quantized_mult(acc, out_mult[oc], out_shift[oc]);\n                acc += out_offset;\n                acc = max(acc, activation_min);\n                acc = min(acc, activation_max);\n                out_data[(pos + p) * out_channels + oc] = (int8_t)acc;\n            }\n        }\n    }\n\n    /* Leftover positions (< 8 remaining) */\n    for (; pos < size; pos++) {\n        const int8_t *in_ptr = input + pos * in_channels;\n        for (int oc = 0; oc < out_channels; oc++) {\n            const int8_t *filt = filter_data + oc * in_channels;\n            int32_t acc = 0;\n            int c = 0;\n            for (; c + 2 < in_channels; c += 3) {\n                acc += ((int32_t)in_ptr[c]     + input_offset) * (int32_t)filt[c];\n                acc += ((int32_t)in_ptr[c + 1] + input_offset) * (int32_t)filt[c + 1];\n                acc += ((int32_t)in_ptr[c + 2] + input_offset) * (int32_t)filt[c + 2];\n            }\n            for (; c < in_channels; c++) {\n                acc += ((int32_t)in_ptr[c] + input_offset) * (int32_t)filt[c];\n            }\n            if (bias) acc += bias[oc];\n            acc = esp_nn_multiply_by_quantized_mult(acc, out_mult[oc], out_shift[oc]);\n            acc += out_offset;\n            acc = max(acc, activation_min);\n            acc = min(acc, activation_max);\n            out_data[pos * out_channels + oc] = (int8_t)acc;\n        }\n    }\n}\n"
  },
  {
    "path": "src/convolution/esp_nn_conv_s8_3x3_opt_esp32s3.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*\n * Optimized 3x3 convolution for ESP32-S3.\n *\n * Key optimization vs the general aligned asm:\n * The general asm reloads input for each output channel (128× per pixel).\n * This version pre-loads the 3x3 input window into scratch (9 rows × in_ch bytes),\n * then iterates output channels with the input in L1 cache.\n *\n * For Conv[11] (26×26×128→12×12×128, 3×3 s2):\n * - Input window: 3 × 3 × 128 = 1,152 bytes (fits in L1)\n * - Filter per OC: 3 × 3 × 128 = 1,152 bytes\n * - Total for all 128 OC: 147,456 bytes (cycles through L1)\n * - Input loaded once vs 128× in the general asm\n */\n\n#include <stdint.h>\n#include <string.h>\n#include <esp_nn_defs.h>\n#include <common_functions.h>\n\n/*\n * Check if a conv can use the optimized 3x3 path.\n * Requirements:\n * - filter_wd == 3 && filter_ht == 3\n * - in_channels >= 16 (SIMD worth it)\n * - in_channels % 16 == 0 (aligned for ee.vld.128)\n */\nint esp_nn_conv_s8_3x3_can_use(int filter_wd, int filter_ht,\n                                int in_channels)\n{\n    return (filter_wd == 3 && filter_ht == 3 &&\n            in_channels >= 16 && (in_channels % 16) == 0);\n}\n\n/*\n * Scratch size for the 3x3 optimized path:\n * - im2col buffer: 3 × 3 × in_channels bytes (input window)\n * - corrections: out_channels × 4 bytes\n */\nint esp_nn_conv_s8_3x3_scratch_size(int in_channels, int out_channels)\n{\n    int im2col = 9 * in_channels;          /* 3×3 input window */\n    int corrections = out_channels * 4;    /* bias + filter_sum * offset */\n    return im2col + corrections + 32;      /* + alignment */\n}\n\n/*\n * 3x3 convolution: im2col per pixel, then dot product per output channel.\n * Uses ACCX dot product (ee.vmulas.s8.accx) for the 3×3×in_ch window.\n */\nvoid esp_nn_conv_s8_3x3_opt(const int8_t *input,\n                             const uint16_t input_wd,\n                             const uint16_t input_ht,\n                             const uint16_t in_channels,\n                             const int32_t input_offset,\n                             const uint16_t stride_wd,\n                             const uint16_t stride_ht,\n                             const int8_t *filter_data,\n                             const int32_t *bias,\n                             int8_t *out_data,\n                             const uint16_t out_wd,\n                             const uint16_t out_ht,\n                             const uint16_t out_channels,\n                             const int32_t out_offset,\n                             const int32_t *out_shift,\n                             const int32_t *out_mult,\n                             const int32_t activation_min,\n                             const int32_t activation_max,\n                             void *scratch)\n{\n    const int window_len = 9 * in_channels; /* 3×3 window */\n    const int window_len_aligned = (window_len + 15) & ~15;\n\n    /* Scratch layout: [im2col_buf | corrections] */\n    int8_t *im2col_buf = (int8_t *)((uintptr_t)((int8_t *)scratch + 15) & ~15);\n    int32_t *corrections = (int32_t *)(im2col_buf + window_len_aligned);\n\n    /* Pre-compute corrections: filter_sum * input_offset + bias */\n    const int8_t *f_ptr = filter_data;\n    for (int oc = 0; oc < out_channels; oc++) {\n        int32_t filter_sum = 0;\n        for (int i = 0; i < window_len; i++) {\n            filter_sum += f_ptr[i];\n        }\n        corrections[oc] = filter_sum * input_offset;\n        if (bias) corrections[oc] += bias[oc];\n        f_ptr += window_len;\n    }\n\n    /* Zero-pad the tail of im2col buffer for aligned SIMD reads */\n    memset(im2col_buf + window_len, 0, window_len_aligned - window_len);\n\n    const int in_row_stride = input_wd * in_channels;\n\n    for (int out_y = 0; out_y < out_ht; out_y++) {\n        for (int out_x = 0; out_x < out_wd; out_x++) {\n            /* Phase 1: Build im2col for this output pixel (one-time per pixel) */\n            const int in_y = out_y * stride_ht;\n            const int in_x = out_x * stride_wd;\n            int8_t *dst = im2col_buf;\n            for (int fy = 0; fy < 3; fy++) {\n                const int8_t *src = input + (in_y + fy) * in_row_stride + in_x * in_channels;\n                memcpy(dst, src, 3 * in_channels);\n                dst += 3 * in_channels;\n            }\n\n            /* Phase 2: Dot product against each output channel's filter */\n            const int8_t *filter_ptr = filter_data;\n            for (int oc = 0; oc < out_channels; oc++) {\n                /* ACCX dot product: im2col_buf · filter_ptr */\n                int32_t acc = 0;\n\n                /* Use SIMD dot product via ACCX */\n                const int8_t *a = im2col_buf;\n                const int8_t *b = filter_ptr;\n                int remaining = window_len_aligned;\n\n                __asm__ volatile(\"ee.zero.accx\");\n\n                /* Primed unaligned load for input */\n                __asm__ volatile(\n                    \"ee.ld.128.usar.ip q0, %[a], 16\\n\"\n                    : [a] \"+r\" (a) : : \"memory\"\n                );\n\n                while (remaining >= 32) {\n                    __asm__ volatile(\n                        \"ee.vld.128.ip q4, %[a], 16\\n\"\n                        \"ee.vmulas.s8.accx.ld.ip.qup q3, %[b], 16, q2, q1, q0, q4\\n\"\n                        \"ee.vld.128.ip q2, %[a], 16\\n\"\n                        \"ee.vmulas.s8.accx.ld.ip.qup q1, %[b], 16, q0, q3, q4, q2\\n\"\n                        \"ee.orq q0, q2, q2\\n\"\n                        \"ee.orq q2, q4, q4\\n\"\n                        : [a] \"+r\" (a), [b] \"+r\" (b)\n                        : : \"memory\"\n                    );\n                    remaining -= 32;\n                }\n                if (remaining >= 16) {\n                    __asm__ volatile(\n                        \"ee.vmulas.s8.accx.ld.ip q4, %[a], 16, q2, q1\\n\"\n                        \"ee.src.q.ld.ip q1, %[b], 16, q0, q4\\n\"\n                        \"ee.orq q2, q0, q0\\n\"\n                        : [a] \"+r\" (a), [b] \"+r\" (b)\n                        : : \"memory\"\n                    );\n                    remaining -= 16;\n                }\n                __asm__ volatile(\n                    \"ee.vmulas.s8.accx q2, q1\\n\"\n                    \"movi.n %[tmp], 0\\n\"\n                    \"ee.srs.accx %[acc], %[tmp], 0\\n\"\n                    : [acc] \"=r\" (acc), [tmp] \"=r\" (remaining)\n                    : : \"memory\"\n                );\n\n                acc += corrections[oc];\n                acc = esp_nn_multiply_by_quantized_mult(acc, out_mult[oc], out_shift[oc]);\n                acc += out_offset;\n                acc = max(acc, activation_min);\n                acc = min(acc, activation_max);\n                *out_data++ = (int8_t)acc;\n\n                filter_ptr += window_len;\n            }\n        }\n    }\n}\n"
  },
  {
    "path": "src/convolution/esp_nn_conv_s8_filter_aligned_input_padded_esp32s3.S",
    "content": "//\n// SPDX-FileCopyrightText: 2023-2026 Espressif Systems (Shanghai) CO LTD\n//\n// SPDX-License-Identifier: Apache-2.0\n//\n\n\n//\n// Contraints used by this function are:\n//     1. pad_wd and pad_ht is 0. For versions needing padding we do this\n//        explicitly\n//     2. All the filter rows are aligned to 16 bytes boundary. To make sure\n//        this is indeed the case, for filter rows (filter_wd * channels) not\n//        multiple of 16, we add zeros to fill it till 16 bondary.\n//\n//     The optimized kernel assumes this and skips filter row with following\n//     size: ((filter_wd * input_ch) + 15) & ~15.\n\n\t.text\n\n.literal_position\n\t.literal .LC1, 1073741824\n\n    # Program Unit: esp_nn_conv_s8_filter_aligned_input_padded_esp32s3\n\t.type\tesp_nn_conv_s8_filter_aligned_input_padded_esp32s3, @function\n\t.align\t4\n\t.global\tesp_nn_conv_s8_filter_aligned_input_padded_esp32s3\n // registers:\n // a2: const int16_t *input_data\n // a3: const uint16_t input_wd\n // a4: const uint16_t input_ht\n // a5: const uint16_t in_ch\n // a6: const uint16_t input_offset\n // a7: const uint16_t stride_wd\n\n // on stack:\n // const uint16_t stride_ht\t: 80\n // const int8_t *filter_data\t: 84\n // const uint16_t filter_wd\t: 88\n // const uint16_t filter_ht\t: 92\n // const int32_t *bias\t\t\t: 96\n // int8_t *out_data\t\t\t: 100\n // const uint16_t out_wd\t\t: 104\n // const uint16_t out_ht\t\t: 108\n // const uint16_t out_channels\t: 112\n // const int32_t out_offset\t: 116\n // const int32_t *out_shift\t: 120\n // const int32_t *out_mult\t\t: 124\n // const int32_t activation_min: 128\n // const int32_t activation_max: 132\n // void *scratch_buffer: 136\n\nesp_nn_conv_s8_filter_aligned_input_padded_esp32s3:\n\tentry\tsp, 80\n\ts32i.n  a2, sp, 40  \t# input_data\n\tmov\t\ta11, a6\t\t\t# input_offset\n\tl16ui\ta2, sp, 88  \t# filter_wd\n\tl32i\ta8, sp, 100\t\t# out_data\n\tl16ui\ta6, sp, 80\t\t# stride_ht\n\tmov.n\ta15, a5\n\n\tmull\ta4, a2, a15\t\t# filter_row_sz\n\ts32i.n\ta8, sp, 24\t\t# out_data_ptr\n\tmovi.n\ta9, 0\n\ts32i.n\ta9, sp, 36      # out_y\n\n\taddi.n\ta4, a4, 15\t\t# to round the size up\n\tsrli\ta2, a4, 4\t\t# (filter_row_sz) >> 4\n\tslli\ta12, a2, 4\t\t# ((filter_row_sz) >> 4) << 4\n\n\tmull\ta4, a6, a3\t\t# stride_ht * input_wd\n\tmull\ta5, a3, a15\t\t# input_wd * in_ch\n\tl32i.n\ta10, sp, 112     # out_ch\n\n\tmull \ta9, a7, a15\t\t# stride_wd * in_ch\n\tmull \ta4, a4, a15\t\t# (stride_ht * input_wd) * in_ch\n\n\tslli\ta3, a10, 2\t\t# out_ch * 4\n\n\ts32i.n\ta3, sp, 32\t\t# out_ch * 4\n\ts32i.n\ta5, sp, 12\t\t# input_wd * in_ch\n\ts32i.n\ta9, sp, 52\t\t# stride_wd * in_ch\n\ts32i\ta4, sp, 56\t\t# (stride_ht * input_wd) * in_ch\n\n\tl32i.n\ta3, sp, 92   \t# filter_ht\n\tl32i\ta13, sp, 136\t# scratch_buf\n\tl32i\ta5, sp, 84\t\t# filter_data\n\tmull    a4, a12, a3\t\t# (filter_wd * filter_ht * in_ch)\n\tsrai\ta4, a4, 1\n\taddx4\ta10, a10, a13   # scratch_buf + 4 * out_ch\n\tl32i\ta3, sp, 96\n\t// Skip filter sum accumulation if input_offset is 0 (common in TFLite)\n\t// In that case, correction = just bias (pre-filled by C wrapper)\n\tbeqz\ta11, .L_skip_acc_loop\n\t// accumulate filter values per channel into scratch buffer\n.L_acc_out_channel_loop:\n\tmovi.n\ta9, 0\t// acc\n\tloop\ta4, .L_acc_filter_size_loop\n\tl8ui\ta14, a5, 0\n\tl8ui\ta7, a5, 1\n\taddi.n\ta5, a5, 2\n\tsext\ta14, a14, 7\n\tsext\ta7, a7, 7\n\tadd\t\ta9, a9, a14\n\tadd\t\ta9, a9, a7\n\t.L_acc_filter_size_loop:\n\n\t// multiply by offset, add bias and store the acc value per channel\n\tmull \ta9, a9, a11\n\tbeqz.n \ta3, .L_skip_bias\n\tl32i\ta8, a3, 0\n\taddi\ta3, a3, 4\t// this will remain 0 if bias not present\n\tadd \ta9, a9, a8\n.L_skip_bias:\n\ts32i\ta9, a13, 0\n\taddi.n \ta13, a13, 4\n\tblt    \ta13, a10, .L_acc_out_channel_loop\n\n\tj\t\t.L_acc_done\n\n.L_skip_acc_loop:\n\t// input_offset == 0: correction = bias only\n\t// Fill scratch_buf with bias values\n\tbeqz.n\ta3, .L_skip_acc_zero_bias\n.L_copy_bias_loop:\n\tl32i\ta8, a3, 0\n\ts32i\ta8, a13, 0\n\taddi\ta3, a3, 4\n\taddi.n\ta13, a13, 4\n\tblt\t\ta13, a10, .L_copy_bias_loop\n\tj\t\t.L_acc_done\n\n.L_skip_acc_zero_bias:\n\t// No bias either: zero the scratch buffer\n.L_zero_scratch_loop:\n\tmovi.n\ta8, 0\n\ts32i\ta8, a13, 0\n\taddi.n\ta13, a13, 4\n\tblt\t\ta13, a10, .L_zero_scratch_loop\n\n.L_acc_done:\n\tmovi.n\ta4, 0\t\t\t# 0\n\n.L_height_loop:\n\tl32i.n\ta8, sp, 40  \t# in_row_ptr\n\tmovi.n\ta9, 0\n\tl32i.n\ta10, sp, 104\t# out_wd\n\ts32i.n\ta8, sp, 28  \t# input_ptr\n\ts32i.n\ta9, sp, 44      # out_x\n\n.L_width_loop:\n\tmovi.n\ta9, 0\n\tl32i\ta5, sp, 84\t\t# filter_data\n\ts32i.n\ta9, sp, 20\n\tl32i\ta3, sp, 136\t\t# scratch_buf\n\n.L_out_ch_loop:\n\tmovi.n\ta6, 0\n\tl32i.n\ta9, sp, 28  \t# input_ptr\n\tmov.n\ta10, a6\n\n.L_filter_ht_loop:\n\tadd.n\ta8, a5, a12\n\tmov.n\ta13, a9\n\n\tee.zero.accx\n\tee.ld.128.usar.ip \tq0, a13, 16\n\tee.vld.128.ip \t\tq4, a13, 16\n\tee.vld.128.ip \t\tq1, a5, 16\n\n\tsub             a15, a8, a5         // row_len - 16\n\textui           a14, a15, 4, 1      // if multiple of 16 and not 32\n\tsrai            a15, a15, 5         // multiples of 32\n\tee.src.q.qup \tq2, q0, q4\n\tbeqz\ta15, .L_vector_32_loop_end\n\n\tloop\ta15, .L_vector_32_loop_end\n\n\tee.vld.128.ip \t\t\t\t\tq4, a13, 16\n\tee.vmulas.s8.accx.ld.ip.qup \tq3, a5, 16, q2, q1, q0, q4\n\tee.vld.128.ip \t\t\t\t\tq2, a13, 16\n\tee.vmulas.s8.accx.ld.ip.qup \tq1, a5, 16, q0, q3, q4, q2\n\tee.orq \t\t\t\t\t\t\tq0, q2, q2\n\tee.orq \t\t\t\t\t\t\tq2, q4, q4\n\n.L_vector_32_loop_end:\n\tbeqz\ta14, .L_vector_loop_end\n\tee.vmulas.s8.accx.ld.ip \t\tq4, a13, 16, q2, q1\n\tee.src.q.ld.ip\t\t\t\t\tq1, a5, 16, q0, q4\n\tee.orq \t\t\t\t\t\t\tq2, q0, q0\n\n.L_vector_loop_end:\n\tee.vmulas.s8.accx \tq2, q1\n\taddi\ta13, a13, -16\t// since we incremented by 16 too much\n\tmovi \ta15, 0\n\tee.srs.accx  \ta14, a15, 0\n\n\tmov.n\ta5, a8\n\tadd.n \t\t\ta6, a6, a14\n.L7:\n\tl32i.n\ta8, sp, 12\t\t# input_wd * in_ch\n\tl32i.n\ta2, sp, 92   \t# filter_ht\n\taddi.n\ta10, a10, 1\t\t# filter_y_idx\n\tadd.n\ta9, a9, a8\n\tblt\t\ta10, a2, .L_filter_ht_loop\n.L9:\n\tl32i    a7, a3, 0\t\t# load input_offset acc\n\taddi    a3, a3, 4\t\t# increment offset acc ptr\n\tl32i.n\ta8, sp, 20\n\tadd.n\ta6, a6, a7\t\t# add input_offset accumulation\n\n.L_multiply_by_quant_mult:\n\tl32i\ta10, sp, 120\n\tl32i\ta9, sp, 124\n\tadd.n\ta2, a10, a8\n\tl32i.n\ta2, a2, 0\n\tadd.n\ta7, a9, a8\n\tl32i.n\ta7, a7, 0\n\tmax\t\ta8, a2, a4\n\tssl\t\ta8\n\tsll\t\ta6, a6\n\tmull\ta9, a6, a7\n\tl32r\ta10, .LC1\n\tsub\t\ta2, a8, a2\n\tadd.n\ta8, a9, a10\n\tmulsh\ta6, a6, a7\n\tmovi.n\ta7, 1\n\tbltu\ta8, a9, .L13\n\tmovi.n\ta7, 0\n\n.L13:\n\tadd.n\ta6, a7, a6\n\tslli\ta6, a6, 1\n\textui\ta8, a8, 31, 1\n\tor\t\ta6, a6, a8\n\tbeqz.n\ta2, .L_skip_div_by_pow_of_2\n\taddi.n\ta7, a2, -1\n\tmovi.n\ta9, 1\n\textui\ta8, a6, 31, 1\n\tssl\t\ta7\n\tsll\t\ta7, a9\n\tsub\t\ta7, a7, a8\n\tadd.n\ta6, a7, a6\n\tssr\t\ta2\n\tsra\t\ta6, a6\n.L_skip_div_by_pow_of_2:\n\tl32i\ta10, sp, 116\n\tl32i\ta8, sp, 128\n\tadd.n\ta2, a10, a6\n\tl32i\ta9, sp, 132\n\tl32i.n\ta10, sp, 24\t\t# out_data_ptr\n\tmax\t\ta2, a2, a8\n\tmin\t\ta2, a2, a9\n\ts8i\t\ta2, a10, 0\n\tl32i.n\ta2, sp, 20\n\taddi.n\ta10, a10, 1\n\taddi.n\ta2, a2, 4\n\tl32i.n\ta6, sp, 32\n\ts32i.n\ta2, sp, 20\n\ts32i.n\ta10, sp, 24\t\t# out_data_ptr\n\tbne\t\ta6, a2, .L_out_ch_loop\n\n.L4:\n\tl32i.n\ta5, sp, 44      # out_x\n\tl32i.n\ta6, sp, 28  \t# input_ptr (was stored by height loop)\n\tl32i.n\ta8, sp, 52\t\t# stride_wd * in_ch\n\taddi.n\ta5, a5, 1\n\tadd.n\ta6, a6, a8\t\t# input_ptr + stride_wd * in_ch\n\tl32i.n\ta9, sp, 104 \t# out_wd\n\ts32i.n\ta5, sp, 44      # out_x\n\ts32i.n\ta6, sp, 28  \t# input_ptr\n\tbne\t\ta9, a5, .L_width_loop\n\n\tl32i.n\ta10, sp, 36     # out_y\n\tl32i.n\ta2, sp, 40  \t# in_row_ptr\n\tl32i\ta5, sp, 56\t\t# (stride_ht * input_wd) * in_ch\n\tl32i.n\ta6, sp, 108\t\t# out_ht\n\taddi.n\ta10, a10, 1\n\tadd.n\ta2, a2, a5\t\t# in_row_ptr\n\ts32i.n\ta10, sp, 36     # out_y\n\ts32i.n\ta2, sp, 40  \t# in_row_ptr\n\tblt\t\ta10, a6, .L_height_loop\n\t// end outer (height) loop\n\tretw.n\n\n\t.size\tesp_nn_conv_s8_filter_aligned_input_padded_esp32s3, .-esp_nn_conv_s8_filter_aligned_input_padded_esp32s3\n"
  },
  {
    "path": "src/convolution/esp_nn_conv_s8_mult8_1x1_esp32s3.S",
    "content": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n    .text\n    .literal_position\n    .literal  .nudge_val, 1073741824\n\n    # Program Unit: esp_nn_conv_s8_mult8_1x1_esp32s3\n    #\n    # Requirements:\n    #   - in_channels must be a multiple of 8\n    #   - filter_data must be 8-byte aligned (ee.vld.l.64.ip ignores lower 3 address bits)\n    #   - input_data must be 8-byte aligned (ee.vld.l/h.64.xp same alignment requirement)\n    #   - buffer (scratch) must be 16-byte aligned\n    #\n    # If filter is not aligned, use esp_nn_conv_s8_1x1() (C+inline asm) as fallback.\n    #\n    .type   esp_nn_conv_s8_mult8_1x1_esp32s3, @function\n    .align   4\n    .global esp_nn_conv_s8_mult8_1x1_esp32s3\n\nesp_nn_conv_s8_mult8_1x1_esp32s3:  # 0xdbc\n    # scratch_buf = 0   // to store qacc regs need 36 bytes\n    # gra_spill_temp_164 = 36, channel itr, (in_channels - 1) >> 3\n    # gra_spill_temp_165 = 40, i_out\n    # gra_spill_temp_166 = 44, in_channels\n    # gra_spill_temp_167 = 48, in_channels/8 - 1\n    # gra_spill_temp_168 = 52, in_channels-7\n    # gra_spill_temp_169 = 56, input\n    # gra_spill_temp_170 = 60, filter_data\n    # gra_spill_temp_171 = 64, input_offset\n    # gra_spill_temp_172 = 68, input_ptr\n    # gra_spill_temp_173 = 72, bias\n    # gra_spill_temp_174 = 76, in_channels*8\n    # gra_spill_temp_175 = 80, size-7\n    # gra_spill_temp_176 = 84, size\n\n // registers:\n // a2: int8_t *input_data\n // a3: uint16_t input_wd\n // a4: uint16_t input_ht\n // a5: uint16_t in_channels\n // a6: int32_t input_offset\n // a7: int16_t *filter_data\n\n // on stack:\n // int32_t *bias           // 160\n // int8_t *out_data        // 164\n // uint16_t out_wd         // 168\n // uint16_t out_ht         // 172\n // uint16_t out_channels   // 176\n // int32_t out_offset      // 180\n // int32_t *out_shift      // 184\n // int32_t *out_mult       // 188\n // int32_t activation_min  // 192\n // int32_t activation_max  // 196\n // void *buffer // tmp buf // 200\n\n    entry   a1,160                      #\n    s32i    a5,a1,44                    # [0]  gra_spill_temp_166, in_channels\n    s32i    a6,a1,64                    # [2]  id:619 input_offset+0x0\n    s32i    a7,a1,60                    # [1]  gra_spill_temp_170, filter_data\n    mul16u  a8,a3,a4                    # [3]  size = input_wd * input_ht;\n    s32i    a2,a1,56                    # [0]  gra_spill_temp_169, input\n    l32i    a4,a1,164                   # [1]  id:624 out_data+0x0\n    mov.n   a3,a1                       # [52]  scratch_buf\n\n    s32i    a8,a1,84                    # [4]  gra_spill_temp_176, size\n    blti    a8,8,.prepare_leftover      # [5] // process remaining lines one by one\n    addi    a9,a8,-7                    # [32]\n    s32i    a9,a1,80                    # [33]  gra_spill_temp_175, size-7\n\n    s32i    a2,a1,68                    # [2]  gra_spill_temp_172 , input_ptr\n    srai    a15,a5,3                    # [7] `in_ch/8` loop_cnt\n    movi.n  a11,0                       # [10]\n    s32i    a11,a1,40                   # [11]  gra_spill_temp_165\n    addi    a15,a15,-1                  # [17]  `in_ch/8` loop_cnt - 1\n    s32i    a15,a1,48                   # [18]  gra_spill_temp_167\n    slli    a9,a5,3                     # [19]  in_channels*8\n    s32i    a9,a1,76                    # [20]  gra_spill_temp_174\n    addi    a15,a5,-7                   # [31]\n    s32i    a15,a1,52                   # [34]  gra_spill_temp_168\n\n.outer_loop: // for (; i_out < size - 7; i_out += 8) {\n\n    l32i    a10,a1,200                  # [1]  gra_spill_temp_165, buffer\n    l32i.n  a11,a1,44                   # [1]  gra_spill_temp_166, input_channels\n    l32i.n  a8,a1,68                    # [2]  gra_spill_temp_172, input_ptr\n    srai    a9,a11,3                    # [7] `in_ch/8` loop_cnt for transpose loop\n\n    ee.zero.q   q7                      # [0]\n    addi        a12,a1,64               # [6]\n    ee.vldbc.16 q5,a12                  # [0*II+16]  id:638 input_offset\n\n    // load and transose 8 lines of input 8xchannels,\n    // add input offset and store 16 bit data to tmp buffer\n    loopgtz a9,.transpose_loop_end  # [10]\n    mov.n                   a9,a8\n    ee.vld.l.64.xp          q0,a9,a11\n    ee.vld.l.64.xp          q1,a9,a11\n    ee.vld.h.64.xp          q0,a9,a11\n    ee.vld.h.64.xp          q1,a9,a11\n    ee.vld.l.64.xp          q2,a9,a11\n    ee.vzip.8               q0,q1\n    ee.vld.l.64.xp          q3,a9,a11\n    ee.vld.h.64.xp          q2,a9,a11\n    ee.vld.h.64.ip          q3,a9,0\n    ee.vzip.16              q0,q1\n    ee.vzip.8               q2,q3\n    ee.vzip.16              q2,q3\n    ee.vzip.32              q0,q2\n    ee.vcmp.lt.s8           q4,q2,q7\n    ee.vzip.8               q2,q4\n    ee.vcmp.lt.s8           q6,q0,q7\n    ee.vzip.8               q0,q6\n    ee.vadds.s16            q0,q0,q5\n    ee.vadds.s16.st.incp    q0,a10,q6,q6,q5\n    ee.vadds.s16.st.incp    q6,a10,q2,q2,q5\n    ee.vadds.s16.st.incp    q2,a10,q4,q4,q5\n    ee.vst.128.ip           q4,a10,16\n    ee.vzip.32              q1,q3\n    ee.vcmp.lt.s8           q4,q3,q7\n    ee.vzip.8               q3,q4\n    ee.vcmp.lt.s8           q6,q1,q7\n    ee.vzip.8               q1,q6\n    ee.vadds.s16            q1,q1,q5\n    ee.vadds.s16.st.incp    q1,a10,q6,q6,q5\n    ee.vadds.s16.st.incp    q6,a10,q3,q3,q5\n    ee.vadds.s16.st.incp    q3,a10,q4,q4,q5\n    ee.vst.128.ip           q4,a10,16\n    addi.n                  a8,a8,8\n.transpose_loop_end:    # 0xeeb\n\n # 468          uint32_t bias_ptr = (uint32_t) bias;\n # 469          uint32_t filter_ptr = (uint32_t) (filter_data);\n # 470          const int32_t *out_mult_ptr = out_mult;\n # 471          const int32_t *out_shift_ptr = out_shift;\n    l32i    a6,a1,184                   # [0]  out_shift\n    l32i    a2,a1,188                   # [1]  out_mult\n    l32i    a5,a1,60                    # [2]  gra_spill_temp_170, filter\n    l32i    a9,a1,160                   # [3]  gra_spill_temp_170, bias\n # 472          for (int32_t out_ch_idx = 0; out_ch_idx < out_channels; out_ch_idx++) {\n    l16ui   a8,a1,176                   # [5]  id:620 out_channels+0x0\n    s32i    a9,a1,72                    # [5]  gra_spill_temp_173\n    blti    a8,1,.outer_ch_loop_end\n\n    movi.n  a7,0\n\n.out_ch_loop:   # 0xf3e\n    l32i    a8,a1,200                   # [4]  gra_spill_temp_165, buffer_ptr\n    ee.zero.qacc                        # [3]\n    ee.zero.q                       q5  #\n    l32i    a10,a1,52                   # [1]  gra_spill_temp_168, in_channels-7\n    l32i    a9,a1,48                    # [1]  gra_spill_temp_167, in_channels/8 - 1\n    # USAR-based filter load for unaligned access\n    ee.ld.128.usar.ip               q7,a5,16\n    ee.ld.128.usar.ip               q6,a5,0\n    addi                            a5,a5,-8     # net advance = 8\n    ee.src.q                        q7,q7,q6\n    ee.vld.128.ip                   q0,a8,16\n    ee.vld.128.ip                   q1,a8,16\n    ee.vcmp.lt.s8                   q6,q7,q5\n    ee.vzip.8                       q7,q6\n\n    ee.vsmulas.s16.qacc.ld.incp     q2,a8,q0,q7,0\n    ee.vsmulas.s16.qacc.ld.incp     q3,a8,q1,q7,1\n    ee.vsmulas.s16.qacc.ld.incp     q0,a8,q2,q7,2\n    ee.vsmulas.s16.qacc.ld.incp     q1,a8,q3,q7,3\n    ee.vsmulas.s16.qacc.ld.incp     q2,a8,q0,q7,4\n    ee.vsmulas.s16.qacc.ld.incp     q3,a8,q1,q7,5\n    blti    a10,8,.inner_loop_end           # [16]\n\n    loopgtz a9,.inner_loop_end  # [3]\n\n    ee.vsmulas.s16.qacc.ld.incp q0,a8,q2,q7,6   # [0*II+0]  id:657\n    ee.vsmulas.s16.qacc.ld.incp q1,a8,q3,q7,7   # [0*II+1]  id:658\n    # USAR-based filter load for unaligned access\n    ee.ld.128.usar.ip           q7,a5,16\n    ee.ld.128.usar.ip           q6,a5,0\n    addi                        a5,a5,-8\n    ee.src.q                    q7,q7,q6\n    ee.vcmp.lt.s8               q6,q7,q5\n    ee.vzip.8                   q7,q6\n    ee.vsmulas.s16.qacc.ld.incp q2,a8,q0,q7,0   # [0*II+4]  id:660\n    ee.vsmulas.s16.qacc.ld.incp q3,a8,q1,q7,1   # [0*II+5]  id:661\n    ee.vsmulas.s16.qacc.ld.incp q0,a8,q2,q7,2   # [0*II+6]  id:662\n    ee.vsmulas.s16.qacc.ld.incp q1,a8,q3,q7,3   # [0*II+7]  id:663\n    ee.vsmulas.s16.qacc.ld.incp q2,a8,q0,q7,4   # [0*II+8]  id:664\n    ee.vsmulas.s16.qacc.ld.incp q3,a8,q1,q7,5   # [0*II+9]  id:665\n.inner_loop_end:    # 0xfaf\n\n    ee.vsmulas.s16.qacc q2,q7,6     # [2]\n    ee.vsmulas.s16.qacc q3,q7,7     # [3]\n\n # store qacc registers and re-arrange data for low 16 bits\n\n    ee.st.qacc_l.l.128.ip   a3,16       # [5]  id:668\n    ee.st.qacc_l.h.32.ip    a3,-16        # [6]  id:669\n    l32i.n     a10, a1, 0\n    l32i.n     a11, a1, 5\n    l32i.n     a12, a1, 10\n    l32i.n     a13, a1, 15\n    ee.movi.32.q    q0, a10, 0\n    ee.movi.32.q    q0, a11, 1\n    ee.movi.32.q    q0, a12, 2\n    ee.movi.32.q    q0, a13, 3\n\n    ee.st.qacc_h.l.128.ip   a3,16       # [5]  id:668\n    ee.st.qacc_h.h.32.ip    a3,-16        # [6]  id:669\n    l32i.n     a10, a1, 0\n    l32i.n     a11, a1, 5\n    l32i.n     a12, a1, 10\n    l32i.n     a13, a1, 15\n    ee.movi.32.q    q4, a10, 0\n    ee.movi.32.q    q4, a11, 1\n    ee.movi.32.q    q4, a12, 2\n    ee.movi.32.q    q4, a13, 3\n\n    l32i                a9,a1,160       # [17]  gra_spill_temp_170, bias\n    l32i                a10,a1,72       # [0]  gra_spill_temp_173, bias_ptr\n\n # add bias\n    beqz.n          a9,.no_bias\n    ee.vldbc.32.ip  q6,a10,4\n    s32i            a10,a1,72           # [3]  gra_spill_temp_173, bias_ptr\n    ee.vadds.s32    q0,q0,q6            # [4]\n    ee.vadds.s32    q4,q4,q6            # [5]\n.no_bias:   # 0x102e\n\n    l32i.n  a11,a6,0                    # [1]  id:696\n    l32i.n  a10,a2,0                    # [3]  id:695\n    .global esp_nn_multiply_by_quantized_mult_asm_esp32s3\n    call8   esp_nn_multiply_by_quantized_mult_asm_esp32s3   # [4]  esp_nn_multiply_by_quantized_mult_asm_esp32s3\n\n    l32i.n  a10,a2,0                    # [0]  id:697, mult\n    l32i.n  a11,a6,0                    # [2]  id:698, shift\n    mv.qr   q5,q0\n    mv.qr   q0,q4\n    call8   esp_nn_multiply_by_quantized_mult_asm_esp32s3   # [5]  esp_nn_multiply_by_quantized_mult_asm_esp32s3\n\n    addi.n  a6,a6,4                     # out_shift_ptr++\n    addi.n  a2,a2,4                     # out_mult_ptr++\n    addi    a9,a1,180                   # [7]\n    addi    a10,a1,192                  # [5]\n    addi    a8,a1,196                   # [6]\n\n# load broadcast, activation and out_offset\n    ee.vldbc.32     q4,a9               # [14]  id:699 out_offset\n    ee.vldbc.32     q2,a10              # [11]  id:700 activation_min\n    ee.vldbc.32     q3,a8               # [12]  id:701 activation_max\n\n# add offset\n    ee.vadds.s32    q1,q0,q4            # [17]\n    ee.vadds.s32    q0,q5,q4            # [22]\n\n # activation\n    ee.vmin.s32     q1,q1,q3            # [19]\n    ee.vmax.s32     q1,q1,q2            # [21]\n    ee.vmin.s32     q0,q0,q3            # [23]\n    ee.vmax.s32     q0,q0,q2            # [24]\n\n    l16ui           a9,a1,176           # [33]  out_channels\n\n# unzip and store\n    ee.vunzip.16    q0,q1               # [25]\n    ee.vst.128.ip   q0,a3,0             # [26]  id:702, scratch_buf\n\n # a4 = out_data, out_channels = a1+176\n\n    l8ui    a14,a1,0                    # [27]\n    l8ui    a11,a1,2                    # [30]  scratch_buf+2\n    add     a10,a4,a9\n    s8i     a14,a4,0                    # [28], out_data\n    s8i     a11,a10,0                   # [31], out_data + out_channels\n\n    l8ui    a14,a1,4                    # [32]  scratch_buf+4\n    l8ui    a11,a1,6                    # [37]  scratch_buf+6\n    add     a12,a10,a9\n    add     a10,a12,a9\n    s8i     a14,a12,0                   # [28]\n    s8i     a11,a10,0                   # [31]\n\n    l8ui    a14,a1,8                    # [41]  scratch_buf+8\n    l8ui    a11,a1,10                   # [47]  scratch_buf+10\n    add     a12,a10,a9\n    add     a10,a12,a9\n    s8i     a14,a12,0                   # [28]\n    s8i     a11,a10,0                   # [31]\n\n    l8ui    a14,a1,12                   # [51]  scratch_buf+12\n    l8ui    a11,a1,14                   # [55]  scratch_buf+14\n    add     a12,a10,a9\n    add     a10,a12,a9\n    s8i     a14,a12,0                   # [28]\n    s8i     a11,a10,0                   # [31]\n\n    addi.n  a4,a4,1                     # [29] out_data++;\n    addi.n  a7,a7,1\n    bne     a7,a9,.out_ch_loop\n\n.outer_ch_loop_end:\n\n    subx8   a11,a9,a9                   # (7 * out_channels);\n    l32i    a10,a1,76                   # [1]  gra_spill_temp_174, in_channels * 8\n    l32i    a15,a1,40                   # [4]  gra_spill_temp_165\n    l32i    a9,a1,68                    # [2]  gra_spill_temp_172\n    l32i    a8,a1,80                    # [0]  gra_spill_temp_175, size-7\n    add.n   a4,a4,a11                   # [5] out_data += (7 * out_channels);\n    addi.n  a15,a15,8\n    s32i    a15,a1,40                   # [7]  gra_spill_temp_165\n    add.n   a9,a9,a10                   # [8]\n    s32i    a9,a1,68                    # [9]  gra_spill_temp_172\n    blt     a15,a8,.outer_loop          # [10]\n\n # check if leftover\n    l32i    a15,a1,40\n    l32i    a13,a1,84                   # [1]  gra_spill_temp_176, size\n    l32i    a8,a1,44                    # [0]  gra_spill_temp_166, in_channels\n    bge     a15, a13, .return_function  # no leftover\n\n// This block below processes one input channel line at a time.\n.process_leftover:\n    l32i    a15,a1,40                   # [1]  gra_spill_temp_165, i_out\n    l32i    a14,a1,56                   # [2]  gra_spill_temp_169, input\n    mull    a15,a15,a8                  # [3] in_channels * i_out\n    addi.n  a8,a8,-1                    # [4] in_channels - 1\n    add.n   a14,a14,a15                 # [5] input_ptr = in_channels * i_out + input\n    srai    a8,a8,3                     # [6] iterations, (in_channels - 1) >> 3\n    s32i    a8,a1,36                    # [7]  gra_spill_temp_164, iterations\n    s32i    a14,a1,68                   # [8]  gra_spill_temp_172, in_channels * i_out + input\n    addi            a12,a1,64\n    ee.vldbc.16     q4,a12              # [8]  id:716 input_offset\n\n.leftover_outer_loop:\n\n    l32i    a15,a1,184                  # [0]  out_shift\n    l32i    a2,a1,188                   # [1]  out_mult\n    l32i    a8,a1,60                    # [3]  gra_spill_temp_170, filter_data\n    l32i    a5,a1,160                   # [0]  gra_spill_temp_170, bias\n    movi.n  a11,0                       # [2]\n\n.leftover_out_ch_loop:\n\n    ee.zero.qacc                            # [0]\n    ee.zero.q       q3                      # [1]\n    l32i.n          a9,a1,68                # [4]  gra_spill_temp_172, input_ptr\n    l32i            a10,a1,36               # [1]  gra_spill_temp_164, iterations, (in_channels - 1) >> 3\n    ee.vld.l.64.ip          q0,a9,8         # [7]  id:717, input\n    # USAR-based filter load for unaligned access\n    ee.ld.128.usar.ip       q1,a8,16\n    ee.ld.128.usar.ip       q7,a8,0\n    addi                    a8,a8,-8\n    ee.src.q                q1,q1,q7\n    ee.vcmp.lt.s8           q6,q0,q3\n    ee.vcmp.lt.s8           q7,q1,q3\n    ee.vzip.8               q0,q6\n    ee.vzip.8               q1,q7\n    ee.vadds.s16            q0,q0,q4  # [11]  id:718, add offset\n\n    loopgtz a10,.leftover_inner_loop_end        # [3]\n\n    ee.vmulas.s16.qacc          q0,q1  # mula(q0,q1)\n    ee.vld.l.64.ip              q0,a9,8         # load 8 input values\n    # USAR-based filter load for unaligned access\n    ee.ld.128.usar.ip           q1,a8,16\n    ee.ld.128.usar.ip           q7,a8,0\n    addi                        a8,a8,-8\n    ee.src.q                    q1,q1,q7\n    ee.vcmp.lt.s8               q2,q0,q3        # sign\n    ee.vcmp.lt.s8               q7,q1,q3\n    ee.vzip.8                   q0,q2           # 16 bit input\n    ee.vzip.8                   q1,q7           # 16 bit filter\n    ee.vadds.s16                q0,q0,q4        # add offset\n.leftover_inner_loop_end:   # 0x1262\n\n# re-arrange data from qacc in 32 bit q registers\n    ee.vmulas.s16.qacc      q0,q1       # [3]\n    ee.st.qacc_l.l.128.ip   a3,16       # [5]  id:722\n    ee.st.qacc_l.h.32.ip    a3,0        # [6]  id:723\n    l8ui    a10,a1,5                    # [11]  scratch_buf+5\n    l8ui    a12,a1,6                    # [10]  scratch_buf+6\n    l16ui   a14,a1,10                   # [8]  scratch_buf+10\n    l8ui    a9,a1,15                    # [7]  scratch_buf+15\n    l8ui    a13,a1,16                   # [9]  scratch_buf+16\n    s8i     a10,a1,2                    # [12]  scratch_buf+2\n    s8i     a12,a1,3                    # [13]  scratch_buf+3\n    s16i    a14,a1,4                    # [15]  scratch_buf+4\n    s8i     a9,a1,6                     # [16]  scratch_buf+6\n    s8i     a13,a1,7                    # [14]  scratch_buf+7\n\n    ee.st.qacc_h.l.128.ip   a3,16       # [17]  id:724\n    ee.st.qacc_h.h.32.ip    a3,-32      # [18]  id:725\n    l16ui   a13,a1,16                   # [30]  scratch_buf+16\n    l8ui    a14,a1,21                   # [23]  scratch_buf+21\n    l8ui    a9,a1,22                    # [22]  scratch_buf+22\n    l16ui   a10,a1,26                   # [21]  scratch_buf+26\n    s16i    a13,a1,8                    # [31]  scratch_buf+8\n    l8ui    a12,a1,31                   # [20]  scratch_buf+31\n    l8ui    a13,a1,32                   # [19]  scratch_buf+32\n    s8i     a14,a1,10                   # [24]  scratch_buf+10\n    s8i     a9,a1,11                    # [25]  scratch_buf+11\n    s16i    a10,a1,12                   # [26]  scratch_buf+12\n    s8i     a12,a1,14                   # [27]  scratch_buf+14\n    s8i     a13,a1,15                   # [28]  scratch_buf+15\n    movi.n  a12,16\n\n# get data now\n    ee.vld.128.ip       q0,a3,0\n    ee.srcmb.s16.qacc   q1,a12,0\n    ee.vzip.16          q0,q1\n\n    ee.vadds.s32    q0,q0,q1\n    ee.movi.32.a    q0,a10,3\n    ee.movi.32.a    q0,a9,2\n    ee.movi.32.a    q0,a14,0\n    add             a9,a9,a10\n    ee.movi.32.a    q0,a10,1\n    add             a14,a14,a10\n    add             a14,a14,a9\n\n# a14 contains conv_out\n    l32i    a9,a1,160                   # [43]  gra_spill_temp_170, bias ptr\n    l32i.n  a6,a15,0                    # [44]  id:730, shift\n    beqz.n  a9,.leftover_multiply_by_quant_mult             # [45]\n\n# load and add bias\n    l32i.n  a9,a5,0\n    add.n   a14,a14,a9\n\n.leftover_multiply_by_quant_mult:   # 0x12e7\n    l32i.n  a9,a2,0                 # [0]  id:729, mult\n    movi.n  a10,0                   # [1]\n    max     a10,a6,a10              # [2]  left_shift\n    ssl     a10                     # [3]\n    sll     a14,a14                 # [4] (value << left_shift)\n\n    sub     a7,a10,a6               # right_shift\n\n    l32r    a13,.nudge_val\n    mulsh   a12,a9,a14\n    mull    a14,a9,a14\n    ssai    31\n\n    addi.n  a2,a2,4                 # [0] mult\n    addi.n  a15,a15,4               # [1] shift\n    addi.n  a5,a5,4                 # [2] bias\n    addi.n  a11,a11,1               # [3]\n\n    add     a13,a14,a13             # low part\n    saltu   a14,a13,a14\n    add     a9,a12,a14              # high part\n    src     a12,a9,a13\n\n    blti    a7,1,.leftover_skip_div_by2\n\n    addi.n  a14,a7,-1\n    ssl     a14\n    movi.n  a10,1\n    sll     a10,a10                     # 1 << (exponent - 1)\n    extui   a14,a12,31,1\n    ssr     a7\n    sub     a10,a10,a14                 # 1 << (exponent - 1) - (val < 0)\n    add     a12,a12,a10                 # val += to_add\n    sra     a12,a12\n\n.leftover_skip_div_by2:\n    l32i    a10,a1,180                  # [26]  id:733 out_offset+0x0\n    l32i    a9,a1,192                   # [29]  id:732 activation_min+0x0\n    l16ui   a13,a1,176                  # [5]  id:620 out_channels+0x0\n    l32i    a14,a1,196                  # [31]  id:731 activation_max+0x0\n\n// add offset, apply activation and store\n    add.n   a10,a10,a12\n    max     a9,a9,a10\n    min     a14,a14,a9\n    s8i     a14,a4,0\n    addi.n  a4,a4,1\n\n    bne     a11,a13,.leftover_out_ch_loop\n\n    l32i    a15,a1,44                   # [0]  gra_spill_temp_166, in_channels\n    l32i    a14,a1,68                   # [1]  gra_spill_temp_172, input_ptr\n    l32i    a13,a1,40                   # [2]  gra_spill_temp_165, i_out\n    l32i    a12,a1,84                   # [3]  gra_spill_temp_176, size\n    addi.n  a13,a13,1                   # [4]\n    s32i    a13,a1,40                   # [5]  gra_spill_temp_165, i_out\n    add     a14,a14,a15                 # [7]  input_ptr += in_channels\n    s32i    a14,a1,68                   # [8]  gra_spill_temp_172, input_ptr\n    blt     a13,a12,.leftover_outer_loop\n\n.return_function:\n    retw.n              # [9]\n\n.prepare_leftover:\n    l32i    a8,a1,44                    # [0]  gra_spill_temp_166, in_channels\n    movi.n  a15,0\n    s32i    a15,a1,40                   # [7]  gra_spill_temp_165, i_out\n    j   .process_leftover\n\n    .size   esp_nn_conv_s8_mult8_1x1_esp32s3, . - esp_nn_conv_s8_mult8_1x1_esp32s3\n"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_ansi.c",
    "content": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <esp_nn_defs.h>\n#include <common_functions.h>\n\nint esp_nn_get_depthwise_conv_scratch_size_ansi(const data_dims_t *input_dims,\n                                                const data_dims_t *filter_dims,\n                                                const data_dims_t *output_dims,\n                                                const dw_conv_params_t *conv_params)\n{\n    return 0;\n}\n\nvoid esp_nn_set_depthwise_conv_scratch_buf_ansi(const void *buf)\n{\n\n}\n\nvoid esp_nn_depthwise_conv_s8_ansi(const data_dims_t *input_dims,\n                                   const int8_t *input_data,\n                                   const data_dims_t *filter_dims,\n                                   const int8_t *filter_data,\n                                   const int32_t *bias,\n                                   const data_dims_t *output_dims,\n                                   int8_t *out_data,\n                                   const dw_conv_params_t *conv_params,\n                                   const quant_data_t *quant_data)\n{\n    const uint16_t input_wd = input_dims->width;\n    const uint16_t input_ht = input_dims->height;\n    const uint16_t channels = input_dims->channels;\n    const int32_t input_offset = conv_params->in_offset;\n    const int32_t out_offset = conv_params->out_offset;\n    const uint16_t pad_wd = conv_params->padding.width;\n    const uint16_t pad_ht = conv_params->padding.height;\n    const uint16_t stride_wd = conv_params->stride.width;\n    const uint16_t stride_ht = conv_params->stride.height;\n    const uint16_t filter_wd = filter_dims->width;\n    const uint16_t filter_ht = filter_dims->height;\n    const uint16_t out_wd = output_dims->width;\n    const uint16_t out_ht = output_dims->height;\n    const int32_t *out_shift = quant_data->shift;\n    const int32_t *out_mult = quant_data->mult;\n    const int32_t activation_min = conv_params->activation.min;\n    const int32_t activation_max = conv_params->activation.max;\n    const uint16_t ch_mult = conv_params->ch_mult;\n\n    int out_idx = 0;\n    for (int out_y = 0; out_y < out_ht; out_y++) { //height loop\n        const int16_t base_y = (out_y * stride_ht) - pad_ht;\n        for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop\n            const int16_t base_x = (out_x * stride_wd) - pad_wd;\n            for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop\n                for (int ch_mult_idx = 0; ch_mult_idx < ch_mult; ch_mult_idx++) {\n                    int32_t result = 0;\n                    const int out_ch_idx = ch_mult_idx + ch_idx * ch_mult;\n\n                    /* Select filter so as the point doesn't lie outside block */\n                    int filter_y_start = max(0, -base_y);\n                    int filter_x_start = max(0, -base_x);\n                    int filter_y_end = min(filter_ht, input_ht - base_y);\n                    int filter_x_end = min(filter_wd, input_wd - base_x);\n\n                    for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {\n                        const int32_t idx_y = base_y + filter_y_idx;\n                        for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {\n                            const int32_t idx_x = base_x + filter_x_idx;\n                            int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;\n                            int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx;\n                            int32_t input_val = input_data[input_index] + input_offset;\n                            int32_t filter_val = filter_data[filter_index];\n                            result += input_val * filter_val;\n                        }\n                    }\n                    if (bias) {\n                        result += bias[out_ch_idx];\n                    }\n                    result = esp_nn_multiply_by_quantized_mult(result, out_mult[out_ch_idx], out_shift[out_ch_idx]);\n                    result += out_offset;\n                    result = max(result, activation_min);\n                    result = min(result, activation_max);\n\n                    out_data[out_idx++] = result;\n                }\n            }\n        }\n    }\n}\n"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_esp32p4.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <esp_nn_defs.h>\n#include <common_functions.h>\n#include <stdlib.h>\n\n/* Note: esp_nn_requant_2x_esp32p4.S exists but inline ESP_NN_REQUANT_2X macro\n * from common_functions.h is used instead (avoids function call overhead). */\n\n/* External fallback */\nvoid esp_nn_depthwise_conv_s8_opt(const data_dims_t *input_dims,\n                                   const int8_t *input_data,\n                                   const data_dims_t *filter_dims,\n                                   const int8_t *filter_data,\n                                   const int32_t *bias,\n                                   const data_dims_t *output_dims,\n                                   int8_t *out_data,\n                                   const dw_conv_params_t *conv_params,\n                                   const quant_data_t *quant_data);\n\nint esp_nn_get_depthwise_conv_scratch_size_esp32p4(const data_dims_t *input_dims,\n                                                    const data_dims_t *filter_dims,\n                                                    const data_dims_t *output_dims,\n                                                    const dw_conv_params_t *conv_params)\n{\n    return 0;\n}\n\nvoid esp_nn_set_depthwise_conv_scratch_buf_esp32p4(const void *buf)\n{\n    (void) buf;\n}\n\n/* PIE-optimized ch_mult=1, channels>=16 path using QACC per-lane MAC.\n * Pre-computes filter_sum[ch] = sum of filter[ch] across all filter positions.\n * For non-edge output positions: result[ch] = QACC_MAC + filter_sum[ch] * input_offset\n * For edge positions: falls back to scalar with input_offset applied directly. */\n__attribute__ ((noinline))\nstatic void depthwise_conv_s8_ch1_pie(const data_dims_t *input_dims,\n                                       const int8_t *input_data,\n                                       const data_dims_t *filter_dims,\n                                       const int8_t *filter_data,\n                                       const int32_t *bias,\n                                       const data_dims_t *output_dims,\n                                       int8_t *out_data,\n                                       const dw_conv_params_t *conv_params,\n                                       const quant_data_t *quant_data)\n{\n    const uint16_t input_wd = input_dims->width;\n    const uint16_t input_ht = input_dims->height;\n    const uint16_t channels = input_dims->channels;\n    const int32_t input_offset = conv_params->in_offset;\n    const int32_t out_offset = conv_params->out_offset;\n    const uint16_t pad_wd = conv_params->padding.width;\n    const uint16_t pad_ht = conv_params->padding.height;\n    const uint16_t stride_wd = conv_params->stride.width;\n    const uint16_t stride_ht = conv_params->stride.height;\n    const uint16_t filter_wd = filter_dims->width;\n    const uint16_t filter_ht = filter_dims->height;\n    const uint16_t out_wd = output_dims->width;\n    const uint16_t out_ht = output_dims->height;\n    const int32_t activation_min = conv_params->activation.min;\n    const int32_t activation_max = conv_params->activation.max;\n\n    /* Enable PIE */\n    asm volatile (\n        \"csrsi  0x7f2, 0b01        \\n\\t\"\n        \"li     x29, 0b10          \\n\\t\"\n        \"esp.movx.w.cfg x29        \\n\\t\"\n        ::: \"x29\"\n    );\n\n    /* Set up activation min/max vectors for PIE clamp */\n    {\n        int8_t act_min_val = (int8_t) activation_min;\n        int8_t act_max_val = (int8_t) activation_max;\n        asm volatile (\n            \"mv     x30, %0             \\n\\t\"\n            \"esp.vldbc.8.ip q4, x30, 0  \\n\\t\"\n            \"mv     x30, %1             \\n\\t\"\n            \"esp.vldbc.8.ip q5, x30, 0  \\n\\t\"\n            :: \"r\"(&act_min_val), \"r\"(&act_max_val)\n            : \"x30\"\n        );\n    }\n\n    /* Pre-compute combined offset: filter_sum * input_offset + bias per channel.\n     * This fuses two additions per channel into one pre-computed value.\n     * Constant for the entire layer - computed once. */\n    int32_t combined_offset_buf[256]; /* support up to 256 channels on stack */\n    int32_t *combined_offset = NULL;\n    if (channels <= 256) {\n        combined_offset = combined_offset_buf;\n        for (int ch = 0; ch < channels; ch++) {\n            int32_t s = 0;\n            if (input_offset != 0) {\n                for (int fy = 0; fy < filter_ht; fy++) {\n                    for (int fx = 0; fx < filter_wd; fx++) {\n                        s += filter_data[(fy * filter_wd + fx) * channels + ch];\n                    }\n                }\n                s *= input_offset;\n            }\n            combined_offset[ch] = s + (bias ? bias[ch] : 0);\n        }\n    }\n\n    int out_idx = 0;\n    for (int out_y = 0; out_y < out_ht; out_y++) {\n        const int16_t base_y = (out_y * stride_ht) - pad_ht;\n        for (int out_x = 0; out_x < out_wd; out_x++) {\n            const int16_t base_x = (out_x * stride_wd) - pad_wd;\n\n            const int32_t *out_shift = quant_data->shift;\n            const int32_t *out_mult = quant_data->mult;\n\n            int filter_y_start = max(0, -base_y);\n            int filter_x_start = max(0, -base_x);\n            int filter_y_end = min(filter_ht, input_ht - base_y);\n            int filter_x_end = min(filter_wd, input_wd - base_x);\n\n            /* Check if this is a non-edge position (full filter window) */\n            int is_full_window = (filter_y_start == 0 && filter_x_start == 0 &&\n                                  filter_y_end == filter_ht && filter_x_end == filter_wd);\n\n            /* Process 16 channels at a time using QACC.\n             * Inline helper macro for QACC MAC across filter window. */\n            #define QACC_MAC_WINDOW(ch_off) do { \\\n                asm volatile (\"esp.zero.qacc \\n\\t\"); \\\n                for (int _fy = filter_y_start; _fy < filter_y_end; _fy++) { \\\n                    const int32_t _iy = base_y + _fy; \\\n                    const int8_t *_ip = input_data + (_iy * input_wd + base_x + filter_x_start) * channels + (ch_off); \\\n                    const int8_t *_fp = filter_data + (_fy * filter_wd + filter_x_start) * channels + (ch_off); \\\n                    int _fc = filter_x_end - filter_x_start; \\\n                    asm volatile ( \\\n                        \"mv     x30, %[ip]               \\n\\t\" \\\n                        \"mv     x31, %[fp]               \\n\\t\" \\\n                        \"mv     s7,  %[cnt]              \\n\\t\" \\\n                        \"1:                              \\n\\t\" \\\n                        \"esp.vld.128.ip  q0, x30, 0      \\n\\t\" \\\n                        \"esp.vld.128.ip  q1, x31, 0      \\n\\t\" \\\n                        \"esp.vmulas.s8.qacc q0, q1       \\n\\t\" \\\n                        \"add    x30, x30, %[stride]      \\n\\t\" \\\n                        \"add    x31, x31, %[stride]      \\n\\t\" \\\n                        \"addi   s7, s7, -1               \\n\\t\" \\\n                        \"bnez   s7, 1b                   \\n\\t\" \\\n                        : \\\n                        : [ip] \"r\"(_ip), [fp] \"r\"(_fp), \\\n                          [cnt] \"r\"(_fc), [stride] \"r\"((int32_t)channels) \\\n                        : \"x30\", \"x31\", \"s7\" \\\n                    ); \\\n                } \\\n            } while(0)\n\n            #define QACC_EXTRACT(dst) do { \\\n                asm volatile ( \\\n                    \"mv                      x30, %0     \\n\\t\" \\\n                    \"esp.st.qacc.l.l.128.ip  x30, 16     \\n\\t\" \\\n                    \"esp.st.qacc.l.h.128.ip  x30, 16     \\n\\t\" \\\n                    \"esp.st.qacc.h.l.128.ip  x30, 16     \\n\\t\" \\\n                    \"esp.st.qacc.h.h.128.ip  x30, 0      \\n\\t\" \\\n                    :: \"r\"(dst) \\\n                    : \"x30\", \"memory\" \\\n                ); \\\n            } while(0)\n\n            int ch_idx = 0;\n\n            /* Process 16-channel blocks, then partial block if remainder >= 8 */\n            while (ch_idx < channels) {\n                int block_ch = (ch_idx + 16 <= channels) ? 16 :\n                               (channels - ch_idx >= 8) ? (channels - ch_idx) : 0;\n                if (block_ch == 0) break;  /* remaining < 8, handle scalar below */\n\n                QACC_MAC_WINDOW(ch_idx);\n\n                /* Extract per-lane results (only first block_ch are valid) */\n                int32_t result[16] __attribute__((aligned(16)));\n                QACC_EXTRACT(result);\n\n                /* Add fused offset (filter_sum * input_offset + bias) + requantize */\n                if (combined_offset) {\n                    if (is_full_window) {\n                        for (int k = 0; k < block_ch; k++) {\n                            result[k] += combined_offset[ch_idx + k];\n                        }\n                    } else {\n                        for (int k = 0; k < block_ch; k++) {\n                            int32_t fsum = 0;\n                            if (input_offset != 0) {\n                                for (int fy = filter_y_start; fy < filter_y_end; fy++) {\n                                    for (int fx = filter_x_start; fx < filter_x_end; fx++) {\n                                        fsum += filter_data[(fy * filter_wd + fx) * channels + ch_idx + k];\n                                    }\n                                }\n                                fsum *= input_offset;\n                            }\n                            result[k] += fsum + (bias ? bias[ch_idx + k] : 0);\n                        }\n                    }\n                }\n\n                /* Per-channel requantize */\n                {\n                    const int32_t *mp = out_mult + ch_idx;\n                    const int32_t *sp = out_shift + ch_idx;\n                    int rq_count = block_ch & ~1;  /* round down to even for 2-wide */\n\n                    for (int k = 0; k < rq_count; k += 2) {\n                        int32_t r0 = result[k]; int32_t r1 = result[k+1];\n\n                        int32_t m0 = mp[k], s0 = sp[k];\n                        int32_t m1 = mp[k+1], s1 = sp[k+1];\n\n                        /* 2-wide interleaved requant via inline asm macro.\n                         * Macro handles left_shift internally - do NOT pre-shift. */\n                        int32_t h0, h1;\n                        ESP_NN_REQUANT_2X(r0, r1, m0, m1, s0, s1, h0, h1);\n\n                        h0 += out_offset; h1 += out_offset;\n                        out_data[out_idx++] = (int8_t)max(activation_min, min(h0, activation_max));\n                        out_data[out_idx++] = (int8_t)max(activation_min, min(h1, activation_max));\n                    }\n                    /* Handle odd remaining channel in block */\n                    if (block_ch & 1) {\n                        int k = rq_count;\n                        int32_t r = result[k];\n                        r = esp_nn_requantize(r, mp[k], sp[k]);\n                        r += out_offset;\n                        out_data[out_idx++] = (int8_t)max(activation_min, min(r, activation_max));\n                    }\n                }\n                ch_idx += block_ch;\n            }\n\n            /* Remaining channels < 8: scalar */\n            for (; ch_idx < channels; ch_idx++) {\n                int32_t result = 0;\n                for (int fy = filter_y_start; fy < filter_y_end; fy++) {\n                    const int32_t idx_y = base_y + fy;\n                    for (int fx = filter_x_start; fx < filter_x_end; fx++) {\n                        const int32_t idx_x = base_x + fx;\n                        result += (input_data[(idx_y * input_wd + idx_x) * channels + ch_idx] + input_offset)\n                                  * filter_data[(fy * filter_wd + fx) * channels + ch_idx];\n                    }\n                }\n                if (bias) result += bias[ch_idx];\n                result = esp_nn_requantize(result, out_mult[ch_idx], out_shift[ch_idx]);\n                result += out_offset;\n                result = max(result, activation_min);\n                result = min(result, activation_max);\n                out_data[out_idx++] = (int8_t) result;\n            }\n        }\n    }\n}\n\nvoid esp_nn_depthwise_conv_s8_esp32p4(const data_dims_t *input_dims,\n                                       const int8_t *input_data,\n                                       const data_dims_t *filter_dims,\n                                       const int8_t *filter_data,\n                                       const int32_t *bias,\n                                       const data_dims_t *output_dims,\n                                       int8_t *out_data,\n                                       const dw_conv_params_t *conv_params,\n                                       const quant_data_t *quant_data)\n{\n    const uint16_t ch_mult = conv_params->ch_mult;\n    const uint16_t channels = input_dims->channels;\n\n    if (ch_mult == 1 && channels >= 8) {\n        depthwise_conv_s8_ch1_pie(input_dims, input_data, filter_dims, filter_data,\n                                   bias, output_dims, out_data, conv_params, quant_data);\n        return;\n    }\n\n    /* Fall back to generic optimized */\n    esp_nn_depthwise_conv_s8_opt(input_dims, input_data, filter_dims, filter_data,\n                                  bias, output_dims, out_data, conv_params, quant_data);\n}\n"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_opt.c",
    "content": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <esp_nn_defs.h>\n#include <common_functions.h>\n\nint esp_nn_get_depthwise_conv_scratch_size_opt(const data_dims_t *input_dims,\n                                               const data_dims_t *filter_dims,\n                                               const data_dims_t *output_dims,\n                                               const dw_conv_params_t *conv_params)\n{\n    return 0;\n}\n\nvoid esp_nn_set_depthwise_conv_scratch_buf_opt(const void *buf)\n{\n\n}\n\n/* common channel multiplier == 1 case */\n__attribute__ ((noinline))\nstatic void esp_nn_depthwise_conv_s8_ch_mult_1(const data_dims_t *input_dims,\n                                               const int8_t *input_data,\n                                               const data_dims_t *filter_dims,\n                                               const int8_t *filter_data,\n                                               const int32_t *bias,\n                                               const data_dims_t *output_dims,\n                                               int8_t *out_data,\n                                               const dw_conv_params_t *conv_params,\n                                               const quant_data_t *quant_data)\n{\n    const uint16_t input_wd = input_dims->width;\n    const uint16_t input_ht = input_dims->height;\n    const uint16_t channels = input_dims->channels;\n    const int32_t input_offset = conv_params->in_offset;\n    const int32_t out_offset = conv_params->out_offset;\n    const uint16_t pad_wd = conv_params->padding.width;\n    const uint16_t pad_ht = conv_params->padding.height;\n    const uint16_t stride_wd = conv_params->stride.width;\n    const uint16_t stride_ht = conv_params->stride.height;\n    const uint16_t filter_wd = filter_dims->width;\n    const uint16_t filter_ht = filter_dims->height;\n    const uint16_t out_wd = output_dims->width;\n    const uint16_t out_ht = output_dims->height;\n    const int32_t activation_min = conv_params->activation.min;\n    const int32_t activation_max = conv_params->activation.max;\n\n    int out_idx = 0;\n    for (int out_y = 0; out_y < out_ht; out_y++) { //height loop\n        const int16_t base_y = (out_y * stride_ht) - pad_ht;\n        for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop\n            const int16_t base_x = (out_x * stride_wd) - pad_wd;\n\n            const int32_t *out_shift = quant_data->shift;\n            const int32_t *out_mult = quant_data->mult;\n\n            /* Select filter so as the point doesn't lie outside block */\n            int filter_y_start = max(0, -base_y);\n            int filter_x_start = max(0, -base_x);\n            int filter_y_end = min(filter_ht, input_ht - base_y);\n            int filter_x_end = min(filter_wd, input_wd - base_x);\n\n            int ch_idx = 0;\n            for (; ch_idx < channels - 3; ch_idx += 4) {//channel_loop\n                int32_t result0 = 0;\n                int32_t result1 = 0;\n                int32_t result2 = 0;\n                int32_t result3 = 0;\n\n                for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {\n                    const int32_t idx_y = base_y + filter_y_idx;\n                    for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {\n                        const int32_t idx_x = base_x + filter_x_idx;\n                        int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;\n                        int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels) + ch_idx;\n                        int32_t input_val0 = input_data[input_index + 0] + input_offset;\n                        int32_t input_val1 = input_data[input_index + 1] + input_offset;\n                        int32_t input_val2 = input_data[input_index + 2] + input_offset;\n                        int32_t input_val3 = input_data[input_index + 3] + input_offset;\n                        int32_t filter_val0 = filter_data[filter_index + 0];\n                        int32_t filter_val1 = filter_data[filter_index + 1];\n                        int32_t filter_val2 = filter_data[filter_index + 2];\n                        int32_t filter_val3 = filter_data[filter_index + 3];\n                        result0 += input_val0 * filter_val0;\n                        result1 += input_val1 * filter_val1;\n                        result2 += input_val2 * filter_val2;\n                        result3 += input_val3 * filter_val3;\n                    }\n                }\n                if (bias) {\n                    result0 += bias[ch_idx + 0];\n                    result1 += bias[ch_idx + 1];\n                    result2 += bias[ch_idx + 2];\n                    result3 += bias[ch_idx + 3];\n                }\n                result0 = esp_nn_requantize(result0, *out_mult++, *out_shift++);\n                result1 = esp_nn_requantize(result1, *out_mult++, *out_shift++);\n                result2 = esp_nn_requantize(result2, *out_mult++, *out_shift++);\n                result3 = esp_nn_requantize(result3, *out_mult++, *out_shift++);\n\n                result0 += out_offset;\n                result1 += out_offset;\n                result2 += out_offset;\n                result3 += out_offset;\n\n                result0 = max(result0, activation_min);\n                result1 = max(result1, activation_min);\n                result2 = max(result2, activation_min);\n                result3 = max(result3, activation_min);\n\n                result0 = min(result0, activation_max);\n                result1 = min(result1, activation_max);\n                result2 = min(result2, activation_max);\n                result3 = min(result3, activation_max);\n\n                out_data[out_idx++] = result0;\n                out_data[out_idx++] = result1;\n                out_data[out_idx++] = result2;\n                out_data[out_idx++] = result3;\n            }\n            for (; ch_idx < channels; ch_idx++) {//channel_loop\n                int32_t result = 0;\n\n                for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {\n                    const int32_t idx_y = base_y + filter_y_idx;\n                    for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {\n                        const int32_t idx_x = base_x + filter_x_idx;\n                        int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;\n                        int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels) + ch_idx;\n                        int32_t input_val = input_data[input_index] + input_offset;\n                        int32_t filter_val = filter_data[filter_index];\n                        result += input_val * filter_val;\n                    }\n                }\n                if (bias) {\n                    result += bias[ch_idx];\n                }\n                result = esp_nn_requantize(result, *out_mult++, *out_shift++);\n                result += out_offset;\n                result = max(result, activation_min);\n                result = min(result, activation_max);\n\n                out_data[out_idx++] = result;\n            }\n        }\n    }\n}\n\nvoid esp_nn_depthwise_conv_s8_opt(const data_dims_t *input_dims,\n                                  const int8_t *input_data,\n                                  const data_dims_t *filter_dims,\n                                  const int8_t *filter_data,\n                                  const int32_t *bias,\n                                  const data_dims_t *output_dims,\n                                  int8_t *out_data,\n                                  const dw_conv_params_t *conv_params,\n                                  const quant_data_t *quant_data)\n{\n    const uint16_t ch_mult = conv_params->ch_mult;\n    if (ch_mult == 1) {\n        esp_nn_depthwise_conv_s8_ch_mult_1(input_dims, input_data, filter_dims, filter_data,\n                                           bias, output_dims, out_data, conv_params, quant_data);\n        return;\n    }\n    const uint16_t input_wd = input_dims->width;\n    const uint16_t input_ht = input_dims->height;\n    const uint16_t channels = input_dims->channels;\n    const int32_t input_offset = conv_params->in_offset;\n    const int32_t out_offset = conv_params->out_offset;\n    const uint16_t pad_wd = conv_params->padding.width;\n    const uint16_t pad_ht = conv_params->padding.height;\n    const uint16_t stride_wd = conv_params->stride.width;\n    const uint16_t stride_ht = conv_params->stride.height;\n    const uint16_t filter_wd = filter_dims->width;\n    const uint16_t filter_ht = filter_dims->height;\n    const uint16_t out_wd = output_dims->width;\n    const uint16_t out_ht = output_dims->height;\n    const int32_t activation_min = conv_params->activation.min;\n    const int32_t activation_max = conv_params->activation.max;\n\n    int out_idx = 0;\n    for (int out_y = 0; out_y < out_ht; out_y++) { //height loop\n        const int16_t base_y = (out_y * stride_ht) - pad_ht;\n        for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop\n            const int16_t base_x = (out_x * stride_wd) - pad_wd;\n\n            const int32_t *out_shift = quant_data->shift;\n            const int32_t *out_mult = quant_data->mult;\n\n            /* Select filter so as the point doesn't lie outside block */\n            int filter_y_start = max(0, -base_y);\n            int filter_x_start = max(0, -base_x);\n            int filter_y_end = min(filter_ht, input_ht - base_y);\n            int filter_x_end = min(filter_wd, input_wd - base_x);\n\n            for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop\n                int ch_mult_idx = 0;\n                for (; ch_mult_idx < ch_mult - 3; ch_mult_idx += 4) {\n                    int32_t result0 = 0;\n                    int32_t result1 = 0;\n                    int32_t result2 = 0;\n                    int32_t result3 = 0;\n                    const int out_ch_idx =  ch_idx * ch_mult + ch_mult_idx;\n\n                    for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {\n                        const int32_t idx_y = base_y + filter_y_idx;\n                        for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {\n                            const int32_t idx_x = base_x + filter_x_idx;\n                            int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;\n                            int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx;\n                            int32_t input_val = input_data[input_index] + input_offset;\n                            int32_t filter_val0 = filter_data[filter_index + 0];\n                            int32_t filter_val1 = filter_data[filter_index + 1];\n                            int32_t filter_val2 = filter_data[filter_index + 2];\n                            int32_t filter_val3 = filter_data[filter_index + 3];\n                            result0 += input_val * filter_val0;\n                            result1 += input_val * filter_val1;\n                            result2 += input_val * filter_val2;\n                            result3 += input_val * filter_val3;\n                        }\n                    }\n                    if (bias) {\n                        result0 += bias[out_ch_idx + 0];\n                        result1 += bias[out_ch_idx + 1];\n                        result2 += bias[out_ch_idx + 2];\n                        result3 += bias[out_ch_idx + 3];\n                    }\n                    result0 = esp_nn_requantize(result0, *out_mult++, *out_shift++);\n                    result1 = esp_nn_requantize(result1, *out_mult++, *out_shift++);\n                    result2 = esp_nn_requantize(result2, *out_mult++, *out_shift++);\n                    result3 = esp_nn_requantize(result3, *out_mult++, *out_shift++);\n\n                    result0 += out_offset;\n                    result1 += out_offset;\n                    result2 += out_offset;\n                    result3 += out_offset;\n\n                    result0 = max(result0, activation_min);\n                    result1 = max(result1, activation_min);\n                    result2 = max(result2, activation_min);\n                    result3 = max(result3, activation_min);\n                    result0 = min(result0, activation_max);\n                    result1 = min(result1, activation_max);\n                    result2 = min(result2, activation_max);\n                    result3 = min(result3, activation_max);\n\n                    out_data[out_idx++] = result0;\n                    out_data[out_idx++] = result1;\n                    out_data[out_idx++] = result2;\n                    out_data[out_idx++] = result3;\n                }\n                for (; ch_mult_idx < ch_mult; ch_mult_idx++) {\n                    int32_t result = 0;\n                    const int out_ch_idx =  ch_idx * ch_mult + ch_mult_idx;\n\n                    for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {\n                        const int32_t idx_y = base_y + filter_y_idx;\n                        for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {\n                            const int32_t idx_x = base_x + filter_x_idx;\n                            int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;\n                            int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx;\n                            int32_t input_val = input_data[input_index] + input_offset;\n                            int32_t filter_val = filter_data[filter_index];\n                            result += input_val * filter_val;\n                        }\n                    }\n                    if (bias) {\n                        result += bias[out_ch_idx];\n                    }\n                    result = esp_nn_requantize(result, *out_mult++, *out_shift++);\n                    result += out_offset;\n                    result = max(result, activation_min);\n                    result = min(result, activation_max);\n\n                    out_data[out_idx++] = result;\n                }\n            }\n        }\n    }\n}\n"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3.S",
    "content": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n    .text\n    .literal_position\n\n    # Program Unit: esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3\n    .type   esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3, @function\n    .align   4\n    .global esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3\n\nesp_nn_depthwise_conv_s16_mult1_3x3_esp32s3:    # 0x776\n    # qacc_scratch = 0\n    # gra_spill_temp_35 = 48\n    # gra_spill_temp_36 = 52\n    # gra_spill_temp_37 = 56\n    # gra_spill_temp_38 = 60\n    # gra_spill_temp_39 = 64\n    # gra_spill_temp_40 = 68\n    # gra_spill_temp_41 = 72\n    # gra_spill_temp_42 = 76\n    # gra_spill_temp_43 = 80\n    # gra_spill_temp_44 = 84\n    # gra_spill_temp_45 = 88\n    # gra_spill_temp_46 = 92\n    # gra_spill_temp_47 = 96\n    # gra_spill_temp_48 = 100\n    # gra_spill_temp_49 = 104\n    # gra_spill_temp_50 = 108\n    # gra_spill_temp_51 = 112\n    # gra_spill_temp_52 = 116\n    # gra_spill_temp_53 = 120\n    # gra_spill_temp_54 = 124\n    # gra_spill_temp_55 = 128\n    # gra_spill_temp_56 = 132\n    # gra_spill_temp_57 = 136\n    # gra_spill_temp_58 = 140\n    # gra_spill_temp_59 = 144\n    # gra_spill_temp_60 = 148\n    # gra_spill_temp_61 = 152\n    # gra_spill_temp_62 = 156\n    # gra_spill_temp_63 = 160\n    # gra_spill_temp_64 = 164\n    # gra_spill_temp_65 = 168\n    # gra_spill_temp_66 = 176\n    # gra_spill_temp_67 = 192\n    # gra_spill_temp_68 = 208\n    # gra_spill_temp_69 = 224\n    # gra_spill_temp_70 = 240\n\n // registers:\n // a2: const int16_t *input_data\n // a3: const uint16_t input_wd\n // a4: const uint16_t input_ht\n // a5: const uint16_t channels\n // a6: const uint16_t pad_wd\n // a7: const uint16_t pad_ht\n\n // on stack\n // const uint16_t stride_wd\n // const uint16_t stride_ht\n // const int16_t *filter_data\n // const int32_t *bias\n // int8_t *out_data\n // const uint16_t out_wd\n // const uint16_t out_ht\n // const int32_t out_offset\n // const int32_t *out_shift\n // const int32_t *out_mult\n // const int32_t activation_min\n // const int32_t activation_max\n\n    entry   a1,288                      #\n    s32i    a2,a1,104                   # [0]  gra_spill_temp_49\n    s32i    a3,a1,112                   # [1]  gra_spill_temp_51\n    s32i    a5,a1,116                   # [2]  gra_spill_temp_52\n    s32i.n  a6,a1,56                # [3]  gra_spill_temp_37\n    addi    a14,a1,112                  # [4]\n    addmi   a11,a1,256                  # [5]\n    addmi   a13,a1,256                  # [6]\n    addmi   a15,a1,256                  # [7]\n    l32i    a9,a1,304                   # [8]  id:251 out_data+0x0\n    l16ui   a8,a1,312                   # [9]  id:252 out_ht+0x0\n    s32i    a8,a1,64                    # [10]  gra_spill_temp_39\n    s32i    a9,a1,156                   # [11]  gra_spill_temp_62\n    addi    a15,a15,60                  # [12]\n    addi    a13,a13,72                  # [13]\n    addi    a11,a11,76                  # [14]\n    ee.vldbc.32 q0,a11              # [15]  id:250 activation_max\n    ee.vldbc.32 q1,a13              # [16]  id:249 activation_min\n    ee.vldbc.32 q2,a15              # [17]  id:248 out_offset\n    st.qr   q2,a14,80                   # [18]  gra_spill_temp_67-112\n    st.qr   q1,a14,96                   # [19]  gra_spill_temp_68-112\n    st.qr   q0,a14,112                  # [20]  gra_spill_temp_69-112\n    beqz.n  a8,.Lt_5_7426           # [21]\n\n.LBB3_esp_nn_depthwise_conv_s16_mult1_3x3:  # 0x7b9\n    s32i    a1,a1,160                   # [0]  gra_spill_temp_63\n    s32i    a7,a1,72                    # [1]  gra_spill_temp_41\n    mul16u  a6,a3,a5                # [2]\n    l32i    a14,a1,296                  # [3]  id:254 filter_data+0x0\n    l32i    a15,a1,300                  # [4]  id:253 bias+0x0\n    l16ui   a9,a1,308                   # [5]  id:259 out_wd+0x0\n    l16ui   a13,a1,288                  # [6]  id:255 stride_wd+0x0\n    neg     a8,a7                       # [7]\n    l16ui   a10,a1,292                  # [8]  id:258 stride_ht+0x0\n    l32i    a11,a1,324                  # [9]  id:257 out_mult+0x0\n    l32i    a12,a1,320                  # [10]  id:256 out_shift+0x0\n    s32i    a12,a1,84                   # [11]  gra_spill_temp_44\n    s32i    a11,a1,88                   # [12]  gra_spill_temp_45\n    s32i.n  a10,a1,60               # [13]  gra_spill_temp_38\n    s32i    a8,a1,124                   # [14]  gra_spill_temp_54\n    s32i    a13,a1,80                   # [15]  gra_spill_temp_43\n    s32i    a9,a1,92                    # [16]  gra_spill_temp_46\n    s32i    a15,a1,140                  # [17]  gra_spill_temp_58\n    s32i    a14,a1,108                  # [18]  gra_spill_temp_50\n    slli    a6,a6,1                     # [19]\n    movi.n  a14,16                  # [20]\n    extui   a15,a15,0,4                 # [21]\n    addi    a9,a5,-7                    # [22]\n    movi.n  a13,0                   # [23]\n    sub     a8,a4,a8                    # [24]\n    addx2   a7,a5,a5                    # [25]\n    slli    a7,a7,1                     # [26]\n    slli    a4,a5,1                     # [27]\n    s32i    a13,a1,68                   # [28]  gra_spill_temp_40\n    s32i    a9,a1,144                   # [29]  gra_spill_temp_59\n    s32i    a15,a1,132                  # [30]  gra_spill_temp_56\n    l32i.n  a9,a1,56                # [31]  gra_spill_temp_37\n    s32i    a8,a1,76                    # [32]  gra_spill_temp_42\n    neg     a9,a9                       # [33]\n    s32i.n  a9,a1,48                # [34]  gra_spill_temp_35\n    sub     a8,a3,a9                    # [35]\n    s32i.n  a8,a1,52                # [36]  gra_spill_temp_36\n\n.Lt_5_7938: # 0x822\n    l32i    a10,a1,92                   # [0]  gra_spill_temp_46\n    beqz.n  a10,.Lt_5_8194          # [2]\n\n.LBB6_esp_nn_depthwise_conv_s16_mult1_3x3:  # 0x827\n    l32i.n  a5,a1,52                # [0]  gra_spill_temp_36\n    l32i    a11,a1,76                   # [1]  gra_spill_temp_42\n    movi.n  a13,0                   # [2]\n    l32i    a12,a1,72                   # [3]  gra_spill_temp_41\n    movi.n  a15,0                   # [4]\n    l32i.n  a8,a1,48                # [5]  gra_spill_temp_35\n    l32i.n  a9,a1,56                # [6]  gra_spill_temp_37\n    s32i    a9,a1,100                   # [7]  gra_spill_temp_48\n    s32i    a8,a1,128                   # [8]  gra_spill_temp_55\n    s32i    a15,a1,96                   # [9]  gra_spill_temp_47\n    max     a12,a12,a13                 # [10]\n    s32i    a12,a1,152                  # [11]  gra_spill_temp_61\n    movi.n  a13,3                   # [12]\n    min     a11,a11,a13                 # [13]\n    s32i    a11,a1,136                  # [14]  gra_spill_temp_57\n    sub     a11,a11,a12                 # [15]\n    s32i    a11,a1,120                  # [16]  gra_spill_temp_53\n\n.Lt_5_8706: # 0x854\n    l32i    a2,a1,84                    # [0]  gra_spill_temp_44\n    l32i    a10,a1,144                  # [1]  gra_spill_temp_59\n    l32i    a11,a1,140                  # [2]  gra_spill_temp_58\n    l32i    a12,a1,88                   # [3]  gra_spill_temp_45\n    s32i    a12,a1,168                  # [4]  gra_spill_temp_65\n    s32i    a11,a1,148                  # [5]  gra_spill_temp_60\n    blti    a10,1,.Lt_5_8962            # [6]\n\n    movi.n  a8,0                    # [0]\n    movi.n  a13,0                   # [1]\n    l32i    a3,a1,100                   # [2]  gra_spill_temp_48\n    s32i    a13,a1,164                  # [3]  gra_spill_temp_64\n    max     a3,a3,a8                    # [4]\n\n.Lt_5_9474: # 0x876\n    l32i    a10,a1,136                  # [0]  gra_spill_temp_57\n    l32i    a9,a1,152                   # [1]  gra_spill_temp_61\n    ee.zero.qacc                    # [2]\n    bge     a9,a10,.Lt_5_9730           # [3]\n\n.LBB12_esp_nn_depthwise_conv_s16_mult1_3x3: # 0x882\n    l32i    a12,a1,128                  # [0]  gra_spill_temp_55\n    l32i    a15,a1,112                  # [1]  gra_spill_temp_51\n    l32i    a10,a1,116                  # [2]  gra_spill_temp_52\n    l32i    a13,a1,124                  # [3]  gra_spill_temp_54\n    mull    a11,a9,a10                  # [4]\n    add.n   a13,a13,a9                  # [5]\n    mull    a13,a13,a15                 # [6]\n    addx2   a11,a11,a11                 # [7]\n    l32i    a9,a1,164                   # [8]  gra_spill_temp_64\n    add.n   a12,a12,a13                 # [9]\n    mull    a10,a10,a12                 # [10]\n    add.n   a11,a9,a11                  # [11]\n    l32i    a12,a1,108                  # [12]  gra_spill_temp_50\n    add.n   a9,a9,a10                   # [13]\n    l32i    a10,a1,104                  # [14]  gra_spill_temp_49\n    addx2   a11,a11,a12                 # [15]\n    l32i    a12,a1,120                  # [16]  gra_spill_temp_53\n    addx2   a9,a9,a10                   # [17]\n    loopgtz a12,.LBB32_esp_nn_depthwise_conv_s16_mult1_3x3  # [18]\n\n    mov.n   a13,a9                      # [0]\n    mov.n   a12,a11                     # [1]\n    mov.n   a9,a11                      # [2]\n    mov.n   a11,a13                     # [3]\n\n    beqz.n  a3,.Lt_5_10498          # [4] if (filter_x_start)\n\n    add.n   a11,a4,a13                  # [0]\n    add.n   a9,a4,a12                   # [1]\n.Lt_5_10498:    # 0x8c5\n\n    ee.vld.128.xp   q0,a11,a4           # [0]  id:261\n    ee.vld.128.xp   q1,a9,a4            # [1]  id:262\n\n    bnez.n  a3,.Lt_5_11010          # [2] if (filter_x_start)\n\n    ee.vmulas.s16.qacc  q0,q1       # [0]\n    ee.vld.128.xp   q0,a11,a4           # [1]  id:264\n    ee.vld.128.xp   q1,a9,a4            # [2]  id:265\n.Lt_5_11010:    # 0x8d6\n\n    ee.vmulas.s16.qacc  q0,q1       # [0]\n    ee.vld.128.xp   q0,a11,a4           # [1]  id:267\n    ee.vld.128.xp   q1,a9,a4            # [2]  id:268\n    add.n   a9,a6,a13                   # [3]\n\n    blti    a5,3,.Lt_5_11522            # [4] if (filter_x_end)\n    ee.vmulas.s16.qacc  q0,q1       # [0]\n.Lt_5_11522:    # 0x8e7\n\n    add.n   a11,a7,a12                  # [0]\n\n.LBB32_esp_nn_depthwise_conv_s16_mult1_3x3: # 0x8eb\n\n.Lt_5_9730: # 0x8eb\n // extract data\n    l32i    a9,a1,160                   # [0]  gra_spill_temp_63\n    ee.st.qacc_l.l.128.ip   a9,16       # [2]  id:270\n    ee.st.qacc_l.h.32.ip    a9,0        # [3]  id:271\n    l8ui    a11,a1,15                   # [4]  qacc_scratch+15\n    l16ui   a10,a1,10                   # [5]  qacc_scratch+10\n    l8ui    a15,a1,16                   # [6]  qacc_scratch+16\n    l8ui    a13,a1,6                    # [7]  qacc_scratch+6\n    l8ui    a12,a1,5                    # [8]  qacc_scratch+5\n    s8i     a12,a1,2                    # [9]  qacc_scratch+2\n    s8i     a13,a1,3                    # [10]  qacc_scratch+3\n    s8i     a15,a1,7                    # [11]  qacc_scratch+7\n    s16i    a10,a1,4                    # [12]  qacc_scratch+4\n    s8i     a11,a1,6                    # [13]  qacc_scratch+6\n\n    ee.st.qacc_h.l.128.ip   a9,16       # [14]  id:281\n    ee.st.qacc_h.h.32.ip    a9,-32      # [15]  id:282\n    ee.srcmb.s16.qacc   q1,a14,0        # [16]\n    l8ui    a15,a1,31                   # [17]  qacc_scratch+31\n    l8ui    a8,a1,32                    # [18]  qacc_scratch+32\n    l16ui   a13,a1,26                   # [19]  qacc_scratch+26\n    l8ui    a12,a1,22                   # [20]  qacc_scratch+22\n    l8ui    a11,a1,21                   # [21]  qacc_scratch+21\n    l16ui   a10,a1,16                   # [22]  qacc_scratch+16\n    s16i    a10,a1,8                    # [23]  qacc_scratch+8\n    s8i     a11,a1,10                   # [24]  qacc_scratch+10\n    s8i     a12,a1,11                   # [25]  qacc_scratch+11\n    s16i    a13,a1,12                   # [26]  qacc_scratch+12\n    s8i     a8,a1,15                    # [27]  qacc_scratch+15\n    s8i     a15,a1,14                   # [28]  qacc_scratch+14\n\n\n    l32i    a8,a1,140                   # [29]  gra_spill_temp_58 , bias\n    ee.vld.128.ip   q0,a9,0             # [30]  id:294\n    s32i    a9,a1,160                   # [31]  gra_spill_temp_63\n    ee.vzip.16  q0,q1               # [32]\n    beqz.n  a8,.Lt_5_12290          # [33] // skip bias\n\n    addi    a8,a1,112                   # [0]\n    l32i    a10,a1,132                  # [1]  gra_spill_temp_56\n    l32i    a9,a1,148                   # [2]  gra_spill_temp_60\n    wur.sar_byte    a10                 # [3]\n    ee.vld.128.ip   q4,a9,16            # [4]  id:297\n    ee.vld.128.ip   q7,a9,16            # [5]  id:298\n    ee.vld.128.ip   q5,a9,0             # [6]  id:299\n    s32i    a9,a1,148                   # [7]  gra_spill_temp_60\n    ee.src.q.qup    q6,q4,q7            # [8]\n    ee.vadds.s32    q0,q0,q6            # [9]\n    ee.src.q.qup    q3,q4,q5            # [10]\n    ee.vadds.s32    q1,q1,q3            # [11]\n    st.qr   q1,a8,64                    # [12]  gra_spill_temp_66-112\n\n.Lt_5_12290:    # 0x974\n    addi    a11,a1,112                  # [0]\n\n # 287                  q0 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q0, out_mult_ptr, out_shift_ptr);\n    l32i    a10,a1,168                  # [1]  gra_spill_temp_65\n    st.qr   q1,a11,64                   # [2]  gra_spill_temp_66-112\n    mov.n   a11,a2                      # [3]\n    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [4]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3\n\n # 288                  out_mult_ptr += 4;\n # 289                  out_shift_ptr += 4;\n # 290\n # 291                  q1 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q1, out_mult_ptr, out_shift_ptr);\n    l32i    a10,a1,168                  # [0]  gra_spill_temp_65\n    addmi   a12,a1,256                  # [1]\n    addi    a11,a1,112                  # [2]\n    st.qr   q0,a12,-16                  # [3]  gra_spill_temp_70-256\n    ld.qr   q0,a11,64                   # [4]  gra_spill_temp_66-112\n    addi    a10,a10,16                  # [5]\n    addi    a11,a2,16                   # [6]\n    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [7]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3\n\n.LBB25_esp_nn_depthwise_conv_s16_mult1_3x3: # 0x99a\n#<loop> Part of loop body line 216, head labeled .Lt_5_9474\n    movi.n  a14,16                  # [0]\n # 292                  out_mult_ptr += 4;\n # 293                  out_shift_ptr += 4;\n    addi    a2,a2,32                    # [1]\n    l32i    a15,a1,144                  # [2]  gra_spill_temp_59\n    l32i    a9,a1,156                   # [3]  gra_spill_temp_62\n    l32i    a8,a1,168                   # [4]  gra_spill_temp_65\n    addmi   a12,a1,256                  # [5]\n    addi    a13,a1,112                  # [6]\n    ld.qr   q3,a13,112                  # [7]  gra_spill_temp_69-112\n    ld.qr   q1,a13,80                   # [8]  gra_spill_temp_67-112\n    ld.qr   q2,a12,-16                  # [9]  gra_spill_temp_70-256\n    addi    a8,a8,32                    # [10]\n    s32i    a8,a1,168                   # [11]  gra_spill_temp_65\n    ee.vadds.s32    q2,q2,q1            # [12]\n    ee.vadds.s32    q1,q0,q1            # [13]\n    ee.vmin.s32 q0,q2,q3            # [14]\n    ee.vmin.s32 q1,q1,q3            # [15]\n    ld.qr   \tq2,a13,96                   # [16]  gra_spill_temp_68-112\n    l32i    \ta13,a1,164                  # [17]  gra_spill_temp_64\n    ee.vmax.s32 q1,q1,q2            # [18]\n    ee.vmax.s32 q0,q0,q2            # [19]\n    addi.n  \ta13,a13,8               # [20]\n    s32i    \ta13,a1,164                  # [21]  gra_spill_temp_64\n    ee.vunzip.16    q0,q1               # [22]\n    ee.vunzip.8 \tq0,q1               # [23]\n    ee.vst.l.64.ip  q0,a9,8         # [24]  id:302\n    s32i    \ta9,a1,156                   # [25]  gra_spill_temp_62\n    blt     \ta13,a15,.Lt_5_9474          # [26]\n\n.Lt_5_8962: # 0x9e9\n#<loop> Part of loop body line 203, head labeled .Lt_5_8706\n    l32i    a8,a1,92                    # [0]  gra_spill_temp_46\n    l32i    a11,a1,100                  # [1]  gra_spill_temp_48\n    l32i    a10,a1,128                  # [2]  gra_spill_temp_55\n    l32i    a9,a1,80                    # [3]  gra_spill_temp_43\n    l32i    a15,a1,96                   # [4]  gra_spill_temp_47\n    sub     a5,a5,a9                    # [5]\n    addi.n  a15,a15,1               # [6]\n    s32i    a15,a1,96                   # [7]  gra_spill_temp_47\n    add.n   a10,a10,a9                  # [8]\n    sub     a11,a11,a9                  # [9]\n    s32i    a11,a1,100                  # [10]  gra_spill_temp_48\n    s32i    a10,a1,128                  # [11]  gra_spill_temp_55\n    sub     a15,a15,a8                  # [12]\n    bnez    a15,.Lt_5_8706              # [13]\n\n.Lt_5_8194: # 0xa11\n#<loop> Part of loop body line 201, head labeled .Lt_5_7938\n    l32i    a13,a1,64                   # [0]  gra_spill_temp_39\n    l32i    a10,a1,72                   # [1]  gra_spill_temp_41\n    l32i    a9,a1,124                   # [2]  gra_spill_temp_54\n    l32i.n  a8,a1,60                # [3]  gra_spill_temp_38\n    l32i    a12,a1,68                   # [4]  gra_spill_temp_40\n    l32i    a15,a1,76                   # [5]  gra_spill_temp_42\n    addi.n  a12,a12,1               # [6]\n    s32i    a12,a1,68                   # [7]  gra_spill_temp_40\n    sub     a15,a15,a8                  # [8]\n    add.n   a9,a9,a8                    # [9]\n    sub     a10,a10,a8                  # [10]\n    s32i    a10,a1,72                   # [11]  gra_spill_temp_41\n    s32i    a9,a1,124                   # [12]  gra_spill_temp_54\n    s32i    a15,a1,76                   # [13]  gra_spill_temp_42\n    sub     a12,a12,a13                 # [14]\n    bnez    a12,.Lt_5_7938              # [15]\n\n.Lt_5_7426: # 0xa3e\n    retw.n                          # [0]\n\n    .size   esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3, . - esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3\n"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3.S",
    "content": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n    .text\n    .literal_position\n\n    # Program Unit: esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3\n    .type   esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3, @function\n    .align   4\n    .global esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3\n\nesp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3: # 0xa42\n    # qacc_scratch = 0\n    # gra_spill_temp_71 = 48\n    # gra_spill_temp_72 = 52\n    # gra_spill_temp_73 = 56\n    # gra_spill_temp_74 = 60\n    # gra_spill_temp_75 = 64\n    # gra_spill_temp_76 = 68\n    # gra_spill_temp_77 = 72\n    # gra_spill_temp_78 = 76\n    # gra_spill_temp_79 = 80\n    # gra_spill_temp_80 = 84\n    # gra_spill_temp_81 = 88\n    # gra_spill_temp_82 = 92\n    # gra_spill_temp_83 = 96\n    # gra_spill_temp_84 = 100\n    # gra_spill_temp_85 = 104\n    # gra_spill_temp_86 = 108\n    # gra_spill_temp_87 = 112\n    # gra_spill_temp_88 = 116\n    # gra_spill_temp_89 = 120\n    # gra_spill_temp_90 = 124\n    # gra_spill_temp_91 = 128\n    # gra_spill_temp_92 = 132\n    # gra_spill_temp_93 = 136\n    # gra_spill_temp_94 = 140\n    # gra_spill_temp_95 = 144\n    # gra_spill_temp_96 = 160\n    # gra_spill_temp_97 = 176\n    # gra_spill_temp_98 = 192\n    # gra_spill_temp_99 = 208\n    # gra_spill_temp_100 = 224\n    # gra_spill_temp_101 = 240\n    # gra_spill_temp_102 = 244\n    # gra_spill_temp_103 = 248\n\n // registers:\n // a2: const int16_t *input_data\n // a3: const uint16_t input_wd\n // a4: const uint16_t input_ht\n // a5: const uint16_t channels\n // a6: const uint16_t stride_wd\n // a7: const uint16_t stride_ht\n\n // on stack:\n // const int16_t *filter_data\n // const int32_t *bias\n // int8_t *out_data\n // const uint16_t out_wd\n // const uint16_t out_ht\n // const int32_t out_offset\n // const int32_t *out_shift\n // const int32_t *out_mult\n // const int32_t activation_min\n // const int32_t activation_max\n\n    entry   a1,288                      #\n    s32i    a2,a1,120                   # [0]  gra_spill_temp_89\n    s32i.n  a3,a1,48                # [1]  gra_spill_temp_71\n    s32i    a5,a1,76                    # [2]  gra_spill_temp_78\n    s32i    a6,a1,84                    # [3]  gra_spill_temp_80\n    s32i.n  a7,a1,60                # [4]  gra_spill_temp_74\n    l32i    a12,a1,296                  # [5]  id:241 out_data+0x0\n    addi    a14,a1,112                  # [6]\n    addmi   a10,a1,256                  # [7]\n    addmi   a13,a1,256                  # [8]\n    addmi   a15,a1,256                  # [9]\n\n // height loop\n    l16ui   a8,a1,304                   # [10]  id:242 out_ht+0x0\n    s32i.n  a8,a1,56                # [11]  gra_spill_temp_73\n    addi    a15,a15,52                  # [12]\n    addi    a13,a13,64                  # [13]\n    addi    a10,a10,68                  # [14]\n    ee.vldbc.32 q0,a10              # [15]  id:240 activation_max\n    ee.vldbc.32 q1,a13              # [16]  id:239 activation_min\n    ee.vldbc.32 q2,a15              # [17]  id:238 out_offset\n    st.qr   q2,a14,64                   # [18]  gra_spill_temp_97-112\n    st.qr   q1,a14,80                   # [19]  gra_spill_temp_98-112\n    st.qr   q0,a14,96                   # [20]  gra_spill_temp_99-112\n    beqz.n  a8,.Lt_6_6914           # [21]\n\n.LBB3_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad:   # 0xa83\n    s32i    a1,a1,144                   # [0]  gra_spill_temp_95\n    mul16u  a7,a3,a5                # [1]\n    s32i    a4,a1,72                    # [2]  gra_spill_temp_77\n    addi    a9,a5,-7                    # [3]\n    l16ui   a11,a1,300                  # [4]  id:247 out_wd+0x0\n    l32i    a10,a1,292                  # [5]  id:243 bias+0x0\n    l32i    a15,a1,288                  # [6]  id:244 filter_data+0x0\n    l32i    a13,a1,316                  # [7]  id:246 out_mult+0x0\n    l32i    a14,a1,312                  # [8]  id:245 out_shift+0x0\n    s32i    a14,a1,88                   # [9]  gra_spill_temp_81\n    s32i    a13,a1,92                   # [10]  gra_spill_temp_82\n    s32i    a15,a1,124                  # [11]  gra_spill_temp_90\n    s32i    a10,a1,116                  # [12]  gra_spill_temp_88\n    s32i    a11,a1,96                   # [13]  gra_spill_temp_83\n    s32i    a9,a1,136                   # [14]  gra_spill_temp_93\n    addx2   a4,a5,a5                    # [15]\n    slli    a4,a4,1                     # [16]\n    slli    a7,a7,1                     # [17]\n    l32i.n  a9,a1,60                # [18]  gra_spill_temp_74\n    movi.n  a11,0                   # [19]\n    extui   a10,a10,0,4                 # [20]\n    movi.n  a15,0                   # [21]\n    slli    a5,a5,1                     # [22]\n    s32i    a15,a1,68                   # [23]  gra_spill_temp_76\n    s32i    a10,a1,112                  # [24]  gra_spill_temp_87\n    s32i    a11,a1,64                   # [25]  gra_spill_temp_75\n    mul16u  a8,a3,a9                # [26]\n    movi.n  a11,0                   # [27]\n    s32i    a11,a1,80                   # [28]  gra_spill_temp_79\n    s32i.n  a8,a1,52                # [29]  gra_spill_temp_72\n\n.Lt_6_7426: # 0xad8 // width_loop\n    l32i    a8,a1,96                    # [0]  gra_spill_temp_83\n    beqz.n  a8,.Lt_6_7682           # [2]\n\n    movi.n  a11,3                   # [0]\n    l32i    a10,a1,72                   # [1]  gra_spill_temp_77\n    movi.n  a9,0                    # [2]\n    movi.n  a13,0                   # [3]\n    l32i.n  a14,a1,48               # [4]  gra_spill_temp_71\n    s32i    a14,a1,108                  # [5]  gra_spill_temp_86\n    s32i    a13,a1,104                  # [6]  gra_spill_temp_85\n    s32i    a9,a1,100                   # [7]  gra_spill_temp_84\n    min a10,a10,a11                 # [8]\n    s32i    a10,a1,128                  # [9]  gra_spill_temp_91\n\n.Lt_6_8194: # 0xaf7\n    l32i    a2,a1,88                    # [0]  gra_spill_temp_81\n    l32i    a6,a1,92                    # [1]  gra_spill_temp_82\n    l32i    a8,a1,116                   # [2]  gra_spill_temp_88\n\n// channel loop\n    l32i    a15,a1,136                  # [3]  gra_spill_temp_93\n    s32i    a8,a1,140                   # [4]  gra_spill_temp_94\n    blti    a15,1,.Lt_6_8450            # [5]\n\n    movi.n  a11,0                   # [0]\n    movi.n  a10,0                   # [1]\n    l32i    a9,a1,76                    # [2]  gra_spill_temp_78\n    l32i    a14,a1,80                   # [3]  gra_spill_temp_79\n    movi.n  a8,3                    # [4]\n    l32i    a3,a1,108                   # [5]  gra_spill_temp_86\n    l32i    a13,a1,104                  # [6]  gra_spill_temp_85\n    min a3,a3,a8                    # [7]\n    add.n   a13,a13,a14                 # [8]\n    mull    a9,a9,a13                   # [9]\n    s32i    a9,a1,132                   # [10]  gra_spill_temp_92\n\n.Lt_6_8962: # 0xb26\n    ee.zero.qacc                    # [0]\n    l32i    a9,a1,132                   # [1]  gra_spill_temp_92\n    l32i    a13,a1,120                  # [2]  gra_spill_temp_89\n    add.n   a9,a9,a10                   # [3]\n    addx2   a9,a9,a13                   # [4]\n    l32i    a13,a1,124                  # [5]  gra_spill_temp_90\n    l32i    a14,a1,128                  # [6]  gra_spill_temp_91\n    add.n   a13,a11,a13                 # [7]\n    loopgtz a14,.LBB30_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad   # [8]\n\n.Lt_6_9730: # 0xb3f\n#<loop> Loop body line 360, nesting depth: 4, estimated iterations: 100\n    mov.n   a14,a13                     # [0]\n    mov.n   a15,a9                      # [1]\n    ee.vld.128.xp   q0,a15,a5           # [2]  id:249\n    ee.vld.128.xp   q1,a14,a5           # [3]  id:250\n    add.n   a9,a9,a7                    # [4]\n    beqi    a3,2,.LBB15_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad  # [5]\n\n.Lt_6_9986: # 0xb4e\n    beqi    a3,3,.LBB17_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad  # [0]\n\n.Lt_6_10498:    # 0xb51\n    add.n   a13,a13,a4                  # [0]\n    ee.vmulas.s16.qacc  q0,q1       # [1]\n\n.LBB30_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad:  # 0xb58\n\n // extract data\n    l32i    a15,a1,144                  # [0]  gra_spill_temp_95\n    ee.st.qacc_l.l.128.ip   a15,16      # [2]  id:258\n    ee.st.qacc_l.h.32.ip    a15,0       # [3]  id:259\n    l8ui    a14,a1,15                   # [4]  qacc_scratch+15\n    l8ui    a13,a1,16                   # [5]  qacc_scratch+16\n    l8ui    a8,a1,5                     # [6]  qacc_scratch+5\n    l8ui    a9,a1,6                     # [7]  qacc_scratch+6\n    s8i     a9,a1,3                     # [8]  qacc_scratch+3\n    s8i     a8,a1,2                     # [9]  qacc_scratch+2\n    s8i     a13,a1,7                    # [10]  qacc_scratch+7\n    s8i     a14,a1,6                    # [11]  qacc_scratch+6\n    l16ui   a13,a1,10                   # [12]  qacc_scratch+10\n    s16i    a13,a1,4                    # [13]  qacc_scratch+4\n    ee.st.qacc_h.l.128.ip   a15,16      # [14]  id:269\n    ee.st.qacc_h.h.32.ip    a15,-32     # [15]  id:270\n    l8ui    a9,a1,32                    # [16]  qacc_scratch+32\n    l8ui    a13,a1,22                   # [17]  qacc_scratch+22\n    l8ui    a8,a1,31                    # [18]  qacc_scratch+31\n    l16ui   a14,a1,26                   # [19]  qacc_scratch+26\n    s16i    a14,a1,12                   # [20]  qacc_scratch+12\n    s8i     a8,a1,14                    # [21]  qacc_scratch+14\n    s8i     a13,a1,11                   # [22]  qacc_scratch+11\n    s8i     a9,a1,15                    # [23]  qacc_scratch+15\n\n    l32i    a13,a1,116                  # [24]  gra_spill_temp_88\n    l8ui    a9,a1,21                    # [25]  qacc_scratch+21\n    l16ui   a8,a1,16                    # [26]  qacc_scratch+16\n    movi.n  a14,16                  # [27]\n    ee.srcmb.s16.qacc   q1,a14,0        # [28]\n    s16i    a8,a1,8                     # [29]  qacc_scratch+8\n    s8i     a9,a1,10                    # [30]  qacc_scratch+10\n    ee.vld.128.ip   q0,a15,0            # [31]  id:282\n    s32i    a15,a1,144                  # [32]  gra_spill_temp_95\n    ee.vzip.16  q0,q1               # [33]\n\n    bnez.n  a13,.LBB20_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad   # [34]\n\n    s32i    a12,a1,240                  # [0]  gra_spill_temp_101\n    s32i    a11,a1,244                  # [1]  gra_spill_temp_102\n    s32i    a10,a1,248                  # [2]  gra_spill_temp_103\n    addi    a14,a1,112                  # [3]\n    st.qr   q1,a14,48                   # [4]  gra_spill_temp_96-112\n    j   .Lt_6_11266                     # [5]\n\n.LBB15_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad:  # 0xbce\n#<loop> Part of loop body line 360, head labeled .Lt_6_9730\n    ee.vmulas.s16.qacc.ld.xp    q0,a15,a5,q0,q1     # [0]  id:251\n    ee.vld.128.xp   q1,a14,a5           # [1]  id:252\n    bnei    a3,3,.Lt_6_10498            # [2]\n\n.LBB17_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad:  # 0xbd8\n    ee.vmulas.s16.qacc.ld.xp    q3,a15,a5,q0,q1     # [0]  id:253\n    ee.vld.128.xp   q4,a14,a5           # [1]  id:254\n    ee.vld.128.xp   q1,a14,a5           # [2]  id:256\n    ee.vmulas.s16.qacc.ld.xp    q0,a15,a5,q3,q4     # [3]  id:255\n    j   .Lt_6_10498                     # [4]\n\n.LBB20_esp_nn_depthwise_conv_s16_mult1_3x3_no_pad:  # 0xbe9\n#<loop> Part of loop body line 358, head labeled .Lt_6_8962\n    s32i    a12,a1,240                  # [0]  gra_spill_temp_101\n    s32i    a11,a1,244                  # [1]  gra_spill_temp_102\n    s32i    a10,a1,248                  # [2]  gra_spill_temp_103\n    addi    a15,a1,112                  # [3]\n    l32i    a9,a1,112                   # [4]  gra_spill_temp_87\n    l32i    a8,a1,140                   # [5]  gra_spill_temp_94\n    wur.sar_byte    a9                  # [6]\n    ee.vld.128.ip   q6,a8,16            # [7]  id:285\n    ee.vld.128.ip   q3,a8,16            # [8]  id:286\n    ee.vld.128.ip   q7,a8,0             # [9]  id:287\n    s32i    a8,a1,140                   # [10]  gra_spill_temp_94\n    ee.src.q.qup    q2,q6,q3            # [11]\n    ee.vadds.s32    q0,q0,q2            # [12]\n    ee.src.q.qup    q5,q6,q7            # [13]\n    ee.vadds.s32    q1,q1,q5            # [14]\n    st.qr           q1,a15,48                   # [15]  gra_spill_temp_96-112\n\n.Lt_6_11266:    # 0xc19\n # 423                  q0 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q0, out_mult_ptr, out_shift_ptr);\n    mov.n   a10,a6                      # [0]\n    mov.n   a11,a2                      # [1]\n    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [2]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3\n\n    addi    a11,a1,112                  # [0]\n    addi    a10,a6,16                   # [1]\n    st.qr   q0,a11,112                  # [2]  gra_spill_temp_100-112\n    ld.qr   q0,a11,48                   # [3]  gra_spill_temp_96-112\n    addi    a11,a2,16                   # [4]\n    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [5]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3\n\n    addi    a6,a6,32                    # [0]\n    addi    a2,a2,32                    # [1]\n\n    l32i    a13,a1,136                  # [2]  gra_spill_temp_93\n    l32i    a12,a1,240                  # [3]  gra_spill_temp_101\n    l32i    a10,a1,248                  # [4]  gra_spill_temp_103\n    l32i    a11,a1,244                  # [5]  gra_spill_temp_102\n    addi    a9,a1,112                   # [6]\n    ld.qr   q6,a9,80                    # [7]  gra_spill_temp_98-112\n    ld.qr   q7,a9,96                    # [8]  gra_spill_temp_99-112\n    ld.qr   q5,a9,64                    # [9]  gra_spill_temp_97-112\n    ld.qr   q4,a9,112                   # [10]  gra_spill_temp_100-112\n    addi    a11,a11,16                  # [11]\n    addi.n  a10,a10,8               # [12]\n    ee.vadds.s32    q4,q4,q5            # [13]\n    ee.vadds.s32    q5,q0,q5            # [14]\n    ee.vmin.s32     q4,q4,q7            # [15]\n    ee.vmax.s32     q4,q4,q6            # [16]\n    ee.vmin.s32     q5,q5,q7            # [17]\n    ee.vmax.s32     q5,q5,q6            # [18]\n    ee.vunzip.16    q4,q5               # [19]\n    ee.vunzip.8     q4,q5               # [20]\n    ee.vst.l.64.ip  q4,a12,8        # [21]  id:290\n    blt         a10,a13,.Lt_6_8962          # [22]\n\n.Lt_6_8450: # 0xc76\n#<loop> Part of loop body line 348, head labeled .Lt_6_8194\n    l32i    a11,a1,96                   # [0]  gra_spill_temp_83\n    l32i    a15,a1,104                  # [1]  gra_spill_temp_85\n    l32i    a14,a1,84                   # [2]  gra_spill_temp_80\n    l32i    a10,a1,100                  # [3]  gra_spill_temp_84\n    l32i    a13,a1,108                  # [4]  gra_spill_temp_86\n    addi.n  a10,a10,1               # [5]\n    s32i    a10,a1,100                  # [6]  gra_spill_temp_84\n    sub     a13,a13,a14                 # [7]\n    add.n   a15,a15,a14                 # [8]\n    s32i    a15,a1,104                  # [9]  gra_spill_temp_85\n    s32i    a13,a1,108                  # [10]  gra_spill_temp_86\n    sub     a10,a10,a11                 # [11]\n    bnez    a10,.Lt_6_8194              # [12]\n\n.Lt_6_7682: # 0xc9b\n    l32i.n  a9,a1,56                # [0]  gra_spill_temp_73\n    l32i    a15,a1,64                   # [1]  gra_spill_temp_75\n    l32i.n  a14,a1,52               # [2]  gra_spill_temp_72\n    l32i    a13,a1,80                   # [3]  gra_spill_temp_79\n    l32i.n  a11,a1,60               # [4]  gra_spill_temp_74\n    l32i    a8,a1,68                    # [5]  gra_spill_temp_76\n    l32i    a10,a1,72                   # [6]  gra_spill_temp_77\n    addi.n  a8,a8,1                 # [7]\n    s32i    a8,a1,68                    # [8]  gra_spill_temp_76\n    sub     a10,a10,a11                 # [9]\n    add.n   a13,a13,a14                 # [10]\n    add.n   a15,a15,a11                 # [11]\n    s32i    a15,a1,64                   # [12]  gra_spill_temp_75\n    s32i    a13,a1,80                   # [13]  gra_spill_temp_79\n    s32i    a10,a1,72                   # [14]  gra_spill_temp_77\n    sub     a8,a8,a9                    # [15]\n    bnez    a8,.Lt_6_7426               # [16]\n\n.Lt_6_6914: # 0xcc8\n    retw.n                          # [0]\n\n    .size   esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3, . - esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3\n"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_s16_mult1_esp32s3.S",
    "content": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n    .text\n    .literal_position\n\n    # Program Unit: esp_nn_depthwise_conv_s16_mult1_esp32s3\n    .type   esp_nn_depthwise_conv_s16_mult1_esp32s3, @function\n    .align   4\n    .global esp_nn_depthwise_conv_s16_mult1_esp32s3\n\nesp_nn_depthwise_conv_s16_mult1_esp32s3:    # 0x4c8\n    # scratch_buf = 0\n    # gra_spill_temp_2 = 48\n    # gra_spill_temp_22 = 52\n    # gra_spill_temp_4 = 56\n    # gra_spill_temp_23 = 60\n    # gra_spill_temp_24 = 64\n    # gra_spill_temp_7 = 68\n    # gra_spill_temp_26 = 72\n    # gra_spill_temp_27 = 76\n    # gra_spill_temp_28 = 80\n    # gra_spill_temp_29 = 84\n    # gra_spill_temp_12 = 88\n    # gra_spill_temp_13 = 92\n    # gra_spill_temp_14 = 96\n    # gra_spill_temp_15 = 100\n    # gra_spill_temp_21 = 104\n    # gra_spill_temp_17 = 108\n    # gra_spill_temp_18 = 112\n    # gra_spill_temp_20 = 116\n    # gra_spill_temp_30 = 0\n    # gra_spill_temp_34 = 16\n\n // in registers:\n // a2: *input_data\n // a3: input_wd\n // a4: input_ht\n // a5: channels\n // a6: pad_wd\n // a7: pad_ht\n\n // on stack:\n // stride_wd\n // stride_ht\n // *filter_data\n // filter_wd\n // filter_ht\n // *bias\n // *out_data\n // out_wd\n // out_ht\n // out_offset\n // *out_shift\n // *out_mult\n // activation_min\n // activation_max\n\n    entry   a1,160                      #\n    l32i    a9,a1,184                   # [7]  id:237 out_data+0x0\n    l16ui   a8,a1,192                   # [8]  id:238 out_ht+0x0\n    s32i    a2,a1,52                    # [0]  gra_spill_temp_22\n    s32i.n  a4,a1,56                # [1]  gra_spill_temp_4\n    s32i    a5,a1,60                    # [2]  gra_spill_temp_23\n    s32i    a9,a1,112                   # [10]  gra_spill_temp_18\n    beqz.n  a8,.Lt_4_7170           # [20]\n\n.LBB3_esp_nn_depthwise_conv_s16_mult1:  # 0x508\n    l16ui   a4,a1,172                   # [0]  id:240 filter_wd+0x0\n    neg     a13,a7                      # [2]\n    neg     a12,a6                      # [3]\n    sext    a12,a12,15                  # [16]\n    sext    a13,a13,15                  # [17]\n    s32i    a13,a1,92                   # [18]  gra_spill_temp_13\n    s32i.n  a12,a1,48               # [19]  gra_spill_temp_2\n    movi.n  a8,0                    # [20]\n    slli    a9,a5,1                     # [21]\n    addi    a10,a5,-7                   # [22]\n    s32i    a10,a1,100                  # [23]  gra_spill_temp_15\n    s32i    a9,a1,64                    # [24]  gra_spill_temp_24\n    s32i    a8,a1,68                    # [25]  gra_spill_temp_7\n    j   .Lt_4_7682                      # [30]\n\n.Lt_4_7938: # 0x561\n    l32i    a15,a1,192                  # [0]  out_ht\n    l32i.n  a9,a1,164                   # [1]  stride_ht\n    l32i    a14,a1,68                   # [2]  gra_spill_temp_7\n    l32i    a8,a1,92                    # [3]  gra_spill_temp_13\n    addi.n  a14,a14,1               # [4]\n    s32i    a14,a1,68                   # [5]  gra_spill_temp_7\n    add.n   a9,a8,a9                    # [6]\n    sub     a14,a14,a15                 # [7]\n    sext    a8,a9,15                    # [8]\n    s32i    a8,a1,92                    # [9]  gra_spill_temp_13\n    beqz    a14,.Lt_4_7170              # [10]\n\n.Lt_4_7682: # 0x57f\n#<loop> Loop body line 59, nesting depth: 1, estimated iterations: 100\n #  60          const int16_t base_y = (out_y * stride_ht) - pad_ht;\n #  61          for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop\n    l32i    a10,a1,188                  # [0]  out_width\n    beqz.n  a10,.Lt_4_7938          # [2]\n\n.LBB6_esp_nn_depthwise_conv_s16_mult1:  # 0x584\n#<loop> Part of loop body line 59, head labeled .Lt_4_7682\n    movi.n  a14,0                   # [0]\n    l32i.n  a7,a1,176                   # [1]  filter_ht\n    l32i    a13,a1,92                   # [2]  gra_spill_temp_13\n    l32i.n  a8,a1,56                # [3]  gra_spill_temp_4\n    movi.n  a11,0                   # [4]\n    l32i.n  a12,a1,48               # [5]  gra_spill_temp_2\n    s32i    a12,a1,84                   # [6]  gra_spill_temp_29\n    s32i    a11,a1,88                   # [7]  gra_spill_temp_12\n    sub     a8,a8,a13                   # [8]\n    min     a7,a7,a8                    # [9]\n    neg     a13,a13                     # [10]\n    max     a13,a13,a14                 # [11]\n    s32i    a13,a1,96                   # [12]  gra_spill_temp_14\n    j   .Lt_4_8450                      # [13]\n\n.Lt_4_8706: # 0x5a9\n#<loop> Part of loop body line 61, head labeled .Lt_4_8450\n    l32i    a10,a1,188                  # [0]  out_width\n    l32i    a12,a1,160                  # [1]  stride_wd\n    l32i    a9,a1,88                    # [2]  gra_spill_temp_12\n    l32i    a11,a1,84                   # [3]  gra_spill_temp_29\n    addi.n  a9,a9,1                 # [4]\n    s32i    a9,a1,88                    # [5]  gra_spill_temp_12\n    add.n   a12,a11,a12                 # [6]\n    sext    a11,a12,15                  # [7]\n    s32i    a11,a1,84                   # [8]  gra_spill_temp_29\n    beq     a9,a10,.Lt_4_7938           # [9]\n\n.Lt_4_8450: # 0x5c5\n#<loop> Loop body line 61, nesting depth: 2, estimated iterations: 100\n #  69              uint32_t bias_ptr = (uint32_t) bias;\n #  70              const int32_t *out_mult_ptr = out_mult;\n #  71              const int32_t *out_shift_ptr = out_shift;\n #  72\n #  73              for (int ch_idx = 0; ch_idx < channels - 7; ch_idx += 8) {//channel_loop\n    l32i    a13,a1,100                  # [0]  gra_spill_temp_15\n    l32i    a14,a1,180                  # [1]  bias\n    l32i    a15,a1,204                  # [2]  out_mult\n    l32i    a8,a1,200                   # [3]  out_shift\n    s32i    a8,a1,104                   # [4]  gra_spill_temp_21\n    s32i    a15,a1,116                  # [5]  gra_spill_temp_20\n    s32i    a14,a1,108                  # [6]  gra_spill_temp_17\n    blti    a13,1,.Lt_4_8706            # [7]\n\n.LBB9_esp_nn_depthwise_conv_s16_mult1:  # 0x5dd\n#<loop> Part of loop body line 61, head labeled .Lt_4_8450\n    movi.n  a2,0                    # [0]\n    l32i    a5,a1,84                    # [1]  gra_spill_temp_29\n    movi.n  a8,0                    # [2]\n    neg     a6,a5                       # [3]\n    max     a6,a6,a8                    # [4]\n    sub     a5,a3,a5                    # [5]\n    min     a5,a4,a5                    # [6]\n    sub     a9,a5,a6                    # [7]\n    s32i    a9,a1,72                    # [8]  gra_spill_temp_26\n    j   .Lt_4_9218                      # [9]\n\n.Lt_4_9474: # 0x5f9\n\n// extract data\n    mov     a11,a1\n    ee.st.qacc_l.l.128.ip   a11,16      # [2]  id:252\n    ee.st.qacc_l.h.32.ip    a11,0       # [3]  id:253\n    l8ui    a12,a1,15                   # [4]  scratch_buf+15\n    l16ui   a10,a1,10                   # [5]  scratch_buf+10\n    l8ui    a13,a1,5                    # [6]  scratch_buf+5\n    l8ui    a14,a1,6                    # [7]  scratch_buf+6\n    l8ui    a15,a1,16                   # [8]  scratch_buf+16\n    s8i     a13,a1,2                    # [11]  scratch_buf+2\n    s8i     a14,a1,3                    # [10]  scratch_buf+3\n    s8i     a15,a1,7                    # [9]  scratch_buf+7\n    s16i    a10,a1,4                    # [12]  scratch_buf+4\n    s8i     a12,a1,6                    # [13]  scratch_buf+6\n\n    movi.n  a10,16                  # [14]\n    ee.st.qacc_h.l.128.ip   a11,16      # [15]  id:263\n    ee.st.qacc_h.h.32.ip    a11,-32     # [16]  id:264\n    ee.srcmb.s16.qacc       q1,a10,0        # [17]\n    l8ui    a8,a1,31                    # [18]  scratch_buf+31\n    l8ui    a9,a1,32                    # [19]  scratch_buf+32\n    l16ui   a12,a1,16                   # [20]  scratch_buf+16\n    l8ui    a13,a1,21                   # [21]  scratch_buf+21\n    l8ui    a14,a1,22                   # [22]  scratch_buf+22\n    l16ui   a15,a1,26                   # [23]  scratch_buf+26\n    s8i     a13,a1,10                   # [26]  scratch_buf+10\n    s8i     a14,a1,11                   # [25]  scratch_buf+11\n    s16i    a15,a1,12                   # [24]  scratch_buf+12\n    s16i    a12,a1,8                    # [27]  scratch_buf+8\n    s8i     a9,a1,15                    # [28]  scratch_buf+15\n    s8i     a8,a1,14                    # [29]  scratch_buf+14\n\n    l32i            a9,a1,180                   # [30]  bias\n    ee.vld.128.ip   q0,a11,0            # [31]  id:164\n    ee.vzip.16      q0,q1               # [33]\n    beqz.n          a9,.Lt_4_11522          # [34] // skip bias\n\n// add bias\n    l32i    a9,a1,108                   # [0]  gra_spill_temp_17\n    addi    a8,a1,112                   # [1]\n    extui   a10,a9,0,4                  # [2]\n    wur.sar_byte    a10                 # [3]\n    ee.vld.128.ip   q4,a9,16            # [4]  id:279\n    ee.vld.128.ip   q7,a9,16            # [5]  id:168\n    ee.vld.128.ip   q5,a9,0             # [6]  id:281\n    s32i    a9,a1,108                   # [7]  gra_spill_temp_17\n    ee.src.q    q4,q4,q7            # [8]\n    ee.src.q    q7,q7,q5            # [10]\n    ee.vadds.s32    q0,q0,q4            # [9]\n    ee.vadds.s32    q1,q1,q7            # [11]\n    st.qr   q1,a1,0                 # [12]  gra_spill_temp_30-112\n\n.Lt_4_11522:    # 0x684\n\n// apply quantisation: esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q0, out_mult_ptr, out_shift_ptr);\n\n    l32i    a10,a1,116                  # [1]  gra_spill_temp_20\n    l32i    a11,a1,104                  # [3]  gra_spill_temp_21\n    st.qr   q1,a1,0                 # [2]  gra_spill_temp_30-112\n    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [4]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3\n\n    l32i    a10,a1,116                  # [2]  gra_spill_temp_20\n    l32i    a11,a1,104                  # [0]  gra_spill_temp_21\n    st.qr   q0,a1,16                # [3]  gra_spill_temp_34-112\n    ld.qr   q0,a1,0                 # [4]  gra_spill_temp_30-112\n    addi    a10,a10,16                  # [5] // out_mult_ptr += 4\n    addi    a11,a11,16                  # [6] // out_shift_ptr += 4\n    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [7]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3\n\n// add offset, apply activation and store\n    l32i    a13,a1,100                  # [0]  gra_spill_temp_15\n    addi.n  a2,a2,8                 # [1]\n    l32i    a8,a1,112                   # [2]  gra_spill_temp_18\n    l32i    a15,a1,116                  # [3]  gra_spill_temp_20\n    l32i    a14,a1,104                  # [4]  gra_spill_temp_21\n\n    addi        a12,a1,212\n    ee.vldbc.32 q3,a12              # [14]  id:236 activation_max\n    addi        a12,a1,196\n    ee.vldbc.32 q1,a12              # [16]  id:234 out_offset\n    addi    a12,a1,208\n\n    ld.qr   q2,a1,16                # [8]  gra_spill_temp_34-112\n\n    addi    a14,a14,32                  # [9]\n    addi    a15,a15,32                  # [10]\n    s32i    a15,a1,116                  # [11]  gra_spill_temp_20\n    ee.vadds.s32    q2,q2,q1            # [12]\n    s32i            a14,a1,104                  # [13]  gra_spill_temp_21\n    ee.vadds.s32    q1,q0,q1            # [14]\n    ee.vmin.s32     q0,q2,q3            # [15]\n    ee.vldbc.32     q2,a12              # [16]  id:234 out_offset\n    ee.vmin.s32     q1,q1,q3            # [17]\n    ee.vmax.s32     q1,q1,q2            # [18]\n    ee.vmax.s32     q0,q0,q2            # [19]\n    ee.vunzip.16    q0,q1               # [20]\n    ee.vunzip.8     q0,q1               # [21]\n    ee.vst.l.64.ip  q0,a8,8         # [22]  id:172\n    s32i    a8,a1,112                   # [23]  gra_spill_temp_18\n    bge     a2,a13,.Lt_4_8706           # [24]\n\n.Lt_4_9218: # 0x6f5\n    ee.zero.qacc                    # [0]\n    l32i    a13,a1,96                   # [1]  gra_spill_temp_14\n    s32i    a13,a1,80                   # [2]  gra_spill_temp_28\n    bge     a13,a7,.Lt_4_9474           # [3]\n\n.LBB12_esp_nn_depthwise_conv_s16_mult1: # 0x701 // channel_loop\n    mull    a15,a13,a4                  # [0]\n    l32i    a14,a1,92                   # [1]  gra_spill_temp_13\n    add.n   a8,a15,a5                   # [2]\n    add.n   a14,a14,a13                 # [3]\n    mull    a14,a3,a14                  # [4]\n    s32i    a8,a1,76                    # [5]  gra_spill_temp_27\n    bge     a6,a5,.Lt_4_10242           # [6]\n\n.LBB15_esp_nn_depthwise_conv_s16_mult1: # 0x714\n    l32i    a12,a1,64                   # [0]  gra_spill_temp_24\n    l32i    a9,a1,168                   # [1]  filter_data\n    l32i    a10,a1,60                   # [2]  gra_spill_temp_23\n    l32i    a11,a1,84                   # [3]  gra_spill_temp_29\n    add.n   a8,a15,a6                   # [4]\n    add.n   a11,a11,a6                  # [5]\n    mull    a8,a8,a10                   # [6]\n    add.n   a11,a14,a11                 # [7]\n    mull    a10,a10,a11                 # [8]\n    add.n   a8,a2,a8                    # [9]\n    l32i    a11,a1,52                   # [10]  gra_spill_temp_22\n    addx2   a8,a8,a9                    # [11]\n    add.n   a10,a2,a10                  # [12]\n    l32i    a9,a1,72                    # [13]  gra_spill_temp_26\n    addx2   a10,a10,a11                 # [14]\n    loopgtz a9,.LBB41_esp_nn_depthwise_conv_s16_mult1   # [15]\n// innermost loop\n    ee.vld.128.xp   q0,a10,a12          # [0*II+3]  id:249\n    ee.vld.128.xp   q1,a8,a12           # [0*II+4]  id:250\n    ee.vmulas.s16.qacc  q0,q1       # [0*II+6]\n.LBB41_esp_nn_depthwise_conv_s16_mult1: # 0x750\n\n.Lt_4_10242:    # 0x750\n    add.n   a14,a14,a3                  # [0]\n    add.n   a15,a15,a4                  # [1]\n    l32i    a9,a1,80                    # [2]  gra_spill_temp_28\n    l32i    a10,a1,76                   # [3]  gra_spill_temp_27\n    addi.n  a9,a9,1                 # [4]\n    add.n   a10,a10,a4                  # [5]\n    s32i    a10,a1,76                   # [6]  gra_spill_temp_27\n    s32i    a9,a1,80                    # [7]  gra_spill_temp_28\n    sub     a9,a7,a9                    # [8]\n    beqz    a9,.Lt_4_9474               # [9]\n\n    blt a6,a5,.LBB15_esp_nn_depthwise_conv_s16_mult1    # [0]\n\n    j   .Lt_4_10242                     # [0]\n\n.Lt_4_7170: # 0x770\n    retw.n                          # [0]\n\n    .size   esp_nn_depthwise_conv_s16_mult1_esp32s3, . - esp_nn_depthwise_conv_s16_mult1_esp32s3\n"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_s16_mult4_esp32s3.S",
    "content": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n    .text\n    .literal_position\n\n    # Program Unit: esp_nn_depthwise_conv_s16_mult4_esp32s3\n    .type   esp_nn_depthwise_conv_s16_mult4_esp32s3, @function\n    .align   4\n    .global esp_nn_depthwise_conv_s16_mult4_esp32s3\n\nesp_nn_depthwise_conv_s16_mult4_esp32s3:    # 0x17c8\n    # qacc_scratch = 0\n    # gra_spill_temp_220 = 32\n    # gra_spill_temp_221 = 36\n    # gra_spill_temp_222 = 40\n    # gra_spill_temp_223 = 44\n    # gra_spill_temp_224 = 48\n    # gra_spill_temp_225 = 52\n    # gra_spill_temp_226 = 56\n    # gra_spill_temp_227 = 60\n    # gra_spill_temp_228 = 64\n    # gra_spill_temp_229 = 68\n    # gra_spill_temp_230 = 72\n    # gra_spill_temp_231 = 76\n    # gra_spill_temp_232 = 80\n    # gra_spill_temp_233 = 84\n    # gra_spill_temp_234 = 88\n    # gra_spill_temp_235 = 92\n    # gra_spill_temp_236 = 96\n    # gra_spill_temp_237 = 100\n    # gra_spill_temp_238 = 104\n    # gra_spill_temp_239 = 108\n    # gra_spill_temp_240 = 112\n    # gra_spill_temp_241 = 116\n    # gra_spill_temp_242 = 120\n    # gra_spill_temp_243 = 124\n    # gra_spill_temp_244 = 128\n    # gra_spill_temp_245 = 132\n    # gra_spill_temp_246 = 136\n    # gra_spill_temp_247 = 140\n    # gra_spill_temp_248 = 144\n    # gra_spill_temp_249 = 148\n    # gra_spill_temp_250 = 152\n    # gra_spill_temp_251 = 156\n    # gra_spill_temp_252 = 160\n    # gra_spill_temp_253 = 164\n    # gra_spill_temp_254 = 168\n    # gra_spill_temp_255 = 172\n    # gra_spill_temp_256 = 176\n    # gra_spill_temp_257 = 192\n    # gra_spill_temp_258 = 208\n    # gra_spill_temp_259 = 224\n    # gra_spill_temp_260 = 240\n\n // registers:\n // a2: const int16_t *input_data\n // a3: const uint16_t input_wd\n // a4: const uint16_t input_ht\n // a5: const uint16_t channels\n // a6: const uint16_t pad_wd\n // a7: const uint16_t pad_ht\n\n // on stack:\n // const uint16_t stride_wd\n // const uint16_t stride_ht\n // const uint16_t ch_mult\n // const int16_t *filter_data\n // const uint16_t filter_wd\n // const uint16_t filter_ht\n // const int32_t *bias\n // int8_t *out_data\n // const uint16_t out_wd\n // const uint16_t out_ht\n // const int32_t out_offset\n // const int32_t *out_shift\n // const int32_t *out_mult\n // const int32_t activation_min\n // const int32_t activation_max\n\n\n    entry   a1,288                      #\n    s32i    a2,a1,136                   # [0]  gra_spill_temp_246\n    s32i.n  a4,a1,40                # [1]  gra_spill_temp_222\n    s32i    a5,a1,164                   # [2]  gra_spill_temp_253\n    addi    a12,a1,112                  # [3]\n    addmi   a10,a1,256                  # [4]\n    addmi   a11,a1,256                  # [5]\n    addmi   a13,a1,256                  # [6]\n    l16ui   a8,a1,324                   # [7]  id:216 out_ht+0x0\n    s32i.n  a8,a1,48                # [8]  gra_spill_temp_224\n    addi    a13,a13,72                  # [9]\n    addi    a11,a11,88                  # [10]\n    addi    a10,a10,84                  # [11]\n    ee.vldbc.32 q0,a10              # [12]  id:215 activation_min\n    ee.vldbc.32 q1,a11              # [13]  id:214 activation_max\n    ee.vldbc.32 q2,a13              # [14]  id:213 out_offset\n    st.qr       q2,a12,80                   # [15]  gra_spill_temp_257-112\n    st.qr       q1,a12,96                   # [16]  gra_spill_temp_258-112\n    st.qr       q0,a12,112                  # [17]  gra_spill_temp_259-112\n    beqz.n  a8,.Lt_10_8450          # [18]\n\n    s32i    a1,a1,112                   # [0]  gra_spill_temp_240\n    neg     a15,a6                      # [1]\n    neg     a4,a7                       # [2]\n    addmi   a8,a1,256                   # [3]\n    movi.n  a9,0                    # [4]\n    movi.n  a11,0                   # [5]\n    slli    a14,a5,1                    # [6]\n    l16ui   a13,a1,296                  # [7]  id:217 ch_mult+0x0\n    l16ui   a10,a1,308                  # [8]  id:227 filter_ht+0x0\n    s32i.n  a10,a1,36               # [9]  gra_spill_temp_221\n    s32i    a13,a1,76                   # [10]  gra_spill_temp_231\n    s32i    a14,a1,148                  # [11]  gra_spill_temp_249\n    s32i.n  a11,a1,52               # [12]  gra_spill_temp_225\n    s32i    a9,a1,116                   # [13]  gra_spill_temp_241\n    st.qr   q4,a8,-16                   # [14]  gra_spill_temp_260-256\n    sext    a4,a4,15                    # [15]\n    sext    a15,a15,15                  # [16]\n    s32i.n  a15,a1,32               # [17]  gra_spill_temp_220\n    mul16u  a12,a5,a13              # [18]\n    s32i    a4,a1,92                    # [19]  gra_spill_temp_235\n    l16ui   a8,a1,320                   # [20]  id:229 out_wd+0x0\n    l16ui   a9,a1,292                   # [21]  id:228 stride_ht+0x0\n    l32i    a11,a1,336                  # [22]  id:226 out_mult+0x0\n    s32i    a11,a1,64                   # [23]  gra_spill_temp_228\n    s32i.n  a9,a1,44                # [24]  gra_spill_temp_223\n    s32i    a8,a1,68                    # [25]  gra_spill_temp_229\n    l32i    a4,a1,300                   # [26]  id:218 filter_data+0x0\n    s32i    a12,a1,140                  # [27]  gra_spill_temp_247\n    l32i    a15,a1,316                  # [28]  id:219 out_data+0x0\n    s32i    a15,a1,96                   # [29]  gra_spill_temp_236\n    slli    a12,a12,1                   # [30]\n    s32i    a4,a1,152                   # [31]  gra_spill_temp_250\n    addi    a14,a13,-3                  # [32]\n    l16ui   a4,a1,304                   # [33]  id:223 filter_wd+0x0\n    s32i    a14,a1,108                  # [34]  gra_spill_temp_239\n    s32i    a12,a1,144                  # [35]  gra_spill_temp_248\n    slli    a13,a13,2                   # [36]\n    s32i    a13,a1,80                   # [37]  gra_spill_temp_232\n    l32i    a12,a1,332                  # [38]  id:225 out_shift+0x0\n    l32i    a14,a1,312                  # [39]  id:222 bias+0x0\n    s32i    a14,a1,104                  # [40]  gra_spill_temp_238\n    s32i.n  a12,a1,60               # [41]  gra_spill_temp_227\n    l16ui   a13,a1,288                  # [42]  id:224 stride_wd+0x0\n    s32i.n  a13,a1,56               # [43]  gra_spill_temp_226\n    j   .Lt_10_8962                     # [44]\n\n.Lt_10_9218:    # 0x1880\n    l32i.n  a9,a1,48                # [0]  gra_spill_temp_224\n    l32i.n  a11,a1,44               # [1]  gra_spill_temp_223\n    l32i.n  a8,a1,52                # [2]  gra_spill_temp_225\n    l32i    a10,a1,92                   # [3]  gra_spill_temp_235\n    addi.n  a8,a8,1                 # [4]\n    s32i.n  a8,a1,52                # [5]  gra_spill_temp_225\n    add.n   a11,a10,a11                 # [6]\n    sub     a8,a8,a9                    # [7]\n    sext    a10,a11,15                  # [8]\n    s32i    a10,a1,92                   # [9]  gra_spill_temp_235\n    beqz    a8,.Lt_10_8450              # [10]\n\n.Lt_10_8962:    # 0x189b\n#<loop> Loop body line 1223, nesting depth: 1, estimated iterations: 100\n #1224          const int16_t base_y = (out_y * stride_ht) - pad_ht;\n #1225          for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop\n    l32i    a12,a1,68                   # [0]  gra_spill_temp_229\n    beqz.n  a12,.Lt_10_9218         # [2]\n\n.LBB6_esp_nn_depthwise_conv_s16_mult4:  # 0x18a0\n    l32i.n  a7,a1,36                # [0]  gra_spill_temp_221\n    movi.n  a11,0                   # [1]\n    l32i.n  a8,a1,40                # [2]  gra_spill_temp_222\n    l32i    a9,a1,92                    # [3]  gra_spill_temp_235\n    movi.n  a13,0                   # [4]\n    l32i.n  a14,a1,32               # [5]  gra_spill_temp_220\n    s32i    a14,a1,160                  # [6]  gra_spill_temp_252\n    s32i    a13,a1,72                   # [7]  gra_spill_temp_230\n    neg     a10,a9                      # [8]\n    sub     a8,a8,a9                    # [9]\n    max     a10,a10,a11                 # [10]\n    s32i    a10,a1,100                  # [11]  gra_spill_temp_237\n    min     a7,a7,a8                    # [12]\n    j   .Lt_10_9730                     # [13]\n\n.Lt_10_9986:    # 0x18c5\n    l32i    a13,a1,68                   # [0]  gra_spill_temp_229\n    l32i.n  a15,a1,56               # [1]  gra_spill_temp_226\n    l32i    a12,a1,72                   # [2]  gra_spill_temp_230\n    l32i    a14,a1,160                  # [3]  gra_spill_temp_252\n    addi.n  a12,a12,1               # [4]\n    s32i    a12,a1,72                   # [5]  gra_spill_temp_230\n    add.n   a15,a14,a15                 # [6]\n    sext    a14,a15,15                  # [7]\n    s32i    a14,a1,160                  # [8]  gra_spill_temp_252\n    beq a12,a13,.Lt_10_9218         # [9]\n\n.Lt_10_9730:    # 0x18e0\n    l32i    a8,a1,164                   # [0]  gra_spill_temp_253\n    l32i    a9,a1,64                    # [1]  gra_spill_temp_228\n    l32i.n  a10,a1,60               # [2]  gra_spill_temp_227\n    s32i    a10,a1,132                  # [3]  gra_spill_temp_245\n    s32i    a9,a1,128                   # [4]  gra_spill_temp_244\n    beqz.n  a8,.Lt_10_9986          # [5]\n\n    movi.n  a8,0                    # [0]\n    l32i    a5,a1,160                   # [1]  gra_spill_temp_252\n    movi.n  a12,0                   # [2]\n    movi.n  a13,0                   # [3]\n    movi.n  a14,0                   # [4]\n    s32i    a14,a1,84                   # [5]  gra_spill_temp_233\n    s32i    a13,a1,88                   # [6]  gra_spill_temp_234\n    s32i    a12,a1,176                  # [7]  gra_spill_temp_256\n    neg     a6,a5                       # [8]\n    max     a6,a6,a8                    # [9]\n    sub     a5,a3,a5                    # [10]\n    min     a5,a4,a5                    # [11]\n    sub     a11,a5,a6                   # [12]\n    s32i    a11,a1,156                  # [13]  gra_spill_temp_251\n    j   .Lt_10_10498                    # [14]\n\n.Lt_10_10754:   # 0x1919\n    l32i    a10,a1,164                  # [0]  gra_spill_temp_253\n    l32i    a14,a1,76                   # [1]  gra_spill_temp_231\n    l32i    a13,a1,84                   # [2]  gra_spill_temp_233\n    l32i    a12,a1,80                   # [3]  gra_spill_temp_232\n    l32i    a9,a1,176                   # [4]  gra_spill_temp_256\n    l32i    a11,a1,88                   # [5]  gra_spill_temp_234\n    addi.n  a9,a9,1                 # [6]\n    s32i    a9,a1,176                   # [7]  gra_spill_temp_256\n    add.n   a11,a11,a12                 # [8]\n    add.n   a13,a13,a14                 # [9]\n    s32i    a13,a1,84                   # [10]  gra_spill_temp_233\n    s32i    a11,a1,88                   # [11]  gra_spill_temp_234\n    beq     a9,a10,.Lt_10_9986          # [12]\n\n.Lt_10_10498:   # 0x193d\n    l32i    a15,a1,108                  # [0]  gra_spill_temp_239\n    blti    a15,1,.Lt_10_10754          # [2]\n\n    l32i    a2,a1,84                    # [0]  gra_spill_temp_233\n    l32i    a10,a1,104                  # [1]  gra_spill_temp_238\n    l32i    a9,a1,88                    # [2]  gra_spill_temp_234\n    movi.n  a8,0                    # [3]\n    s32i    a8,a1,120                   # [4]  gra_spill_temp_242\n    add.n   a9,a9,a10                   # [5]\n    s32i    a9,a1,124                   # [6]  gra_spill_temp_243\n    j   .Lt_10_11266                    # [7]\n\n.Lt_10_11522:   # 0x1959\n    addmi   a12,a1,256                  # [0]\n    l32i    a14,a1,112                  # [1]  gra_spill_temp_240\n    movi.n  a13,16                  # [2]\n    ee.st.qacc_l.l.128.ip   a14,16      # [3]  id:234\n    ee.st.qacc_l.h.32.ip    a14,-16     # [4]  id:235\n    ee.srcmb.s16.qacc   q5,a13,0        # [5]\n    l16ui   a15,a1,10                   # [6]  qacc_scratch+10\n    l8ui    a8,a1,15                    # [7]  qacc_scratch+15\n    l8ui    a9,a1,5                     # [8]  qacc_scratch+5\n    l8ui    a11,a1,16                   # [9]  qacc_scratch+16\n    l8ui    a10,a1,6                    # [10]  qacc_scratch+6\n    s8i     a10,a1,3                    # [11]  qacc_scratch+3\n    s8i     a11,a1,7                    # [12]  qacc_scratch+7\n    s8i     a9,a1,2                     # [13]  qacc_scratch+2\n\n    l32i    a11,a1,104                  # [14]  gra_spill_temp_238\n    s8i     a8,a1,6                     # [15]  qacc_scratch+6\n    s16i    a15,a1,4                    # [16]  qacc_scratch+4\n    ee.vld.l.64.ip  q0,a14,0        # [17]  id:245\n    s32i    a14,a1,112                  # [18]  gra_spill_temp_240\n    ee.vzip.16  q0,q5               # [19]\n    st.qr   q5,a12,-16                  # [20]  gra_spill_temp_260-256\n\n    beqz.n  a11,.Lt_10_13570        # [21] // skip_bias\n\n // add bias\n    l32i    a13,a1,124                  # [0]  gra_spill_temp_243\n    extui   a12,a13,0,4                 # [2]\n    ee.vld.128.ip   q7,a13,16           # [3]  id:248\n    ee.vld.128.ip   q1,a13,0            # [4]  id:249\n    wur.sar_byte    a12                 # [5]\n    ee.src.q.qup    q6,q7,q1            # [6]\n    ee.vadds.s32    q0,q0,q6            # [7]\n\n.Lt_10_13570:   # 0x19ae\n #1287                      q0 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q0, out_mult_ptr, out_shift_ptr);\n    l32i    a10,a1,128                  # [0]  gra_spill_temp_244\n    l32i    a11,a1,132                  # [1]  gra_spill_temp_245\n    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [2]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3\n\n    addi.n  a2,a2,4                 # [0]\n    l32i    a13,a1,96                   # [1]  gra_spill_temp_236\n    l32i    a11,a1,128                  # [2]  gra_spill_temp_244\n    l32i    a10,a1,132                  # [3]  gra_spill_temp_245\n    addi    a8,a1,112                   # [4]\n    ld.qr   q1,a8,96                    # [5]  gra_spill_temp_258-112\n    ld.qr   q2,a8,80                    # [6]  gra_spill_temp_257-112\n    addi    a10,a10,16                  # [7]\n    addi    a11,a11,16                  # [8]\n    s32i    a11,a1,128                  # [9]  gra_spill_temp_244\n    ee.vadds.s32    q0,q0,q2            # [10]\n    s32i    a10,a1,132                  # [11]  gra_spill_temp_245\n    ee.vmin.s32 \tq0,q0,q1            # [12]\n    ld.qr   \t\tq1,a8,112                   # [13]  gra_spill_temp_259-112\n    l32i    \t\ta8,a1,116                   # [14]  gra_spill_temp_241\n    ee.vmax.s32 \tq0,q0,q1            # [15]\n    ee.movi.32.a    q0,a14,2            # [16]\n    ee.movi.32.a    q0,a15,1            # [17]\n    ee.movi.32.a    q0,a9,0             # [18]\n    add.n   \t\ta13,a8,a13                  # [19]\n    ee.movi.32.a    q0,a12,3            # [20]\n    addi.n  a8,a8,4                 # [21]\n    s8i \ta12,a13,3                   # [22]  id:254\n    s32i    a8,a1,116                   # [23]  gra_spill_temp_241\n    s8i \ta9,a13,0                    # [24]  id:251\n    s8i \ta15,a13,1                   # [25]  id:252\n    s8i \ta14,a13,2                   # [26]  id:253\n    l32i    a15,a1,108                  # [27]  gra_spill_temp_239\n    l32i    a14,a1,120                  # [28]  gra_spill_temp_242\n    l32i    a9,a1,124                   # [29]  gra_spill_temp_243\n    addi.n  a14,a14,4               # [30]\n    addi    a9,a9,16                    # [31]\n    s32i    a9,a1,124                   # [32]  gra_spill_temp_243\n    s32i    a14,a1,120                  # [33]  gra_spill_temp_242\n    bge a14,a15,.Lt_10_10754        # [34]\n\n.Lt_10_11266:   # 0x1a1c\n#<loop> Loop body line 1230, nesting depth: 4, estimated iterations: 100\n    ee.zero.qacc                    # [0]\n    l32i    a9,a1,100                   # [1]  gra_spill_temp_237\n    s32i    a9,a1,172                   # [2]  gra_spill_temp_255\n    bge     a9,a7,.Lt_10_11522          # [3]\n\n    mull    a15,a9,a4                   # [0]\n    l32i    a14,a1,92                   # [1]  gra_spill_temp_235\n    add.n   a11,a15,a5                  # [2]\n    add.n   a14,a14,a9                  # [3]\n    mull    a14,a3,a14                  # [4]\n    s32i    a11,a1,168                  # [5]  gra_spill_temp_254\n    bge     a6,a5,.Lt_10_12290          # [6]\n\n.LBB18_esp_nn_depthwise_conv_s16_mult4: # 0x1a3b\n    l32i    a10,a1,176                  # [0]  gra_spill_temp_256\n    l32i    a11,a1,164                  # [1]  gra_spill_temp_253\n    l32i    a12,a1,160                  # [2]  gra_spill_temp_252\n    add.n   a9,a15,a6                   # [3]\n    l32i    a8,a1,140                   # [4]  gra_spill_temp_247\n    addmi   a13,a1,256                  # [5]\n    ld.qr   q1,a13,-16                  # [6]  gra_spill_temp_260-256\n    mull    a8,a8,a9                    # [7]\n    add.n   a12,a12,a6                  # [8]\n    l32i    a9,a1,152                   # [9]  gra_spill_temp_250\n    add.n   a12,a14,a12                 # [10]\n    mull    a11,a11,a12                 # [11]\n    add.n   a8,a2,a8                    # [12]\n    l32i    a12,a1,148                  # [13]  gra_spill_temp_249\n    addx2   a8,a8,a9                    # [14]\n    add.n   a10,a10,a11                 # [15]\n    l32i    a11,a1,136                  # [16]  gra_spill_temp_246\n    l32i    a9,a1,156                   # [17]  gra_spill_temp_251\n    addx2   a10,a10,a11                 # [18]\n    l32i    a11,a1,144                  # [19]  gra_spill_temp_248\n    loopgtz a9,.LBB45_esp_nn_depthwise_conv_s16_mult4   # [20]\n\n    mov.n   a9,a8                       # [0*II+0]\n    ee.vldbc.16 q0,a10              # [0*II+1]  id:232\n    add.n   a10,a10,a12                 # [0*II+2]\n    ee.vld.l.64.ip  q1,a9,0         # [0*II+3]  id:231\n    add.n   a8,a8,a11                   # [0*II+4]\n    ee.vmulas.s16.qacc  q0,q1       # [0*II+5]\n.LBB45_esp_nn_depthwise_conv_s16_mult4: # 0x1a84\n\n    addmi   a10,a1,256                  # [0]\n    st.qr   q1,a10,-16                  # [1]  gra_spill_temp_260-256\n\n.Lt_10_12290:   # 0x1a8a\n    add.n   a14,a14,a3                  # [0]\n    add.n   a15,a15,a4                  # [1]\n    l32i    a11,a1,172                  # [2]  gra_spill_temp_255\n    l32i    a12,a1,168                  # [3]  gra_spill_temp_254\n    addi.n  a11,a11,1               # [4]\n    add.n   a12,a12,a4                  # [5]\n    s32i    a12,a1,168                  # [6]  gra_spill_temp_254\n    s32i    a11,a1,172                  # [7]  gra_spill_temp_255\n    sub     a11,a7,a11                  # [8]\n    beqz    a11,.Lt_10_11522            # [9]\n\n    blt     a6,a5,.LBB18_esp_nn_depthwise_conv_s16_mult4    # [0]\n\n    j   .Lt_10_12290                    # [0]\n\n.Lt_10_8450:    # 0x1aaa\n    retw.n                          # [0]\n\n    .size   esp_nn_depthwise_conv_s16_mult4_esp32s3, . - esp_nn_depthwise_conv_s16_mult4_esp32s3\n"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3.S",
    "content": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n    .text\n    .literal_position\n\n    # Program Unit: esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3\n    .type   esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3, @function\n    .align   4\n    .global esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3\n\nesp_nn_depthwise_conv_s16_mult8_3x3_esp32s3:    # 0x11b3\n    # qacc_scratch = 0\n    # gra_spill_temp_142 = 48\n    # gra_spill_temp_143 = 52\n    # gra_spill_temp_144 = 56\n    # gra_spill_temp_145 = 60\n    # gra_spill_temp_146 = 64\n    # gra_spill_temp_147 = 68\n    # gra_spill_temp_148 = 72\n    # gra_spill_temp_149 = 76\n    # gra_spill_temp_150 = 80\n    # gra_spill_temp_151 = 84\n    # gra_spill_temp_152 = 88\n    # gra_spill_temp_153 = 92\n    # gra_spill_temp_154 = 96\n    # gra_spill_temp_155 = 100\n    # gra_spill_temp_156 = 104\n    # gra_spill_temp_157 = 108\n    # gra_spill_temp_158 = 112\n    # gra_spill_temp_159 = 116\n    # gra_spill_temp_160 = 120\n    # gra_spill_temp_161 = 124\n    # gra_spill_temp_162 = 128\n    # gra_spill_temp_163 = 132\n    # gra_spill_temp_164 = 136\n    # gra_spill_temp_165 = 140\n    # gra_spill_temp_166 = 144\n    # gra_spill_temp_167 = 148\n    # gra_spill_temp_168 = 152\n    # gra_spill_temp_169 = 156\n    # gra_spill_temp_170 = 160\n    # gra_spill_temp_171 = 164\n    # gra_spill_temp_172 = 168\n    # gra_spill_temp_173 = 172\n    # gra_spill_temp_174 = 176\n    # gra_spill_temp_175 = 180\n    # gra_spill_temp_176 = 184\n    # gra_spill_temp_177 = 188\n    # gra_spill_temp_178 = 192\n    # gra_spill_temp_179 = 208\n    # gra_spill_temp_180 = 224\n    # gra_spill_temp_181 = 240\n    # gra_spill_temp_182 = 256\n\n // registers:\n // a2: const int16_t *input_data\n // a3: const uint16_t input_wd\n // a4: const uint16_t input_ht\n // a5: const uint16_t channels\n // a6: const uint16_t pad_wd\n // a7: const uint16_t pad_ht\n\n // const uint16_t stride_wd\n // const uint16_t stride_ht\n // const uint16_t ch_mult\n // const int16_t *filter_data\n // const int32_t *bias\n // int8_t *out_data\n // const uint16_t out_wd\n // const uint16_t out_ht\n // const int32_t out_offset\n // const int32_t *out_shift\n // const int32_t *out_mult\n // const int32_t activation_min\n // const int32_t activation_max\n\n    entry   a1,304                      #\n    s32i    a2,a1,116                   # [0]  gra_spill_temp_159\n    s32i    a3,a1,120                   # [1]  gra_spill_temp_160\n    s32i    a5,a1,144                   # [2]  gra_spill_temp_166\n    s32i.n  a6,a1,60                # [3]  gra_spill_temp_145\n\n    addmi   a9,a1,256                   # [4]\n    addi    a12,a1,112                  # [5]\n    addmi   a10,a1,256                  # [6]\n    addmi   a11,a1,256                  # [7]\n    addmi   a13,a1,256                  # [8]\n\n // height loop\n    l16ui   a8,a1,332                   # [9]  id:261 out_ht+0x0\n    l32i    a14,a1,324                  # [10]  id:257 out_data+0x0\n    s32i    a14,a1,176                  # [11]  gra_spill_temp_174\n    s32i    a8,a1,68                    # [12]  gra_spill_temp_147\n    addi    a13,a13,80                  # [13]\n    addi    a11,a11,96                  # [14]\n    addi    a10,a10,92                  # [15]\n    ee.vldbc.32 q0,a10              # [16]  id:260 activation_min\n    ee.vldbc.32 q1,a11              # [17]  id:259 activation_max\n    ee.vldbc.32 q2,a13              # [18]  id:258 out_offset\n    st.qr   \tq2,a12,96                   # [19]  gra_spill_temp_179-112\n    st.qr   \tq1,a12,112                  # [20]  gra_spill_temp_180-112\n    st.qr   \tq0,a9,-16                   # [21]  gra_spill_temp_181-256\n    beqz.n  a8,.Lt_8_8194           # [22]\n\n.LBB3_esp_nn_depthwise_conv_s16_mult8_3x3:  # 0x11f9\n    s32i    a1,a1,180                   # [0]  gra_spill_temp_175\n    mul16u  a6,a3,a5                # [1]\n    s32i    a7,a1,76                    # [2]  gra_spill_temp_149\n    l32i    a9,a1,316                   # [3]  id:264 filter_data+0x0\n    l32i    a15,a1,320                  # [4]  id:262 bias+0x0\n    l16ui   a10,a1,312                  # [5]  id:263 ch_mult+0x0\n    slli    a11,a5,1                    # [6]\n    l16ui   a12,a1,308                  # [7]  id:268 stride_ht+0x0\n    l32i    a13,a1,344                  # [8]  id:267 out_mult+0x0\n    l32i    a14,a1,340                  # [9]  id:266 out_shift+0x0\n    s32i    a14,a1,88                   # [10]  gra_spill_temp_152\n    s32i    a13,a1,92                   # [11]  gra_spill_temp_153\n    s32i    a12,a1,64                   # [12]  gra_spill_temp_146\n    s32i    a11,a1,124                  # [13]  gra_spill_temp_161\n    s32i    a10,a1,108                  # [14]  gra_spill_temp_157\n    s32i    a15,a1,160                  # [15]  gra_spill_temp_170\n    s32i    a9,a1,128                   # [16]  gra_spill_temp_162\n    neg     a7,a7                       # [17]\n    slli    a6,a6,1                     # [18]\n    s32i    a7,a1,136                   # [19]  gra_spill_temp_164\n    movi.n  a9,0                    # [20]\n    extui   a15,a15,0,4                 # [21]\n    s32i    a15,a1,152                  # [22]  gra_spill_temp_168\n    s32i    a9,a1,72                    # [23]  gra_spill_temp_148\n    sub     a7,a4,a7                    # [24]\n    l32i.n  a9,a1,60                # [25]  gra_spill_temp_145\n    s32i    a7,a1,80                    # [26]  gra_spill_temp_150\n    l16ui   a4,a1,328                   # [27]  id:269 out_wd+0x0\n    s32i    a4,a1,96                    # [28]  gra_spill_temp_154\n    l16ui   a7,a1,304                   # [29]  id:265 stride_wd+0x0\n    s32i    a7,a1,84                    # [30]  gra_spill_temp_151\n    mul16u  a4,a5,a10               # [31]\n    neg     a9,a9                       # [32]\n    s32i.n  a9,a1,52                # [33]  gra_spill_temp_143\n    sub     a8,a3,a9                    # [34]\n    addi    a10,a10,-7                  # [35]\n    s32i    a10,a1,164                  # [36]  gra_spill_temp_171\n    s32i.n  a8,a1,56                # [37]  gra_spill_temp_144\n    addx2   a7,a4,a4                    # [38]\n    slli    a7,a7,1                     # [39]\n    j       .Lt_8_8706                      # [40]\n\n.Lt_8_8962: # 0x1270\n#<loop> Part of loop body line 933, head labeled .Lt_8_8706\n    l32i    a10,a1,68                   # [0]  gra_spill_temp_147\n    l32i    a14,a1,76                   # [1]  gra_spill_temp_149\n    l32i    a13,a1,136                  # [2]  gra_spill_temp_164\n    l32i    a12,a1,64                   # [3]  gra_spill_temp_146\n    l32i    a9,a1,72                    # [4]  gra_spill_temp_148\n    l32i    a11,a1,80                   # [5]  gra_spill_temp_150\n    addi.n  a9,a9,1                 # [6]\n    s32i    a9,a1,72                    # [7]  gra_spill_temp_148\n    sub     a11,a11,a12                 # [8]\n    add.n   a13,a13,a12                 # [9]\n    sub     a14,a14,a12                 # [10]\n    s32i    a14,a1,76                   # [11]  gra_spill_temp_149\n    s32i    a13,a1,136                  # [12]  gra_spill_temp_164\n    s32i    a11,a1,80                   # [13]  gra_spill_temp_150\n    sub     a9,a9,a10                   # [14]\n    beqz    a9,.Lt_8_8194               # [15]\n\n.Lt_8_8706: # 0x129e\n#<loop> Loop body line 933, nesting depth: 1, estimated iterations: 100\n # 934          const int32_t base_y = (out_y * stride_ht) - pad_ht;\n # 935          for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop\n    l32i    a15,a1,96                   # [0]  gra_spill_temp_154\n    beqz.n  a15,.Lt_8_8962          # [2]\n\n.LBB6_esp_nn_depthwise_conv_s16_mult8_3x3:  # 0x12a3\n#<loop> Part of loop body line 933, head labeled .Lt_8_8706\n    l32i.n  a3,a1,56                # [0]  gra_spill_temp_144\n    l32i    a8,a1,80                    # [1]  gra_spill_temp_150\n    movi.n  a10,0                   # [2]\n    l32i    a9,a1,76                    # [3]  gra_spill_temp_149\n    movi.n  a11,0                   # [4]\n    l32i.n  a12,a1,52               # [5]  gra_spill_temp_143\n    l32i.n  a13,a1,60               # [6]  gra_spill_temp_145\n    s32i    a13,a1,104                  # [7]  gra_spill_temp_156\n    s32i    a12,a1,140                  # [8]  gra_spill_temp_165\n    s32i    a11,a1,100                  # [9]  gra_spill_temp_155\n    max     a9,a9,a10                   # [10]\n    movi.n  a10,3                   # [11]\n    s32i    a9,a1,172                   # [12]  gra_spill_temp_173\n    min     a8,a8,a10                   # [13]\n    s32i    a8,a1,156                   # [14]  gra_spill_temp_169\n    sub     a8,a8,a9                    # [15]\n    s32i    a8,a1,132                   # [16]  gra_spill_temp_163\n    j       .Lt_8_9474                      # [17]\n\n.Lt_8_9730: # 0x12d3\n#<loop> Part of loop body line 935, head labeled .Lt_8_9474\n    l32i    a15,a1,96                   # [0]  gra_spill_temp_154\n    l32i    a10,a1,104                  # [1]  gra_spill_temp_156\n    l32i    a9,a1,140                   # [2]  gra_spill_temp_165\n    l32i    a8,a1,84                    # [3]  gra_spill_temp_151\n    l32i    a14,a1,100                  # [4]  gra_spill_temp_155\n    sub     a3,a3,a8                    # [5]\n    addi.n  a14,a14,1               # [6]\n    s32i    a14,a1,100                  # [7]  gra_spill_temp_155\n    add.n   a9,a9,a8                    # [8]\n    sub     a10,a10,a8                  # [9]\n    s32i    a10,a1,104                  # [10]  gra_spill_temp_156\n    s32i    a9,a1,140                   # [11]  gra_spill_temp_165\n    beq     a14,a15,.Lt_8_8962          # [12]\n\n.Lt_8_9474: # 0x12f8\n # 936              const int32_t base_x = (out_x * stride_wd) - pad_wd;\n # 937              const int32_t *out_mult_ptr = out_mult;\n # 938              const int32_t *out_shift_ptr = out_shift;\n    l32i    a2,a1,88                    # [0]  gra_spill_temp_152\n    l32i    a10,a1,92                   # [1]  gra_spill_temp_153\n # 939              uint32_t bias_ptr = (uint32_t) (bias);\n    l32i    a12,a1,160                  # [2]  gra_spill_temp_170\n # 940\n # 941              for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop\n    l32i    a11,a1,144                  # [3]  gra_spill_temp_166\n    s32i    a12,a1,168                  # [4]  gra_spill_temp_172\n    beqz.n  a11,.Lt_8_9730          # [5]\n\n.LBB9_esp_nn_depthwise_conv_s16_mult8_3x3:  # 0x1309\n#<loop> Part of loop body line 935, head labeled .Lt_8_9474\n    movi.n  a8,0                    # [0]\n    l32i    a5,a1,104                   # [1]  gra_spill_temp_156\n    movi.n  a13,0                   # [2]\n    movi.n  a9,0                    # [3]\n    s32i    a9,a1,112                   # [4]  gra_spill_temp_158\n    s32i    a13,a1,148                  # [5]  gra_spill_temp_167\n    max     a5,a5,a8                    # [6]\n    j       .Lt_8_10242                     # [7]\n\n.Lt_8_10498:    # 0x131e\n#<loop> Part of loop body line 941, head labeled .Lt_8_10242\n    l32i    a12,a1,144                  # [0]  gra_spill_temp_166\n    l32i    a14,a1,108                  # [1]  gra_spill_temp_157\n    l32i    a11,a1,148                  # [2]  gra_spill_temp_167\n    l32i    a13,a1,112                  # [3]  gra_spill_temp_158\n    addi.n  a11,a11,1               # [4]\n    s32i    a11,a1,148                  # [5]  gra_spill_temp_167\n    add.n   a13,a13,a14                 # [6]\n    s32i    a13,a1,112                  # [7]  gra_spill_temp_158\n    beq     a11,a12,.Lt_8_9730          # [8]\n\n.Lt_8_10242:    # 0x1337\n # 942                  for (int ch_mult_idx = 0; ch_mult_idx < ch_mult - 7; ch_mult_idx += 8) {\n    l32i    a15,a1,164                  # [0]  gra_spill_temp_171\n    blti    a15,1,.Lt_8_10498           # [2]\n\n    movi.n  a8,0                    # [0]\n    l32i    a9,a1,112                   # [1]  gra_spill_temp_158\n    s32i    a9,a1,188                   # [2]  gra_spill_temp_177\n    s32i    a8,a1,184                   # [3]  gra_spill_temp_176\n    j   .Lt_8_11010                     # [4]\n\n.LBB23_esp_nn_depthwise_conv_s16_mult8_3x3: # 0x134b\n    s32i.n  a10,a1,48               # [0]  gra_spill_temp_142\n    addi    a11,a1,112                  # [1]\n    l32i    a13,a1,152                  # [2]  gra_spill_temp_168\n    l32i    a12,a1,168                  # [3]  gra_spill_temp_172\n    wur.sar_byte    a13                 # [4]\n    ee.vld.128.ip   q4,a12,16           # [5]  id:307\n    ee.vld.128.ip   q7,a12,16           # [6]  id:308\n    ee.vld.128.ip   q5,a12,0            # [7]  id:309\n    s32i    a12,a1,168                  # [8]  gra_spill_temp_172\n    ee.src.q.qup    q6,q4,q7            # [9]\n    ee.vadds.s32    q0,q0,q6            # [10]\n    ee.src.q.qup    q3,q4,q5            # [11]\n    ee.vadds.s32    q1,q1,q3            # [12]\n    st.qr   q1,a11,80                   # [13]  gra_spill_temp_178-112\n\n.Lt_8_13314:    # 0x1374\n #1025  q0 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q0, out_mult_ptr, out_shift_ptr);\n    l32i.n  a10,a1,48               # [0]  gra_spill_temp_142\n    mov.n   a11,a2                      # [1]\n    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3\n\n #1026                      out_mult_ptr += 4;\n #1027                      out_shift_ptr += 4;\n #1028\n #1029   q1 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q1, out_mult_ptr, out_shift_ptr);\n    l32i.n  a10,a1,48               # [0]  gra_spill_temp_142\n    addmi   a12,a1,256                  # [1]\n    addi    a11,a1,112                  # [2]\n    st.qr   q0,a12,0                    # [3]  gra_spill_temp_182-256\n    ld.qr   q0,a11,80                   # [4]  gra_spill_temp_178-112\n    addi    a10,a10,16                  # [5]\n    addi    a11,a2,16                   # [6]\n    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3\n\n#<loop> Part of loop body line 942, head labeled .Lt_8_11010\n #1030                      out_mult_ptr += 4;\n #1031                      out_shift_ptr += 4;\n    addi    a2,a2,32                    # [0]\n    l32i    a14,a1,164                  # [1]  gra_spill_temp_171\n\n    l32i    a8,a1,176                   # [2]  gra_spill_temp_174\n    l32i    a15,a1,188                  # [3]  gra_spill_temp_177\n    l32i    a13,a1,184                  # [4]  gra_spill_temp_176\n    l32i.n  a10,a1,48               # [5]  gra_spill_temp_142\n    addmi   a11,a1,256                  # [6]\n    addi    a12,a1,112                  # [7]\n    ld.qr   q3,a12,112                  # [8]  gra_spill_temp_180-112\n    ld.qr   q1,a12,96                   # [9]  gra_spill_temp_179-112\n    ld.qr   q2,a11,0                    # [10]  gra_spill_temp_182-256\n    addi    a10,a10,32                  # [11]\n    addi.n  a13,a13,8               # [12]\n    addi.n  a15,a15,8               # [13]\n    s32i    a15,a1,188                  # [14]  gra_spill_temp_177\n    ee.vadds.s32    q2,q2,q1            # [15]\n    s32i    a13,a1,184                  # [16]  gra_spill_temp_176\n    ee.vadds.s32    q1,q0,q1            # [17]\n    ee.vmin.s32     q0,q2,q3            # [18]\n    ld.qr           q2,a11,-16                  # [19]  gra_spill_temp_181-256\n    ee.vmin.s32     q1,q1,q3            # [20]\n    ee.vmax.s32     q1,q1,q2            # [21]\n    ee.vmax.s32     q0,q0,q2            # [22]\n    ee.vunzip.16    q0,q1               # [23]\n    ee.vunzip.8     q0,q1               # [24]\n    ee.vst.l.64.ip  q0,a8,8         # [25]  id:312\n    s32i    a8,a1,176                   # [26]  gra_spill_temp_174\n    bge     a13,a14,.Lt_8_10498         # [27]\n\n.Lt_8_11010:    # 0x13e3\n#<loop> Loop body line 942, nesting depth: 4, estimated iterations: 100\n    l32i    a14,a1,156                  # [0]  gra_spill_temp_169\n    l32i    a13,a1,172                  # [1]  gra_spill_temp_173\n    ee.zero.qacc                    # [2]\n    bge     a13,a14,.Lt_8_11266         # [3]\n\n.LBB15_esp_nn_depthwise_conv_s16_mult8_3x3: # 0x13ef\n#<loop> Part of loop body line 942, head labeled .Lt_8_11010\n    l32i    a12,a1,124                  # [0]  gra_spill_temp_161\n    l32i    a8,a1,140                   # [1]  gra_spill_temp_165\n    l32i    a11,a1,120                  # [2]  gra_spill_temp_160\n    l32i    a14,a1,188                  # [3]  gra_spill_temp_177\n    l32i    a9,a1,136                   # [4]  gra_spill_temp_164\n    mull    a15,a4,a13                  # [5]\n    add.n   a9,a9,a13                   # [6]\n    addx2   a15,a15,a15                 # [7]\n    l32i    a13,a1,148                  # [8]  gra_spill_temp_167\n    add.n   a14,a14,a15                 # [9]\n    mull    a9,a9,a11                   # [10]\n    l32i    a15,a1,144                  # [11]  gra_spill_temp_166\n    add.n   a8,a8,a9                    # [12]\n    mull    a15,a15,a8                  # [13]\n    l32i    a8,a1,128                   # [14]  gra_spill_temp_162\n    add.n   a13,a13,a15                 # [15]\n    l32i    a15,a1,116                  # [16]  gra_spill_temp_159\n    addx2   a14,a14,a8                  # [17]\n    addx2   a13,a13,a15                 # [18]\n    add.n   a11,a12,a13                 # [19]\n    l32i    a15,a1,132                  # [20]  gra_spill_temp_163\n    add.n   a12,a12,a11                 # [21]\n    loopgtz a15,.LBB34_esp_nn_depthwise_conv_s16_mult8_3x3  # [22]\n\n.Lt_8_11778:    # 0x142e\n    mov.n   a15,a14                     # [0]\n    mov.n   a9,a14                      # [1]\n    bnez.n  a5,.Lt_8_12034          # [2]\n\n    ee.vldbc.16     q3,a13              # [0]  id:271\n    mov.n           a9,a14                      # [1]\n    ee.vld.128.ip       q4,a9,0             # [2]  id:272\n    ee.vmulas.s16.qacc  q3,q4       # [4]\n\n.Lt_8_12034:    # 0x143f\n    ee.vldbc.16     q5,a11              # [0]  id:274\n    addx2           a9,a4,a9                    # [1]\n    ee.vld.128.ip   q6,a9,0             # [2]  id:275\n    add.n           a13,a13,a6                  # [3]\n    ee.vmulas.s16.qacc  q5,q6       # [4]\n    blti    a3,3,.Lt_8_12546            # [5]\n\n    ee.vldbc.16     q7,a12              # [0]  id:277\n    addx2           a14,a4,a9                   # [1]\n    ee.vld.128.ip   q0,a14,0            # [2]  id:278\n    ee.vmulas.s16.qacc  q7,q0       # [4]\n\n.Lt_8_12546:    # 0x145c\n#<loop> Part of loop body line 953, head labeled .Lt_8_11778\n    add.n   a11,a11,a6                  # [0]\n    add.n   a12,a12,a6                  # [1]\n    add.n   a14,a7,a15                  # [2]\n\n.LBB34_esp_nn_depthwise_conv_s16_mult8_3x3: # 0x1464\n.Lt_8_11266:    # 0x1464\n\n    l32i    a8,a1,180                   # [0]  gra_spill_temp_175\n    ee.st.qacc_l.l.128.ip   a8,16       # [2]  id:280\n    ee.st.qacc_l.h.32.ip    a8,0        # [3]  id:281\n    l16ui   a9,a1,10                    # [4]  qacc_scratch+10\n    l8ui    a11,a1,15                   # [5]  qacc_scratch+15\n    l8ui    a12,a1,5                    # [6]  qacc_scratch+5\n    l8ui    a13,a1,6                    # [7]  qacc_scratch+6\n    l8ui    a14,a1,16                   # [8]  qacc_scratch+16\n    s8i     a14,a1,7                    # [9]  qacc_scratch+7\n    s8i     a13,a1,3                    # [10]  qacc_scratch+3\n    s8i     a12,a1,2                    # [11]  qacc_scratch+2\n    s8i     a11,a1,6                    # [12]  qacc_scratch+6\n    s16i    a9,a1,4                     # [13]  qacc_scratch+4\n    ee.st.qacc_h.l.128.ip   a8,16       # [14]  id:291\n    ee.st.qacc_h.h.32.ip    a8,-32      # [15]  id:292\n    l16ui   a9,a1,16                    # [16]  qacc_scratch+16\n    l8ui    a15,a1,32                   # [17]  qacc_scratch+32\n    l8ui    a12,a1,22                   # [18]  qacc_scratch+22\n    l8ui    a11,a1,21                   # [19]  qacc_scratch+21\n    l8ui    a14,a1,31                   # [20]  qacc_scratch+31\n    l16ui   a13,a1,26                   # [21]  qacc_scratch+26\n    s16i    a13,a1,12                   # [22]  qacc_scratch+12\n    s8i \ta14,a1,14                   # [23]  qacc_scratch+14\n    s8i \ta11,a1,10                   # [24]  qacc_scratch+10\n    s8i \ta12,a1,11                   # [25]  qacc_scratch+11\n    s8i \ta15,a1,15                   # [26]  qacc_scratch+15\n    s16i    a9,a1,8                     # [27]  qacc_scratch+8\n    l32i    a15,a1,160                  # [28]  gra_spill_temp_170\n    movi.n  a9,16                   # [29]\n    ee.srcmb.s16.qacc   q1,a9,0         # [30]\n    ee.vld.128.ip   q0,a8,0             # [31]  id:304\n    s32i    a8,a1,180                   # [32]  gra_spill_temp_175\n    ee.vzip.16  q0,q1               # [33]\n    bnez.n  a15,.LBB23_esp_nn_depthwise_conv_s16_mult8_3x3  # [34]\n\n    s32i.n  a10,a1,48               # [0]  gra_spill_temp_142\n    addi    a15,a1,112                  # [1]\n    st.qr   q1,a15,80                   # [2]  gra_spill_temp_178-112\n    j   .Lt_8_13314                     # [3]\n\n.Lt_8_8194: # 0x14d3\n    retw.n                          # [0]\n\n    .size   esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3, . - esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3\n"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_s16_mult8_esp32s3.S",
    "content": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n    .text\n    .literal_position\n\n    # Program Unit: esp_nn_depthwise_conv_s16_mult8_esp32s3\n    .type   esp_nn_depthwise_conv_s16_mult8_esp32s3, @function\n    .align   4\n    .global esp_nn_depthwise_conv_s16_mult8_esp32s3\n\nesp_nn_depthwise_conv_s16_mult8_esp32s3:    # 0x14d7\n    # qacc_scratch = 0\n    # gra_spill_temp_183 = 48\n    # gra_spill_temp_184 = 52\n    # gra_spill_temp_185 = 56\n    # gra_spill_temp_186 = 60\n    # gra_spill_temp_187 = 64\n    # gra_spill_temp_188 = 68\n    # gra_spill_temp_189 = 72\n    # gra_spill_temp_190 = 76\n    # gra_spill_temp_191 = 80\n    # gra_spill_temp_192 = 84\n    # gra_spill_temp_193 = 88\n    # gra_spill_temp_194 = 92\n    # gra_spill_temp_195 = 96\n    # gra_spill_temp_196 = 100\n    # gra_spill_temp_197 = 104\n    # gra_spill_temp_198 = 108\n    # gra_spill_temp_199 = 112\n    # gra_spill_temp_200 = 116\n    # gra_spill_temp_201 = 120\n    # gra_spill_temp_202 = 124\n    # gra_spill_temp_203 = 128\n    # gra_spill_temp_204 = 132\n    # gra_spill_temp_205 = 136\n    # gra_spill_temp_206 = 140\n    # gra_spill_temp_207 = 144\n    # gra_spill_temp_208 = 148\n    # gra_spill_temp_209 = 152\n    # gra_spill_temp_210 = 156\n    # gra_spill_temp_211 = 160\n    # gra_spill_temp_212 = 164\n    # gra_spill_temp_213 = 168\n    # gra_spill_temp_214 = 172\n    # gra_spill_temp_215 = 176\n    # gra_spill_temp_216 = 180\n    # gra_spill_temp_217 = 184\n    # gra_spill_temp_218 = 192\n    # gra_spill_temp_219 = 208\n\n // registers:\n // a2: const int16_t *input_data\n // a3: const uint16_t input_wd\n // a4: const uint16_t input_ht\n // a5: const uint16_t channels\n // a6: const uint16_t pad_wd\n // a7: const uint16_t pad_ht\n\n // on stack:\n // const uint16_t stride_wd\n // const uint16_t stride_ht\n // const uint16_t ch_mult\n // const int16_t *filter_data\n // const uint16_t filter_wd\n // const uint16_t filter_ht\n // const int32_t *bias\n // int8_t *out_data\n // const uint16_t out_wd\n // const uint16_t out_ht\n // const int32_t out_offset\n // const int32_t *out_shift\n // const int32_t *out_mult\n // const int32_t activation_min\n // const int32_t activation_max\n\n    entry   a1,256                      #\n    s32i    a2,a1,144                   # [0]  gra_spill_temp_207\n    s32i.n  a4,a1,56                # [1]  gra_spill_temp_185\n    s32i    a5,a1,172                   # [2]  gra_spill_temp_214\n    l32i    a9,a1,284                   # [3]  id:241 out_data+0x0\n\n    l16ui   a8,a1,292                   # [4]  id:242 out_ht+0x0\n    s32i    a8,a1,64                    # [5]  gra_spill_temp_187\n    s32i    a9,a1,124                   # [6]  gra_spill_temp_202\n    beqz.n  a8,.Lt_9_8450           # [7]\n\n    s32i    a1,a1,128                   # [0]  gra_spill_temp_203\n    neg     a13,a7                      # [1]\n    movi.n  a4,0                    # [2]\n    neg     a12,a6                      # [3]\n    l32i    a9,a1,280                   # [4]  id:243 bias+0x0\n    slli    a11,a5,1                    # [5]\n    l16ui   a10,a1,264                  # [6]  id:244 ch_mult+0x0\n    l32i    a14,a1,268                  # [7]  id:245 filter_data+0x0\n    s32i    a14,a1,160                  # [8]  gra_spill_temp_211\n    s32i    a10,a1,92                   # [9]  gra_spill_temp_194\n    s32i    a11,a1,156                  # [10]  gra_spill_temp_210\n    s32i    a9,a1,112                   # [11]  gra_spill_temp_199\n    sext    a12,a12,15                  # [12]\n    s32i    a4,a1,68                    # [13]  gra_spill_temp_188\n    sext    a13,a13,15                  # [14]\n    l16ui   a4,a1,272                   # [15]  id:246 filter_wd+0x0\n    s32i    a13,a1,100                  # [16]  gra_spill_temp_196\n    s32i.n  a12,a1,48               # [17]  gra_spill_temp_183\n    mul16u  a8,a5,a10               # [18]\n    extui   a9,a9,0,4                   # [19]\n    l32i    a11,a1,304                  # [20]  id:249 out_mult+0x0\n    s32i    a11,a1,80                   # [21]  gra_spill_temp_191\n    s32i    a9,a1,104                   # [22]  gra_spill_temp_197\n    s32i    a8,a1,148                   # [23]  gra_spill_temp_208\n    addi    a10,a10,-7                  # [24]\n    l32i    a12,a1,300                  # [25]  id:248 out_shift+0x0\n    l16ui   a13,a1,256                  # [26]  id:247 stride_wd+0x0\n    s32i    a13,a1,72                   # [27]  gra_spill_temp_189\n    s32i    a12,a1,76                   # [28]  gra_spill_temp_190\n    s32i    a10,a1,116                  # [29]  gra_spill_temp_200\n    slli    a8,a8,1                     # [30]\n    l16ui   a9,a1,260                   # [31]  id:251 stride_ht+0x0\n    s32i.n  a9,a1,60                # [32]  gra_spill_temp_186\n    s32i    a8,a1,152                   # [33]  gra_spill_temp_209\n    l16ui   a10,a1,276                  # [34]  id:250 filter_ht+0x0\n    s32i.n  a10,a1,52               # [35]  gra_spill_temp_184\n    l16ui   a8,a1,288                   # [36]  id:252 out_wd+0x0\n    s32i    a8,a1,84                    # [37]  gra_spill_temp_192\n    j       .Lt_9_8962                      # [38]\n\n.Lt_9_9218: # 0x1561\n#<loop> Part of loop body line 1083, head labeled .Lt_9_8962\n    l32i    a15,a1,64                   # [0]  gra_spill_temp_187\n    l32i.n  a9,a1,60                # [1]  gra_spill_temp_186\n    l32i    a14,a1,68                   # [2]  gra_spill_temp_188\n    l32i    a8,a1,100                   # [3]  gra_spill_temp_196\n    addi.n  a14,a14,1               # [4]\n    s32i    a14,a1,68                   # [5]  gra_spill_temp_188\n    add.n   a9,a8,a9                    # [6]\n    sub     a14,a14,a15                 # [7]\n    sext    a8,a9,15                    # [8]\n    s32i    a8,a1,100                   # [9]  gra_spill_temp_196\n    beqz    a14,.Lt_9_8450              # [10]\n\n.Lt_9_8962: # 0x157f\n    l32i    a10,a1,84                   # [0]  gra_spill_temp_192\n    beqz.n  a10,.Lt_9_9218          # [2]\n\n    l32i.n  a7,a1,52                # [0]  gra_spill_temp_184\n    movi.n  a11,0                   # [1]\n    l32i.n  a8,a1,56                # [2]  gra_spill_temp_185\n    l32i    a9,a1,100                   # [3]  gra_spill_temp_196\n    l32i.n  a12,a1,48               # [4]  gra_spill_temp_183\n    s32i    a12,a1,168                  # [5]  gra_spill_temp_213\n    neg     a10,a9                      # [6]\n    sub     a8,a8,a9                    # [7]\n    max     a10,a10,a11                 # [8]\n    s32i    a10,a1,108                  # [9]  gra_spill_temp_198\n    min     a7,a7,a8                    # [10]\n    movi.n  a11,0                   # [11]\n    s32i    a11,a1,88                   # [12]  gra_spill_temp_193\n    j       .Lt_9_9730                      # [13]\n\n.Lt_9_9986: # 0x15a9\n#<loop> Part of loop body line 1085, head labeled .Lt_9_9730\n    l32i    a13,a1,84                   # [0]  gra_spill_temp_192\n    l32i    a15,a1,72                   # [1]  gra_spill_temp_189\n    l32i    a12,a1,88                   # [2]  gra_spill_temp_193\n    l32i    a14,a1,168                  # [3]  gra_spill_temp_213\n    addi.n  a12,a12,1               # [4]\n    s32i    a12,a1,88                   # [5]  gra_spill_temp_193\n    add.n   a15,a14,a15                 # [6]\n    sext    a14,a15,15                  # [7]\n    s32i    a14,a1,168                  # [8]  gra_spill_temp_213\n    beq     a12,a13,.Lt_9_9218          # [9]\n\n.Lt_9_9730: # 0x15c5\n#<loop> Loop body line 1085, nesting depth: 2, estimated iterations: 100\n #1086              const int16_t base_x = (out_x * stride_wd) - pad_wd;\n #1087              const int32_t *out_mult_ptr = out_mult;\n #1088              const int32_t *out_shift_ptr = out_shift;\n #1089              uint32_t bias_ptr = (uint32_t) (bias);\n #1090              for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop\n    l32i    a8,a1,172                   # [0]  gra_spill_temp_214\n    l32i    a9,a1,80                    # [1]  gra_spill_temp_191\n    l32i    a10,a1,76                   # [2]  gra_spill_temp_190\n    l32i    a11,a1,112                  # [3]  gra_spill_temp_199\n    s32i    a11,a1,120                  # [4]  gra_spill_temp_201\n    s32i    a10,a1,140                  # [5]  gra_spill_temp_206\n    s32i    a9,a1,136                   # [6]  gra_spill_temp_205\n    beqz.n  a8,.Lt_9_9986           # [7]\n\n.LBB9_esp_nn_depthwise_conv_s16_mult8:  # 0x15dc\n#<loop> Part of loop body line 1085, head labeled .Lt_9_9730\n    movi.n  a8,0                    # [0]\n    l32i    a5,a1,168                   # [1]  gra_spill_temp_213\n    movi.n  a13,0                   # [2]\n    movi.n  a14,0                   # [3]\n    s32i    a14,a1,96                   # [4]  gra_spill_temp_195\n    s32i    a13,a1,184                  # [5]  gra_spill_temp_217\n    neg     a6,a5                       # [6]\n    max     a6,a6,a8                    # [7]\n    sub     a5,a3,a5                    # [8]\n    min     a5,a4,a5                    # [9]\n    sub     a12,a5,a6                   # [10]\n    s32i    a12,a1,164                  # [11]  gra_spill_temp_212\n    j       .Lt_9_10498                     # [12]\n\n.Lt_9_10754:    # 0x1600\n#<loop> Part of loop body line 1090, head labeled .Lt_9_10498\n    l32i    a10,a1,172                  # [0]  gra_spill_temp_214\n    l32i    a12,a1,92                   # [1]  gra_spill_temp_194\n    l32i    a9,a1,184                   # [2]  gra_spill_temp_217\n    l32i    a11,a1,96                   # [3]  gra_spill_temp_195\n    addi.n  a9,a9,1                 # [4]\n    s32i    a9,a1,184                   # [5]  gra_spill_temp_217\n    add.n   a11,a11,a12                 # [6]\n    s32i    a11,a1,96                   # [7]  gra_spill_temp_195\n    beq     a9,a10,.Lt_9_9986           # [8]\n\n.Lt_9_10498:    # 0x1619\n#<loop> Loop body line 1090, nesting depth: 3, estimated iterations: 100\n #1091                  for (int ch_mult_idx = 0; ch_mult_idx < ch_mult - 7; ch_mult_idx += 8) {\n    l32i    a13,a1,116                  # [0]  gra_spill_temp_200\n    blti    a13,1,.Lt_9_10754           # [2]\n\n.LBB12_esp_nn_depthwise_conv_s16_mult8: # 0x161f\n#<loop> Part of loop body line 1090, head labeled .Lt_9_10498\n    l32i    a2,a1,96                    # [0]  gra_spill_temp_195\n    movi.n  a14,0                   # [1]\n    s32i    a14,a1,132                  # [2]  gra_spill_temp_204\n    j       .Lt_9_11266                     # [3]\n\n.Lt_9_11522:    # 0x162a\n    l32i    a9,a1,128                   # [0]  gra_spill_temp_203\n    ee.st.qacc_l.l.128.ip   a9,16       # [2]  id:257\n    ee.st.qacc_l.h.32.ip    a9,0        # [3]  id:258\n    l8ui    a10,a1,15                   # [4]  qacc_scratch+15\n    l16ui   a8,a1,10                    # [5]  qacc_scratch+10\n    l8ui    a13,a1,16                   # [6]  qacc_scratch+16\n    l8ui    a12,a1,6                    # [7]  qacc_scratch+6\n    l8ui    a11,a1,5                    # [8]  qacc_scratch+5\n    s8i     a11,a1,2                    # [9]  qacc_scratch+2\n    s8i     a12,a1,3                    # [10]  qacc_scratch+3\n    s8i     a13,a1,7                    # [11]  qacc_scratch+7\n    s16i    a8,a1,4                     # [12]  qacc_scratch+4\n    s8i     a10,a1,6                    # [13]  qacc_scratch+6\n\n    movi.n  a8,16                   # [14]\n    ee.st.qacc_h.l.128.ip   a9,16       # [15]  id:268\n    ee.st.qacc_h.h.32.ip    a9,-32      # [16]  id:269\n    ee.srcmb.s16.qacc   q1,a8,0         # [17]\n    l16ui   a13,a1,26                   # [18]  qacc_scratch+26\n    l8ui    a15,a1,32                   # [19]  qacc_scratch+32\n    l8ui    a12,a1,22                   # [20]  qacc_scratch+22\n    l8ui    a11,a1,21                   # [21]  qacc_scratch+21\n    l16ui   a10,a1,16                   # [22]  qacc_scratch+16\n    l8ui    a14,a1,31                   # [23]  qacc_scratch+31\n    s8i     a14,a1,14                   # [24]  qacc_scratch+14\n    s16i    a10,a1,8                    # [25]  qacc_scratch+8\n    s8i     a11,a1,10                   # [26]  qacc_scratch+10\n    s8i     a12,a1,11                   # [27]  qacc_scratch+11\n    s8i     a15,a1,15                   # [28]  qacc_scratch+15\n    s16i    a13,a1,12                   # [29]  qacc_scratch+12\n #1138                      EE_VZIP_16(q0, q1); /* 4x32 */\n #1139\n #1140                      if (bias) {\n    l32i            a15,a1,112                  # [30]  gra_spill_temp_199\n    ee.vld.128.ip   q0,a9,0             # [31]  id:281\n    s32i            a9,a1,128                   # [32]  gra_spill_temp_203\n    ee.vzip.16      q0,q1               # [33]\n    beqz.n          a15,.Lt_9_13570         # [34]\n\n.LBB23_esp_nn_depthwise_conv_s16_mult8: # 0x168e\n#<loop> Part of loop body line 1091, head labeled .Lt_9_11266\n    addi            a14,a1,112                  # [0]\n    l32i            a8,a1,104                   # [1]  gra_spill_temp_197\n    l32i            a15,a1,120                  # [2]  gra_spill_temp_201\n    wur.sar_byte    a8                  # [3]\n    ee.vld.128.ip   q3,a15,16           # [4]  id:284\n    ee.vld.128.ip   q6,a15,16           # [5]  id:285\n    ee.vld.128.ip   q4,a15,0            # [6]  id:286\n    s32i            a15,a1,120                  # [7]  gra_spill_temp_201\n    ee.src.q.qup    q5,q3,q6            # [8]\n    ee.vadds.s32    q0,q0,q5            # [9]\n    ee.src.q.qup    q2,q3,q4            # [10]\n    ee.vadds.s32    q1,q1,q2            # [11]\n    st.qr           q1,a14,96                   # [12]  gra_spill_temp_219-112\n\n.Lt_9_13570:    # 0x16b5\n #1158  q0 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q0, out_mult_ptr, out_shift_ptr);\n    l32i    a10,a1,136                  # [0]  gra_spill_temp_205\n    l32i    a11,a1,140                  # [1]  gra_spill_temp_206\n    addi    a9,a1,112                   # [2]\n    st.qr   q1,a9,96                    # [3]  gra_spill_temp_219-112\n    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3\n\n #1159                      out_mult_ptr += 4;\n #1160                      out_shift_ptr += 4;\n #1161\n #1162  q1 = esp_nn_multiply_by_quantized_mult_ver1_esp32s3(q1, out_mult_ptr, out_shift_ptr);\n    l32i    a11,a1,140                  # [0]  gra_spill_temp_206\n    addi    a12,a1,112                  # [1]\n    l32i    a10,a1,136                  # [2]  gra_spill_temp_205\n    st.qr   q0,a12,80                   # [3]  gra_spill_temp_218-112\n    ld.qr   q0,a12,96                   # [4]  gra_spill_temp_219-112\n    addi    a10,a10,16                  # [5]\n    addi    a11,a11,16                  # [6]\n    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3\n\n    addi.n  a2,a2,8                 # [0]\n    l32i    a14,a1,116                  # [1]  gra_spill_temp_200\n    l32i    a15,a1,124                  # [2]  gra_spill_temp_202\n    l32i    a13,a1,132                  # [3]  gra_spill_temp_204\n    l32i    a10,a1,140                  # [4]  gra_spill_temp_206\n    l32i    a11,a1,136                  # [5]  gra_spill_temp_205\n    addmi   a9,a1,256                   # [6]\n    addi    a8,a1,112                   # [7]\n    ld.qr   q7,a8,80                    # [8]  gra_spill_temp_218-112\n    addi    a9,a9,56                    # [9]\n    ee.vldbc.32 q2,a9               # [10]  id:290 activation_max\n    addi    a11,a11,32                  # [11]\n    addi    a10,a10,32                  # [12]\n    addi.n  a13,a13,8               # [13]\n    s32i    a13,a1,132                  # [14]  gra_spill_temp_204\n    s32i    a10,a1,140                  # [15]  gra_spill_temp_206\n    s32i    a11,a1,136                  # [16]  gra_spill_temp_205\n    addmi   a10,a1,256                  # [17]\n    addmi   a11,a1,256                  # [18]\n    addi    a11,a11,52                  # [19]\n    addi    a10,a10,40                  # [20]\n    ee.vldbc.32     q3,a10              # [21]  id:289 out_offset\n    ee.vldbc.32     q1,a11              # [22]  id:291 activation_min\n    ee.vadds.s32    q0,q0,q3            # [23]\n    ee.vadds.s32    q7,q7,q3            # [24]\n    ee.vmin.s32     q7,q7,q2            # [25]\n    ee.vmin.s32     q0,q0,q2            # [26]\n    ee.vmax.s32     q0,q0,q1            # [27]\n    ee.vmax.s32     q7,q7,q1            # [28]\n    ee.vunzip.16    q7,q0               # [29]\n    ee.vunzip.8     q7,q0               # [30]\n    ee.vst.l.64.ip  q7,a15,8        # [31]  id:292\n    s32i            a15,a1,124                  # [32]  gra_spill_temp_202\n    bge             a13,a14,.Lt_9_10754         # [33]\n\n.Lt_9_11266:    # 0x1740\n\n    ee.zero.qacc                    # [0]\n    l32i    a12,a1,108                  # [1]  gra_spill_temp_198\n    s32i    a12,a1,180                  # [2]  gra_spill_temp_216\n    bge a12,a7,.Lt_9_11522          # [3]\n\n    mull    a15,a12,a4                  # [0]\n    l32i    a14,a1,100                  # [1]  gra_spill_temp_196\n    add.n   a8,a15,a5                   # [2]\n    add.n   a14,a14,a12                 # [3]\n    mull    a14,a3,a14                  # [4]\n    s32i    a8,a1,176                   # [5]  gra_spill_temp_215\n    bge     a6,a5,.Lt_9_12290           # [6]\n\n.LBB18_esp_nn_depthwise_conv_s16_mult8: # 0x175f\n#<loop> Part of loop body line 1091, head labeled .Lt_9_11266\n    l32i    a10,a1,184                  # [0]  gra_spill_temp_217\n    l32i    a11,a1,172                  # [1]  gra_spill_temp_214\n    l32i    a12,a1,168                  # [2]  gra_spill_temp_213\n    l32i    a8,a1,148                   # [3]  gra_spill_temp_208\n    add.n   a9,a15,a6                   # [4]\n    mull    a8,a8,a9                    # [5]\n    add.n   a12,a12,a6                  # [6]\n    l32i    a9,a1,160                   # [7]  gra_spill_temp_211\n    add.n   a12,a14,a12                 # [8]\n    mull    a11,a11,a12                 # [9]\n    add.n   a8,a2,a8                    # [10]\n    l32i    a12,a1,156                  # [11]  gra_spill_temp_210\n    addx2   a8,a8,a9                    # [12]\n    add.n   a10,a10,a11                 # [13]\n    l32i    a11,a1,144                  # [14]  gra_spill_temp_207\n    l32i    a9,a1,164                   # [15]  gra_spill_temp_212\n    addx2   a10,a10,a11                 # [16]\n    l32i    a11,a1,152                  # [17]  gra_spill_temp_209\n    loopgtz a9,.LBB45_esp_nn_depthwise_conv_s16_mult8   # [18]\n\n    mov.n           a9,a8                       # [0*II+0]\n    ee.vldbc.16     q0,a10              # [0*II+1]  id:255\n    ee.vld.128.ip   q1,a9,0             # [0*II+2]  id:254\n    add.n           a10,a10,a12                 # [0*II+3]\n    add.n           a8,a8,a11                   # [0*II+4]\n    ee.vmulas.s16.qacc  q0,q1       # [0*II+5]\n\n.LBB45_esp_nn_depthwise_conv_s16_mult8: # 0x17a2\n\n.Lt_9_12290:    # 0x17a2\n\n    add.n   a14,a14,a3                  # [0]\n    add.n   a15,a15,a4                  # [1]\n    l32i    a10,a1,180                  # [2]  gra_spill_temp_216\n    l32i    a11,a1,176                  # [3]  gra_spill_temp_215\n    addi.n  a10,a10,1               # [4]\n    add.n   a11,a11,a4                  # [5]\n    s32i    a11,a1,176                  # [6]  gra_spill_temp_215\n    s32i    a10,a1,180                  # [7]  gra_spill_temp_216\n    sub     a10,a7,a10                  # [8]\n    beqz    a10,.Lt_9_11522             # [9]\n\n.Lt_9_12034:    # 0x17bc\n    blt     a6,a5,.LBB18_esp_nn_depthwise_conv_s16_mult8    # [0]\n\n    j       .Lt_9_12290                     # [0]\n\n.Lt_9_8450: # 0x17c2\n    retw.n                          # [0]\n\n    .size   esp_nn_depthwise_conv_s16_mult8_esp32s3, . - esp_nn_depthwise_conv_s16_mult8_esp32s3\n"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_s8_esp32s3.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <stdio.h>\n#include <esp_nn_defs.h>\n\n#include <common_functions.h>\n\nstatic int16_t *scratch_buffer = NULL;\n\nextern void esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3(const int16_t *input_data,\n                                                        const uint16_t input_wd,\n                                                        const uint16_t input_ht,\n                                                        const uint16_t channels,\n                                                        const uint16_t pad_wd,\n                                                        const uint16_t pad_ht,\n                                                        const uint16_t stride_wd,\n                                                        const uint16_t stride_ht,\n                                                        const uint16_t ch_mult,\n                                                        const int16_t *filter_data,\n                                                        const int32_t *bias,\n                                                        int8_t *out_data,\n                                                        const uint16_t out_wd,\n                                                        const uint16_t out_ht,\n                                                        const int32_t out_offset,\n                                                        const int32_t *out_shift,\n                                                        const int32_t *out_mult,\n                                                        const int32_t activation_min,\n                                                        const int32_t activation_max);\n\nextern void esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(const int8_t *input_data,\n                                                              const uint16_t input_wd,\n                                                              const uint16_t input_ht,\n                                                              const uint16_t channels,\n                                                              const int32_t input_offset,\n                                                              const uint16_t stride_wd,\n                                                              const uint16_t stride_ht,\n                                                              const int8_t *filter_data,\n                                                              const int32_t *bias,\n                                                              int8_t *out_data,\n                                                              const uint16_t out_wd,\n                                                              const uint16_t out_ht,\n                                                              const int32_t out_offset,\n                                                              const int32_t *out_shift,\n                                                              const int32_t *out_mult,\n                                                              const int32_t activation_min,\n                                                              const int32_t activation_max);\n\nextern void esp_nn_depthwise_conv_s16_mult1_3x3_no_pad_esp32s3(const int16_t *input_data,\n                                                               const uint16_t input_wd,\n                                                               const uint16_t input_ht,\n                                                               const uint16_t channels,\n                                                               const uint16_t stride_wd,\n                                                               const uint16_t stride_ht,\n                                                               const int16_t *filter_data,\n                                                               const int32_t *bias,\n                                                               int8_t *out_data,\n                                                               const uint16_t out_wd,\n                                                               const uint16_t out_ht,\n                                                               const int32_t out_offset,\n                                                               const int32_t *out_shift,\n                                                               const int32_t *out_mult,\n                                                               const int32_t activation_min,\n                                                               const int32_t activation_max);\n\nextern void esp_nn_depthwise_conv_s16_mult8_esp32s3(const int16_t *input_data,\n                                                    const uint16_t input_wd,\n                                                    const uint16_t input_ht,\n                                                    const uint16_t channels,\n                                                    const uint16_t pad_wd,\n                                                    const uint16_t pad_ht,\n                                                    const uint16_t stride_wd,\n                                                    const uint16_t stride_ht,\n                                                    const uint16_t ch_mult,\n                                                    const int16_t *filter_data,\n                                                    const uint16_t filter_wd,\n                                                    const uint16_t filter_ht,\n                                                    const int32_t *bias,\n                                                    int8_t *out_data,\n                                                    const uint16_t out_wd,\n                                                    const uint16_t out_ht,\n                                                    const int32_t out_offset,\n                                                    const int32_t *out_shift,\n                                                    const int32_t *out_mult,\n                                                    const int32_t activation_min,\n                                                    const int32_t activation_max);\n\nextern void esp_nn_depthwise_conv_s16_mult4_esp32s3(const int16_t *input_data,\n                                                    const uint16_t input_wd,\n                                                    const uint16_t input_ht,\n                                                    const uint16_t channels,\n                                                    const uint16_t pad_wd,\n                                                    const uint16_t pad_ht,\n                                                    const uint16_t stride_wd,\n                                                    const uint16_t stride_ht,\n                                                    const uint16_t ch_mult,\n                                                    const int16_t *filter_data,\n                                                    const uint16_t filter_wd,\n                                                    const uint16_t filter_ht,\n                                                    const int32_t *bias,\n                                                    int8_t *out_data,\n                                                    const uint16_t out_wd,\n                                                    const uint16_t out_ht,\n                                                    const int32_t out_offset,\n                                                    const int32_t *out_shift,\n                                                    const int32_t *out_mult,\n                                                    const int32_t activation_min,\n                                                    const int32_t activation_max);\n\nextern void esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3(const int16_t *input_data,\n                                                        const uint16_t input_wd,\n                                                        const uint16_t input_ht,\n                                                        const uint16_t channels,\n                                                        const uint16_t pad_wd,\n                                                        const uint16_t pad_ht,\n                                                        const uint16_t stride_wd,\n                                                        const uint16_t stride_ht,\n                                                        const int16_t *filter_data,\n                                                        const int32_t *bias,\n                                                        int8_t *out_data,\n                                                        const uint16_t out_wd,\n                                                        const uint16_t out_ht,\n                                                        const int32_t out_offset,\n                                                        const int32_t *out_shift,\n                                                        const int32_t *out_mult,\n                                                        const int32_t activation_min,\n                                                        const int32_t activation_max);\n\nextern void esp_nn_depthwise_conv_s16_mult1_esp32s3(const int16_t *input_data,\n                                                    const uint16_t input_wd,\n                                                    const uint16_t input_ht,\n                                                    const uint16_t channels,\n                                                    const uint16_t pad_wd,\n                                                    const uint16_t pad_ht,\n                                                    const uint16_t stride_wd,\n                                                    const uint16_t stride_ht,\n                                                    const int16_t *filter_data,\n                                                    const uint16_t filter_wd,\n                                                    const uint16_t filter_ht,\n                                                    const int32_t *bias,\n                                                    int8_t *out_data,\n                                                    const uint16_t out_wd,\n                                                    const uint16_t out_ht,\n                                                    const int32_t out_offset,\n                                                    const int32_t *out_shift,\n                                                    const int32_t *out_mult,\n                                                    const int32_t activation_min,\n                                                    const int32_t activation_max);\n\nextern void esp_nn_s8_to_s16_esp32s3(const int8_t *src, int16_t *dst, const int size);\n\nextern void esp_nn_aligned_s8_to_s16_with_offset_esp32s3(const int8_t *src, int16_t *dst,\n                                                         const int size, const int32_t offset);\n\nstatic void esp_nn_depthwise_conv_s8_unrolled(const int8_t *input_data,\n                                              const uint16_t input_wd,\n                                              const uint16_t input_ht,\n                                              const uint16_t channels,\n                                              const int32_t input_offset,\n                                              const uint16_t pad_wd,\n                                              const uint16_t pad_ht,\n                                              const uint16_t stride_wd,\n                                              const uint16_t stride_ht,\n                                              const uint16_t ch_mult,\n                                              const int8_t *filter_data,\n                                              const uint16_t filter_wd,\n                                              const uint16_t filter_ht,\n                                              const int32_t *bias,\n                                              int8_t *out_data,\n                                              const uint16_t out_wd,\n                                              const uint16_t out_ht,\n                                              const int32_t out_offset,\n                                              const int32_t *out_shift,\n                                              const int32_t *out_mult,\n                                              const int32_t activation_min,\n                                              const int32_t activation_max)\n{\n    int out_idx = 0;\n    for (int out_y = 0; out_y < out_ht; out_y++) { //height loop\n        const int16_t base_y = (out_y * stride_ht) - pad_ht;\n        for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop\n            const int16_t base_x = (out_x * stride_wd) - pad_wd;\n            for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop\n                int ch_mult_idx = 0;\n                for (; ch_mult_idx < ch_mult - 3; ch_mult_idx += 4) {\n                    int32_t result0 = 0, result1 = 0, result2 = 0, result3 = 0;\n                    const int out_ch_idx = ch_mult_idx + ch_idx * ch_mult;\n\n                    /* Select filter so as the point doesn't lie outside block */\n                    int filter_y_start = max(0, -base_y);\n                    int filter_x_start = max(0, -base_x);\n                    int filter_y_end = min(filter_ht, input_ht - base_y);\n                    int filter_x_end = min(filter_wd, input_wd - base_x);\n\n                    for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {\n                        const int32_t idx_y = base_y + filter_y_idx;\n                        for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {\n                            const int32_t idx_x = base_x + filter_x_idx;\n                            int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;\n                            int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx;\n                            int32_t input_val = input_data[input_index] + input_offset;\n                            int32_t filter_val0 = filter_data[filter_index + 0];\n                            int32_t filter_val1 = filter_data[filter_index + 1];\n                            int32_t filter_val2 = filter_data[filter_index + 2];\n                            int32_t filter_val3 = filter_data[filter_index + 3];\n                            result0 += input_val * filter_val0;\n                            result1 += input_val * filter_val1;\n                            result2 += input_val * filter_val2;\n                            result3 += input_val * filter_val3;\n                        }\n                    }\n                    if (bias) {\n                        result0 += bias[out_ch_idx + 0];\n                        result1 += bias[out_ch_idx + 1];\n                        result2 += bias[out_ch_idx + 2];\n                        result3 += bias[out_ch_idx + 3];\n                    }\n                    result0 = esp_nn_multiply_by_quantized_mult(result0,\n                                out_mult[out_ch_idx + 0], out_shift[out_ch_idx + 0]);\n                    result1 = esp_nn_multiply_by_quantized_mult(result1,\n                                out_mult[out_ch_idx + 1], out_shift[out_ch_idx + 1]);\n                    result2 = esp_nn_multiply_by_quantized_mult(result2,\n                                out_mult[out_ch_idx + 2], out_shift[out_ch_idx + 2]);\n                    result3 = esp_nn_multiply_by_quantized_mult(result3,\n                                out_mult[out_ch_idx + 3], out_shift[out_ch_idx + 3]);\n\n                    result0 += out_offset;\n                    result1 += out_offset;\n                    result2 += out_offset;\n                    result3 += out_offset;\n\n                    result0 = max(result0, activation_min);\n                    result1 = max(result1, activation_min);\n                    result2 = max(result2, activation_min);\n                    result3 = max(result3, activation_min);\n\n                    result0 = min(result0, activation_max);\n                    result1 = min(result1, activation_max);\n                    result2 = min(result2, activation_max);\n                    result3 = min(result3, activation_max);\n\n                    out_data[out_idx++] = result0;\n                    out_data[out_idx++] = result1;\n                    out_data[out_idx++] = result2;\n                    out_data[out_idx++] = result3;\n                }\n\n                /* left-over */\n                for (; ch_mult_idx < ch_mult; ch_mult_idx++) {\n                    int32_t result = 0;\n                    const int out_ch_idx = ch_mult_idx + ch_idx * ch_mult;\n\n                    /* Select filter so as the point doesn't lie outside block */\n                    int filter_y_start = max(0, -base_y);\n                    int filter_x_start = max(0, -base_x);\n                    int filter_y_end = min(filter_ht, input_ht - base_y);\n                    int filter_x_end = min(filter_wd, input_wd - base_x);\n\n                    for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {\n                        const int32_t idx_y = base_y + filter_y_idx;\n                        for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {\n                            const int32_t idx_x = base_x + filter_x_idx;\n                            int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;\n                            int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * (channels * ch_mult) + out_ch_idx;\n                            int32_t input_val = input_data[input_index] + input_offset;\n                            int32_t filter_val = filter_data[filter_index];\n                            result += input_val * filter_val;\n                        }\n                    }\n                    if (bias) {\n                        result += bias[out_ch_idx];\n                    }\n                    result = esp_nn_multiply_by_quantized_mult(result, out_mult[out_ch_idx], out_shift[out_ch_idx]);\n                    result += out_offset;\n                    result = max(result, activation_min);\n                    result = min(result, activation_max);\n\n                    out_data[out_idx++] = result;\n                }\n            }\n        }\n    }\n}\n\nvoid esp_nn_depthwise_conv_s8_ch_mult1(const int8_t *input_data,\n                                       const uint16_t input_wd,\n                                       const uint16_t input_ht,\n                                       const uint16_t channels,\n                                       const int32_t input_offset,\n                                       const uint16_t pad_wd,\n                                       const uint16_t pad_ht,\n                                       const uint16_t stride_wd,\n                                       const uint16_t stride_ht,\n                                       const int8_t *filter_data,\n                                       const uint16_t filter_wd,\n                                       const uint16_t filter_ht,\n                                       const int32_t *bias,\n                                       int8_t *out_data,\n                                       const uint16_t out_wd,\n                                       const uint16_t out_ht,\n                                       const int32_t out_offset,\n                                       const int32_t *out_shift,\n                                       const int32_t *out_mult,\n                                       const int32_t activation_min,\n                                       const int32_t activation_max)\n{\n    int out_idx = 0;\n    for (int out_y = 0; out_y < out_ht; out_y++) { //height loop\n        const int16_t base_y = (out_y * stride_ht) - pad_ht;\n        for (int out_x = 0; out_x < out_wd; out_x++) { //width_loop\n            const int16_t base_x = (out_x * stride_wd) - pad_wd;\n            for (int ch_idx = 0; ch_idx < channels; ch_idx++) {//channel_loop\n                int32_t result = 0;\n                /* Select filter so as the point doesn't lie outside block */\n                int filter_y_start = max(0, -base_y);\n                int filter_x_start = max(0, -base_x);\n                int filter_y_end = min(filter_ht, input_ht - base_y);\n                int filter_x_end = min(filter_wd, input_wd - base_x);\n\n                for (int filter_y_idx = filter_y_start; filter_y_idx < filter_y_end; filter_y_idx++) {\n                    const int32_t idx_y = base_y + filter_y_idx;\n                    for (int filter_x_idx = filter_x_start; filter_x_idx < filter_x_end; filter_x_idx++) {\n                        const int32_t idx_x = base_x + filter_x_idx;\n                        int32_t input_index = (idx_y * input_wd + idx_x) * channels + ch_idx;\n                        int32_t filter_index = (filter_y_idx * filter_wd + filter_x_idx) * channels + ch_idx;\n                        int32_t input_val = input_data[input_index] + input_offset;\n                        int32_t filter_val = filter_data[filter_index];\n                        result += input_val * filter_val;\n                    }\n                }\n                if (bias) {\n                    result += bias[ch_idx];\n                }\n                result = esp_nn_multiply_by_quantized_mult(result, out_mult[ch_idx], out_shift[ch_idx]);\n                result += out_offset;\n                result = max(result, activation_min);\n                result = min(result, activation_max);\n\n                out_data[out_idx++] = result;\n            }\n        }\n    }\n}\n\nint esp_nn_get_depthwise_conv_scratch_size_esp32s3(const data_dims_t *input_dims,\n                                                   const data_dims_t *filter_dims,\n                                                   const data_dims_t *output_dims,\n                                                   const dw_conv_params_t *conv_params)\n{\n    const uint16_t input_wd = input_dims->width;\n    const uint16_t input_ht = input_dims->height;\n    const uint16_t channels = input_dims->channels;\n    const uint16_t filter_wd = filter_dims->width;\n    const uint16_t filter_ht = filter_dims->height;\n    const uint16_t ch_mult = conv_params->ch_mult;\n    const uint16_t out_wd = output_dims->width;\n    const uint16_t out_ht = output_dims->height;\n    const uint16_t pad_wd = conv_params->padding.width;\n    const uint16_t pad_ht = conv_params->padding.height;\n    const uint16_t stride_wd = conv_params->stride.width;\n    const uint16_t stride_ht = conv_params->stride.height;\n\n    int filter_size = filter_wd * filter_ht * channels * ch_mult;\n    int pad_width = 0, pad_height = 0;\n\n    if ((ch_mult == 1) && (channels % 8 == 0)) {\n        if(filter_wd == 3 && filter_ht == 3) {\n            if (channels % 16 == 0) {\n                if (pad_wd || pad_ht) {\n                    pad_width = pad_wd * 2;\n                    pad_height = pad_ht * 2;\n                } else {\n                    pad_width = (out_wd * stride_wd + filter_wd - 1) - input_wd;\n                    pad_height = (out_ht * stride_ht + filter_ht - 1) - input_ht;\n                }\n                if (pad_width || pad_height) {\n                    int full_input = (input_wd + pad_width) * (input_ht + pad_height) * channels;\n                    if (full_input <= 40 * 1024) {\n                        return filter_size + full_input + 16;\n                    } else {\n                        /* Tiled: only need filter + strip buffer (filter_ht rows) */\n                        int strip = (input_wd + pad_width) * filter_ht * channels;\n                        return filter_size + strip + 16;\n                    }\n                } else {\n                    return filter_size + 16;\n                }\n            } else if (channels >= 12) {\n                /* ch % 8 == 0, not % 16, ch >= 12: pad channels to 16, s8 path + compaction */\n                int new_ch = (channels + 15) & ~15;\n                int new_filter_size = 9 * new_ch;\n                int total_pad_wd = pad_wd * 2 + max(0, (out_wd * stride_wd + 2) - input_wd);\n                int total_pad_ht = pad_ht * 2 + max(0, (out_ht * stride_ht + 2) - input_ht);\n                int new_input_size = (input_wd + total_pad_wd) * (input_ht + total_pad_ht) * new_ch;\n                int out_buf_size = out_wd * out_ht * new_ch;\n                return new_filter_size + new_input_size + out_buf_size + 64;\n            } else {\n                /* ch=8: s16 path is more efficient (no channel padding overhead) */\n                int input_s = input_wd * input_ht * channels;\n                return  2 * (filter_size + input_s) + 32;\n            }\n        } else {\n            int input_size = input_wd * input_ht * channels;\n            int total_s16 = 2 * (filter_size + input_size);\n            if (total_s16 <= 48 * 1024) {\n                return total_s16 + 32;\n            } else {\n                /* Tiled: only need filter_s16 + tile buffer (filter_ht rows of input s16) */\n                int tile_rows = filter_ht;\n                int tile_s16 = 2 * input_wd * tile_rows * channels;\n                return 2 * filter_size + tile_s16 + 32;\n            }\n        }\n    } else if ((ch_mult == 1) && (channels > 3)) {\n        // ch_mult=1, channels>3 case: pad channels to multiple of 8 for mult1\n        int padded_channels = (channels + 7) & ~7;\n        int padded_input_size = input_wd * input_ht * padded_channels;\n        int padded_filter_size = filter_wd * filter_ht * padded_channels;\n\n        // Calculate actual memory layout with 16-byte alignments (matching usage)\n        size_t filter_bytes = padded_filter_size * sizeof(int16_t);\n        size_t input_start = (filter_bytes + 15) & ~15;\n        size_t input_bytes = padded_input_size * sizeof(int16_t);\n        size_t out_start = (input_start + input_bytes + 15) & ~15;\n        size_t out_bytes = out_wd * out_ht * padded_channels * sizeof(int8_t);\n        size_t bias_start = (out_start + out_bytes + 15) & ~15;\n        size_t bias_bytes = padded_channels * sizeof(int32_t);\n        size_t shift_bytes = padded_channels * sizeof(int32_t);\n        size_t mult_bytes = padded_channels * sizeof(int32_t);\n        size_t total_size = bias_start + bias_bytes + shift_bytes + mult_bytes;\n\n        return total_size + 16; // 16 for margin\n    } else if (ch_mult % 4 == 0) {\n        int input_size = input_wd * input_ht * channels;\n        return  2 * (filter_size + input_size) + 32; // 32 for alignment\n    }\n\n    // Default fallback\n    return 32;\n}\n\nvoid esp_nn_set_depthwise_conv_scratch_buf_esp32s3(void *buf)\n{\n    scratch_buffer = (int16_t *) buf;\n}\n\n/**\n * ESP32-S3 optimized depthwise convolution implementation.\n *\n * This function dispatches to various optimized implementations based on:\n * - Channel multiplier (ch_mult)\n * - Number of channels\n * - Filter dimensions\n * - Padding requirements\n *\n * For cases that don't have direct optimized implementations, the function\n * uses data padding techniques to leverage existing optimized functions:\n * - ch_mult % 4 != 0: Pad ch_mult to next multiple of 4, use mult4 functions\n * - ch_mult == 1, channels % 8 != 0: Fallback to C implementation for correctness\n *\n * Assumption 1: i/p channels == o/p channels\n * Assumption 2: Pointers are valid\n * Assumption 3: dilation width = 1\n */\n\n#include \"esp_nn_generic_opt.h\"\n\nvoid esp_nn_depthwise_conv_s8_esp32s3(const data_dims_t *input_dims,\n                                      const int8_t *input_data,\n                                      const data_dims_t *filter_dims,\n                                      const int8_t *filter_data,\n                                      const int32_t *bias,\n                                      const data_dims_t *output_dims,\n                                      int8_t *out_data,\n                                      const dw_conv_params_t *conv_params,\n                                      const quant_data_t *quant_data)\n{\n    const uint16_t input_wd = input_dims->width;\n    const uint16_t input_ht = input_dims->height;\n    const uint16_t channels = input_dims->channels;\n    const int32_t input_offset = conv_params->in_offset;\n    const int32_t out_offset = conv_params->out_offset;\n    const uint16_t pad_wd = conv_params->padding.width;\n    const uint16_t pad_ht = conv_params->padding.height;\n    const uint16_t stride_wd = conv_params->stride.width;\n    const uint16_t stride_ht = conv_params->stride.height;\n    const uint16_t filter_wd = filter_dims->width;\n    const uint16_t filter_ht = filter_dims->height;\n    const uint16_t out_wd = output_dims->width;\n    const uint16_t out_ht = output_dims->height;\n    const int32_t *out_shift = quant_data->shift;\n    const int32_t *out_mult = quant_data->mult;\n    const int32_t activation_min = conv_params->activation.min;\n    const int32_t activation_max = conv_params->activation.max;\n    const uint16_t ch_mult = conv_params->ch_mult;\n\n    int filter_size = filter_wd * filter_ht * channels * ch_mult;\n    int align_len = 16 - (filter_size & 15);\n    int input_size = input_wd * input_ht * channels;\n    int16_t *filter_data16 = scratch_buffer;\n    int16_t *input_data16 = scratch_buffer + filter_size + align_len;\n    if (scratch_buffer == NULL) {\n        printf(\"esp_nn_depthwise_conv error! scratch_buffer not set!\\n\");\n        return;\n    }\n\n    if ((ch_mult == 1) && (channels % 8 == 0)) {\n        if ((filter_wd == 3) && (filter_ht == 3)) {\n            if ((channels % 16 == 0) && (pad_wd == 1) && (pad_ht == 1)) {\n                /* process in 8 bits with s8 padded assembly */\n                int8_t *filter_aligned = (int8_t *) scratch_buffer;\n                int8_t *input_padded = (int8_t *) scratch_buffer + filter_size + align_len;\n                memcpy(filter_aligned, filter_data, filter_size);\n\n                int padded_input_size = (input_wd + 2*pad_wd) * (input_ht + 2*pad_ht) * channels;\n                if (padded_input_size <= 40 * 1024) {\n                    /* Small enough — full padding, single assembly call */\n                    esp_nn_aligned_s8_pad_with_value(input_data, input_padded, input_wd, input_ht, channels,\n                                                     -input_offset, pad_wd, pad_ht);\n                    esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(input_padded, input_wd + 2 * pad_wd,\n                                                                      input_ht + 2 * pad_ht, channels, input_offset,\n                                                                      stride_wd, stride_ht, filter_aligned, bias,\n                                                                      out_data, out_wd, out_ht, out_offset, out_shift,\n                                                                      out_mult, activation_min, activation_max);\n                } else {\n                    /* Large input: row-tiled processing to reduce cache pressure.\n                     * Pad and process a strip of output rows at a time. */\n                    int padded_wd = input_wd + 2 * pad_wd;\n                    int8_t pad_val = (int8_t)(-input_offset);\n\n                    for (int out_y = 0; out_y < out_ht; out_y++) {\n                        int in_y_start = out_y * stride_ht; /* in padded coords (pad_ht already accounted) */\n                        /* Pad filter_ht rows of input into scratch */\n                        int8_t *tile = input_padded;\n                        for (int fy = 0; fy < filter_ht; fy++) {\n                            int src_y = in_y_start + fy - pad_ht; /* original input row */\n                            if (src_y < 0 || src_y >= input_ht) {\n                                /* Padding row */\n                                memset(tile, pad_val, padded_wd * channels);\n                            } else {\n                                /* Left pad */\n                                memset(tile, pad_val, pad_wd * channels);\n                                /* Copy input row */\n                                memcpy(tile + pad_wd * channels,\n                                       input_data + src_y * input_wd * channels,\n                                       input_wd * channels);\n                                /* Right pad */\n                                memset(tile + (pad_wd + input_wd) * channels, pad_val, pad_wd * channels);\n                            }\n                            tile += padded_wd * channels;\n                        }\n                        /* Process one output row */\n                        esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(\n                            input_padded, padded_wd, filter_ht, channels, input_offset,\n                            stride_wd, 1, filter_aligned, bias,\n                            out_data + out_y * out_wd * channels,\n                            out_wd, 1, out_offset, out_shift,\n                            out_mult, activation_min, activation_max);\n                    }\n                }\n            } else if ((channels % 16 == 0) && (pad_wd == 0) && (pad_ht == 0)) {\n                /* process in 8 bits */\n                int8_t *filter_aligned = (int8_t *) scratch_buffer;\n                int8_t *input_padded = (int8_t *) scratch_buffer + filter_size + align_len;\n\n                // check if we need to pad additionally\n                int pad_right = (out_wd * stride_wd + filter_wd - 1) - input_wd;\n                int pad_bottom = (out_ht * stride_ht + filter_ht - 1) - input_ht;\n                if (pad_right || pad_bottom) { // pad right and bottom\n                    esp_nn_aligned_s8_pad_end_with_value(input_data, input_padded, input_wd, input_ht,\n                                                         channels, -input_offset, pad_right, pad_bottom);\n                } else {\n                    input_padded = (int8_t *) input_data;\n                }\n                memcpy(filter_aligned, filter_data, filter_size);\n                esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(input_padded, input_wd + pad_right,\n                                                                  input_ht + pad_bottom, channels, input_offset,\n                                                                  stride_wd, stride_ht, filter_aligned, bias,\n                                                                  out_data, out_wd, out_ht, out_offset, out_shift,\n                                                                  out_mult, activation_min, activation_max);\n            } else if (channels >= 12) {\n                /* channels % 8 == 0, not % 16, channels >= 12: pad to 16 is worthwhile\n                 * (overhead <= 33%). For ch=8, padding to 16 doubles data — use s16 instead */\n                int new_ch = (channels + 15) & ~15;\n                int8_t pad_val = (int8_t)(-input_offset);\n\n                /* Pad filter: 3x3 x new_ch */\n                int new_filter_size = 9 * new_ch;\n                int8_t *filter_padded = (int8_t *) scratch_buffer;\n                memset(filter_padded, 0, new_filter_size);\n                for (int f = 0; f < 9; f++) {\n                    memcpy(filter_padded + f * new_ch, filter_data + f * channels, channels);\n                }\n\n                /* Pad input: (input_wd + 2*pad) x (input_ht + 2*pad) x new_ch */\n                int new_input_wd = input_wd + 2 * pad_wd;\n                int new_input_ht = input_ht + 2 * pad_ht;\n                int pad_right = max(0, (out_wd * stride_wd + 3 - 1) - (input_wd + 2 * pad_wd));\n                int pad_bottom = max(0, (out_ht * stride_ht + 3 - 1) - (input_ht + 2 * pad_ht));\n                new_input_wd += pad_right;\n                new_input_ht += pad_bottom;\n\n                int8_t *input_padded = filter_padded + new_filter_size + 16;\n                int padded_input_total = new_input_wd * new_input_ht * new_ch;\n                /* Fill entire padded input with pad_val first */\n                memset(input_padded, pad_val, padded_input_total);\n                /* Copy actual input data into correct positions */\n                for (int y = 0; y < input_ht; y++) {\n                    for (int x = 0; x < input_wd; x++) {\n                        int dst_y = y + pad_ht;\n                        int dst_x = x + pad_wd;\n                        memcpy(input_padded + (dst_y * new_input_wd + dst_x) * new_ch,\n                               input_data + (y * input_wd + x) * channels, channels);\n                    }\n                }\n\n                /* Padded output buffer */\n                int8_t *out_padded = input_padded + padded_input_total;\n\n                /* Pad quant arrays */\n                int32_t shift_pad[new_ch], mult_pad[new_ch], bias_pad[new_ch];\n                memcpy(shift_pad, out_shift, channels * sizeof(int32_t));\n                memcpy(mult_pad, out_mult, channels * sizeof(int32_t));\n                memset(shift_pad + channels, 0, (new_ch - channels) * sizeof(int32_t));\n                memset(mult_pad + channels, 0, (new_ch - channels) * sizeof(int32_t));\n                if (bias) {\n                    memcpy(bias_pad, bias, channels * sizeof(int32_t));\n                    memset(bias_pad + channels, 0, (new_ch - channels) * sizeof(int32_t));\n                }\n\n                esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3(\n                    input_padded, new_input_wd, new_input_ht, new_ch, input_offset,\n                    stride_wd, stride_ht, filter_padded,\n                    bias ? bias_pad : NULL, out_padded,\n                    out_wd, out_ht, out_offset, shift_pad, mult_pad,\n                    activation_min, activation_max);\n\n                /* Compact output: strip padding channels */\n                for (int pos = 0; pos < out_wd * out_ht; pos++) {\n                    memcpy(out_data + pos * channels,\n                           out_padded + pos * new_ch, channels);\n                }\n            } else {\n                /* ch < 12 (e.g., ch=8), 3x3: use s16 mult1 3x3 path */\n                esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size);\n                esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input_data, input_data16, input_size, input_offset);\n                esp_nn_depthwise_conv_s16_mult1_3x3_esp32s3(input_data16, input_wd, input_ht, channels,\n                                                            pad_wd, pad_ht, stride_wd, stride_ht, filter_data16,\n                                                            bias, out_data, out_wd, out_ht, out_offset, out_shift,\n                                                            out_mult, activation_min, activation_max);\n            }\n        } else { // all other ch_mult == 1, channels % 8 == 0\n            /* Tiled s16 processing: convert filter once, process input in row strips\n             * to keep working set within DCache (64KB) */\n            esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size);\n\n            /* Check if full conversion fits comfortably in cache */\n            int total_s16_size = 2 * (filter_size + input_size);\n            if (total_s16_size <= 48 * 1024) {\n                /* Small enough — full conversion is fine */\n                esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input_data, input_data16, input_size, input_offset);\n                esp_nn_depthwise_conv_s16_mult1_esp32s3(input_data16, input_wd, input_ht, channels,\n                                                        pad_wd, pad_ht, stride_wd, stride_ht, filter_data16,\n                                                        filter_wd, filter_ht, bias, out_data, out_wd, out_ht, out_offset, out_shift,\n                                                        out_mult, activation_min, activation_max);\n            } else {\n                /* Large input: process in row tiles to reduce cache pressure.\n                 * Convert only the input rows needed for each output row strip. */\n                int16_t *tile_buf = input_data16; /* reuse scratch for tile */\n\n                for (int out_row = 0; out_row < out_ht; out_row++) {\n                    int in_row_start = out_row * stride_ht - pad_ht;\n                    int in_row_end = in_row_start + filter_ht;\n\n                    /* Fill tile: pad rows that are outside input bounds */\n                    int16_t *dst = tile_buf;\n                    for (int r = in_row_start; r < in_row_end; r++) {\n                        if (r < 0 || r >= input_ht) {\n                            /* Padding row: fill with input_offset */\n                            for (int i = 0; i < input_wd * channels; i++) {\n                                dst[i] = (int16_t)input_offset;\n                            }\n                        } else {\n                            /* Valid row: convert s8 to s16 with offset */\n                            const int8_t *src = input_data + r * input_wd * channels;\n                            for (int i = 0; i < input_wd * channels; i++) {\n                                dst[i] = (int16_t)src[i] + (int16_t)input_offset;\n                            }\n                        }\n                        dst += input_wd * channels;\n                    }\n\n                    /* Process one output row */\n                    esp_nn_depthwise_conv_s16_mult1_esp32s3(tile_buf, input_wd, filter_ht, channels,\n                                                            pad_wd, 0, stride_wd, 1, filter_data16,\n                                                            filter_wd, filter_ht, bias,\n                                                            out_data + out_row * out_wd * channels,\n                                                            out_wd, 1, out_offset, out_shift,\n                                                            out_mult, activation_min, activation_max);\n                }\n            }\n        }\n    } else if ((ch_mult == 1) && (channels > 3)) {\n        // For ch_mult=1, pad channels to multiple of 8 for optimized mult1 function\n        int padded_channels = (channels + 7) & ~7; // Round up to multiple of 8\n        int padded_input_size = input_wd * input_ht * padded_channels;\n        int padded_filter_size = filter_wd * filter_ht * padded_channels;\n\n        // Use scratch buffer for padded data (ensure 16-byte alignment for SIMD)\n        int16_t *padded_filter_data16 = (int16_t*)scratch_buffer;\n        size_t input_start = (size_t)(padded_filter_data16 + padded_filter_size);\n        int16_t *padded_input_data16 = (int16_t*)((input_start + 15) & ~15);\n        size_t out_start = (size_t)(padded_input_data16 + padded_input_size);\n        int8_t *padded_out_data = (int8_t*)((out_start + 15) & ~15);\n\n        // Create padded parameter arrays\n        size_t bias_start = (size_t)(padded_out_data + out_wd * out_ht * padded_channels);\n        int32_t *padded_bias = (int32_t*)((bias_start + 15) & ~15);\n        int32_t *padded_shift = padded_bias + padded_channels;\n        int32_t *padded_mult = padded_shift + padded_channels;\n\n        // Initialize padded parameters - copy valid values, set padded ones to safe defaults\n        memset(padded_bias, 0, padded_channels * sizeof(int32_t));\n        memset(padded_shift, 0, padded_channels * sizeof(int32_t));\n        memset(padded_mult, 0, padded_channels * sizeof(int32_t));\n\n        if (bias) {\n            memcpy(padded_bias, bias, channels * sizeof(int32_t));\n        }\n        if (out_shift) {\n            memcpy(padded_shift, out_shift, channels * sizeof(int32_t));\n        }\n        if (out_mult) {\n            memcpy(padded_mult, out_mult, channels * sizeof(int32_t));\n        }\n\n        // Convert filter data to padded layout (zero out extra channels)\n        memset(padded_filter_data16, 0, padded_filter_size * sizeof(int16_t));\n        for (int c = 0; c < channels; c++) {\n            for (int fy = 0; fy < filter_ht; fy++) {\n                for (int fx = 0; fx < filter_wd; fx++) {\n                    int orig_idx = (fy * filter_wd + fx) * channels + c;\n                    int padded_idx = (fy * filter_wd + fx) * padded_channels + c;\n                    padded_filter_data16[padded_idx] = (int16_t) filter_data[orig_idx];\n                }\n            }\n        }\n\n        // Convert input data to padded layout (zero out extra channels, apply offset)\n        memset(padded_input_data16, 0, padded_input_size * sizeof(int16_t));\n        for (int h = 0; h < input_ht; h++) {\n            for (int w = 0; w < input_wd; w++) {\n                for (int c = 0; c < channels; c++) {\n                    int orig_idx = (h * input_wd + w) * channels + c;\n                    int padded_idx = (h * input_wd + w) * padded_channels + c;\n                    padded_input_data16[padded_idx] = (int16_t) input_data[orig_idx] + input_offset;\n                }\n            }\n        }\n\n        // Call mult1 with padded data\n        esp_nn_depthwise_conv_s16_mult1_esp32s3(padded_input_data16, input_wd, input_ht, padded_channels,\n                                                pad_wd, pad_ht, stride_wd, stride_ht, padded_filter_data16,\n                                                filter_wd, filter_ht, padded_bias, padded_out_data, out_wd, out_ht, out_offset, padded_shift,\n                                                padded_mult, activation_min, activation_max);\n\n        // Copy back only valid channels\n        for (int h = 0; h < out_ht; h++) {\n            for (int w = 0; w < out_wd; w++) {\n                for (int c = 0; c < channels; c++) {\n                    int out_idx = (h * out_wd + w) * channels + c;\n                    int padded_idx = (h * out_wd + w) * padded_channels + c;\n                    out_data[out_idx] = padded_out_data[padded_idx];\n                }\n            }\n        }\n    } else if (ch_mult % 8 == 0) {\n        // Channel multiplier is optimized multiple - use direct s16 functions\n        esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size);\n        esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input_data, input_data16, input_size, input_offset);\n        if (filter_wd == 3 && filter_ht == 3) {\n            esp_nn_depthwise_conv_s16_mult8_3x3_esp32s3(input_data16, input_wd, input_ht, channels,\n                                                        pad_wd, pad_ht, stride_wd, stride_ht, ch_mult,\n                                                        filter_data16, bias,\n                                                        out_data, out_wd, out_ht, out_offset, out_shift,\n                                                        out_mult, activation_min, activation_max);\n        } else {\n            esp_nn_depthwise_conv_s16_mult8_esp32s3(input_data16, input_wd, input_ht, channels,\n                                                    pad_wd, pad_ht, stride_wd, stride_ht, ch_mult,\n                                                    filter_data16, filter_wd, filter_ht, bias,\n                                                    out_data, out_wd, out_ht, out_offset, out_shift,\n                                                    out_mult, activation_min, activation_max);\n        }\n    } else if (ch_mult % 4 == 0) {\n        esp_nn_s8_to_s16_esp32s3(filter_data, filter_data16, filter_size);\n        esp_nn_aligned_s8_to_s16_with_offset_esp32s3(input_data, input_data16, input_size, input_offset);\n        esp_nn_depthwise_conv_s16_mult4_esp32s3(input_data16, input_wd, input_ht, channels,\n                                                pad_wd, pad_ht, stride_wd, stride_ht, ch_mult,\n                                                filter_data16, filter_wd, filter_ht, bias,\n                                                out_data, out_wd, out_ht, out_offset, out_shift,\n                                                out_mult, activation_min, activation_max);\n    } else {\n        esp_nn_depthwise_conv_s8_opt(input_dims, input_data, filter_dims, filter_data, bias,\n                                     output_dims, out_data, conv_params, quant_data);\n    }\n}\n"
  },
  {
    "path": "src/convolution/esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3.S",
    "content": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n    .text\n    .literal_position\n\n// processes multiple of 16 channels\n// already padded version. no additional padding needed\n// simply keep sliding filter window by stride_size\n\n    # Program Unit: esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3\n    .type   esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3, @function\n    .align   4\n    .global esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3\n\nesp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3:  # 0xccc\n    # qacc_scratch = 0\n    # gra_spill_temp_103 = 40 // stride_wd*channels\n    # gra_spill_temp_104 = 44 // bias_align\n    # gra_spill_temp_107 = 48 // input_offset\n    # gra_spill_temp_105 = 52 // out_mult_ptr\n    # gra_spill_temp_106 = 56 // out_shift_ptr\n    # gra_spill_temp_108 = 60 // ch_idx\n    # gra_spill_temp_109 = 64 // out_ch\n    # gra_spill_temp_110 = 68 // bias_ptr\n    # gra_spill_temp_111 = 72 // 2 * (input_wd * channels)\n    # gra_spill_temp_112 = 76 // input_data\n    # gra_spill_temp_118 = 96\n    # gra_spill_temp_119 = 100\n    # gra_spill_temp_120 = 104\n    # gra_spill_temp_121 = 108\n    # gra_spill_temp_113 = 112 // input_wd * channels\n    # gra_spill_temp_114 = 116 // input_wd\n    # gra_spill_temp_130 = 120\n\n    # gra_spill_temp_141 = 0\n    # gra_spill_temp_120 = 16\n    # gra_spill_temp_137 = 80\n\n// offset+bias factor\n    # gra_spill_temp_134 = 128 //256-128\n    # gra_spill_temp_135 = 144 //256-112\n    # gra_spill_temp_133 = 160 //256-96\n    # gra_spill_temp_132 = 176 //256-80\n\n\n // registers:\n // a2: input_data\n // a3: input_wd\n // a4: input_ht\n // a5: channels\n // a6: input_offset\n // a7: stride_wd\n\n // on stack:\n\n // 320: stride_ht\n // 324: filter_data\n // 328: *bias\n // 332: *out_data\n // 336: out_wd\n // 340: out_ht\n // 344: out_offset\n // 348: *out_shift\n // 352: *out_mult\n // 356: activation_min\n // 360: activation_max\n\n    entry   a1,320                      #\n    mul16u  a7,a7,a5\n    s32i    a3,a1,116                   # [0]  gra_spill_temp_114, input_wd\n    s32i    a6,a1,48                    # [1]  gra_spill_temp_107, input_offset\n    s32i    a7,a1,40                    # gra_spill_temp_103, stride_wd*channels\n\n    addi    a8,a5,-15                   # [2]\n    s32i    a2,a1,76                    # [3]  gra_spill_temp_112, input_data\n    l32i    a9,a1,328                   # [4]  id:664 bias+0x0\n    mov.n   a2,a5                       # [5]\n    s32i    a8,a1,64                    # [7]  gra_spill_temp_109\n    s32i    a9,a1,68                    # [8]  gra_spill_temp_110, bias_ptr\n    blti    a8,1,.Lt_7_4610             # [9]\n\n    l32i    a12,a1,348                  # [4]  id:666 out_shift+0x0\n    mul16u  a15,a3,a5               # [1]\n    movi.n  a9,0                    # [13]\n    s32i    a12,a1,56                   # [9]  gra_spill_temp_106 // out_shift_ptr\n    s32i    a9,a1,60                    # [14]  gra_spill_temp_108, ch_idx\n    s32i    a15,a1,112                  # [12]  gra_spill_temp_113, input_wd*channels\n    l32i    a9,a1,352                   # [24]  id:665 out_mult+0x0\n    slli    a15,a15,1                   # [15]\n    s32i    a15,a1,72                   # [23]  gra_spill_temp_111, 2 * (input_wd * channels)\n    s32i    a9,a1,52                    # [25]  gra_spill_temp_105, out_mult_ptr\n\n// outer most out_ch loop\n.Lt_7_5122: # 0xd57\n    l32i            a13,a1,324                  # [1]  filter_data\n    l32i            a6,a1,60                    # [2]  gra_spill_temp_108, ch_idx\n    l32i            a9,a1,48                    # [0]  gra_spill_temp_107, input_offset\n    ee.zero.q       q2                      # [3]\n    add.n           a13,a6,a13                  # [4]\n    s32i            a13,a1,108                  # [5]  gra_spill_temp_121\n\n// multiply accumulate filter points\n    ee.vld.128.xp   q1,a13,a2           # [6]  id:673\n    ee.vld.128.xp   q3,a13,a2           # [7]  id:674\n    ee.vcmp.lt.s8   q0,q1,q2            # [8]\n    ee.vcmp.lt.s8   q4,q3,q2            # [9]\n    ee.vzip.8       q1,q0                   # [10]\n    ee.vzip.8       q3,q4                   # [11]\n    ee.vadds.s16    q0,q0,q4            # [12]\n    ee.vld.128.xp   q4,a13,a2           # [13]  id:675\n    ee.vadds.s16    q1,q1,q3            # [14]\n    ee.vcmp.lt.s8   q3,q4,q2            # [15]\n    ee.vzip.8       q4,q3                   # [16]\n    ee.vadds.s16    q1,q1,q4            # [17]\n    ee.vld.128.xp   q4,a13,a2           # [18]  id:676\n    ee.vadds.s16    q0,q0,q3            # [19]\n    ee.vcmp.lt.s8   q3,q4,q2            # [20]\n    ee.vzip.8       q4,q3                   # [21]\n    ee.vadds.s16    q0,q0,q3            # [22]\n    ee.vld.128.xp   q3,a13,a2           # [23]  id:677\n    ee.vadds.s16    q1,q1,q4            # [24]\n    ee.vcmp.lt.s8   q4,q3,q2            # [25]\n    ee.vzip.8       q3,q4                   # [26]\n    ee.vadds.s16    q1,q1,q3            # [27]\n    ee.vld.128.xp   q3,a13,a2           # [28]  id:678\n    ee.vadds.s16    q0,q0,q4            # [29]\n    ee.vcmp.lt.s8   q4,q3,q2            # [30]\n    ee.vzip.8       q3,q4                   # [31]\n    ee.vadds.s16    q0,q0,q4            # [32]\n    ee.vld.128.xp   q4,a13,a2           # [33]  id:679\n    ee.vadds.s16    q1,q1,q3            # [34]\n    ee.vcmp.lt.s8   q3,q4,q2            # [35]\n    ee.vzip.8       q4,q3                   # [36]\n    ee.vadds.s16    q1,q1,q4            # [37]\n    ee.vld.128.xp   q4,a13,a2           # [38]  id:680\n    ee.vadds.s16    q0,q0,q3            # [39]\n    ee.vcmp.lt.s8   q3,q4,q2            # [40]\n    ee.vzip.8       q4,q3                   # [41]\n    ee.vadds.s16    q0,q0,q3            # [42]\n    ee.vld.128.xp   q3,a13,a2           # [44]  id:681\n    ee.vadds.s16    q1,q1,q4            # [43]\n    ee.vcmp.lt.s8   q2,q3,q2            # [47]\n    ee.vzip.8       q3,q2                   # [48]\n    ee.vadds.s16    q0,q0,q2            # [49]\n    ee.vadds.s16    q1,q1,q3            # [50]\n\n    ee.movi.32.a    q1,a15,1            # [51]\n    ee.movi.32.a    q1,a8,3             # [52]\n    ee.movi.32.a    q0,a10,3            # [54]\n    ee.movi.32.a    q0,a13,1            # [55]\n    srai            a11,a10,16                  # [56]\n    srai            a12,a8,16                   # [57]\n    mull            a12,a9,a12                  # [58]\n    mull            a11,a9,a11                  # [59]\n    sext            a8,a8,15                    # [328]\n    sext            a10,a10,15                  # [61]\n    srai            a14,a13,16                  # [62]\n    mull            a14,a9,a14                  # [63]\n    mull            a10,a9,a10                  # [64]\n    mull            a8,a9,a8                    # [65]\n    sext            a13,a13,15                  # [66]\n    mull            a13,a9,a13                  # [67]\n    ee.movi.32.q    q3,a11,3            # [68]\n    ee.movi.32.q    q4,a12,3            # [69]\n    ee.movi.32.q    q4,a8,2             # [70]\n    ee.movi.32.q    q3,a10,2            # [71]\n    ee.movi.32.a    q1,a11,2            # [72]\n    srai            a12,a11,16                  # [74]\n    srai            a8,a15,16                   # [75]\n    mull            a8,a9,a8                    # [76]\n    mull            a12,a9,a12                  # [77]\n    sext            a15,a15,15                  # [78]\n    sext            a11,a11,15                  # [79]\n    mull            a11,a9,a11                  # [80]\n    mull            a15,a9,a15                  # [81]\n    ee.movi.32.q    q4,a12,1            # [82]\n    ee.movi.32.q    q1,a8,3             # [83]\n    ee.movi.32.q    q1,a15,2            # [84]\n    ee.movi.32.q    q4,a11,0            # [85]\n    ee.movi.32.a    q0,a15,2            # [86]\n    ee.movi.32.q    q0,a14,3            # [88]\n    ee.movi.32.q    q0,a13,2            # [91]\n    srai            a8,a15,16                   # [89]\n    mull            a8,a9,a8                    # [90]\n    sext            a15,a15,15                  # [92]\n    mull            a15,a9,a15                  # [93]\n # 526  MUL_IN_OFFSET_EXPAND(q_sum2, 0, q_sum2, 0);\n    ee.movi.32.a    q0,a11,0            # [94]\n    srai            a13,a11,16                  # [95]\n    ee.movi.32.q    q3,a8,1             # [96]\n    ee.movi.32.q    q3,a15,0            # [100]\n    sext            a11,a11,15                  # [97]\n    mull            a13,a9,a13                  # [98]\n    l32i            a8,a1,332                   # [99]\n    ee.movi.32.a    q1,a10,0            # [103]\n    ee.movi.32.q    q0,a13,1            # [100]\n    srai            a12,a10,16                  # [105]\n    sext            a10,a10,15                  # [106]\n    mull            a12,a9,a12                  # [107]\n    mull            a10,a9,a10                  # [108]\n    mull            a9,a9,a11                   # [109]\n    ee.movi.32.q    q1,a12,1            # [110]\n    ee.movi.32.q    q1,a10,0            # [111]\n\n    l32i            a11,a1,328      // load bias\n    add.n           a6,a6,a8                    # [102]\n    ee.movi.32.q    q0,a9,0             # [113]\n    beqz.n          a11,.Lt_7_5378          # [114]\n\n// add bias\n    l32i            a8,a1,68                    # [0]  gra_spill_temp_110, bias_ptr\n    extui           a11,a11,0,4                 # [2] // bias_align\n    wur.sar_byte    a11                 # [4]\n    ee.vld.128.ip   q5,a8,16            # [5]  id:683\n    ee.vld.128.ip   q6,a8,16            # [6]  id:684\n    ee.vld.128.ip   q7,a8,16            # [7]  id:685\n    addmi           a10,a1,256                  # [2]\n    ee.src.q.ld.ip  q2,a8,16,q5,q6              # [9]\n    ee.vadds.s32    q1,q1,q5            # [12]\n    ee.src.q.ld.ip  q5,a8,0,q6,q7               # [13]\n    s32i            a8,a1,68                    # [11]  gra_spill_temp_110, bias_ptr\n    ee.vadds.s32    q4,q4,q6            # [18]\n    ee.src.q        q7,q7,q2                # [9]\n    ee.src.q        q2,q2,q5                # [13]\n    ee.vadds.s32    q0,q0,q7            # [12]\n    ee.vadds.s32    q3,q3,q2            # [12]\n.Lt_7_5378: # 0xeef\n\n// store offset+bias factor (q1,q4,q0,q3)\n    st.qr           q4,a10,-112                  # [17]  gra_spill_temp_135-256\n    st.qr           q3,a10,-128                  # [21]  gra_spill_temp_134-256\n    st.qr           q1,a10,-96                  # [7]  gra_spill_temp_133-256\n    st.qr           q0,a10,-80                  # [8]  gra_spill_temp_132-256\n\n// prepare height loop\n    movi.n  a15,0                   # [1]\n    movi.n  a8,0                    # [2]\n    movi.n  a9,0                    # [3]\n    s32i    a9,a1,100                   # [4]  gra_spill_temp_119\n    s32i    a8,a1,104                   # [5]  gra_spill_temp_120\n    s32i    a15,a1,96                  # [6]  gra_spill_temp_118\n\n// height loop\n.Lt_7_6402: # 0xf0c\n    l32i    a4,a1,104                   # [2]  gra_spill_temp_120 // out_y * (input_wd * stride_ht) * channels)\n    l32i    a8,a1,100                   # [3]  gra_spill_temp_119 // initialised to 0 before height loop\n    l32i    a5,a1,76                    # [1]  gra_spill_temp_112, input_data\n    l32i    a3,a1,60                    # [0]  gra_spill_temp_108, ch_idx\n    l32i    a7,a1,112                   # [1]  gra_spill_temp_113, input_wd*channels\n    l32i    a10,a1,336                  # [0]  out_wd\n    add.n   a4,a4,a5                    # [4] // input_data + (out_y * stride_ht) * input_wd * channels\n    mov.n   a5,a8                       # [5] // index\n    add.n   a3,a3,a4                    # [6] // input_row0\n    l32i    a4,a1,72                    # [9]  gra_spill_temp_111, 2 * (input_wd * channels)\n    add.n   a7,a7,a3                    # [7] // input_row1 = (input_wd * channels)\n    add.n   a8,a8,a10                   # [8]\n    s32i    a8,a1,120                   # [10]  gra_spill_temp_130\n    add.n   a4,a4,a3                    # [11] // input_row2\n\n// width loop\n.Lt_7_7170: # 0xf32\n    l32i                    a9,a1,108                   # [3]  gra_spill_temp_121, filter_ptr\n    ee.zero.qacc                    # [2]\n    mov.n                   a12,a3                      # [4]\n    mov.n                   a11,a7                      # [1]\n    mov.n                   a10,a4                      # [0]\n    ee.vld.128.xp           q0,a12,a2           # [5]  id:693\n    ee.vld.128.xp           q6,a12,a2           # [6]  id:695\n    ee.vld.128.xp           q1,a9,a2            # [7]  id:694\n    ee.vld.128.xp           q7,a9,a2            # [8]  id:696\n    ee.vld.128.xp           q5,a9,a2            # [9]  id:698\n    ee.vld.128.xp           q3,a9,a2            # [10]  id:700\n    ee.vmulas.s8.qacc.ld.xp q4,a12,a2,q0,q1     # [11]  id:697\n    ee.vmulas.s8.qacc.ld.xp q2,a11,a2,q6,q7     # [13]  id:699\n    ee.vld.128.xp           q1,a9,a2            # [14]  id:702\n    ee.vmulas.s8.qacc.ld.xp q0,a11,a2,q4,q5     # [15]  id:701\n    ee.vmulas.s8.qacc.ld.xp q6,a11,a2,q2,q3     # [16]  id:703\n    ee.vld.128.xp           q7,a9,a2            # [17]  id:704\n    ee.vld.128.xp           q3,a9,a2            # [18]  id:706\n    ee.vmulas.s8.qacc.ld.xp q0,a10,a2,q0,q1     # [19]  id:705\n    ee.vmulas.s8.qacc.ld.xp q1,a10,a2,q6,q7     # [20]  id:707\n    ee.vmulas.s8.qacc.ld.xp q4,a10,a2,q0,q3     # [21]  id:709\n    ee.vld.128.xp           q6,a9,a2            # [22]  id:708\n    ee.vld.128.xp           q5,a9,a2            # [23]  id:710\n    ee.vmulas.s8.qacc       q1,q6           # [24]\n    ee.vmulas.s8.qacc       q4,q5           # [25]\n\n // extract data\n    mov     a12,a1      //// scratch\n    ee.st.qacc_l.l.128.ip   a12,16      # [27]  id:713\n    ee.st.qacc_l.h.32.ip    a12,-16     # [28]  id:714\n\n    l32i.n  a9,a1,8                 # [29]  qacc_scratch+8\n    l32i.n  a11,a1,4                # [30]  qacc_scratch+4\n    l32i.n  a15,a1,0                # [31]  qacc_scratch\n    slli    a14,a11,24                  # [32]\n    sext    a8,a15,19                   # [33]\n    slli    a10,a9,16                   # [34]\n    slli    a13,a11,4                   # [35]\n    extui   a9,a9,16,16                 # [36]\n    srai    a13,a13,12                  # [37]\n    extui   a15,a15,20,12               # [39]\n    srai    a14,a14,12                  # [40]\n    srai    a10,a10,12                  # [41]\n    extui   a11,a11,28,4                # [42]\n    or      a10,a10,a11                 # [43]\n    or      a14,a14,a15                 # [44]\n\n// insert to q0\n    ee.movi.32.q    q0,a8,0             # [38]\n    ee.movi.32.q    q0,a14,1            # [45]\n    ee.movi.32.q    q0,a13,2            # [48]\n    ee.movi.32.q    q0,a10,3            # [49]\n\n    l32i.n  a11,a1,16               # [46]  qacc_scratch+16\n    l32i.n  a14,a1,12               # [47]  qacc_scratch+12\n    slli    a13,a11,20                  # [50]\n\n    ee.st.qacc_h.l.128.ip   a12,16      # [51]  id:720\n    ee.st.qacc_h.h.32.ip    a12,-16     # [55]  id:721\n    srai    a11,a11,12                  # [52]\n    srai    a13,a13,12                  # [53]\n    slli    a8,a14,28                   # [54]\n    slli    a15,a14,8                   # [56]\n    srai    a15,a15,12                  # [57]\n    srai    a8,a8,12                    # [59]\n\n    l32i.n          a12,a1,8                # [328]  qacc_scratch+8\n    or              a8,a8,a9                    # [61]\n    extui           a14,a14,24,8                # [62]\n    l32i.n          a9,a1,0                 # [63]  qacc_scratch\n    or              a13,a13,a14                 # [64]\n//insert to q3\n    ee.movi.32.q    q3,a8,0             # [65]\n    ee.movi.32.q    q3,a15,1            # [67]\n    ee.movi.32.q    q3,a13,2            # [69]\n    ee.movi.32.q    q3,a11,3            # [70]\n\n    l32i.n          a14,a1,4                # [66]  qacc_scratch+4\n    sext            a10,a9,19                   # [68]\n    extui           a9,a9,20,12                 # [72]\n    slli            a13,a12,16                  # [73]\n    slli            a8,a14,24                   # [74]\n    extui           a12,a12,16,16               # [75]\n    srai            a13,a13,12                  # [76]\n    srai            a8,a8,12                    # [77]\n    slli            a15,a14,4                   # [78]\n    srai            a15,a15,12                  # [79]\n    or              a8,a8,a9                    # [80]\n    extui           a14,a14,28,4                # [81]\n    l32i.n          a9,a1,12                # [82]  qacc_scratch+12\n    or              a13,a13,a14                 # [83]\n// insert to q1\n    ee.movi.32.q    q1,a10,0            # [71]\n    ee.movi.32.q    q1,a8,1             # [84]\n    ee.movi.32.q    q1,a15,2            # [85]\n    ee.movi.32.q    q1,a13,3            # [88]\n\n// load in_offset+bias factor\n    addmi           a14,a1,256                  # [86]\n    ld.qr           q7,a14,-128                  # [87]  gra_spill_temp_134-256\n    ld.qr           q4,a14,-112                  # [89]  gra_spill_temp_135-256\n    l32i.n          a15,a1,16               # [90]  qacc_scratch+16\n    ld.qr           q2,a14,-96                  # [91]  gra_spill_temp_133-256\n    slli            a11,a9,28                   # [92]\n    slli            a10,a9,8                    # [93]\n    srai            a10,a10,12                  # [94]\n    srai            a11,a11,12                  # [95]\n    extui           a9,a9,24,8                  # [96]\n    or              a11,a11,a12                 # [97]\n    ee.vadds.s32    q0,q0,q2            # [98]\n    slli            a8,a15,20                   # [99]\n    ee.vadds.s32    q3,q3,q4            # [100]\n    st.qr           q3,a1,80                # [101]  gra_spill_temp_137-256\n    srai            a15,a15,12                  # [102]\n    ld.qr           q2,a14,-80                  # [103]  gra_spill_temp_132-256\n    srai            a8,a8,12                    # [105]\n    or              a8,a8,a9                    # [108]\n\n// insert to q6\n    ee.movi.32.q    q6,a11,0            # [100]\n    ee.movi.32.q    q6,a10,1            # [107]\n    ee.movi.32.q    q6,a8,2             # [112]\n    ee.movi.32.q    q6,a15,3            # [113]\n\n    ee.vadds.s32    q1,q1,q2            # [110]\n    ee.vadds.s32    q6,q6,q7            # [114]\n    st.qr           q1,a1,16                   # [111]  gra_spill_temp_120\n    s32i.n          a7,a1,32                # [0] // tmp\n    s32i.n          a6,a1,36                # [106] // tmp\n    l32i            a7,a1,52                # [109]  gra_spill_temp_105, out_mult_ptr\n    l32i            a6,a1,56                # [106]  gra_spill_temp_106, out_shift_ptr\n    addi.n          a10,a7,0\n    addi.n          a11,a6,0\n    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [116]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3\n\n    mv.qr       q5,q0\n    ld.qr       q0,a1,80                # [4]  gra_spill_temp_137-256\n    addi.n      a10,a7,16\n    addi.n      a11,a6,16\n    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [5]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3\n\n    mv.qr       q4,q0\n    ld.qr       q0,a1,16                   # [5]  gra_spill_temp_120\n    addi.n      a10,a7,32\n    addi.n      a11,a6,32\n    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [6]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3\n\n    st.qr       q0,a1,0                 # [3]  gra_spill_temp_141\n    mv.qr       q0,q6\n    addi.n      a10,a7,48\n    addi.n      a11,a6,48\n    call8   esp_nn_multiply_by_quantized_mult_ver1_esp32s3     # [6]  esp_nn_multiply_by_quantized_mult_ver1_esp32s3\n\n\n    l32i.n  a6,a1,36                # [106]  // tmp\n    l32i.n  a7,a1,32                # [0]  // tmp\n    l32i    a15,a1,40                   # gra_spill_temp_103, stride_wd * channels\n    l32i    a11,a1,120                  # [3]  gra_spill_temp_130\n\n    add.n   a3,a3,a15                   # [0]\n    add.n   a4,a4,a15                   # [1]\n    add.n   a7,a7,a15                   # [2]\n    addi.n  a5,a5,1                 # [4]\n\n // add offset, apply activation and store\n    addmi   a13,a1,256                  # [8]\n    ld.qr   q3,a1,0                 # [10]  gra_spill_temp_141\n    mv.qr   q2,q5\n    addi    a8,a13,88                   # [14]\n    addi    a9,a13,100                  # [15]\n    addi    a15,a13,104                 # [13]\n    ee.vldbc.32     q6,a9               # [17]  id:723 activation_min\n    ee.vldbc.32     q1,a8               # [18]  id:722 out_offset\n    ee.vldbc.32     q7,a15              # [19]  id:724 activation_max\n    ee.vadds.s32    q4,q4,q1            # [20]\n    ee.vadds.s32    q2,q2,q1            # [21]\n    ee.vadds.s32    q5,q0,q1            # [22]\n    ee.vadds.s32    q3,q3,q1            # [23]\n    ee.vmin.s32     q3,q3,q7            # [24]\n    ee.vmin.s32     q5,q5,q7            # [25]\n    ee.vmin.s32     q2,q2,q7            # [26]\n    ee.vmin.s32     q4,q4,q7            # [27]\n    ee.vmax.s32     q4,q4,q6            # [28]\n    ee.vmax.s32     q2,q2,q6            # [29]\n    ee.vmax.s32     q5,q5,q6            # [30]\n    ee.vmax.s32     q3,q3,q6            # [31]\n    ee.vunzip.16    q3,q5               # [32]\n    ee.vunzip.16    q2,q4               # [33]\n    ee.vunzip.8     q2,q3               # [34]\n    ee.vst.128.xp   q2,a6,a2            # [35]  id:725\n    bne             a5,a11,.Lt_7_7170               # [36]\n\n.Lt_7_6658: # 0x112f\n#<loop> Part of loop body line 548, head labeled .Lt_7_6402\n    l32i    a15,a1,112                  # [3]  gra_spill_temp_113, input_wd*channels\n    l32i    a10,a1,320                  # gra_spill_temp_103\n    l32i    a13,a1,340                  # [0]  // out_ht\n    l32i    a9,a1,116                   # [1]  gra_spill_temp_114, input_wd\n    l32i    a12,a1,96                  # [4]  gra_spill_temp_118\n    mull    a15,a10,a15                 # // (input_wd * stride_ht) * channels\n    l32i    a14,a1,104                  # [5]  gra_spill_temp_120\n    l32i    a8,a1,100                   # [2]  gra_spill_temp_119\n\n    addi.n  a12,a12,1               # [6]\n    s32i    a12,a1,96                  # [7]  gra_spill_temp_118\n    add.n   a14,a14,a15                 # [8]\n    add.n   a8,a8,a9                    # [9]\n    s32i    a8,a1,100                   # [10]  gra_spill_temp_119\n    s32i    a14,a1,104                  # [11]  gra_spill_temp_120, (input_wd * stride_wd) * channels\n    bne     a12,a13,.Lt_7_6402              # [13] // iterate over height loop\n\n#<loop> Part of loop body line 348, head labeled .Lt_7_5122\n    l32i    a11,a1,56                   # [6]  gra_spill_temp_106 // out_shift_ptr\n    l32i    a15,a1,52                   # [2]  gra_spill_temp_105, out_mult_ptr\n    l32i    a10,a1,60                   # [24]  gra_spill_temp_108, ch_idx\n    addi    a11,a11,64                  # [8]\n    addi    a15,a15,64                  # [13]\n    s32i    a11,a1,56                   # [23]  gra_spill_temp_106\n    s32i    a15,a1,52                   # [18]  gra_spill_temp_105, out_mult_ptr\n    l32i    a11,a1,64                   # [25]  gra_spill_temp_109\n    addi    a10,a10,16                  # [26]\n    s32i    a10,a1,60                   # [27]  gra_spill_temp_108, ch_idx\n    blt     a10,a11,.Lt_7_5122          # [28] // iterate over outer most out_ch loop\n\n.Lt_7_4610: # 0x11ad\n    retw.n                          # [0]\n\n    .size   esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3, . - esp_nn_depthwise_conv_s8_mult1_3x3_padded_esp32s3\n"
  },
  {
    "path": "src/fully_connected/esp_nn_fc_s8_mac16_esp32s3.S",
    "content": "//\n// SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n//\n// SPDX-License-Identifier: Apache-2.0\n//\n\n//\n// s8 dot product for FC with 2x loop unrolling and QUP for unaligned filter.\n// Pattern adapted from esp-dsp dspi_dotprod_s8_aes3.S.\n// Input must be 16-byte aligned. Filter can be unaligned.\n//\n\n    .text\n    .align  4\n\n    .type   esp_nn_fc_s8_mac16_esp32s3, @function\n    .align  4\n    .global esp_nn_fc_s8_mac16_esp32s3\n\n// a2: input_data (16-byte aligned)\n// a3: filter_data (may be unaligned)\n// a4: row_len_div16 (>= 1)\n// Returns: int32_t dot product in a2\n\nesp_nn_fc_s8_mac16_esp32s3:\n    entry   a1, 32\n\n    ee.zero.accx\n    beqz    a4, .Ldone\n\n    // Prime: first unaligned filter load (sets SAR_BYTE)\n    ee.ld.128.usar.ip   q0, a3, 16         // filter chunk 0\n\n    // Check if we can do 2x unrolled (need >= 2 iterations)\n    srai    a5, a4, 1                       // a5 = row_len_div16 / 2\n    beqz    a5, .Lsingle\n\n    // Load first input + filter pair for unrolled loop\n    ee.vld.128.ip       q1, a2, 16         // input[0]\n    ee.ld.128.usar.ip   q2, a3, 16         // filter chunk 1\n\n    // 2x unrolled main loop: 2 MACs per iteration\n    loopgtz a5, .Lloop2_end\n\n    ee.src.q.qup        q4, q0, q2         // align filter[i]\n    ee.vld.128.ip       q3, a2, 16         // input[i+1]\n    ee.vmulas.s8.accx   q4, q1             // MAC filter[i] * input[i]\n    ee.ld.128.usar.ip   q0, a3, 16         // filter chunk[i+2]\n    ee.src.q.qup        q5, q2, q0         // align filter[i+1]\n    ee.vld.128.ip       q1, a2, 16         // input[i+2] (primed for next)\n    ee.vmulas.s8.accx   q5, q3             // MAC filter[i+1] * input[i+1]\n    ee.ld.128.usar.ip   q2, a3, 16         // filter chunk[i+3]\n\n.Lloop2_end:\n\n    // Check if there's a remaining single iteration\n    bbci    a4, 0, .Llast_qup              // if row_len_div16 is even, skip single\n\n.Lsingle:\n    // Single iteration: load input, QUP filter, MAC\n    ee.vld.128.ip       q1, a2, 16         // input\n    ee.ld.128.usar.ip   q2, a3, 16         // next filter chunk\n    ee.src.q.qup        q4, q0, q2         // align filter\n    ee.vmulas.s8.accx   q4, q1             // MAC\n    j                   .Ldone_mac\n\n.Llast_qup:\n    // After 2x loop: need to back up pointers since we loaded one extra pair\n    addi    a2, a2, -16\n    addi    a3, a3, -16\n\n.Ldone_mac:\n.Ldone:\n    // 2-cycle gap before ACCX read\n    movi.n  a3, 0\n    nop\n    ee.srs.accx a2, a3, 0\n\n    retw.n\n\n    .size   esp_nn_fc_s8_mac16_esp32s3, . - esp_nn_fc_s8_mac16_esp32s3\n"
  },
  {
    "path": "src/fully_connected/esp_nn_fully_connected_ansi.c",
    "content": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <stdint.h>\n\n#include <common_functions.h>\n\nvoid esp_nn_fully_connected_s8_ansi(const int8_t *input_data,\n                                    const int32_t input_offset,\n                                    const uint16_t row_len,\n                                    const int8_t *filter_data,\n                                    const int32_t filter_offset,\n                                    const int32_t *bias,\n                                    int8_t *out_data,\n                                    const uint16_t out_channels,\n                                    const int32_t out_offset,\n                                    const int32_t out_shift,\n                                    const int32_t out_mult,\n                                    const int32_t activation_min,\n                                    const int32_t activation_max)\n{\n    for (int32_t out_c = 0; out_c < out_channels; ++out_c) {\n        int32_t result = 0;\n        for (int32_t data_idx = 0; data_idx < row_len; data_idx++) {\n            int32_t filter_index = row_len * out_c + data_idx;\n            int32_t input_val = input_data[data_idx];\n            int32_t filter_val = filter_data[filter_index];\n            result += (filter_val + filter_offset) * (input_val + input_offset);\n        }\n        if (bias) {\n            result += bias[out_c];\n        }\n        result = esp_nn_multiply_by_quantized_mult(result, out_mult, out_shift);\n        result += out_offset;\n        result = max(result, activation_min);\n        result = min(result, activation_max);\n        out_data[out_c] = (int8_t) result;\n    }\n}\n\nvoid esp_nn_fully_connected_per_ch_s8_ansi(const int8_t *input_data,\n                                    const int32_t input_offset,\n                                    const uint16_t row_len,\n                                    const int8_t *filter_data,\n                                    const int32_t filter_offset,\n                                    const int32_t *bias,\n                                    int8_t *out_data,\n                                    const uint16_t out_channels,\n                                    const int32_t out_offset,\n                                    const int32_t* out_shift,\n                                    const int32_t* out_mult,\n                                    const int32_t activation_min,\n                                    const int32_t activation_max)\n{\n    for (int32_t out_c = 0; out_c < out_channels; ++out_c) {\n        int32_t result = 0;\n        for (int32_t data_idx = 0; data_idx < row_len; data_idx++) {\n            int32_t filter_index = row_len * out_c + data_idx;\n            int32_t input_val = input_data[data_idx];\n            int32_t filter_val = filter_data[filter_index];\n            result += (filter_val + filter_offset) * (input_val + input_offset);\n        }\n        if (bias) {\n            result += bias[out_c];\n        }\n        result = esp_nn_multiply_by_quantized_mult(result, out_mult[out_c], out_shift[out_c]);\n        result += out_offset;\n        result = max(result, activation_min);\n        result = min(result, activation_max);\n        out_data[out_c] = (int8_t) result;\n    }\n}\n"
  },
  {
    "path": "src/fully_connected/esp_nn_fully_connected_esp32s3.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*\n * FC multi-path dispatcher for ESP32-S3.\n * - Pre-computes offset corrections per channel in C\n * - Dispatches to s8 MAC assembly (aligned, large row_len) or s16 assembly (fallback)\n */\n\n#include <stdint.h>\n#include <stddef.h>\n#include <string.h>\n#include <common_functions.h>\n\n/* Original s16 assembly (renamed) */\nextern void esp_nn_fc_s16_esp32s3(const int8_t *input_data,\n                                   const int32_t input_offset,\n                                   const uint16_t row_len,\n                                   const int8_t *filter_data,\n                                   const int32_t filter_offset,\n                                   const int32_t *bias,\n                                   int8_t *out_data,\n                                   const uint16_t out_channels,\n                                   const int32_t out_offset,\n                                   const int32_t out_shift,\n                                   const int32_t out_mult,\n                                   const int32_t activation_min,\n                                   const int32_t activation_max);\n\nextern void esp_nn_fc_per_ch_s16_esp32s3(const int8_t *input_data,\n                                          const int32_t input_offset,\n                                          const uint16_t row_len,\n                                          const int8_t *filter_data,\n                                          const int32_t filter_offset,\n                                          const int32_t *bias,\n                                          int8_t *out_data,\n                                          const uint16_t out_channels,\n                                          const int32_t out_offset,\n                                          const int32_t *out_shift,\n                                          const int32_t *out_mult,\n                                          const int32_t activation_min,\n                                          const int32_t activation_max);\n\n/* Shared s8 dot product from common — handles unaligned filter via USAR+QUP */\nextern int32_t esp_nn_dot_s8_unaligned_esp32s3(const int8_t *a,\n                                                const int8_t *b,\n                                                int32_t len_div16);\n\nvoid esp_nn_fully_connected_s8_esp32s3(const int8_t *input_data,\n                                       const int32_t input_offset,\n                                       const uint16_t row_len,\n                                       const int8_t *filter_data,\n                                       const int32_t filter_offset,\n                                       const int32_t *bias,\n                                       int8_t *out_data,\n                                       const uint16_t out_channels,\n                                       const int32_t out_offset,\n                                       const int32_t out_shift,\n                                       const int32_t out_mult,\n                                       const int32_t activation_min,\n                                       const int32_t activation_max)\n{\n    /* Quick check: s8 fast path only for aligned, row_len%16, no filter_offset */\n    if (__builtin_expect(filter_offset != 0 || row_len < 16\n        || ((uintptr_t)input_data & 15), 0)) {\n        /* Fallback to original s16 assembly — tail call, no extra overhead */\n        esp_nn_fc_s16_esp32s3(input_data, input_offset, row_len, filter_data,\n                              filter_offset, bias, out_data, out_channels,\n                              out_offset, out_shift, out_mult,\n                              activation_min, activation_max);\n        return;\n    }\n    {\n        int32_t row_len_div16 = row_len >> 4;\n\n        /* Pre-compute per-channel corrections once */\n        int32_t corrections[out_channels];\n        for (int ch = 0; ch < out_channels; ch++) {\n            const int8_t *f_ptr = filter_data + ch * row_len;\n            int32_t corr = 0;\n            if (input_offset != 0) {\n                int32_t filter_sum = 0;\n                for (int i = 0; i < row_len; i++) {\n                    filter_sum += f_ptr[i];\n                }\n                corr = filter_sum * input_offset;\n            }\n            if (bias) {\n                corr += bias[ch];\n            }\n            corrections[ch] = corr;\n        }\n\n        int32_t row_len_rem = row_len & 15;\n        int32_t simd_bytes = row_len_div16 << 4;\n\n        for (int ch = 0; ch < out_channels; ch++) {\n            const int8_t *f_ptr = filter_data + ch * row_len;\n            int32_t acc = esp_nn_dot_s8_unaligned_esp32s3(input_data, f_ptr, row_len_div16);\n\n            /* Scalar remainder for non-multiple-of-16 row_len */\n            for (int i = 0; i < row_len_rem; i++) {\n                acc += (int32_t)input_data[simd_bytes + i] * (int32_t)f_ptr[simd_bytes + i];\n            }\n\n            acc += corrections[ch];\n\n            acc = esp_nn_multiply_by_quantized_mult(acc, out_mult, out_shift);\n            acc += out_offset;\n            acc = max(acc, activation_min);\n            acc = min(acc, activation_max);\n            out_data[ch] = (int8_t)acc;\n        }\n    }\n}\n\nvoid esp_nn_fully_connected_per_ch_s8_esp32s3(const int8_t *input_data,\n                                       const int32_t input_offset,\n                                       const uint16_t row_len,\n                                       const int8_t *filter_data,\n                                       const int32_t filter_offset,\n                                       const int32_t *bias,\n                                       int8_t *out_data,\n                                       const uint16_t out_channels,\n                                       const int32_t out_offset,\n                                       const int32_t *out_shift,\n                                       const int32_t *out_mult,\n                                       const int32_t activation_min,\n                                       const int32_t activation_max)\n{\n    if (__builtin_expect(filter_offset != 0 || row_len < 16\n        || ((uintptr_t)input_data & 15), 0)) {\n        esp_nn_fc_per_ch_s16_esp32s3(input_data, input_offset, row_len, filter_data,\n                                     filter_offset, bias, out_data, out_channels,\n                                     out_offset, out_shift, out_mult,\n                                     activation_min, activation_max);\n        return;\n    }\n    {\n        int32_t row_len_div16 = row_len >> 4;\n\n        /* Pre-compute per-channel corrections once */\n        int32_t corrections[out_channels];\n        for (int ch = 0; ch < out_channels; ch++) {\n            const int8_t *f_ptr = filter_data + ch * row_len;\n            int32_t corr = 0;\n            if (input_offset != 0) {\n                int32_t filter_sum = 0;\n                for (int i = 0; i < row_len; i++) {\n                    filter_sum += f_ptr[i];\n                }\n                corr = filter_sum * input_offset;\n            }\n            if (bias) {\n                corr += bias[ch];\n            }\n            corrections[ch] = corr;\n        }\n\n        int32_t row_len_rem = row_len & 15;\n        int32_t simd_bytes = row_len_div16 << 4;\n\n        for (int ch = 0; ch < out_channels; ch++) {\n            const int8_t *f_ptr = filter_data + ch * row_len;\n            int32_t acc = esp_nn_dot_s8_unaligned_esp32s3(input_data, f_ptr, row_len_div16);\n\n            for (int i = 0; i < row_len_rem; i++) {\n                acc += (int32_t)input_data[simd_bytes + i] * (int32_t)f_ptr[simd_bytes + i];\n            }\n\n            acc += corrections[ch];\n\n            acc = esp_nn_multiply_by_quantized_mult(acc, out_mult[ch], out_shift[ch]);\n            acc += out_offset;\n            acc = max(acc, activation_min);\n            acc = min(acc, activation_max);\n            out_data[ch] = (int8_t)acc;\n        }\n    }\n}\n"
  },
  {
    "path": "src/fully_connected/esp_nn_fully_connected_per_ch_s8_esp32s3.S",
    "content": "//\n// SPDX-FileCopyrightText: 2025-2026 Espressif Systems (Shanghai) CO LTD\n//\n// SPDX-License-Identifier: Apache-2.0\n//\n    .text\n    .align  4\n    .literal_position\n    .literal    .LC3_26_101, 1073741824 // nudge (1 << 30)\n\n    # Program Unit: esp_nn_fc_per_ch_s16_esp32s3\n    .type   esp_nn_fc_per_ch_s16_esp32s3, @function\n    .align   4\n    .global esp_nn_fc_per_ch_s16_esp32s3\n\n// a2: input_data\n// a3: input_offset\n// a4: row_len\n// a5: filter_data\n// a6: filter_offset\n// a7: bias\n// on stack: out_data\n// on stack: out_channels\n// on stack: out_offset\n// on stack: out_shift\n// on stack: out_mult\n// on stack: activation_min\n// on stack: activation_max\n\nesp_nn_fc_per_ch_s16_esp32s3:  # 0x4\n    # qacc_scratch = 0\n    // 40, filter_offset\n    // 44, input_offset\n    # gra_spill_temp_7 = 48\n    # gra_spill_temp_2 = 60\n    # gra_spill_temp_3 = 64\n    # gra_spill_temp_4 = 68\n    # gra_spill_temp_5 = 72\n    # gra_spill_temp_6 = 76\n    # gra_spill_temp_8 = 80\n    # gra_spill_temp_9 = 84\n\n    entry   a1,112                      #\n    s32i.n  a5,a1,60                # [0]  gra_spill_temp_2, filter_data\n    s32i    a7,a1,48                    # [1]  gra_spill_temp_7, bias\n    s32i    a6,a1,40                    # [2]  id:252 filter_offset+0x0\n    s32i    a3,a1,44                    # [3]  id:251 input_offset+0x0\n    mov.n   a13,a2                      # [5]\n    mov.n   a12,a4                      # [6]\n\n // out_channel loop\n    l16ui       a2,a1,116                   # [7]  id:255 out_channels+0x0\n    addi        a4,a1,40                # [8]\n    addi        a8,a1,44                # [9]\n    ee.vldbc.16 q5,a8               # [10]  id:253 input_offset\n    ee.vldbc.16 q6,a4               # [12]  id:254 filter_offset\n    beqz.n      a2,.Lt_0_7938           # [13]\n\n    ee.zero.q   q7                      # [0]\n    srai        a11,a12,3                   # [2]\n    l32i        a8,a1,112                   # [6]  id:259 out_data+0x0\n    addi        a9,a12,-7                   # [7]\n    s32i        a9,a1,76                    # [8]  gra_spill_temp_6\n    s32i        a8,a1,72                    # [9]  gra_spill_temp_5\n    s32i        a11,a1,64                   # [14]  gra_spill_temp_3\n    slli        a11,a11,3                   # [16]\n    s32i        a11,a1,68                   # [18]  gra_spill_temp_4\n    movi.n      a15,0                   # [17]\n    mov.n       a14,a7                      # [15]\n    mov.n       a11,a5                      # [31]\n    l32i        a10,a1,124  # out_shift\n    l32i        a2,a1,128  # out_mult\n    s32i        a10,a1,80                   # gra_spill_temp_8\n    s32i        a2,a1,84                   # gra_spill_temp_9\n    movi.n      a10,0                   # [32]\n    mov.n       a2,a11                      # [33]\n\n.Lt_0_8450: # 0x12b\n    l32i            a9,a1,76                    # [2]  gra_spill_temp_6\n    extui           a5,a11,0,3                  # [34]\n    ee.zero.accx\n    slli            a5,a5,1                     # [3]\n    bgei            a9,0,.LBB6_esp_nn_fc_per_ch_s16_esp32s3            # [9]\n\n    mov.n           a5,a10                      # [6]\n    movi.n  a2,0                    # [0]\n    j       .Lt_0_8706                      # [1]\n\n.LBB6_esp_nn_fc_per_ch_s16_esp32s3:    # 0x147\n    wur.sar_byte    a5                  # [5]\n    ee.vld.l.64.ip  q4,a2,8         # [4]  id:267\n    l32i            a4,a1,64                    # [0]  gra_spill_temp_3\n    mov.n           a3,a13                      # [1]\n    addx8           a5,a4,a10                   # [2]\n    ee.vcmp.lt.s8   q2,q4,q7            # [7]\n    ee.vzip.8       q4,q2                   # [8]\n    loopgtz a4,.LBB45_esp_nn_fc_per_ch_s16_esp32s3     # [3]\n\n    ee.vld.l.64.ip      q0,a2,8         # [0*II+0]  id:268\n    ee.vld.l.64.ip      q1,a3,8         # [0*II+1]  id:270\n    ee.vcmp.lt.s8       q2,q0,q7            # [0*II+2]\n    ee.vcmp.lt.s8       q3,q1,q7            # [0*II+3]\n    ee.vzip.8           q0,q2                   # [0*II+4]\n    ee.vzip.8           q1,q3                   # [0*II+5]\n    ee.vadds.s16        q1,q1,q5            # [0*II+6]\n    ee.src.q.qup        q2,q4,q0            # [0*II+7]\n    ee.vadds.s16        q2,q2,q6            # [0*II+8]\n    ee.vmulas.s16.accx  q1,q2       # [0*II+9]\n\n.LBB45_esp_nn_fc_per_ch_s16_esp32s3:   # 0x170\n    l32i    a2,a1,68                    # [0]  gra_spill_temp_4\n\n.Lt_0_8706: # 0x173\n\tmovi a9, 0\n\tee.srs.accx  a6, a9, 0\n\n    bge             a2,a12,.Lt_0_9730           # [38]\n\n// prepare remaining loop\n    l32i    a8,a1,44                    # [0]  id:251 input_offset+0x0\n    l32i    a7,a1,40                    # [1]  id:252 filter_offset+0x0\n    sub     a3,a12,a2                   # [2]\n    l32i.n  a4,a1,60                # [3]  gra_spill_temp_2\n    add.n   a2,a2,a13                   # [4]\n    add.n   a4,a4,a5                    # [5]\n    loopgtz a3,.LBB60_esp_nn_fc_per_ch_s16_esp32s3     # [6]\n\n// remaining c loop\n    l8ui    a3,a2,0                     # [0*II+0]  id:299\n    l8ui    a5,a4,0                     # [0*II+1]  id:300\n    sext    a3,a3,7                     # [0*II+2]\n    sext    a5,a5,7                     # [0*II+3]\n    add.n   a5,a5,a7                    # [0*II+5]\n    add.n   a3,a3,a8                    # [0*II+6]\n    mull    a3,a3,a5                    # [0*II+7]\n    addi.n  a2,a2,1                 # [0*II+8]\n    addi.n  a4,a4,1                 # [0*II+4]\n    add.n   a6,a6,a3                    # [0*II+9]\n\n.LBB60_esp_nn_fc_per_ch_s16_esp32s3:   # 0x20f\n\n// add bias\n.Lt_0_9730: # 0x20f\n    l32i    a8,a1,48                    # [0]  gra_spill_temp_7, bias\n    beqz.n  a8,.Lt_0_10754          # [2], skip_bias\n\n    l32i.n  a9,a14,0                # [0]  id:301\n    add.n   a6,a6,a9                    # [2]\n\n// apply quantization\n.Lt_0_10754:    # 0x218\n    movi        a4,0\n    l32i        a5,a1,80                  # [25]  id:256 gra_spill_temp_8, out_shift+0x0\n    l32i        a5,a5,0\n    max         a2,a5,a4                 // left_shift\n    sub         a5,a2,a5                 // right_shift\n\n    ssl     a2                          # [3]\n    sll     a6,a6                       # [5] // x * (1 << left_shift)\n\n    l32i    a4,a1,84                   # [2]  gra_spill_temp_9 //out_mult\n    l32r    a3,.LC3_26_101              # [0]\n\n    add.n   a10,a10,a12                 # [0]\n    addi.n  a14,a14,4               # [1]\n\n    l32i    a4,a4,0\n    add.n   a11,a11,a12                 # [6]\n\n// multiply add nudge and pick high32\n    ssai    31\n    mulsh   a7,a4,a6                    # [4]\n    mull    a4,a4,a6                    # [5]\n\n    mov.n   a2,a11                      # [27]\n    add     a4,a4,a3\n    saltu   a8,a4,a3\n    add.n   a7,a7,a8\n    src     a3,a7,a4\n\n// divide_by_power_of2_step\n    blti    a5,1,.skip_divide_by2\n    movi.n  a8,1                    # [28]\n    addi    a4,a5,-1\n    ssl     a4          // load left_shift\n    sll     a8,a8       // to_add factor ( 1 << (exponent - 1))\n    extui   a6,a3,31,1                  # [33]\n    sub     a8,a8,a6        // modified to_add factor ( 1 << (exponent - 1) - (val < 0))\n    add     a3,a3,a8    // val + to_add\n    ssr     a5                          # [29] //load right_shift\n    sra     a3,a3                       # [31]\n.skip_divide_by2:\n\n    l32i    a8,a1,120                   # [41]  out_offset\n    l32i    a7,a1,132                   # [44] // activation_min\n    l32i    a4,a1,136                   # [45] // activation_max\n\n    add.n   a8,a8,a3                    # [46] // add out_offset\n    l32i    a6,a1,72                    # [47]  gra_spill_temp_5\n    l32i.n  a3,a1,116                   # [48]  out_channels\n    max     a7,a7,a8                    # [49]\n    add.n   a6,a15,a6                   # [50]\n    min     a4,a4,a7                    # [51]\n    addi.n  a15,a15,1               # [52]\n\n    l32i        a7,a1,84                # gra_spill_temp_9\n    l32i        a8,a1,80                # gra_spill_temp_8\n\n    s8i     a4,a6,0                     # store output\n\n    addi.n      a7,a7,4                 # increment mult pointer\n    addi.n      a8,a8,4                 # increment mult pointer\n\n    s32i        a7,a1,84                # gra_spill_temp_9\n    s32i        a8,a1,80                # gra_spill_temp_8\n\n    bne     a3,a15,.Lt_0_8450               # [55]\n\n.Lt_0_7938: # 0x25c\n    retw.n                          # [0]\n\n    .size   esp_nn_fc_per_ch_s16_esp32s3, . - esp_nn_fc_per_ch_s16_esp32s3\n"
  },
  {
    "path": "src/fully_connected/esp_nn_fully_connected_s8_esp32p4.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <stdint.h>\n#include <common_functions.h>\n\n/**\n * Fully connected layer for s8 using ESP32-P4 PIE SIMD.\n *\n * Uses esp.vmulas.s8.xacc.ld.ip for fused 16-wide s8 MAC + load.\n * Pre-computes filter_sum * input_offset (like conv) so PIE path\n * works even with non-zero input_offset.\n *\n * Inner loop is software-pipelined:\n *   iteration N: MAC(q0,q1) + load_next_input(q0)\n *                load_next_filter(q1)     <- hides MAC latency\n *                counter_update           <- independent of above\n */\n\n/* Core dot product: PIE-accelerated when row_len >= 16 */\nstatic inline __attribute__((always_inline))\nint32_t fc_dot_s8_pie(const int8_t *input, const int8_t *filter, int32_t row_len)\n{\n    int32_t result = 0;\n    int32_t idx = 0;\n\n    if (row_len >= 32) {\n        /* Double-pumped: process 32 elements per iteration\n         * Uses q0/q1 for first pair, q2/q3 for second pair */\n        asm volatile (\n            \"esp.zero.xacc                          \\n\\t\"\n            \"mv     x30, %[in]                      \\n\\t\"\n            \"mv     x31, %[flt]                     \\n\\t\"\n            \"li     %[idx], 32                      \\n\\t\"\n            \"addi   s7, %[len], -31                 \\n\\t\"\n\n            /* Prime the pipeline: load first 32 bytes */\n            \"esp.vld.128.ip  q0, x30, 16            \\n\\t\"\n            \"esp.vld.128.ip  q2, x30, 16            \\n\\t\"\n            \"esp.vld.128.ip  q1, x31, 16            \\n\\t\"\n            \"esp.vld.128.ip  q3, x31, 16            \\n\\t\"\n            \"j      2f                              \\n\\t\"\n\n            \"1:                                     \\n\\t\"\n            /* MAC pair 1 + load next input[0:16] */\n            \"esp.vmulas.s8.xacc.ld.ip q0, x30, 16, q0, q1 \\n\\t\"\n            /* Load next filter[0:16] while MAC settles */\n            \"esp.vld.128.ip  q1, x31, 16            \\n\\t\"\n            /* MAC pair 2 + load next input[16:32] */\n            \"esp.vmulas.s8.xacc.ld.ip q2, x30, 16, q2, q3 \\n\\t\"\n            /* Load next filter[16:32] - interleaved with counter */\n            \"esp.vld.128.ip  q3, x31, 16            \\n\\t\"\n            \"addi   %[idx], %[idx], 32              \\n\\t\"\n\n            \"2:                                     \\n\\t\"\n            \"blt    %[idx], s7, 1b                  \\n\\t\"\n\n            /* Drain pipeline: final two MACs */\n            \"esp.vmulas.s8.xacc  q0, q1             \\n\\t\"\n            \"esp.vmulas.s8.xacc  q2, q3             \\n\\t\"\n\n            /* Handle 16-element remainder if any (idx+16 <= row_len) */\n            \"addi   s7, %[len], -15                 \\n\\t\"\n            \"bge    %[idx], s7, 3f                  \\n\\t\"\n            \"esp.vld.128.ip  q0, x30, 16            \\n\\t\"\n            \"esp.vld.128.ip  q1, x31, 16            \\n\\t\"\n            \"esp.vmulas.s8.xacc  q0, q1             \\n\\t\"\n            \"addi   %[idx], %[idx], 16              \\n\\t\"\n            \"3:                                     \\n\\t\"\n\n            \"esp.movx.r.xacc.l   x30                \\n\\t\"\n            \"mv     %[res], x30                     \\n\\t\"\n            : [idx] \"+r\"(idx), [res] \"=r\"(result)\n            : [in] \"r\"(input), [flt] \"r\"(filter), [len] \"r\"(row_len)\n            : \"x30\", \"x31\", \"s7\"\n        );\n    } else if (row_len >= 16) {\n        /* Single-pumped for 16-31 element rows */\n        asm volatile (\n            \"esp.zero.xacc                          \\n\\t\"\n            \"mv     x30, %[in]                      \\n\\t\"\n            \"mv     x31, %[flt]                     \\n\\t\"\n            \"li     %[idx], 16                      \\n\\t\"\n            \"addi   s7, %[len], -15                 \\n\\t\"\n            \"esp.vld.128.ip  q0, x30, 16            \\n\\t\"\n            \"esp.vld.128.ip  q1, x31, 16            \\n\\t\"\n            \"j      5f                              \\n\\t\"\n            \"4:                                     \\n\\t\"\n            \"esp.vmulas.s8.xacc.ld.ip q0, x30, 16, q0, q1 \\n\\t\"\n            \"esp.vld.128.ip  q1, x31, 16            \\n\\t\"\n            \"addi   %[idx], %[idx], 16              \\n\\t\"\n            \"5:                                     \\n\\t\"\n            \"blt    %[idx], s7, 4b                  \\n\\t\"\n            \"esp.vmulas.s8.xacc  q0, q1             \\n\\t\"\n            \"esp.movx.r.xacc.l   x30                \\n\\t\"\n            \"mv     %[res], x30                     \\n\\t\"\n            : [idx] \"+r\"(idx), [res] \"=r\"(result)\n            : [in] \"r\"(input), [flt] \"r\"(filter), [len] \"r\"(row_len)\n            : \"x30\", \"x31\", \"s7\"\n        );\n    }\n\n    /* Scalar remainder */\n    for (; idx < row_len; idx++) {\n        result += (int32_t)input[idx] * (int32_t)filter[idx];\n    }\n\n    return result;\n}\n\nvoid esp_nn_fully_connected_s8_esp32p4(const int8_t *input_data,\n                                        const int32_t input_offset,\n                                        const uint16_t row_len,\n                                        const int8_t *filter_data,\n                                        const int32_t filter_offset,\n                                        const int32_t *bias,\n                                        int8_t *out_data,\n                                        const uint16_t out_channels,\n                                        const int32_t out_offset,\n                                        const int32_t out_shift,\n                                        const int32_t out_mult,\n                                        const int32_t activation_min,\n                                        const int32_t activation_max)\n{\n    /* Enable PIE once for all channels */\n    asm volatile (\n        \"csrsi  0x7f2, 0b01        \\n\\t\"\n        \"li     x29, 0b10          \\n\\t\"\n        \"esp.movx.w.cfg x29        \\n\\t\"\n        ::: \"x29\"\n    );\n\n    for (int32_t out_c = 0; out_c < out_channels; ++out_c) {\n        const int8_t *filter_row = filter_data + (int32_t)row_len * out_c;\n\n        int32_t result;\n        if (input_offset == 0 && filter_offset == 0) {\n            /* Fast PIE path: pure s8 dot product */\n            result = fc_dot_s8_pie(input_data, filter_row, row_len);\n        } else {\n            /* Scalar path with offsets */\n            result = 0;\n            for (int32_t i = 0; i < row_len; i++) {\n                result += ((int32_t)input_data[i] + input_offset) *\n                          ((int32_t)filter_row[i] + filter_offset);\n            }\n        }\n\n        if (bias) {\n            result += bias[out_c];\n        }\n        result = esp_nn_requantize(result, out_mult, out_shift);\n        result += out_offset;\n        result = max(result, activation_min);\n        result = min(result, activation_max);\n        out_data[out_c] = (int8_t) result;\n    }\n}\n\nvoid esp_nn_fully_connected_per_ch_s8_esp32p4(const int8_t *input_data,\n                                        const int32_t input_offset,\n                                        const uint16_t row_len,\n                                        const int8_t *filter_data,\n                                        const int32_t filter_offset,\n                                        const int32_t *bias,\n                                        int8_t *out_data,\n                                        const uint16_t out_channels,\n                                        const int32_t out_offset,\n                                        const int32_t *out_shift,\n                                        const int32_t *out_mult,\n                                        const int32_t activation_min,\n                                        const int32_t activation_max)\n{\n    /* Enable PIE once for all channels */\n    asm volatile (\n        \"csrsi  0x7f2, 0b01        \\n\\t\"\n        \"li     x29, 0b10          \\n\\t\"\n        \"esp.movx.w.cfg x29        \\n\\t\"\n        ::: \"x29\"\n    );\n\n    for (int32_t out_c = 0; out_c < out_channels; ++out_c) {\n        const int8_t *filter_row = filter_data + (int32_t)row_len * out_c;\n\n        int32_t result;\n        if (input_offset == 0 && filter_offset == 0) {\n            result = fc_dot_s8_pie(input_data, filter_row, row_len);\n        } else {\n            result = 0;\n            for (int32_t i = 0; i < row_len; i++) {\n                result += ((int32_t)input_data[i] + input_offset) *\n                          ((int32_t)filter_row[i] + filter_offset);\n            }\n        }\n\n        if (bias) {\n            result += bias[out_c];\n        }\n        result = esp_nn_requantize(result, out_mult[out_c], out_shift[out_c]);\n        result += out_offset;\n        result = max(result, activation_min);\n        result = min(result, activation_max);\n        out_data[out_c] = (int8_t) result;\n    }\n}\n"
  },
  {
    "path": "src/fully_connected/esp_nn_fully_connected_s8_esp32s3.S",
    "content": "//\n// SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n//\n// SPDX-License-Identifier: Apache-2.0\n//\n    .text\n    .align  4\n    .literal_position\n    .literal    .LC3_26_101, 1073741824 // nudge (1 << 30)\n\n    # Program Unit: esp_nn_fc_s16_esp32s3\n    .type   esp_nn_fc_s16_esp32s3, @function\n    .align   4\n    .global esp_nn_fc_s16_esp32s3\n\n// a2: input_data\n// a3: input_offset\n// a4: row_len\n// a5: filter_data\n// a6: filter_offset\n// a7: bias\n// on stack: out_data\n// on stack: out_channels\n// on stack: out_offset\n// on stack: out_shift\n// on stack: out_mult\n// on stack: activation_min\n// on stack: activation_max\n\nesp_nn_fc_s16_esp32s3:  # 0x4\n    # qacc_scratch = 0\n    // 40, filter_offset\n    // 44, input_offset\n    # gra_spill_temp_7 = 48\n    # gra_spill_temp_0 = 52\n    # gra_spill_temp_1 = 56\n    # gra_spill_temp_2 = 60\n    # gra_spill_temp_3 = 64\n    # gra_spill_temp_4 = 68\n    # gra_spill_temp_5 = 72\n    # gra_spill_temp_6 = 76\n\n    entry   a1,112                      #\n    s32i.n  a5,a1,60                # [0]  gra_spill_temp_2, filter_data\n    s32i    a7,a1,48                    # [1]  gra_spill_temp_7, bias\n    s32i    a6,a1,40                    # [2]  id:252 filter_offset+0x0\n    s32i    a3,a1,44                    # [3]  id:251 input_offset+0x0\n    mov.n   a13,a2                      # [5]\n    mov.n   a12,a4                      # [6]\n\n // out_channel loop\n    l16ui       a2,a1,116                   # [7]  id:255 out_channels+0x0\n    addi        a4,a1,40                # [8]\n    addi        a8,a1,44                # [9]\n    ee.vldbc.16 q5,a8               # [10]  id:253 input_offset\n    ee.vldbc.16 q6,a4               # [12]  id:254 filter_offset\n    beqz.n      a2,.Lt_0_7938           # [13]\n\n    ee.zero.q   q7                      # [0]\n    srai        a11,a12,3                   # [2]\n    l32i        a10,a1,128                  # [5]  id:257 out_mult+0x0\n    l32i        a8,a1,112                   # [6]  id:259 out_data+0x0\n    addi        a9,a12,-7                   # [7]\n    s32i        a9,a1,76                    # [8]  gra_spill_temp_6\n    s32i        a8,a1,72                    # [9]  gra_spill_temp_5\n    s32i        a11,a1,64                   # [14]  gra_spill_temp_3\n    slli        a11,a11,3                   # [16]\n    s32i        a11,a1,68                   # [18]  gra_spill_temp_4\n    l32i        a10,a1,124                  # [25]  id:256 out_shift+0x0\n    movi.n      a15,0                   # [17]\n    mov.n       a14,a7                      # [15]\n    max         a11,a10,a15                 # [29]\n    s32i        a11,a1,52                   # [30]  gra_spill_temp_0 // left_shift\n    sub         a10,a11,a10                 #  // right_shift\n    s32i.n      a10,a1,56                   # [28]  gra_spill_temp_1 // right_shift\n    mov.n       a11,a5                      # [31]\n    movi.n      a10,0                   # [32]\n    mov.n       a2,a11                      # [33]\n\n.Lt_0_8450: # 0x12b\n\n    l32i            a9,a1,76                    # [2]  gra_spill_temp_6\n    extui           a5,a11,0,3                  # [34]\n    ee.zero.accx\n    slli            a5,a5,1                     # [3]\n    bgei            a9,0,.LBB6_esp_nn_fc_s16_esp32s3            # [9]\n\n    mov.n           a5,a10                      # [6]\n    movi.n  a2,0                    # [0]\n    j       .Lt_0_8706                      # [1]\n\n.LBB6_esp_nn_fc_s16_esp32s3:    # 0x147\n    wur.sar_byte    a5                  # [5]\n    ee.vld.l.64.ip  q4,a2,8         # [4]  id:267\n    l32i            a4,a1,64                    # [0]  gra_spill_temp_3\n    mov.n           a3,a13                      # [1]\n    addx8           a5,a4,a10                   # [2]\n    ee.vcmp.lt.s8   q2,q4,q7            # [7]\n    ee.vzip.8       q4,q2                   # [8]\n    loopgtz a4,.LBB45_esp_nn_fc_s16_esp32s3     # [3]\n\n    ee.vld.l.64.ip      q0,a2,8         # [0*II+0]  id:268\n    ee.vld.l.64.ip      q1,a3,8         # [0*II+1]  id:270\n    ee.vcmp.lt.s8       q2,q0,q7            # [0*II+2]\n    ee.vcmp.lt.s8       q3,q1,q7            # [0*II+3]\n    ee.vzip.8           q0,q2                   # [0*II+4]\n    ee.vzip.8           q1,q3                   # [0*II+5]\n    ee.vadds.s16        q1,q1,q5            # [0*II+6]\n    ee.src.q.qup        q2,q4,q0            # [0*II+7]\n    ee.vadds.s16        q2,q2,q6            # [0*II+8]\n    ee.vmulas.s16.accx  q1,q2       # [0*II+9]\n\n.LBB45_esp_nn_fc_s16_esp32s3:   # 0x170\n    l32i    a2,a1,68                    # [0]  gra_spill_temp_4\n\n.Lt_0_8706: # 0x173\n\tmovi a9, 0\n\tee.srs.accx  a6, a9, 0\n\n    bge             a2,a12,.Lt_0_9730           # [38]\n\n// prepare remaining loop\n    l32i    a8,a1,44                    # [0]  id:251 input_offset+0x0\n    l32i    a7,a1,40                    # [1]  id:252 filter_offset+0x0\n    sub     a3,a12,a2                   # [2]\n    l32i.n  a4,a1,60                # [3]  gra_spill_temp_2\n    add.n   a2,a2,a13                   # [4]\n    add.n   a4,a4,a5                    # [5]\n    loopgtz a3,.LBB60_esp_nn_fc_s16_esp32s3     # [6]\n\n// remaining c loop\n    l8ui    a3,a2,0                     # [0*II+0]  id:299\n    l8ui    a5,a4,0                     # [0*II+1]  id:300\n    sext    a3,a3,7                     # [0*II+2]\n    sext    a5,a5,7                     # [0*II+3]\n    add.n   a5,a5,a7                    # [0*II+5]\n    add.n   a3,a3,a8                    # [0*II+6]\n    mull    a3,a3,a5                    # [0*II+7]\n    addi.n  a2,a2,1                 # [0*II+8]\n    addi.n  a4,a4,1                 # [0*II+4]\n    add.n   a6,a6,a3                    # [0*II+9]\n\n.LBB60_esp_nn_fc_s16_esp32s3:   # 0x20f\n\n// add bias\n.Lt_0_9730: # 0x20f\n    l32i    a8,a1,48                    # [0]  gra_spill_temp_7, bias\n    beqz.n  a8,.Lt_0_10754          # [2], skip_bias\n\n    l32i.n  a9,a14,0                # [0]  id:301\n    add.n   a6,a6,a9                    # [2]\n\n// apply quantization\n.Lt_0_10754:    # 0x218\n    l32i    a2,a1,52                    # [1]  gra_spill_temp_0 // left_shift\n    l32i    a5,a1,56                    # [2]  gra_spill_temp_1 // right_shift\n    ssl     a2                          # [3]\n    sll     a6,a6                       # [5] // x * (1 << left_shift)\n\n    l32r    a3,.LC3_26_101              # [0]\n\n    add.n   a10,a10,a12                 # [0]\n    addi.n  a14,a14,4               # [1]\n\n    l32i    a4,a1,128                   # [2]  gra_spill_temp_10 //out_mult\n    add.n   a11,a11,a12                 # [6]\n\n// multiply add nudge and pick high32\n    ssai    31\n    mulsh   a7,a4,a6                    # [4]\n    mull    a4,a4,a6                    # [5]\n\n    mov.n   a2,a11                      # [27]\n    add     a4,a4,a3\n    saltu   a8,a4,a3\n    add.n   a7,a7,a8\n    src     a3,a7,a4\n\n// divide_by_power_of2_step\n    blti    a5,1,.skip_divide_by2\n    movi.n  a8,1                    # [28]\n    addi    a4,a5,-1\n    ssl     a4          // load left_shift\n    sll     a8,a8       // to_add factor ( 1 << (exponent - 1))\n    extui   a6,a3,31,1                  # [33]\n    sub     a8,a8,a6        // modified to_add factor ( 1 << (exponent - 1) - (val < 0))\n    add     a3,a3,a8    // val + to_add\n    ssr     a5                          # [29] //load right_shift\n    sra     a3,a3                       # [31]\n.skip_divide_by2:\n\n    l32i    a8,a1,120                   # [41]  out_offset\n    l32i    a7,a1,132                   # [44] // activation_min\n    l32i    a4,a1,136                   # [45] // activation_max\n\n    add.n   a8,a8,a3                    # [46] // add out_offset\n    l32i    a6,a1,72                    # [47]  gra_spill_temp_5\n    l32i.n  a3,a1,116                   # [48]  out_channels\n    max     a7,a7,a8                    # [49]\n    add.n   a6,a15,a6                   # [50]\n    min     a4,a4,a7                    # [51]\n    addi.n  a15,a15,1               # [52]\n    s8i     a4,a6,0                     # [53]  id:302\n    bne     a3,a15,.Lt_0_8450               # [55]\n\n.Lt_0_7938: # 0x25c\n    retw.n                          # [0]\n\n    .size   esp_nn_fc_s16_esp32s3, . - esp_nn_fc_s16_esp32s3\n"
  },
  {
    "path": "src/logistic/esp_nn_logistic_ansi.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <stdint.h>\n#include <math.h>\n\n/*\n * LUT-based int8 logistic (sigmoid) for quantized inference.\n *\n * For int8, there are only 256 possible input values. We precompute sigmoid\n * for all of them during Prepare() and store as a 256-byte LUT.\n * Eval() then becomes a trivial table lookup — O(1) per element.\n *\n * Output quantization is fixed: scale = 1/256, zero_point = -128.\n * This matches TFLite's convention for int8 logistic output.\n */\n\nint32_t esp_nn_get_logistic_s8_scratch_size_ansi(void)\n{\n    return 256; /* LUT: one int8 output per possible int8 input */\n}\n\nvoid esp_nn_logistic_s8_prepare_ansi(int8_t *lut,\n                                      int32_t input_zero_point,\n                                      float input_scale)\n{\n    /* Build LUT: for each possible int8 input value (-128..127),\n     * compute sigmoid and quantize to output int8.\n     *\n     * Output quant: scale=1/256, zero_point=-128\n     * So output_int8 = clamp(round(sigmoid * 256) - 128, -128, 127)\n     * Which simplifies to: output_int8 = clamp(round(sigmoid * 256) - 128, -128, 127)\n     */\n    for (int i = 0; i < 256; i++) {\n        /* Index matches (uint8_t) cast of int8: i=0→int8(0), i=128→int8(-128) */\n        int8_t input_val = (int8_t)i;\n        float dequant = (input_val - input_zero_point) * input_scale;\n        float sigmoid = 1.0f / (1.0f + expf(-dequant));\n\n        /* Quantize to output: scale=1/256, zp=-128 */\n        int32_t out_q = (int32_t)roundf(sigmoid * 256.0f) - 128;\n        if (out_q < -128) out_q = -128;\n        if (out_q > 127) out_q = 127;\n        lut[i] = (int8_t)out_q;\n    }\n}\n\nvoid esp_nn_logistic_s8_ansi(const int8_t *input, int8_t *output,\n                              int32_t size, const int8_t *lut)\n{\n    for (int i = 0; i < size; i++) {\n        output[i] = lut[(uint8_t)input[i]];\n    }\n}\n"
  },
  {
    "path": "src/pooling/esp_nn_avg_pool_ansi.c",
    "content": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <stdint.h>\n\n#include <common_functions.h>\n\nvoid esp_nn_avg_pool_s8_ansi(const int8_t *input,\n                             const uint16_t input_wd,\n                             const uint16_t input_ht,\n                             int8_t *output,\n                             const uint16_t output_wd,\n                             const uint16_t output_ht,\n                             const uint16_t stride_wd,\n                             const uint16_t stride_ht,\n                             const uint16_t filter_wd,\n                             const uint16_t filter_ht,\n                             const uint16_t pad_wd,\n                             const uint16_t pad_ht,\n                             const int32_t activation_min,\n                             const int32_t activation_max,\n                             const uint16_t channels)\n{\n    int32_t base_y = -pad_ht;\n    for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) {\n        int32_t base_x = -pad_wd;\n        for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) {\n            for (int32_t ch_idx = 0; ch_idx < channels; ch_idx++) {\n                int32_t result = 0;\n                int32_t filter_cnt = 0;\n                /* Make sure filter does not cross the input box */\n                int32_t filter_y_start = max(0, -base_y);\n                int32_t filter_x_start = max(0, -base_x);\n\n                int32_t filter_y_end = min(filter_ht, input_ht - base_y);\n                int32_t filter_x_end = min(filter_wd, input_wd - base_x);\n\n                for (int32_t filter_y = filter_y_start; filter_y < filter_y_end; filter_y++) {\n                    for (int32_t filter_x = filter_x_start; filter_x < filter_x_end; filter_x++) {\n                        int32_t in_x_idx = base_x + filter_x;\n                        int32_t in_y_idx = base_y + filter_y;\n                        int32_t input_index = (in_y_idx * input_wd + in_x_idx) * channels + ch_idx;\n                        result += input[input_index];\n                        filter_cnt++;\n                    }\n                }\n\n                /* Rounded average */\n                result = result > 0 ? (result + filter_cnt / 2) / filter_cnt\n                                    : (result - filter_cnt / 2) / filter_cnt;\n\n                /* Activation function */\n                result = max(result, activation_min);\n                result = min(result, activation_max);\n\n                int32_t output_index = (out_y * output_wd + out_x) * channels + ch_idx;\n                output[output_index] = (int8_t) result;\n            }\n        }\n    }\n}\n"
  },
  {
    "path": "src/pooling/esp_nn_avg_pool_s8_esp32p4.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <stdint.h>\n#include <common_functions.h>\n\n/**\n * Average pooling for s8 using ESP32-P4 PIE SIMD.\n *\n * Uses QACC per-lane accumulation: multiply 16 input channels by a\n * vector of 1s, accumulate per-lane across filter window.\n * Extract 16 × int32 sums via esp.st.qacc.{l,h}.{l,h}.128.ip.\n * Then divide, clamp, and store.\n */\nvoid esp_nn_avg_pool_s8_esp32p4(const int8_t *input,\n                                 const uint16_t input_wd,\n                                 const uint16_t input_ht,\n                                 int8_t *output,\n                                 const uint16_t output_wd,\n                                 const uint16_t output_ht,\n                                 const uint16_t stride_wd,\n                                 const uint16_t stride_ht,\n                                 const uint16_t filter_wd,\n                                 const uint16_t filter_ht,\n                                 const uint16_t pad_wd,\n                                 const uint16_t pad_ht,\n                                 const int32_t activation_min,\n                                 const int32_t activation_max,\n                                 const uint16_t channels)\n{\n    /* Enable PIE */\n    asm volatile (\n        \"csrsi  0x7f2, 0b01        \\n\\t\"\n        \"li     x29, 0b10          \\n\\t\"\n        \"esp.movx.w.cfg x29        \\n\\t\"\n        ::: \"x29\"\n    );\n\n    /* Broadcast 1 into q7 for \"multiply by 1\" accumulation trick */\n    const int8_t one_val = 1;\n    asm volatile (\n        \"mv     x30, %0             \\n\\t\"\n        \"esp.vldbc.8.ip q7, x30, 0  \\n\\t\"\n        :: \"r\"(&one_val) : \"x30\"\n    );\n\n    const int32_t ch_16 = channels >> 4;\n\n    int32_t base_y = -pad_ht;\n    for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) {\n        int32_t base_x = -pad_wd;\n        for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) {\n            int32_t filter_y_start = max(0, -base_y);\n            int32_t filter_x_start = max(0, -base_x);\n            int32_t filter_y_end = min(filter_ht, input_ht - base_y);\n            int32_t filter_x_end = min(filter_wd, input_wd - base_x);\n            int32_t filter_cnt = (filter_y_end - filter_y_start) * (filter_x_end - filter_x_start);\n            int32_t half_cnt = filter_cnt >> 1;\n\n            int8_t *out_ptr = output + (out_y * output_wd + out_x) * channels;\n\n            /* Process 16 channels at a time using QACC per-lane accumulation */\n            int32_t ch_offset = 0;\n            for (int32_t ch_blk = 0; ch_blk < ch_16; ch_blk++, ch_offset += 16) {\n\n                /* Clear per-lane accumulators */\n                asm volatile (\"esp.zero.qacc \\n\\t\");\n\n                /* Accumulate via QACC with stride-based fx loop */\n                for (int32_t fy = filter_y_start; fy < filter_y_end; fy++) {\n                    int32_t in_y = base_y + fy;\n                    const int8_t *row_ptr = input + (in_y * input_wd + base_x + filter_x_start) * channels + ch_offset;\n                    int32_t fx_count = filter_x_end - filter_x_start;\n\n                    asm volatile (\n                        \"mv     x30, %[ptr]              \\n\\t\"\n                        \"mv     s7,  %[cnt]              \\n\\t\"\n                        \"1:                              \\n\\t\"\n                        \"esp.vld.128.ip  q0, x30, 0      \\n\\t\"\n                        \"esp.vmulas.s8.qacc q0, q7       \\n\\t\"\n                        \"add    x30, x30, %[stride]      \\n\\t\"\n                        \"addi   s7, s7, -1               \\n\\t\"\n                        \"bnez   s7, 1b                   \\n\\t\"\n                        :\n                        : [ptr] \"r\"(row_ptr), [cnt] \"r\"(fx_count),\n                          [stride] \"r\"((int32_t)channels)\n                        : \"x30\", \"s7\"\n                    );\n                }\n\n                /* Extract 16 per-lane int32 sums from QACC:\n                 * qacc has 4 quadrants, each 128 bits = 4 × int32 */\n                int32_t sums[16] __attribute__((aligned(16)));\n                asm volatile (\n                    \"mv                      x30, %0     \\n\\t\"\n                    \"esp.st.qacc.l.l.128.ip  x30, 16     \\n\\t\"  /* lanes 0-3 */\n                    \"esp.st.qacc.l.h.128.ip  x30, 16     \\n\\t\"  /* lanes 4-7 */\n                    \"esp.st.qacc.h.l.128.ip  x30, 16     \\n\\t\"  /* lanes 8-11 */\n                    \"esp.st.qacc.h.h.128.ip  x30, 0      \\n\\t\"  /* lanes 12-15 */\n                    :: \"r\"(sums)\n                    : \"x30\", \"memory\"\n                );\n\n                /* Rounded division and activation clamp */\n                for (int k = 0; k < 16; k++) {\n                    int32_t s = sums[k];\n                    int32_t result = s > 0 ? (s + half_cnt) / filter_cnt\n                                           : (s - half_cnt) / filter_cnt;\n                    result = max(result, activation_min);\n                    result = min(result, activation_max);\n                    out_ptr[ch_offset + k] = (int8_t) result;\n                }\n            }\n\n            /* Handle remaining channels scalar */\n            for (int32_t ch_idx = ch_offset; ch_idx < channels; ch_idx++) {\n                int32_t result = 0;\n                int32_t count = 0;\n                for (int32_t fy = filter_y_start; fy < filter_y_end; fy++) {\n                    for (int32_t fx = filter_x_start; fx < filter_x_end; fx++) {\n                        int32_t in_y = base_y + fy;\n                        int32_t in_x = base_x + fx;\n                        result += input[(in_y * input_wd + in_x) * channels + ch_idx];\n                        count++;\n                    }\n                }\n                result = result > 0 ? (result + count / 2) / count\n                                    : (result - count / 2) / count;\n                result = max(result, activation_min);\n                result = min(result, activation_max);\n                out_ptr[ch_idx] = (int8_t) result;\n            }\n        }\n    }\n}\n"
  },
  {
    "path": "src/pooling/esp_nn_avg_pool_s8_esp32s3.S",
    "content": "//\n// SPDX-FileCopyrightText: 2021-2026 Espressif Systems (Shanghai) CO LTD\n//\n// SPDX-License-Identifier: Apache-2.0\n//\n\n    .text\n    .align  4\n    .literal_position\n\n    # Program Unit: esp_nn_avg_pool_s8_esp32s3_asm\n    .type   esp_nn_avg_pool_s8_esp32s3_asm, @function\n    .align   4\n    .global esp_nn_avg_pool_s8_esp32s3_asm\n\n// no of channels must be multiple of 4.\n\n// a2: input\n// a3: input_wd\n// a4: input_ht\n// a5: output\n// a6: output_wd\n// a7: output_ht\n// on stack: stride_wd\n// on stack: stride_ht\n// on stack: filter_wd\n// on stack: filter_ht\n// on stack: pad_wd\n// on stack: pad_ht\n// on stack: activation_min\n// on stack: activation_max\n// on stack: channels\n\nesp_nn_avg_pool_s8_esp32s3_asm: # 0x4\n    # activation_min = 0\n    # activation_max = 4\n    # gra_spill_temp_0 = 8\n    # gra_spill_temp_1 = 12\n    # gra_spill_temp_2 = 16\n    # gra_spill_temp_3 = 20\n    # gra_spill_temp_4 = 24\n    # gra_spill_temp_5 = 28\n    # gra_spill_temp_6 = 32\n    # gra_spill_temp_7 = 36\n    # gra_spill_temp_8 = 40\n    # gra_spill_temp_9 = 44\n    # gra_spill_temp_10 = 48\n    # gra_spill_temp_11 = 52\n    # gra_spill_temp_12 = 56\n    # gra_spill_temp_13 = 60\n    # gra_spill_temp_14 = 64\n    # gra_spill_temp_15 = 68\n    # gra_spill_temp_16 = 72\n    # gra_spill_temp_17 = 76\n    # gra_spill_temp_18 = 80\n    # gra_spill_temp_19 = 84\n    # gra_spill_temp_20 = 88\n    # gra_spill_temp_21 = 92\n    # gra_spill_temp_22 = 96\n    # gra_spill_temp_23 = 100\n    # gra_spill_temp_24 = 104\n    # gra_spill_temp_25 = 108\n    # gra_spill_temp_26 = 112\n    # gra_spill_temp_27 = 116\n    # gra_spill_temp_28 = 120\n    # gra_spill_temp_29 = 124\n    # gra_spill_temp_30 = 128\n    # gra_spill_temp_31 = 132\n    # gra_spill_temp_32 = 136\n    # gra_spill_temp_33 = 140\n    # gra_spill_temp_34 = 144\n    # gra_spill_temp_35 = 148\n    # gra_spill_temp_36 = 152\n    # gra_spill_temp_37 = 156\n    # gra_spill_temp_38 = 160\n    # gra_spill_temp_39 = 164\n    # gra_spill_temp_40 = 168\n    # gra_spill_temp_41 = 172\n    # gra_spill_temp_43 = 180\n\n    entry   a1,240                      #\n    mov.n   a11,a3                      # [0]\n    mov.n   a12,a2                      # [1]\n    s32i    a5,a1,136                   # [4]  gra_spill_temp_30\n    s32i    a6,a1,128                   # [3]  gra_spill_temp_32\n\n    l16ui   a5,a1,272                   # [5]  id:663 channels+0x0\n    s32i    a7,a1,72                    # [6]  gra_spill_temp_16\n\n    l32i        a9,a1,264                   # [1]  id:664 activation_min+0x0\n    l32i        a10,a1,268                  # [2]  id:666 activation_max+0x0\n    s32i.n      a9,a1,0                 # [4]  activation_min\n    s32i.n      a10,a1,4                # [3]  activation_max\n    addi.n      a8,a1,4                 # [0]  activation_max\n    ee.vldbc.32 q7,a1               # [5]  id:668 activation_min\n    ee.vldbc.32 q6,a8               # [6]  id:669 activation_max\n    ee.zero.q   q4                      # [0]\n\n    extui   a10,a5,0,3                  # [7]\n    beqz.n  a10,.LBB3_esp_nn_avg_pool_s8_esp32s3_asm    # [8], if (channels % 8 == 0)\n\n    extui   a13,a5,0,2                  # [0]\n    beqz.n  a13,.LBB52_esp_nn_avg_pool_s8_esp32s3_asm   # [1], if (channels % 4 == 0)\n\n// exit\n.Lt_0_44546:    # 0x1e9\n    retw.n                          # [0]\n\n.LBB3_esp_nn_avg_pool_s8_esp32s3_asm:   # 0x1eb // if (channels % 8 == 0)\n\n    l16ui   a7,a1,256                   # [1]  id:671 pad_wd+0x0\n    l16ui   a10,a1,260                  # [5]  id:670 pad_ht+0x0\n    l32i    a15,a1,72                   # [12]  gra_spill_temp_16\n    movi.n  a14,0                   # [13]\n    movi.n  a8,0                    # [14]\n    neg     a10,a10                     # [15]\n    s32i    a10,a1,56                   # [16]  gra_spill_temp_12\n    s32i    a8,a1,44                    # [17]  gra_spill_temp_9\n    s32i.n  a14,a1,20               # [18]  gra_spill_temp_3\n    sub     a9,a4,a10                   # [19]\n    s32i    a9,a1,40                    # [20]  gra_spill_temp_8\n    mul16u  a15,a15,a5              # [21]\n    neg     a13,a7                      # [22]\n    s32i    a13,a1,104                  # [23]  gra_spill_temp_24\n    s32i.n  a15,a1,16               # [24]  gra_spill_temp_2\n    sub     a13,a3,a13                  # [25]\n    s32i.n  a13,a1,12               # [26]  gra_spill_temp_1\n    j       .Lt_0_28162                     # [27]\n\n.Lt_0_28418:    # 0x24e\n#<loop> Part of loop body line 44, head labeled .Lt_0_28162\n    l32i    a15,a1,260                  # [0]  pad_ht\n    l32i    a14,a1,56                   # [1]  gra_spill_temp_12\n    l32i.n  a9,a1,16                # [2]  gra_spill_temp_2\n    l32i    a13,a1,244                  # [3]  stride_ht\n    l32i    a10,a1,40                   # [4]  gra_spill_temp_8\n    l32i    a8,a1,44                    # [5]  gra_spill_temp_9\n    sub     a10,a10,a13                 # [6]\n    add.n   a8,a8,a9                    # [7]\n    add.n   a14,a14,a13                 # [8]\n    sub     a15,a15,a13                 # [9]\n    s32i    a15,a1,260                  # [10]  pad_ht\n    s32i    a14,a1,56                   # [11]  gra_spill_temp_12\n    s32i    a8,a1,44                    # [12]  gra_spill_temp_9\n    s32i    a10,a1,40                   # [13]  gra_spill_temp_8\n    l32i.n  a8,a1,20                # [14]  gra_spill_temp_3\n    l32i    a9,a1,72                    # [15]  gra_spill_temp_16\n    addi.n  a8,a8,1                 # [16]\n    s32i.n  a8,a1,20                # [17]  gra_spill_temp_3\n    beq a8,a9,.Lt_0_44546           # [18]\n\n.Lt_0_28162:    # 0x281\n    l32i    a10,a1,128                  # [0]  gra_spill_temp_32\n    beqz.n  a10,.Lt_0_28418         # [2]\n\n.LBB7_esp_nn_avg_pool_s8_esp32s3_asm:   # 0x286\n#<loop> Part of loop body line 44, head labeled .Lt_0_28162\n    s32i    a7,a1,112                   # [0]  gra_spill_temp_26\n    movi.n  a10,0                   # [1]\n    l32i    a9,a1,260                   # [2]  pad_ht\n    l32i.n  a6,a1,12                # [3]  gra_spill_temp_1\n    l32i    a8,a1,44                    # [4]  gra_spill_temp_9\n    movi.n  a13,0                   # [5]\n    l32i    a15,a1,104                  # [6]  gra_spill_temp_24\n    s32i    a15,a1,116                  # [7]  gra_spill_temp_27\n    s32i    a13,a1,48                   # [8]  gra_spill_temp_10\n    s32i    a8,a1,124                   # [9]  gra_spill_temp_29\n    s32i    a6,a1,120                   # [10]  gra_spill_temp_28\n    l32i    a8,a1,40                    # [11]  gra_spill_temp_8\n    l32i    a6,a1,252                   # [12]  filter_ht\n    movi.n  a13,0                   # [13]\n    max     a9,a9,a10                   # [14]\n    s32i    a9,a1,160                   # [15]  gra_spill_temp_38\n    s32i    a13,a1,92                   # [16]  gra_spill_temp_21\n    min     a6,a6,a8                    # [17]\n    bnez.n  a5,.LBB10_esp_nn_avg_pool_s8_esp32s3_asm    # [18]\n\n.Lt_0_29186:    # 0x2ba\n    l32i    a8,a1,116                   # [0]  gra_spill_temp_27\n    l32i    a15,a1,120                  # [1]  gra_spill_temp_28\n    l32i    a9,a1,48                    # [2]  gra_spill_temp_10\n    l32i    a14,a1,240                  # [3]  stride_wd\n    l32i    a10,a1,124                  # [4]  gra_spill_temp_29\n    l32i    a13,a1,112                  # [5]  gra_spill_temp_26\n    add.n   a10,a10,a5                  # [6]\n    s32i    a10,a1,124                  # [7]  gra_spill_temp_29\n    sub     a13,a13,a14                 # [8]\n    add.n   a9,a9,a14                   # [9]\n    sub     a15,a15,a14                 # [10]\n    add.n   a8,a8,a14                   # [11]\n    s32i    a8,a1,116                   # [12]  gra_spill_temp_27\n    s32i    a15,a1,120                  # [13]  gra_spill_temp_28\n    s32i    a9,a1,48                    # [14]  gra_spill_temp_10\n    s32i    a13,a1,112                  # [15]  gra_spill_temp_26\n    l32i    a9,a1,92                    # [16]  gra_spill_temp_21\n    l32i    a10,a1,128                  # [17]  gra_spill_temp_32\n    addi.n  a9,a9,1                 # [18]\n    s32i    a9,a1,92                    # [19]  gra_spill_temp_21\n    beq     a9,a10,.Lt_0_28418          # [20]\n\n.Lt_0_28930:    # 0x2f5\n#<loop> Part of loop body line 46, head labeled .Lt_0_29186\n    beqz.n  a5,.Lt_0_29186          # [0]\n\n.LBB10_esp_nn_avg_pool_s8_esp32s3_asm:  # 0x2f7\n#<loop> Part of loop body line 44, head labeled .Lt_0_28162\n    l32i    a14,a1,120                  # [0]  gra_spill_temp_28\n    l32i    a13,a1,248                  # [1]  filter_wd\n    l32i    a9,a1,136                   # [2]  gra_spill_temp_30\n    l32i    a8,a1,124                   # [3]  gra_spill_temp_29\n    movi.n  a15,0                   # [4]\n    s32i    a15,a1,24                   # [5]  gra_spill_temp_60\n    add.n   a10,a8,a5                   # [6]\n    movi.n  a15,0                   # [7]\n    add.n   a8,a8,a9                    # [8]\n    min     a13,a13,a14                 # [9]\n    add.n   a10,a9,a10                  # [10]\n    s32i    a10,a1,180                  # [11]  gra_spill_temp_43\n    s32i    a13,a1,76                   # [12]  gra_spill_temp_17\n    l32i    a14,a1,112                  # [13]  gra_spill_temp_26\n    s32i    a8,a1,148                   # [14]  gra_spill_temp_45\n    max     a14,a14,a15                 # [15]\n    l32i    a15,a1,116                  # [16]  gra_spill_temp_27\n    s32i    a14,a1,152                  # [17]  gra_spill_temp_63\n    add.n   a8,a15,a14                  # [18]\n    s32i    a8,a1,36                    # [19]  gra_spill_temp_7\n    add.n   a15,a15,a13                 # [20]\n    s32i    a15,a1,204                  # [21]  gra_spill_temp_39\n    sub     a13,a13,a14                 # [22]\n    s32i    a13,a1,280                  # [23]  gra_spill_temp_58\n    j   .Lt_0_29698                     # [24]\n\n.LBB13_esp_nn_avg_pool_s8_esp32s3_asm:  # 0x33b\n#<loop> Part of loop body line 16, head labeled .Lt_0_29698\n    l32i    a10,a1,56                   # [0]  gra_spill_temp_12\n    l32i    a14,a1,204                  # [1]  gra_spill_temp_39\n    add.n   a10,a10,a15                 # [2]\n    mull    a10,a11,a10                 # [3]\n    movi.n  a15,0                   # [4]\n    add.n   a14,a10,a14                 # [5]\n\n.Lt_0_30466:    # 0x34a\n#<loop> Loop body line 61, nesting depth: 4, estimated iterations: 252\n    l32i    a9,a1,76                    # [0]  gra_spill_temp_17\n    l32i    a8,a1,152                   # [1]  gra_spill_temp_63\n    add.n   a14,a14,a11                 # [2]\n    bge     a8,a9,.Lt_0_30722           # [3]\n\n.LBB16_esp_nn_avg_pool_s8_esp32s3_asm:  # 0x355\n#<loop> Part of loop body line 61, head labeled .Lt_0_30466\n    l32i    a3,a1,36                    # [0]  gra_spill_temp_7\n    l32i    a2,a1,24                    # [1]  gra_spill_temp_4\n    add.n   a3,a3,a10                   # [2]\n    mull    a3,a3,a5                    # [3]\n    movi.n  a8,0                    # [4]\n    add.n   a2,a2,a3                    # [5]\n    l32i    a3,a1,280                   # [6]  gra_spill_temp_58\n    add.n   a2,a12,a2                   # [7]\n    loopgtz a3,.LBB140_esp_nn_avg_pool_s8_esp32s3_asm   # [8]\n\n    ee.vld.l.64.xp  q0,a2,a5            # [0*II+1]  id:677\n    ee.vcmp.lt.s8   q1,q0,q4            # [0*II+3]\n    ee.vzip.8       q0,q1                   # [0*II+4]\n    ee.vcmp.lt.s16  q1,q0,q4        # [0*II+5]\n    ee.vzip.16      q0,q1               # [0*II+6]\n    ee.vadds.s32    q2,q2,q1            # [0*II+7]\n    ee.vadds.s32    q3,q3,q0            # [0*II+8]\n\n\n.LBB140_esp_nn_avg_pool_s8_esp32s3_asm: # 0x385\n#<loop> Part of loop body line 61, head labeled .Lt_0_30466\n    l32i    a2,a1,48                    # [0]  gra_spill_temp_10\n    sub     a9,a7,a2                    # [2]\n    sub     a2,a2,a7                    # [3]\n    max     a9,a9,a8                    # [4]\n    l32i    a8,a1,248                   # [5]  filter_wd\n    sub     a2,a11,a2                   # [6]\n    min     a8,a8,a2                    # [7]\n    sub     a8,a8,a9                    # [8]\n    add.n   a15,a15,a8                  # [9]\n\n.Lt_0_30722:    # 0x39f\n#<loop> Part of loop body line 61, head labeled .Lt_0_30466\n    add.n   a10,a10,a11                 # [0]\n    addi.n  a13,a13,1               # [1]\n    bne     a6,a13,.Lt_0_30466          # [2]\n\n.Lt_0_29954:    # 0x3a6\n    srai            a2,a15,1                    # [3]\n\n// move data to general purpose registers and average\n    ee.movi.32.a    q3,a9,0             # [0]\n    ee.movi.32.a    q3,a4,1             # [0]\n\n    blti            a9,1,.Lt_0_32258            # [4]\n    add.n           a9,a9,a2                    # [0]\n    j               .Lt_0_32002                     # [2]\n.Lt_0_32258:    # 0x45e\n    sub             a9,a9,a2                    # [0]\n.Lt_0_32002:    # 0x3b9\n\n    blti            a4,1,.Lt_0_32770            # [1]\n    add.n           a4,a2,a4                    # [0]\n    j               .Lt_0_32514                     # [2]\n.Lt_0_32770:\n    sub             a4,a4,a2                    # [0]\n.Lt_0_32514:    # 0x3c4\n\n    quos            a9,a9,a15                   # [1]\n    quos            a4,a4,a15                   # [1]\n    ee.movi.32.q    q3,a9,0             # [0]\n    ee.movi.32.q    q3,a4,1             # [1]\n\n    ee.movi.32.a    q3,a9,2             # [2]\n    ee.movi.32.a    q3,a14,3            # [0]\n\n    blti            a9,1,.Lt_0_33282            # [3]\n    add.n           a9,a9,a2                    # [0]\n    j               .Lt_0_33026                     # [2]\n.Lt_0_33282:    # 0x470\n    sub             a9,a9,a2                    # [0]\n.Lt_0_33026:    # 0x3d5\n\n    blti            a14,1,.Lt_0_33794           # [1]\n    add.n           a14,a2,a14                  # [0]\n    j               .Lt_0_33538                     # [2]\n.Lt_0_33794:    # 0x479\n    sub             a14,a14,a2                      # [0]\n.Lt_0_33538:    # 0x3e0\n\n    quos            a9,a9,a15                   # [1]\n    quos            a14,a14,a15                 # [1]\n    ee.movi.32.q    q3,a9,2             # [0]\n    ee.movi.32.q    q3,a14,3            # [1]\n\n\n    ee.movi.32.a    q2,a9,0             # [0]\n    ee.movi.32.a    q2,a4,1             # [0]\n\n    blti            a9,1,.Lt_0_34306            # [3]\n    add.n           a9,a9,a2                    # [0]\n    j               .Lt_0_34050                     # [2]\n.Lt_0_34306:    # 0x482\n    sub             a9,a9,a2                    # [0]\n.Lt_0_34050:    # 0x3f1\n\n    blti            a4,1,.Lt_0_34818            # [1]\n    add.n           a4,a2,a4                    # [0]\n    j               .Lt_0_34562                     # [2]\n.Lt_0_34818:    # 0x48b\n    sub             a4,a4,a2                    # [0]\n.Lt_0_34562:    # 0x3fc\n\n    quos            a9,a9,a15                   # [1]\n    quos            a4,a4,a15                   # [1]\n    ee.movi.32.q    q2,a9,0             # [0]\n    ee.movi.32.q    q2,a4,1             # [1]\n\n    ee.movi.32.a    q2,a9,2             # [2]\n    ee.movi.32.a    q2,a14,3            # [0]\n\n    blti            a9,1,.Lt_0_35330            # [3]\n    add.n           a9,a9,a2                    # [0]\n    j               .Lt_0_35074                     # [2]\n.Lt_0_35330:    # 0x494\n    sub             a9,a9,a2                    # [0]\n.Lt_0_35074:    # 0x40d\n\n    blti            a14,1,.Lt_0_35842           # [1]\n    add.n           a14,a2,a14                  # [0]\n    j               .Lt_0_35586                     # [2]\n.Lt_0_35842:    # 0x49d\n    sub             a14,a14,a2                      # [0]\n.Lt_0_35586:    # 0x418\n\n    quos            a9,a9,a15                   # [1]\n    quos            a14,a14,a15                 # [1]\n    ee.movi.32.q    q2,a9,2             # [0]\n    ee.movi.32.q    q2,a14,3            # [1]\n\n\n    l32i            a9,a1,180                   # [0]  gra_spill_temp_43\n    l32i            a14,a1,24                   # [1]  gra_spill_temp_4\n    l32i            a13,a1,148                  # [2]  gra_spill_temp_45\n    ee.vmin.s32     q1,q3,q6            # [4]\n    ee.vmax.s32     q1,q1,q7            # [5]\n    ee.vmin.s32     q5,q2,q6            # [8]\n    addi.n          a14,a14,8               # [9]\n    s32i            a14,a1,24                   # [10]  gra_spill_temp_4\n    ee.vmax.s32     q5,q5,q7            # [11]\n    addi.n          a8,a13,8                    # [12]\n    s32i            a8,a1,148                   # [13]  gra_spill_temp_45\n    ee.vunzip.16    q1,q5               # [14]\n    ee.vunzip.8     q1,q5               # [15]\n    ee.vst.l.64.ip  q1,a13,0        # [16]  id:678\n    bge             a8,a9,.Lt_0_29186           # [17]\n\n.Lt_0_29698:    # 0x44b\n#<loop> Loop body line 16, nesting depth: 3, estimated iterations: 252\n    mv.qr   q3,q4                       # [0]\n    l32i    a15,a1,160                  # [1]  gra_spill_temp_38\n    mv.qr   q2,q4                       # [2]\n    mov.n   a13,a15                     # [3]\n    blt a15,a6,.LBB13_esp_nn_avg_pool_s8_esp32s3_asm    # [4]\n\n.Lt_0_51458:    # 0x459\n#<loop> Part of loop body line 16, head labeled .Lt_0_29698\n    movi.n  a15,0                   # [0]\n    j   .Lt_0_29954                     # [1]\n\n\n.LBB52_esp_nn_avg_pool_s8_esp32s3_asm:  # 0x4a6 // if (channels % 4 == 0)\n\n    l16ui   a7,a1,256                   # [1]  id:671 pad_wd+0x0\n    l16ui   a13,a1,260                  # [5]  id:670 pad_ht+0x0\n    s32i    a13,a1,64                   # [8]  gra_spill_temp_4\n    l32i    a8,a1,72                    # [12]  gra_spill_temp_16\n    movi.n  a15,0                   # [13]\n    movi.n  a9,0                    # [14]\n    neg     a13,a13                     # [15]\n    s32i    a13,a1,192                  # [16]  gra_spill_temp_36\n    s32i    a9,a1,32                    # [17]  gra_spill_temp_6\n    s32i.n  a15,a1,8                # [18]  gra_spill_temp_0\n    sub     a10,a4,a13                  # [19]\n    s32i    a10,a1,28                   # [20]  gra_spill_temp_5\n    mul16u  a8,a8,a5                # [21]\n    neg     a14,a7                      # [22]\n    s32i    a14,a1,104                  # [23]  gra_spill_temp_24\n    s32i.n  a8,a1,16                # [24]  gra_spill_temp_2\n    sub     a14,a3,a14                  # [25]\n    s32i.n  a14,a1,12               # [26]  gra_spill_temp_1\n    j   .Lt_0_37890                     # [27]\n\n.Lt_0_38146:    # 0x50b\n#<loop> Part of loop body line 161, head labeled .Lt_0_37890\n    l32i    a15,a1,64                   # [0]  gra_spill_temp_4\n    l32i    a14,a1,192                  # [1]  gra_spill_temp_36\n    l32i.n  a9,a1,16                # [2]  gra_spill_temp_2\n    l32i    a13,a1,244                  # [3]  stride_ht\n    l32i    a10,a1,28                   # [4]  gra_spill_temp_5\n    l32i    a8,a1,32                    # [5]  gra_spill_temp_6\n    sub     a10,a10,a13                 # [6]\n    add.n   a8,a8,a9                    # [7]\n    add.n   a14,a14,a13                 # [8]\n    sub     a15,a15,a13                 # [9]\n    s32i    a15,a1,64                   # [10]  gra_spill_temp_4\n    s32i    a14,a1,192                  # [11]  gra_spill_temp_36\n    s32i    a8,a1,32                    # [12]  gra_spill_temp_6\n    s32i    a10,a1,28                   # [13]  gra_spill_temp_5\n    l32i.n  a8,a1,8                 # [14]  gra_spill_temp_0\n    l32i    a9,a1,72                    # [15]  gra_spill_temp_16\n    addi.n  a8,a8,1                 # [16]\n    s32i.n  a8,a1,8                 # [17]  gra_spill_temp_0\n    sub     a8,a8,a9                    # [18]\n    beqz    a8,.Lt_0_44546              # [19]\n\n.Lt_0_37890:    # 0x541\n#<loop> Loop body line 161, nesting depth: 1, estimated iterations: 252\n    l32i    a10,a1,128                  # [0]  gra_spill_temp_32\n    beqz.n  a10,.Lt_0_38146         # [2]\n\n#<loop> Part of loop body line 161, head labeled .Lt_0_37890\n    s32i    a7,a1,96                    # [0]  gra_spill_temp_22\n    movi.n  a10,0                   # [1]\n    l32i    a9,a1,64                    # [2]  gra_spill_temp_4\n    l32i.n  a6,a1,12                # [3]  gra_spill_temp_1\n    l32i    a8,a1,32                    # [4]  gra_spill_temp_6\n    movi.n  a13,0                   # [5]\n    l32i    a15,a1,104                  # [6]  gra_spill_temp_24\n    s32i    a15,a1,100                  # [7]  gra_spill_temp_23\n    s32i    a13,a1,148                  # [8]  gra_spill_temp_35\n    s32i    a8,a1,108                   # [9]  gra_spill_temp_25\n    s32i    a6,a1,144                   # [10]  gra_spill_temp_24\n    l32i    a8,a1,28                    # [11]  gra_spill_temp_5\n    l32i    a6,a1,252                   # [12]  filter_ht\n    max     a9,a9,a10                   # [14]\n    s32i    a9,a1,168                   # [15]  gra_spill_temp_40\n    s32i    a13,a1,88                   # [16]  gra_spill_temp_20\n    min     a6,a6,a8                    # [17]\n    bnez.n  a5,.LBB59_esp_nn_avg_pool_s8_esp32s3_asm    # [18]\n\n.Lt_0_38914:    # 0x57a\n#<loop> Loop body line 163\n    l32i    a8,a1,100                   # [0]  gra_spill_temp_23\n    l32i    a15,a1,144                  # [1]  gra_spill_temp_24\n    l32i    a9,a1,148                   # [2]  gra_spill_temp_35\n    l32i    a14,a1,240                  # [3]  stride_wd\n    l32i    a10,a1,108                  # [4]  gra_spill_temp_25\n    l32i    a13,a1,96                   # [5]  gra_spill_temp_22\n    add.n   a10,a10,a5                  # [6]\n    s32i    a10,a1,108                  # [7]  gra_spill_temp_25\n    sub     a13,a13,a14                 # [8]\n    add.n   a9,a9,a14                   # [9]\n    sub     a15,a15,a14                 # [10]\n    add.n   a8,a8,a14                   # [11]\n    s32i    a8,a1,100                   # [12]  gra_spill_temp_23\n    s32i    a15,a1,144                  # [13]  gra_spill_temp_24\n    s32i    a9,a1,148                   # [14]  gra_spill_temp_35\n    s32i    a13,a1,96                   # [15]  gra_spill_temp_22\n    l32i    a9,a1,88                    # [16]  gra_spill_temp_20\n    l32i    a10,a1,128                  # [17]  gra_spill_temp_32\n    addi.n  a9,a9,1                 # [18]\n    s32i    a9,a1,88                    # [19]  gra_spill_temp_20\n    beq     a9,a10,.Lt_0_38146          # [20]\n\n    beqz.n  a5,.Lt_0_38914          # [0]\n\n.LBB59_esp_nn_avg_pool_s8_esp32s3_asm:  # 0x5b7\n#<loop> Part of loop body line 161, head labeled .Lt_0_37890\n    l32i    a14,a1,144                  # [0]  gra_spill_temp_24\n    l32i    a13,a1,248                  # [1]  filter_wd\n    l32i    a9,a1,136                   # [2]  gra_spill_temp_30\n    l32i    a8,a1,108                   # [3]  gra_spill_temp_25\n    movi.n  a15,0                   # [4]\n    s32i    a15,a1,216                  # [5]  gra_spill_temp_52\n    add.n   a10,a8,a5                   # [6]\n    add.n   a8,a8,a9                    # [8]\n    min     a13,a13,a14                 # [9]\n    add.n   a10,a9,a10                  # [10]\n    s32i    a10,a1,172                  # [11]  gra_spill_temp_41\n    s32i    a13,a1,132                  # [12]  gra_spill_temp_31\n    l32i    a14,a1,96                   # [13]  gra_spill_temp_22\n    s32i    a8,a1,164                   # [14]  gra_spill_temp_39\n    max     a14,a14,a15                 # [15]\n    l32i    a15,a1,100                  # [16]  gra_spill_temp_23\n    s32i    a14,a1,208                  # [17]  gra_spill_temp_50\n    add.n   a8,a15,a14                  # [18]\n    s32i    a8,a1,60                    # [19]  gra_spill_temp_13\n    add.n   a15,a15,a13                 # [20]\n    s32i    a15,a1,196                  # [21]  gra_spill_temp_37\n    sub     a13,a13,a14                 # [22]\n    s32i    a13,a1,52                   # [23]  gra_spill_temp_11\n    j       .Lt_0_39426                     # [24]\n\n.LBB62_esp_nn_avg_pool_s8_esp32s3_asm:  # 0x5fb\n#<loop> Part of loop body line 173, head labeled .Lt_0_39426\n    l32i    a10,a1,192                  # [0]  gra_spill_temp_36\n    l32i    a14,a1,196                  # [1]  gra_spill_temp_37\n    add.n   a10,a10,a15                 # [2]\n    mull    a10,a11,a10                 # [3]\n    movi.n  a15,0                   # [4]\n    add.n   a14,a10,a14                 # [5]\n\n.Lt_0_40194:    # 0x60a\n#<loop> Loop body line 178, nesting depth: 4, estimated iterations: 252\n    l32i    a9,a1,132                   # [0]  gra_spill_temp_31\n    l32i    a8,a1,208                   # [1]  gra_spill_temp_50\n    add.n   a14,a14,a11                 # [2]\n    bge a8,a9,.Lt_0_40450           # [3]\n\n.LBB65_esp_nn_avg_pool_s8_esp32s3_asm:  # 0x615\n#<loop> Part of loop body line 178, head labeled .Lt_0_40194\n    l32i    a3,a1,60                    # [0]  gra_spill_temp_13\n    l32i    a2,a1,216                   # [1]  gra_spill_temp_52\n    add.n   a3,a3,a10                   # [2]\n    mull    a3,a3,a5                    # [3]\n    l32i    a4,a1,52                    # [4]  gra_spill_temp_11\n    add.n   a2,a2,a3                    # [5]\n    add.n   a2,a12,a2                   # [6]\n    loopgtz a4,.LBB155_esp_nn_avg_pool_s8_esp32s3_asm   # [7]\n\n    ee.vldbc.32.xp  q0,a2,a5                # [0*II+0]  id:684\n    ee.vcmp.lt.s8   q1,q0,q4            # [0*II+2]\n    ee.vzip.8       q0,q1                   # [0*II+3]\n    ee.vcmp.lt.s16  q1,q0,q4        # [0*II+4]\n    ee.vzip.16      q0,q1               # [0*II+5]\n    ee.vadds.s32    q2,q2,q0            # [0*II+6]\n\n.LBB155_esp_nn_avg_pool_s8_esp32s3_asm: # 0x63e\n#<loop> Part of loop body line 178, head labeled .Lt_0_40194\n    l32i    a2,a1,148                   # [0]  gra_spill_temp_35\n    movi.n  a8,0                    # [1]\n    sub     a9,a7,a2                    # [2]\n    sub     a2,a2,a7                    # [3]\n    max     a9,a9,a8                    # [4]\n    l32i    a8,a1,248                   # [5]  filter_wd\n    sub     a2,a11,a2                   # [6]\n    min     a8,a8,a2                    # [7]\n    sub     a8,a8,a9                    # [8]\n    add.n   a15,a15,a8                  # [9]\n\n.Lt_0_40450:    # 0x65a\n#<loop> Part of loop body line 178, head labeled .Lt_0_40194\n    add.n   a10,a10,a11                 # [0]\n    addi.n  a13,a13,1               # [1]\n    bne     a6,a13,.Lt_0_40194          # [2]\n\n.Lt_0_39682:    # 0x661\n#<loop> Part of loop body line 173, head labeled .Lt_0_39426\n    srai            a2,a15,1                    # [5]\n\n// move to gp registers and average\n\n    ee.movi.32.a    q2,a9,0             # [0]\n    ee.movi.32.a    q2,a4,1             # [0]\n\n    blti            a9,1,.Lt_0_41986            # [3]\n    add.n           a9,a9,a2                    # [0]\n    j               .Lt_0_41730                     # [2]\n.Lt_0_41986:    # 0x482\n    sub             a9,a9,a2                    # [0]\n.Lt_0_41730:    # 0x3f1\n\n    blti            a4,1,.Lt_0_42498            # [1]\n    add.n           a4,a2,a4                    # [0]\n    j               .Lt_0_42242                     # [2]\n.Lt_0_42498:    # 0x48b\n    sub             a4,a4,a2                    # [0]\n.Lt_0_42242:    # 0x3fc\n\n\n    quos            a9,a9,a15                   # [1]\n    quos            a4,a4,a15                   # [1]\n    ee.movi.32.q    q2,a9,0             # [0]\n    ee.movi.32.q    q2,a4,1             # [1]\n\n    ee.movi.32.a    q2,a9,2             # [2]\n    ee.movi.32.a    q2,a14,3            # [0]\n\n    blti            a9,1,.Lt_0_43010            # [3]\n    add.n           a9,a9,a2                    # [0]\n    j               .Lt_0_42754                     # [2]\n.Lt_0_43010:    # 0x494\n    sub             a9,a9,a2                    # [0]\n.Lt_0_42754:    # 0x40d\n\n\n    blti            a14,1,.Lt_0_43522           # [1]\n    add.n           a14,a2,a14                  # [0]\n    j               .Lt_0_43266                     # [2]\n.Lt_0_43522:    # 0x49d\n    sub             a14,a14,a2                      # [0]\n.Lt_0_43266:    # 0x418\n\n    quos            a9,a9,a15                   # [1]\n    quos            a14,a14,a15                 # [1]\n    ee.movi.32.q    q2,a9,2             # [0]\n    ee.movi.32.q    q2,a14,3            # [1]\n\n\n    l32i            a9,a1,172                   # [0]  gra_spill_temp_41\n    l32i            a8,a1,164                   # [1]  gra_spill_temp_39\n    l32i            a14,a1,216                  # [2]  gra_spill_temp_52\n    addi.n          a14,a14,4               # [5]\n    ee.vmin.s32     q2,q2,q6            # [6]\n    s32i            a14,a1,216                  # [7]  gra_spill_temp_52\n    ee.vmax.s32     q2,q2,q7            # [8]\n    ee.vunzip.16    q2,q1               # [9]\n    ee.vunzip.8     q2,q1               # [10]\n    ee.vst.l.64.ip  q2,a1,0         # [11]  id:691\n    l32i.n          a13,a1,0                # [12]  id:692\n    s32i.n          a13,a8,0                # [13]  id:693\n    addi.n          a8,a8,4                 # [14]\n    s32i            a8,a1,164                   # [15]  gra_spill_temp_39\n    bge             a8,a9,.Lt_0_38914           # [16]\n\n.Lt_0_39426:    # 0x6cb\n    l32i    a15,a1,168                  # [0]  gra_spill_temp_40\n    mv.qr   q2,q4                       # [1]\n    mov.n   a13,a15                     # [2]\n    blt     a15,a6,.LBB62_esp_nn_avg_pool_s8_esp32s3_asm    # [3]\n\n.Lt_0_52738:    # 0x6d6\n    movi.n  a15,0                   # [0]\n    j       .Lt_0_39682                     # [1]\n\n    .size   esp_nn_avg_pool_s8_esp32s3_asm, . - esp_nn_avg_pool_s8_esp32s3_asm\n"
  },
  {
    "path": "src/pooling/esp_nn_avg_pool_s8_esp32s3.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*\n * ESP32-S3 optimized avg pool wrapper.\n * Routes to existing assembly for channels%4==0,\n * provides int16-accumulation C path for other cases.\n */\n\n#include <stdint.h>\n#include <string.h>\n#include <common_functions.h>\n\n/* Existing S3 assembly (handles depth%4==0) */\nextern void esp_nn_avg_pool_s8_esp32s3_asm(const int8_t *input,\n                             const uint16_t input_wd,\n                             const uint16_t input_ht,\n                             int8_t *output,\n                             const uint16_t output_wd,\n                             const uint16_t output_ht,\n                             const uint16_t stride_wd,\n                             const uint16_t stride_ht,\n                             const uint16_t filter_wd,\n                             const uint16_t filter_ht,\n                             const uint16_t pad_wd,\n                             const uint16_t pad_ht,\n                             const int32_t activation_min,\n                             const int32_t activation_max,\n                             const uint16_t channels);\n\nvoid esp_nn_avg_pool_s8_esp32s3(const int8_t *input,\n                             const uint16_t input_wd,\n                             const uint16_t input_ht,\n                             int8_t *output,\n                             const uint16_t output_wd,\n                             const uint16_t output_ht,\n                             const uint16_t stride_wd,\n                             const uint16_t stride_ht,\n                             const uint16_t filter_wd,\n                             const uint16_t filter_ht,\n                             const uint16_t pad_wd,\n                             const uint16_t pad_ht,\n                             const int32_t activation_min,\n                             const int32_t activation_max,\n                             const uint16_t channels)\n{\n    /* Use existing assembly for channels % 4 == 0 */\n    if (channels % 4 == 0) {\n        esp_nn_avg_pool_s8_esp32s3_asm(input, input_wd, input_ht, output,\n                                        output_wd, output_ht, stride_wd, stride_ht,\n                                        filter_wd, filter_ht, pad_wd, pad_ht,\n                                        activation_min, activation_max, channels);\n        return;\n    }\n\n    /* C path with int16 accumulation for non-aligned channels */\n    int16_t acc_buf[channels];\n\n    int32_t base_y = -pad_ht;\n    for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) {\n        int32_t base_x = -pad_wd;\n        for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) {\n            int32_t fy_start = max(0, -base_y);\n            int32_t fx_start = max(0, -base_x);\n            int32_t fy_end = min(filter_ht, input_ht - base_y);\n            int32_t fx_end = min(filter_wd, input_wd - base_x);\n            int32_t filter_cnt = (fy_end - fy_start) * (fx_end - fx_start);\n\n            memset(acc_buf, 0, channels * sizeof(int16_t));\n\n            for (int32_t fy = fy_start; fy < fy_end; fy++) {\n                for (int32_t fx = fx_start; fx < fx_end; fx++) {\n                    int32_t in_idx = ((base_y + fy) * input_wd + (base_x + fx)) * channels;\n                    for (int c = 0; c < channels; c++) {\n                        acc_buf[c] += (int16_t)input[in_idx + c];\n                    }\n                }\n            }\n\n            int32_t half_cnt = filter_cnt / 2;\n            int32_t out_idx = (out_y * output_wd + out_x) * channels;\n            for (int c = 0; c < channels; c++) {\n                int32_t result = acc_buf[c];\n                result = result > 0 ? (result + half_cnt) / filter_cnt\n                                    : (result - half_cnt) / filter_cnt;\n                result = max(result, activation_min);\n                result = min(result, activation_max);\n                output[out_idx + c] = (int8_t)result;\n            }\n        }\n    }\n}\n"
  },
  {
    "path": "src/pooling/esp_nn_max_pool_ansi.c",
    "content": "// Copyright 2020-2021 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <stdint.h>\n\n#include <common_functions.h>\n\nvoid esp_nn_max_pool_s8_ansi(const int8_t *input,\n                             const uint16_t input_wd,\n                             const uint16_t input_ht,\n                             int8_t *output,\n                             const uint16_t output_wd,\n                             const uint16_t output_ht,\n                             const uint16_t stride_wd,\n                             const uint16_t stride_ht,\n                             const uint16_t filter_wd,\n                             const uint16_t filter_ht,\n                             const uint16_t pad_wd,\n                             const uint16_t pad_ht,\n                             const int32_t activation_min,\n                             const int32_t activation_max,\n                             const uint16_t channels)\n{\n    int32_t base_y = -pad_ht;\n    for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) {\n        int32_t base_x = -pad_wd;\n        for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) {\n            /* Make sure filter does not cross the input box */\n            int32_t filter_y_start = max(0, -base_y);\n            int32_t filter_x_start = max(0, -base_x);\n            int32_t filter_y_end = min(filter_ht, input_ht - base_y);\n            int32_t filter_x_end = min(filter_wd, input_wd - base_x);\n\n            for (int32_t ch_idx = 0; ch_idx < channels; ch_idx++) {\n                int8_t result = INT8_MIN;\n\n                for (int32_t filter_y = filter_y_start; filter_y < filter_y_end; filter_y++) {\n                    for (int32_t filter_x = filter_x_start; filter_x < filter_x_end; filter_x++) {\n                        int32_t in_x_idx = base_x + filter_x;\n                        int32_t in_y_idx = base_y + filter_y;\n                        int32_t input_index = (in_y_idx * input_wd + in_x_idx) * channels + ch_idx;\n                        result = max(input[input_index], result);\n                    }\n                }\n\n                /* Activation function */\n                result = max(result, activation_min);\n                result = min(result, activation_max);\n\n                int32_t output_index = (out_y * output_wd + out_x) * channels + ch_idx;\n                output[output_index] = result;\n            }\n        }\n    }\n}\n"
  },
  {
    "path": "src/pooling/esp_nn_max_pool_s8_esp32p4.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <stdint.h>\n#include <limits.h>\n#include <common_functions.h>\n\n/**\n * Max pooling for s8 using ESP32-P4 PIE SIMD.\n * Vectorizes the channel dimension: processes 16 channels per iteration\n * using esp.vmax.s8 to find running maximum across the filter window.\n */\nvoid esp_nn_max_pool_s8_esp32p4(const int8_t *input,\n                                 const uint16_t input_wd,\n                                 const uint16_t input_ht,\n                                 int8_t *output,\n                                 const uint16_t output_wd,\n                                 const uint16_t output_ht,\n                                 const uint16_t stride_wd,\n                                 const uint16_t stride_ht,\n                                 const uint16_t filter_wd,\n                                 const uint16_t filter_ht,\n                                 const uint16_t pad_wd,\n                                 const uint16_t pad_ht,\n                                 const int32_t activation_min,\n                                 const int32_t activation_max,\n                                 const uint16_t channels)\n{\n    /* Enable PIE */\n    asm volatile (\n        \"csrsi  0x7f2, 0b01        \\n\\t\"\n        \"li     x29, 0b10          \\n\\t\"\n        \"esp.movx.w.cfg x29        \\n\\t\"\n        ::: \"x29\"\n    );\n\n    /* Broadcast activation_min and activation_max into vectors */\n    int8_t act_min_val = (int8_t) activation_min;\n    int8_t act_max_val = (int8_t) activation_max;\n    int8_t int8_min_val = INT8_MIN;\n\n    asm volatile (\n        \"mv              x30, %0     \\n\\t\"\n        \"esp.vldbc.8.ip  q4, x30, 0 \\n\\t\"  /* q4 = broadcast(activation_min) */\n        \"mv              x30, %1     \\n\\t\"\n        \"esp.vldbc.8.ip  q5, x30, 0 \\n\\t\"  /* q5 = broadcast(activation_max) */\n        \"mv              x30, %2     \\n\\t\"\n        \"esp.vldbc.8.ip  q6, x30, 0 \\n\\t\"  /* q6 = broadcast(INT8_MIN) for init */\n        :: \"r\"(&act_min_val), \"r\"(&act_max_val), \"r\"(&int8_min_val)\n        : \"x30\"\n    );\n\n    const int32_t ch_16 = channels >> 4;  /* number of full 16-ch blocks */\n\n    int32_t base_y = -pad_ht;\n    for (int32_t out_y = 0; out_y < output_ht; out_y++, base_y += stride_ht) {\n        int32_t base_x = -pad_wd;\n        for (int32_t out_x = 0; out_x < output_wd; out_x++, base_x += stride_wd) {\n            int32_t filter_y_start = max(0, -base_y);\n            int32_t filter_x_start = max(0, -base_x);\n            int32_t filter_y_end = min(filter_ht, input_ht - base_y);\n            int32_t filter_x_end = min(filter_wd, input_wd - base_x);\n\n            int8_t *out_ptr = output + (out_y * output_wd + out_x) * channels;\n\n            /* Process channels in blocks of 16 */\n            int32_t ch_offset = 0;\n            for (int32_t ch_blk = 0; ch_blk < ch_16; ch_blk++, ch_offset += 16) {\n                /* Initialize running max to INT8_MIN (copy q6 -> q0) */\n                asm volatile (\"esp.vmax.s8 q0, q6, q6 \\n\\t\");\n\n                /* Accumulate max across filter window.\n                 * For fx loop: input channels are at stride=channels apart. */\n                for (int32_t fy = filter_y_start; fy < filter_y_end; fy++) {\n                    int32_t in_y = base_y + fy;\n                    const int8_t *row_ptr = input + (in_y * input_wd + base_x + filter_x_start) * channels + ch_offset;\n                    int32_t fx_count = filter_x_end - filter_x_start;\n\n                    asm volatile (\n                        \"mv     x30, %[ptr]              \\n\\t\"\n                        \"mv     s7,  %[cnt]              \\n\\t\"\n                        \"1:                              \\n\\t\"\n                        \"esp.vld.128.ip  q1, x30, 0      \\n\\t\"\n                        \"esp.vmax.s8     q0, q0, q1      \\n\\t\"\n                        \"add    x30, x30, %[stride]      \\n\\t\"\n                        \"addi   s7, s7, -1               \\n\\t\"\n                        \"bnez   s7, 1b                   \\n\\t\"\n                        :\n                        : [ptr] \"r\"(row_ptr), [cnt] \"r\"(fx_count),\n                          [stride] \"r\"((int32_t)channels)\n                        : \"x30\", \"s7\"\n                    );\n                }\n\n                /* Apply activation: max(act_min, min(act_max, result)) and store */\n                {\n                    int8_t *store_ptr = out_ptr + ch_offset;\n                    asm volatile (\n                        \"esp.vmax.s8     q0, q0, q4       \\n\\t\"  /* max(result, act_min) */\n                        \"esp.vmin.s8     q0, q0, q5       \\n\\t\"  /* min(result, act_max) */\n                        \"mv              x30, %0          \\n\\t\"\n                        \"esp.vst.128.ip  q0, x30, 0       \\n\\t\"  /* store 16 channels */\n                        :\n                        : \"r\"(store_ptr)\n                        : \"x30\", \"memory\"\n                    );\n                }\n            }\n\n            /* Handle remaining channels scalar */\n            for (int32_t ch_idx = ch_offset; ch_idx < channels; ch_idx++) {\n                int8_t result = INT8_MIN;\n                for (int32_t fy = filter_y_start; fy < filter_y_end; fy++) {\n                    for (int32_t fx = filter_x_start; fx < filter_x_end; fx++) {\n                        int32_t in_y = base_y + fy;\n                        int32_t in_x = base_x + fx;\n                        int32_t input_index = (in_y * input_wd + in_x) * channels + ch_idx;\n                        result = max(input[input_index], result);\n                    }\n                }\n                result = max(result, (int8_t) activation_min);\n                result = min(result, (int8_t) activation_max);\n                out_ptr[ch_idx] = result;\n            }\n        }\n    }\n}\n"
  },
  {
    "path": "src/pooling/esp_nn_max_pool_s8_esp32s3.S",
    "content": "// Copyright 2021-2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n    .text\n    .align  4\n    .literal_position\n\n    # Program Unit: esp_nn_max_pool_s8_esp32s3\n    .type   esp_nn_max_pool_s8_esp32s3, @function\n    .align   4\n    .global esp_nn_max_pool_s8_esp32s3\n\n// no of channels must be multiple of 4\n\nesp_nn_max_pool_s8_esp32s3: # 0x4\n    # int8_min = 0\n    # gra_spill_temp_0 = 4\n    # gra_spill_temp_1 = 8\n    # gra_spill_temp_2 = 12\n    # gra_spill_temp_3 = 16\n    # gra_spill_temp_4 = 20\n    # gra_spill_temp_5 = 24\n    # gra_spill_temp_6 = 28\n    # gra_spill_temp_7 = 32\n    # gra_spill_temp_8 = 36\n    # gra_spill_temp_9 = 40\n    # gra_spill_temp_10 = 44\n    # gra_spill_temp_11 = 48\n    # gra_spill_temp_12 = 52\n    # gra_spill_temp_13 = 56\n    # gra_spill_temp_14 = 60\n    # gra_spill_temp_15 = 64\n    # gra_spill_temp_16 = 68\n    # gra_spill_temp_17 = 72\n    # gra_spill_temp_18 = 76\n    # gra_spill_temp_19 = 80\n    # gra_spill_temp_20 = 84\n    # gra_spill_temp_21 = 88\n    # gra_spill_temp_22 = 92\n    # gra_spill_temp_23 = 96\n\n// a2: input\n// a3: input_wd\n// a4: input_ht\n// a5: output\n// a6: output_wd\n// a7: output_ht\n// on stack: stride_wd = 120\n// on stack: stride_ht = 124\n// on stack: filter_wd = 128\n// on stack: filter_ht = 132\n// on stack: pad_wd = 136\n// on stack: pad_ht = 140\n// on stack: activation_min\n// on stack: activation_max\n// on stack: channels\n\n\n    entry   a1,120                      #\n    mov.n   a12,a2                      # [0]\n    s32i    a6,a1,4                 # [2]  gra_spill_temp_0\n    s32i    a7,a1,68                    # [3]  gra_spill_temp_16\n    mov.n   a11,a3                      # [4]\n    s32i    a5,a1,96                    # [5]  gra_spill_temp_23\n\n    l16ui   a5,a1,152                   # [6]  id:465 channels+0x0\n    movi    a3,-128                     # [7]\n    s32i.n  a3,a1,0                 # [1]  int8_min\n\n    addi.n      a9,a1,148                   # [0]  activation_max\n    addi.n      a15,a1,144                  # [1]  activation_min\n    ee.vldbc.8  q3,a1               # [7]  id:473 int8_min+0x0\n    ee.vldbc.8  q5,a15                  # [8]  id:470 activation_min+0x0\n    ee.vldbc.8  q4,a9               # [9]  id:471 activation_max+0x0\n\n    extui   a8,a5,0,3                   # [8]\n    beqz.n  a8,.LBB3_esp_nn_max_pool_s8_esp32s3     # [9] // if (channels % 8 == 0)\n\n    extui   a14,a5,0,2                  # [0]\n    beqz.n  a14,.LBB25_esp_nn_max_pool_s8_esp32s3   # [1] // if (channels % 4 == 0)\n\n    retw.n                          # [0]   // exit\n\n.LBB3_esp_nn_max_pool_s8_esp32s3:   # 0x1c5 // if (channels % 8 == 0)\n\n    l16ui   a15,a1,136                  # [1]  id:475 pad_wd+0x0\n    l16ui   a14,a1,140                  # [4]  id:474 pad_ht+0x0\n    movi.n  a8,0                    # [13]\n    movi.n  a10,0                   # [15]\n    s32i    a14,a1,44                   # [7]  gra_spill_temp_10\n    neg     a15,a15                     # [12]\n    mul16u  a9,a6,a5                # [14]\n    neg     a14,a14                     # [16]\n    s32i    a14,a1,92                   # [17]  gra_spill_temp_22\n    s32i    a10,a1,52                   # [18]  gra_spill_temp_12\n    s32i    a9,a1,60                    # [19]  gra_spill_temp_14\n    s32i.n  a8,a1,36                # [16]  gra_spill_temp_8\n    s32i    a15,a1,56                   # [21]  gra_spill_temp_13\n    sub     a13,a4,a14                  # [22]\n    s32i    a13,a1,48                   # [23]  gra_spill_temp_11\n    sub     a15,a11,a15                 # [24]\n    s32i.n  a15,a1,40               # [25]  gra_spill_temp_9\n\n.Lt_0_21506:    # 0x229\n    l32i    a8,a1,4                 # [0]  gra_spill_temp_0\n    beqz.n  a8,.Lt_0_21762          # [2]\n\n    movi.n  a10,0                   # [0]\n    l32i    a9,a1,44                    # [1]  gra_spill_temp_10\n    l32i.n  a15,a1,40               # [2]  gra_spill_temp_9\n    l32i    a8,a1,52                    # [3]  gra_spill_temp_12\n    l32i.n  a13,a1,136                  # [4]  ,pad_wd\n    l32i    a14,a1,56                   # [5]  gra_spill_temp_13\n    s32i    a14,a1,80                   # [6]  gra_spill_temp_19\n    s32i    a13,a1,76                   # [7]  gra_spill_temp_18\n    s32i    a8,a1,88                    # [8]  gra_spill_temp_21\n    s32i    a15,a1,84                   # [9]  gra_spill_temp_20\n    l32i    a8,a1,48                    # [10]  gra_spill_temp_11\n    max     a9,a9,a10                   # [11]\n    l32i    a15,a1,132                  # [12]  filter_ht\n    s32i    a9,a1,8                 # [13]  gra_spill_temp_1\n    movi.n  a9,0                    # [14]\n    min     a15,a15,a8                  # [15]\n    s32i    a9,a1,64                    # [16]  gra_spill_temp_15\n\n.Lt_0_22274:    # 0x25d\n    beqz.n  a5,.Lt_0_22530          # [0]\n\n.LBB10_esp_nn_max_pool_s8_esp32s3:  # 0x25f\n#<loop> Part of loop body line 46, head labeled .Lt_0_22274\n    l32i    a6,a1,76                    # [0]  gra_spill_temp_18\n    l32i    a13,a1,96                   # [1]  gra_spill_temp_23\n    l32i    a8,a1,84                    # [2]  gra_spill_temp_20\n    l32i    a7,a1,128                   # [3]  filter_wd\n    l32i    a10,a1,88                   # [4]  gra_spill_temp_21\n    movi.n  a9,0                    # [5]\n    s32i    a9,a1,20                    # [6]  gra_spill_temp_4\n    add.n   a14,a10,a5                  # [7]\n    min     a7,a7,a8                    # [8]\n    add.n   a10,a10,a13                 # [9]\n    add.n   a14,a13,a14                 # [10]\n    s32i    a14,a1,12                   # [11]  gra_spill_temp_2\n    s32i    a10,a1,16                   # [12]  gra_spill_temp_3\n    movi.n  a8,0                    # [13]\n    l32i    a10,a1,80                   # [14]  gra_spill_temp_19\n    max     a6,a6,a8                    # [15]\n    sub     a9,a7,a6                    # [16]\n    s32i    a9,a1,28                    # [17]  gra_spill_temp_6\n    add.n   a13,a10,a6                  # [18]\n    s32i    a13,a1,24                   # [19]  gra_spill_temp_5\n    add.n   a10,a10,a7                  # [16]\n    s32i    a10,a1,72                   # [21]  gra_spill_temp_17\n\n.Lt_0_23042:    # 0x29a\n    l32i    a8,a1,8                 # [0]  gra_spill_temp_1\n    mv.qr   q1,q3                       # [1]\n    mov.n   a13,a8                      # [2]\n    bge     a8,a15,.Lt_0_23298          # [3]\n\n.LBB13_esp_nn_max_pool_s8_esp32s3:  # 0x2a5\n#<loop> Part of loop body line 40, head labeled .Lt_0_23042\n    l32i    a10,a1,92                   # [0]  gra_spill_temp_22\n    l32i    a14,a1,72                   # [1]  gra_spill_temp_17\n    add.n   a10,a10,a8                  # [2]\n    mull    a10,a11,a10                 # [3]\n    add.n   a14,a10,a14                 # [5]\n\n.Lt_0_23810:    # 0x2b2\n    add.n   a14,a14,a11                 # [0]\n    addi.n  a13,a13,1               # [1]\n    bge     a6,a7,.Lt_0_24066           # [2]\n\n.LBB16_esp_nn_max_pool_s8_esp32s3:  # 0x2b9\n    l32i    a3,a1,24                    # [0]  gra_spill_temp_5\n    l32i    a2,a1,20                    # [1]  gra_spill_temp_4\n    add.n   a3,a3,a10                   # [2]\n    mull    a3,a3,a5                    # [3]\n    add.n   a2,a2,a3                    # [5]\n    l32i    a3,a1,28                    # [6]  gra_spill_temp_6\n    add.n   a2,a12,a2                   # [7]\n    loopgtz a3,.LBB93_esp_nn_max_pool_s8_esp32s3    # [8]\n\n    ee.vld.l.64.ip  q0,a2,0         # [0*II+1]  id:481\n    add.n           a2,a2,a5                    # [0*II+2]\n    ee.vmax.s8      q1,q1,q0            # [0*II+3]\n.LBB93_esp_nn_max_pool_s8_esp32s3:  # 0x2d8\n\n.Lt_0_24066:    # 0x2d8\n    add.n   a10,a10,a11                 # [0]\n    bne     a15,a13,.Lt_0_23810         # [1]\n\n.Lt_0_23298:    # 0x2dd\n    l32i    a9,a1,12                    # [0]  gra_spill_temp_2\n    l32i    a13,a1,20                   # [1]  gra_spill_temp_4\n    l32i    a8,a1,16                    # [2]  gra_spill_temp_3\n    ee.vmin.s8  q2,q1,q4            # [3]\n    ee.vmax.s8  q2,q2,q5            # [4]\n    mov.n   a10,a8                      # [5]\n    addi.n  a13,a13,8               # [6]\n    s32i    a13,a1,20                   # [7]  gra_spill_temp_4\n    ee.vst.l.64.ip  q2,a10,0        # [8]  id:482\n    addi.n  a8,a8,8                 # [9]\n    s32i    a8,a1,16                    # [10]  gra_spill_temp_3\n    blt     a8,a9,.Lt_0_23042           # [11]\n\n.Lt_0_22530:    # 0x2fe\n    l32i    a13,a1,84                   # [0]  gra_spill_temp_20\n    l32i    a14,a1,80                   # [1]  gra_spill_temp_19\n    l32i    a10,a1,120                  # [2]  stride_wd\n    l32i    a8,a1,88                    # [3]  gra_spill_temp_21\n    l32i    a9,a1,76                    # [4]  gra_spill_temp_18\n    add.n   a8,a8,a5                    # [5]\n    s32i    a8,a1,88                    # [6]  gra_spill_temp_21\n    sub     a9,a9,a10                   # [7]\n    add.n   a14,a14,a10                 # [8]\n    sub     a13,a13,a10                 # [9]\n    s32i    a13,a1,84                   # [10]  gra_spill_temp_20\n    s32i    a14,a1,80                   # [11]  gra_spill_temp_19\n    s32i    a9,a1,76                    # [12]  gra_spill_temp_18\n    l32i    a14,a1,64                   # [13]  gra_spill_temp_15\n    l32i    a8,a1,4                 # [14]  gra_spill_temp_0\n    addi.n  a14,a14,1               # [15]\n    s32i    a14,a1,64                   # [16]  gra_spill_temp_15\n    sub     a14,a14,a8                  # [17]\n    bnez    a14,.Lt_0_22274             # [18]\n\n.Lt_0_21762:    # 0x334\n#<loop> Part of loop body line 20, head labeled .Lt_0_21506\n    l32i    a8,a1,44                    # [0]  gra_spill_temp_10\n    l32i    a15,a1,92                   # [1]  gra_spill_temp_22\n    l32i    a10,a1,60                   # [2]  gra_spill_temp_14\n    l32i    a14,a1,124                  # [3]  stride_ht\n    l32i    a13,a1,48                   # [4]  gra_spill_temp_11\n    l32i    a9,a1,52                    # [5]  gra_spill_temp_12\n    sub     a13,a13,a14                 # [6]\n    add.n   a9,a9,a10                   # [7]\n    add.n   a15,a15,a14                 # [8]\n    sub     a8,a8,a14                   # [9]\n    s32i    a8,a1,44                    # [10]  gra_spill_temp_10\n    s32i    a15,a1,92                   # [11]  gra_spill_temp_22\n    s32i    a9,a1,52                    # [12]  gra_spill_temp_12\n    s32i    a13,a1,48                   # [13]  gra_spill_temp_11\n    l32i.n  a9,a1,36                # [14]  gra_spill_temp_8\n    l32i    a10,a1,68                   # [15]  gra_spill_temp_16\n    addi.n  a9,a9,1                 # [16]\n    s32i.n  a9,a1,36                # [17]  gra_spill_temp_8\n    sub     a9,a9,a10                   # [18]\n    bnez    a9,.Lt_0_21506              # [19]\n\n    retw.n                          # [0] // exit\n\n.LBB25_esp_nn_max_pool_s8_esp32s3:  # 0x36d // if (channels % 4 == 0)\n\n    l16ui   a10,a1,136                  # [1]  id:475 pad_wd+0x0\n    l16ui   a9,a1,140                   # [4]  id:474 pad_ht+0x0\n    movi.n  a13,0                   # [13]\n    movi.n  a15,0                   # [15]\n    neg     a10,a10                     # [12]\n    s32i    a9,a1,44                    # [7]  gra_spill_temp_10\n    mul16u  a14,a6,a5               # [14]\n    neg     a9,a9                       # [16]\n    s32i    a9,a1,92                    # [17]  gra_spill_temp_22\n    s32i    a15,a1,52                   # [18]  gra_spill_temp_12\n    s32i    a14,a1,60                   # [19]  gra_spill_temp_14\n    s32i.n  a13,a1,36               # [16]  gra_spill_temp_8\n    s32i    a10,a1,56                   # [21]  gra_spill_temp_13\n    sub     a8,a4,a9                    # [22]\n    s32i    a8,a1,48                    # [23]  gra_spill_temp_11\n    sub     a10,a11,a10                 # [24]\n    s32i.n  a10,a1,40               # [25]  gra_spill_temp_9\n\n.Lt_0_27138:    # 0x3d5\n    l32i    a13,a1,4                # [0]  gra_spill_temp_0\n    beqz.n  a13,.Lt_0_27394         # [2]\n\n.LBB29_esp_nn_max_pool_s8_esp32s3:  # 0x3da\n#<loop> Part of loop body line 107, head labeled .Lt_0_27138\n    movi.n  a10,0                   # [0]\n    l32i    a9,a1,44                    # [1]  gra_spill_temp_10\n    l32i.n  a15,a1,40               # [2]  gra_spill_temp_9\n    l32i    a8,a1,52                    # [3]  gra_spill_temp_12\n    l32i    a14,a1,56                   # [4]  gra_spill_temp_13\n    l32i.n  a13,a1,136                  # [5]  pad_wd\n    s32i    a13,a1,76                   # [6]  gra_spill_temp_18\n    s32i    a14,a1,80                   # [7]  gra_spill_temp_19\n    s32i    a8,a1,88                    # [8]  gra_spill_temp_21\n    s32i    a15,a1,84                   # [9]  gra_spill_temp_20\n    l32i    a8,a1,48                    # [10]  gra_spill_temp_11\n    l32i    a15,a1,132                  # [11]  filter_ht\n    movi.n  a14,0                   # [12]\n    max     a9,a9,a10                   # [13]\n    s32i    a9,a1,8                 # [14]  gra_spill_temp_1\n    s32i    a14,a1,64                   # [15]  gra_spill_temp_15\n    min     a15,a15,a8                  # [16]\n\n.Lt_0_27906:    # 0x409\n#<loop> Loop body line 109, nesting depth: 2, estimated iterations: 56\n    beqz.n  a5,.Lt_0_28162          # [0]\n\n.LBB32_esp_nn_max_pool_s8_esp32s3:  # 0x40b\n#<loop> Part of loop body line 109, head labeled .Lt_0_27906\n    l32i    a6,a1,76                    # [0]  gra_spill_temp_18\n    l32i    a13,a1,96                   # [1]  gra_spill_temp_23\n    l32i    a8,a1,84                    # [2]  gra_spill_temp_20\n    l32i    a7,a1,128                   # [3]  filter_wd\n    l32i    a10,a1,88                   # [4]  gra_spill_temp_21\n    movi.n  a9,0                    # [5]\n    s32i    a9,a1,32                    # [6]  gra_spill_temp_7\n    add.n   a14,a10,a5                  # [7]\n    min     a7,a7,a8                    # [8]\n    add.n   a10,a10,a13                 # [9]\n    add.n   a14,a13,a14                 # [10]\n    s32i    a14,a1,12                   # [11]  gra_spill_temp_2\n    s32i    a10,a1,16                   # [12]  gra_spill_temp_3\n    movi.n  a8,0                    # [13]\n    l32i    a10,a1,80                   # [14]  gra_spill_temp_19\n    max     a6,a6,a8                    # [15]\n    sub     a9,a7,a6                    # [16]\n    s32i    a9,a1,28                    # [17]  gra_spill_temp_6\n    add.n   a13,a10,a6                  # [18]\n    s32i    a13,a1,24                   # [19]  gra_spill_temp_5\n    add.n   a10,a10,a7                  # [16]\n    s32i    a10,a1,72                   # [21]  gra_spill_temp_17\n\n.Lt_0_28674:    # 0x446\n#<loop> Loop body line 8, nesting depth: 3, estimated iterations: 56\n    l32i    a8,a1,8                 # [0]  gra_spill_temp_1\n    mv.qr   q1,q3                       # [1]\n    mov.n   a13,a8                      # [2]\n    bge     a8,a15,.Lt_0_28930          # [3]\n\n.LBB35_esp_nn_max_pool_s8_esp32s3:  # 0x451\n#<loop> Part of loop body line 8, head labeled .Lt_0_28674\n    l32i    a10,a1,92                   # [0]  gra_spill_temp_22\n    l32i    a14,a1,72                   # [1]  gra_spill_temp_17\n    add.n   a10,a10,a8                  # [2]\n    mull    a10,a11,a10                 # [3]\n    add.n   a14,a10,a14                 # [5]\n\n.Lt_0_29442:    # 0x45e\n    add.n   a14,a14,a11                 # [0]\n    addi.n  a13,a13,1               # [1]\n    bge     a6,a7,.Lt_0_29698           # [2]\n\n.LBB38_esp_nn_max_pool_s8_esp32s3:  # 0x465\n    l32i    a3,a1,24                    # [0]  gra_spill_temp_5\n    l32i    a2,a1,32                    # [1]  gra_spill_temp_7\n    add.n   a3,a3,a10                   # [2]\n    mull    a3,a3,a5                    # [3]\n    l32i    a4,a1,28                    # [4]  gra_spill_temp_6\n    add.n   a2,a2,a3                    # [5]\n    add.n   a2,a12,a2                   # [6]\n    loopgtz a4,.LBB108_esp_nn_max_pool_s8_esp32s3   # [7]\n\n    ee.vldbc.32 q0,a2               # [0*II+0]  id:489\n    add.n       a2,a2,a5                    # [0*II+1]\n    ee.vmax.s8  q1,q1,q0            # [0*II+2]\n.LBB108_esp_nn_max_pool_s8_esp32s3: # 0x482\n\n.Lt_0_29698:    # 0x482\n    add.n   a10,a10,a11                 # [0]\n    bne     a15,a13,.Lt_0_29442         # [1]\n\n.Lt_0_28930:    # 0x487\n#<loop> Part of loop body line 8, head labeled .Lt_0_28674\n    l32i            a9,a1,12                    # [0]  gra_spill_temp_2\n    l32i            a8,a1,16                    # [1]  gra_spill_temp_3\n    l32i            a10,a1,32                   # [3]  gra_spill_temp_7\n\n    ee.vmin.s8      q5,q1,q4            # [4]\n    ee.vmax.s8      q5,q5,q5            # [5]\n    addi.n          a10,a10,4               # [6]\n    ee.movi.32.a    q5,a13,0\n    s32i            a10,a1,32                   # [9]  gra_spill_temp_7\n    s32i.n          a13,a8,0                # [10]  id:492\n    addi.n          a8,a8,4                 # [11]\n    s32i            a8,a1,16                    # [12]  gra_spill_temp_3\n    blt             a8,a9,.Lt_0_28674           # [13]\n\n.Lt_0_28162:    # 0x4ad\n#<loop> Part of loop body line 109, head labeled .Lt_0_27906\n    l32i    a13,a1,84                   # [0]  gra_spill_temp_20\n    l32i    a14,a1,80                   # [1]  gra_spill_temp_19\n    l32i    a10,a1,120                  # [2]  stride_wd\n    l32i    a8,a1,88                    # [3]  gra_spill_temp_21\n    l32i    a9,a1,76                    # [4]  gra_spill_temp_18\n    add.n   a8,a8,a5                    # [5]\n    s32i    a8,a1,88                    # [6]  gra_spill_temp_21\n    sub     a9,a9,a10                   # [7]\n    add.n   a14,a14,a10                 # [8]\n    sub     a13,a13,a10                 # [9]\n    s32i    a13,a1,84                   # [10]  gra_spill_temp_20\n    s32i    a14,a1,80                   # [11]  gra_spill_temp_19\n    s32i    a9,a1,76                    # [12]  gra_spill_temp_18\n    l32i    a14,a1,64                   # [13]  gra_spill_temp_15\n    l32i    a8,a1,4                 # [14]  gra_spill_temp_0\n    addi.n  a14,a14,1               # [15]\n    s32i    a14,a1,64                   # [16]  gra_spill_temp_15\n    sub     a14,a14,a8                  # [17]\n    bnez    a14,.Lt_0_27906             # [18]\n\n.Lt_0_27394:    # 0x4e3\n#<loop> Part of loop body line 107, head labeled .Lt_0_27138\n    l32i    a8,a1,44                    # [0]  gra_spill_temp_10\n    l32i    a15,a1,92                   # [1]  gra_spill_temp_22\n    l32i    a10,a1,60                   # [2]  gra_spill_temp_14\n    l32i    a14,a1,124                  # [3]  stride_ht\n    l32i    a13,a1,48                   # [4]  gra_spill_temp_11\n    l32i    a9,a1,52                    # [5]  gra_spill_temp_12\n    sub     a13,a13,a14                 # [6]\n    add.n   a9,a9,a10                   # [7]\n    add.n   a15,a15,a14                 # [8]\n    sub     a8,a8,a14                   # [9]\n    s32i    a8,a1,44                    # [10]  gra_spill_temp_10\n    s32i    a15,a1,92                   # [11]  gra_spill_temp_22\n    s32i    a9,a1,52                    # [12]  gra_spill_temp_12\n    s32i    a13,a1,48                   # [13]  gra_spill_temp_11\n    l32i.n  a9,a1,36                # [14]  gra_spill_temp_8\n    l32i    a10,a1,68                   # [15]  gra_spill_temp_16\n    addi.n  a9,a9,1                 # [16]\n    s32i.n  a9,a1,36                # [17]  gra_spill_temp_8\n    sub     a9,a9,a10                   # [18]\n    bnez    a9,.Lt_0_27138              # [19]\n\n    retw.n                          # [0] // exit\n\n    .size   esp_nn_max_pool_s8_esp32s3, . - esp_nn_max_pool_s8_esp32s3\n"
  },
  {
    "path": "src/softmax/esp_nn_softmax_ansi.c",
    "content": "// Copyright 2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"softmax_common.h\"\n\nint32_t esp_nn_get_softmax_scratch_size_ansi(const int32_t width, const int32_t height)\n{\n    (void) width;\n    (void) height;\n    return 0;\n}\n\nvoid esp_nn_set_softmax_scratch_buf_ansi(void *buffer)\n{\n    (void) buffer;\n    return;\n}\n\nvoid esp_nn_softmax_s8_ansi(const int8_t *input_data,\n                            const int32_t height,\n                            const int32_t width,\n                            const int32_t mult,\n                            const int32_t shift,\n                            const int32_t diff_min,\n                            int8_t *output_data)\n{\n    // The representation chosen for the input to the exp() function is Q5.26.\n    // We need to leave extra space since values that we skip might be as large as\n    // -32 before multiplying by input mult, and therefore as large as\n    // -16 afterwards.  Note that exp(-8) is definitely not insignificant to\n    // accumulation, but exp(-16) definitely is.\n#define ACCUM_BITS  12\n#define DIFF_BITS   5\n\n    const int32_t mask = (1 << shift);\n    int32_t col = 0;\n    const int8_t *in_ptr = input_data;\n    int8_t *out_ptr = output_data;\n\n    for (int row_idx = 0; row_idx < height; row_idx++) {\n        int8_t max_in_row = in_ptr[0];\n        for (col = 1; col < width; col++) {\n            max_in_row = max(max_in_row, in_ptr[col]);\n        }\n\n        int32_t input_diff = 0;\n        int32_t sum_of_exps = 0;\n\n        for (col = 0; col < width; col++) {\n            input_diff = in_ptr[col] - max_in_row;\n            if (input_diff >= diff_min) {\n                const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult);\n                const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled);\n                sum_of_exps += DIV_POW2(exp_raw, ACCUM_BITS);\n            }\n        }\n\n        const int32_t headroom_plus1 = esp_nn_clz32((uint32_t) sum_of_exps);\n        const int32_t shifted_scale = ONE_OVER_ONE_X((sum_of_exps << headroom_plus1) - (1 << 31));\n        const int32_t bits_over_unit = ACCUM_BITS - headroom_plus1 + 31 - sizeof(int8_t) * 8;\n\n        for (col = 0; col < width; col++) {\n            input_diff = in_ptr[col] - max_in_row;\n            if (input_diff >= diff_min) {\n                const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult);\n                const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled);\n                const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw);\n                const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128;\n                out_ptr[col] = (int8_t) esp_nn_saturate8(result);\n            } else {\n                out_ptr[col] = -128;\n            }\n        }\n        in_ptr  += width;\n        out_ptr += width;\n    }\n}\n"
  },
  {
    "path": "src/softmax/esp_nn_softmax_opt.c",
    "content": "// Copyright 2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include \"softmax_common.h\"\n#include <stdio.h>\n\nstatic int32_t *scratch_buf = NULL;\n\n/**\n * @brief   Get scratch buffer size needed by softmax function\n *\n * @param   width\n * @param   height\n * @return  size in bytes\n *\n * @note    buffer must be 4 byte aligned\n */\nint32_t esp_nn_get_softmax_scratch_size_opt(const int32_t width, const int32_t height)\n{\n    (void) height;\n    return width * 4;\n}\n\n/**\n * @brief   Set scratch buffer to be used by softmax function\n *\n * @param   buffer  this can be NULL if one needs to unset it\n *                  must be aligned to 4 bytes\n */\nvoid esp_nn_set_softmax_scratch_buf_opt(void *buffer)\n{\n    scratch_buf = (int32_t *) buffer;\n}\n\nvoid esp_nn_softmax_s8_opt(const int8_t *input_data,\n                           const int32_t height,\n                           const int32_t width,\n                           const int32_t mult,\n                           const int32_t shift,\n                           const int32_t diff_min,\n                           int8_t *output_data)\n{\n    if (scratch_buf == NULL) {\n        printf(\"%s error! scratch buffer not set\\n\", __FUNCTION__);\n        return;\n    }\n    // The representation chosen for the input to the exp() function is Q5.26.\n    // We need to leave extra space since values that we skip might be as large as\n    // -32 before multiplying by input mult, and therefore as large as\n    // -16 afterwards.  Note that exp(-8) is definitely not insignificant to\n    // accumulation, but exp(-16) definitely is.\n#define ACCUM_BITS  12\n#define DIFF_BITS   5\n\n    const int32_t mask = (1 << shift);\n    int32_t col = 0;\n    const int8_t *in_ptr = input_data;\n    int8_t *out_ptr = output_data;\n\n    for (int row_idx = 0; row_idx < height; row_idx++) {\n        int8_t max_in_row = in_ptr[0];\n        for (col = 1; col < width; col++) {\n            max_in_row = max(max_in_row, in_ptr[col]);\n        }\n\n        int32_t input_diff = 0;\n        int32_t sum_of_exps = 0;\n\n        for (col = 0; col < width; col++) {\n            input_diff = in_ptr[col] - max_in_row;\n            if (input_diff >= diff_min) {\n                const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult);\n                const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled);\n                scratch_buf[col] = exp_raw; // store to avoid duplicate calculation later\n                sum_of_exps += DIV_POW2(exp_raw, ACCUM_BITS);\n            }\n        }\n\n        const int32_t headroom_plus1 = esp_nn_clz32((uint32_t) sum_of_exps);\n        const int32_t shifted_scale = ONE_OVER_ONE_X((sum_of_exps << headroom_plus1) - (1 << 31));\n        const int32_t bits_over_unit = ACCUM_BITS - headroom_plus1 + 31 - sizeof(int8_t) * 8;\n\n        for (col = 0; col < width; col++) {\n            input_diff = in_ptr[col] - max_in_row;\n            if (input_diff >= diff_min) {\n                int32_t exp_raw = scratch_buf[col];\n                const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw);\n                const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128;\n                out_ptr[col] = (int8_t) esp_nn_saturate8(result);\n            } else {\n                out_ptr[col] = -128;\n            }\n        }\n        in_ptr  += width;\n        out_ptr += width;\n    }\n}\n"
  },
  {
    "path": "src/softmax/esp_nn_softmax_s8_esp32p4.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include \"softmax_common.h\"\n#include <stdio.h>\n#include <limits.h>\n\nstatic int32_t *p4_scratch_buf = NULL;\n\nint32_t esp_nn_get_softmax_scratch_size_esp32p4(const int32_t width, const int32_t height)\n{\n    (void) height;\n    return width * 4;\n}\n\nvoid esp_nn_set_softmax_scratch_buf_esp32p4(void *buffer)\n{\n    /* Enable PIE */\n    asm volatile (\n        \"csrsi  0x7f2, 0b01        \\n\\t\"\n        \"li     x29, 0b10          \\n\\t\"\n        \"esp.movx.w.cfg x29        \\n\\t\"\n        ::: \"x29\"\n    );\n    p4_scratch_buf = (int32_t *) buffer;\n}\n\n/**\n * Softmax for s8 optimized for ESP32-P4.\n * Phase 1 (find-max) uses PIE esp.vmax.s8 for 16 elements at a time.\n * Phases 2-3 (exp + normalize) use cached exp values in scratch buffer.\n */\nvoid esp_nn_softmax_s8_esp32p4(const int8_t *input_data,\n                                const int32_t height,\n                                const int32_t width,\n                                const int32_t mult,\n                                const int32_t shift,\n                                const int32_t diff_min,\n                                int8_t *output_data)\n{\n    if (p4_scratch_buf == NULL) {\n        printf(\"%s error! scratch buffer not set\\n\", __FUNCTION__);\n        return;\n    }\n\n#define ACCUM_BITS  12\n#define DIFF_BITS   5\n\n    const int32_t mask = (1 << shift);\n    int32_t col = 0;\n    const int8_t *in_ptr = input_data;\n    int8_t *out_ptr = output_data;\n\n    for (int row_idx = 0; row_idx < height; row_idx++) {\n        /* Phase 1: Find max in row using PIE vectorization.\n         * Use auto-incrementing loads to avoid redundant mv per iteration. */\n        int8_t max_in_row;\n        if (width >= 16) {\n            int32_t vec_count = (width >> 4);  /* number of 16-element groups */\n            int32_t vec_processed = vec_count << 4;\n\n            int32_t max_scalar;\n            asm volatile (\n                \"mv     x30, %[ptr]              \\n\\t\"\n                \"esp.vld.128.ip q0, x30, 16      \\n\\t\"  /* load first 16, advance */\n                \"addi   %[cnt], %[cnt], -1       \\n\\t\"  /* one group already loaded */\n                \"beqz   %[cnt], 2f               \\n\\t\"\n                \"1:                              \\n\\t\"\n                \"esp.vld.128.ip q1, x30, 16      \\n\\t\"  /* load next 16, advance */\n                \"esp.vmax.s8    q0, q0, q1       \\n\\t\"  /* running max */\n                \"addi   %[cnt], %[cnt], -1       \\n\\t\"\n                \"bnez   %[cnt], 1b               \\n\\t\"\n                \"2:                              \\n\\t\"\n                \"esp.max.s8.a   q0, %[max]       \\n\\t\"  /* horizontal reduce */\n                : [cnt] \"+r\"(vec_count), [max] \"=r\"(max_scalar)\n                : [ptr] \"r\"(in_ptr)\n                : \"x30\"\n            );\n            max_in_row = (int8_t) max_scalar;\n\n            /* Check remaining elements (< 16) */\n            for (int32_t i = vec_processed; i < width; i++) {\n                if (in_ptr[i] > max_in_row) max_in_row = in_ptr[i];\n            }\n        } else {\n            max_in_row = in_ptr[0];\n            for (col = 1; col < width; col++) {\n                max_in_row = max(max_in_row, in_ptr[col]);\n            }\n        }\n\n        /* Phase 2: Compute exp values and sum */\n        int32_t input_diff = 0;\n        int32_t sum_of_exps = 0;\n\n        for (col = 0; col < width; col++) {\n            input_diff = in_ptr[col] - max_in_row;\n            if (input_diff >= diff_min) {\n                const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult);\n                const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled);\n                p4_scratch_buf[col] = exp_raw;\n                sum_of_exps += DIV_POW2(exp_raw, ACCUM_BITS);\n            }\n        }\n\n        /* Phase 3: Normalize */\n        const int32_t headroom_plus1 = esp_nn_clz32((uint32_t) sum_of_exps);\n        const int32_t shifted_scale = ONE_OVER_ONE_X((sum_of_exps << headroom_plus1) - (1 << 31));\n        const int32_t bits_over_unit = ACCUM_BITS - headroom_plus1 + 31 - sizeof(int8_t) * 8;\n\n        for (col = 0; col < width; col++) {\n            input_diff = in_ptr[col] - max_in_row;\n            if (input_diff >= diff_min) {\n                int32_t exp_raw = p4_scratch_buf[col];\n                const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw);\n                const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128;\n                out_ptr[col] = (int8_t) esp_nn_saturate8(result);\n            } else {\n                out_ptr[col] = -128;\n            }\n        }\n        in_ptr  += width;\n        out_ptr += width;\n    }\n}\n"
  },
  {
    "path": "src/softmax/esp_nn_softmax_s8_esp32s3.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n/*\n * ESP32-S3 optimized softmax with SIMD find-max for width >= 16.\n */\n\n#include <stdint.h>\n#include \"softmax_common.h\"\n\nstatic int32_t *scratch_buf_s3 = NULL;\n\nint32_t esp_nn_get_softmax_scratch_size_esp32s3(const int32_t width, const int32_t height)\n{\n    (void) height;\n    return width * 4;\n}\n\nvoid esp_nn_set_softmax_scratch_buf_esp32s3(void *buffer)\n{\n    scratch_buf_s3 = (int32_t *) buffer;\n}\n\n/* Find max of int8 array — SIMD for len >= 32, scalar for smaller */\nstatic inline int8_t find_max_s8(const int8_t *data, int32_t len)\n{\n    int8_t m = -128;\n    int32_t idx = 0;\n\n#if defined(__XTENSA__)\n    if (len >= 32) {\n        /* Use ee.vmax.s8 for 16 elements/cycle — only for len >= 32\n         * to avoid potential alignment issues with small buffers */\n        int8_t tmp_buf[16] __attribute__((aligned(16)));\n        const int8_t *ptr = data;\n        int8_t *buf_ptr = tmp_buf;\n        int32_t simd_len = len & ~15; /* round down to multiple of 16 */\n\n        asm volatile (\n            \"ee.vld.128.ip  q0, %[ptr], 16          \\n\\t\" /* q0 = running max */\n            \"movi.n %[idx], 16                       \\n\\t\"\n            \"j      2f                               \\n\\t\"\n            \"1:                                      \\n\\t\"\n            \"ee.vld.128.ip  q1, %[ptr], 16           \\n\\t\"\n            \"ee.vmax.s8     q0, q0, q1               \\n\\t\"\n            \"addi   %[idx], %[idx], 16               \\n\\t\"\n            \"2:                                      \\n\\t\"\n            \"blt    %[idx], %[slen], 1b              \\n\\t\"\n            /* Store vector max to tmp_buf for horizontal reduction */\n            \"ee.vst.128.ip  q0, %[buf], 16           \\n\\t\"\n            : [idx] \"+r\"(idx), [ptr] \"+r\"(ptr), [buf] \"+r\"(buf_ptr)\n            : [slen] \"r\"(simd_len)\n            : \"memory\"\n        );\n\n        /* Horizontal reduction of 16 max values */\n        for (int i = 0; i < 16; i++) {\n            if (tmp_buf[i] > m) m = tmp_buf[i];\n        }\n        idx = simd_len;\n    }\n#endif\n\n    /* Scalar for remainder or small arrays */\n    for (; idx < len; idx++) {\n        if (data[idx] > m) m = data[idx];\n    }\n    return m;\n}\n\nvoid esp_nn_softmax_s8_esp32s3(const int8_t *input_data,\n                                const int32_t height,\n                                const int32_t width,\n                                const int32_t mult,\n                                const int32_t shift,\n                                const int32_t diff_min,\n                                int8_t *output_data)\n{\n    if (scratch_buf_s3 == NULL) {\n        /* Fall through to opt version if scratch not set */\n        return;\n    }\n\n#define ACCUM_BITS  12\n\n    const int32_t mask = (1 << shift);\n    const int8_t *in_ptr = input_data;\n    int8_t *out_ptr = output_data;\n\n    for (int row_idx = 0; row_idx < height; row_idx++) {\n        /* Phase 1: Find max */\n        int8_t max_in_row = find_max_s8(in_ptr, width);\n\n        /* Phase 2: Compute exp and accumulate sum */\n        int32_t sum_of_exps = 0;\n        for (int col = 0; col < width; col++) {\n            int32_t input_diff = in_ptr[col] - max_in_row;\n            if (input_diff >= diff_min) {\n                const int32_t input_diff_rescaled = SAT_HIGH_MUL(input_diff * mask, mult);\n                const int32_t exp_raw = esp_nn_exp_on_negative_values(input_diff_rescaled);\n                scratch_buf_s3[col] = exp_raw;\n                sum_of_exps += DIV_POW2(exp_raw, ACCUM_BITS);\n            }\n        }\n\n        /* Phase 3: Compute normalization scale */\n        const int32_t headroom_plus1 = esp_nn_clz32((uint32_t) sum_of_exps);\n        const int32_t shifted_scale = ONE_OVER_ONE_X((sum_of_exps << headroom_plus1) - (1 << 31));\n        const int32_t bits_over_unit = ACCUM_BITS - headroom_plus1 + 31 - 8;\n\n        /* Phase 4: Normalize and output — unrolled 4x for reduced loop overhead */\n        int col = 0;\n        for (; col + 3 < width; col += 4) {\n            for (int k = 0; k < 4; k++) {\n                int32_t input_diff = in_ptr[col + k] - max_in_row;\n                if (input_diff >= diff_min) {\n                    int32_t exp_raw = scratch_buf_s3[col + k];\n                    const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw);\n                    const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128;\n                    out_ptr[col + k] = (int8_t) esp_nn_saturate8(result);\n                } else {\n                    out_ptr[col + k] = -128;\n                }\n            }\n        }\n        /* Remainder */\n        for (; col < width; col++) {\n            int32_t input_diff = in_ptr[col] - max_in_row;\n            if (input_diff >= diff_min) {\n                int32_t exp_raw = scratch_buf_s3[col];\n                const int32_t shifted_output = SAT_HIGH_MUL(shifted_scale, exp_raw);\n                const int32_t result = DIV_POW2(shifted_output, bits_over_unit) - 128;\n                out_ptr[col] = (int8_t) esp_nn_saturate8(result);\n            } else {\n                out_ptr[col] = -128;\n            }\n        }\n\n        in_ptr  += width;\n        out_ptr += width;\n    }\n#undef ACCUM_BITS\n}\n"
  },
  {
    "path": "src/softmax/softmax_common.h",
    "content": "// Copyright 2022 Espressif Systems (Shanghai) PTE LTD\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n//     http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n#include <stdint.h>\n#include <common_functions.h>\n\n#define MASK_IF_ZERO(x)                 (x) == 0 ? ~0 : 0\n#define MASK_IF_NON_ZERO(x)             (x) != 0 ? ~0 : 0\n#define SELECT_USING_MASK(mask, a, b)   ((mask) & (a)) ^ (~(mask) & (b))\n#define SAT_HIGH_MUL(x, y)              esp_nn_sat_round_doubling_high_mul((x), (y))\n#define DIV_POW2(x,y)                   esp_nn_div_by_power_of_two((x), (y))\n\n__NN_FORCE_INLINE__ int32_t mul_power_of_2(int val, int exp)\n{\n    const int32_t thresh = ((1 << (31 - exp)) - 1);\n    int32_t result = val << exp;\n    result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val > thresh), INT32_MAX, result);\n    result = SELECT_USING_MASK(MASK_IF_NON_ZERO(val < -thresh), INT32_MIN, result);\n    return result;\n}\n\n/**\n * @brief   Calculate `1 / (1 + x)` for x in [0, 1]\n *\n * @param   val     input value to calculate `1/(1+x)` for\n * @return  `int32_t` result\n * @note    Newton-Raphson division\n *\n *          https://en.wikipedia.org/wiki/Division_algorithm#Newton.E2.80.93Raphson_division\n *          Refer to that page for the logic behind the 48/17 and 32/17 constants.\n *          Pseudocode: https://en.wikipedia.org/wiki/Division_algorithm#Pseudocode\n */\n__NN_FORCE_INLINE__ int32_t esp_nn_one_over_one_plus_x_for_x_in_0_1(int32_t val)\n{\n    const int64_t sum = (int64_t) val + INT32_MAX;\n    const int32_t half_denominator = (int32_t) ((sum + (sum >= 0 ? 1 : -1)) / 2L);\n    int32_t constant_48_over_17 = 1515870810;\n    int32_t constant_neg_32_over_17 = -1010580540;\n    int32_t x = constant_48_over_17 + SAT_HIGH_MUL(half_denominator, constant_neg_32_over_17);\n    const int32_t fixed_2_one = (1 << 29);\n\n    x += mul_power_of_2(SAT_HIGH_MUL(x, fixed_2_one - SAT_HIGH_MUL(half_denominator, x)), 2);\n    x += mul_power_of_2(SAT_HIGH_MUL(x, fixed_2_one - SAT_HIGH_MUL(half_denominator, x)), 2);\n    x += mul_power_of_2(SAT_HIGH_MUL(x, fixed_2_one - SAT_HIGH_MUL(half_denominator, x)), 2);\n\n    return mul_power_of_2(x, 1);\n}\n\n#define ONE_OVER_ONE_X(x)   esp_nn_one_over_one_plus_x_for_x_in_0_1((x))\n\n/**\n * @brief   Return exp(x) for x < 0.\n *\n */\n__NN_FORCE_INLINE__ int32_t esp_nn_exp_on_negative_values(int32_t val)\n{\n    int32_t shift = 24;\n\n    const int32_t one_quarter = (1 << shift);\n    int32_t mask = one_quarter - 1;\n    const int32_t val_mod_minus_quarter = (val & mask) - one_quarter;\n    const int32_t remainder             = val_mod_minus_quarter - val;\n\n    // calculate exponent for x in [-1/4, 0) in `result`\n    const int32_t x                     = (val_mod_minus_quarter << 5) + (1 << 28);\n    const int32_t x2                    = SAT_HIGH_MUL(x, x);\n    const int32_t x3                    = SAT_HIGH_MUL(x2, x);\n    const int32_t x4                    = SAT_HIGH_MUL(x2, x2);\n    const int32_t one_over_3            = 715827883;\n    const int32_t one_over_8            = 1895147668;\n\n    const int32_t x4_over_4 = DIV_POW2(x4, 2);\n    const int32_t x4_over_4_plus_x3_over_6_plus_x2_over_2 = DIV_POW2(SAT_HIGH_MUL(x4_over_4 + x3, one_over_3) + x2, 1);\n    int32_t result = one_over_8 + SAT_HIGH_MUL(one_over_8, x + x4_over_4_plus_x3_over_6_plus_x2_over_2);\n\n#define SELECT_IF_NON_ZERO(x) {                                   \\\n    mask   = MASK_IF_NON_ZERO(remainder & (1 << shift++));        \\\n    result = SELECT_USING_MASK(mask, SAT_HIGH_MUL(result, x), result); \\\n}\n\n    SELECT_IF_NON_ZERO(1672461947)\n    SELECT_IF_NON_ZERO(1302514674)\n    SELECT_IF_NON_ZERO(790015084)\n    SELECT_IF_NON_ZERO(290630308)\n    SELECT_IF_NON_ZERO(39332535)\n    SELECT_IF_NON_ZERO(720401)\n    SELECT_IF_NON_ZERO(242)\n\n#undef SELECT_IF_NON_ZERO\n\n    mask = MASK_IF_ZERO(val);\n    return SELECT_USING_MASK(mask, INT32_MAX, result);\n}"
  },
  {
    "path": "test_app/CMakeLists.txt",
    "content": "# The following lines of boilerplate have to be in your project's\n# CMakeLists in this exact order for cmake to work correctly\ncmake_minimum_required(VERSION 3.5)\n\nset(EXTRA_COMPONENT_DIRS \"../\" \"../tests/\")\nset(IDF_EXCLUDE_COMPONENTS test test_app)\n\ninclude($ENV{IDF_PATH}/tools/cmake/project.cmake)\nproject(test_app)\n"
  },
  {
    "path": "test_app/Makefile",
    "content": "#\n# This is a project Makefile. It is assumed the directory this Makefile resides in is a\n# project subdirectory.\n#\n\nPROJECT_NAME := test_app\n\n# This line has to be included into the make file\n# to include components that are located somewhere\n# but not in \"component\" directory\n\nEXTRA_COMPONENT_DIRS := $(realpath ../)\nEXCLUDE_COMPONENTS := test\n\ninclude $(IDF_PATH)/make/project.mk\n"
  },
  {
    "path": "test_app/main/CMakeLists.txt",
    "content": "\nset(COMPONENT_SRCS \"main.c\")\nset(COMPONENT_ADD_INCLUDEDIRS \"\")\n\nset(COMPONENT_PRIV_REQUIRES tests esp_timer)\n\nregister_component()\n"
  },
  {
    "path": "test_app/main/component.mk",
    "content": "#\n# Main component makefile.\n#\n# This Makefile can be left empty. By default, it will take the sources in the \n# src/ directory, compile them and link them into lib(subdirectory_name).a\n# in the build directory. This behaviour is entirely configurable,\n# please read the ESP-IDF documents if you need to do this.\n# \n"
  },
  {
    "path": "test_app/main/main.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <freertos/FreeRTOS.h>\n#include <freertos/task.h>\n#include <esp_log.h>\n\n#include <stdio.h>\n#include <stdlib.h>\n#include <string.h>\n#include <inttypes.h>\n\n#include <test_functions.h>\n#include <esp_timer.h>\n\n\n#if __has_include(\"esp_idf_version.h\")\n#include <esp_idf_version.h>\n#if ESP_IDF_VERSION >= ESP_IDF_VERSION_VAL(5, 0, 0)\n#define esp_cpu_get_ccount esp_cpu_get_cycle_count\n#endif\n#endif\n\nstatic const char *TAG = \"test_app\";\nstatic uint32_t start_c, start_opt, total_c, total_opt;\n\nvoid profile_c_start()\n{\n    /* initiate profiling */\n    start_c = esp_cpu_get_ccount();\n}\n\nuint32_t profile_c_end()\n{\n    /* record profile number */\n    total_c = esp_cpu_get_ccount() - start_c;\n    return total_c;\n}\n\nvoid profile_opt_start()\n{\n    /* initiate profiling */\n    start_opt = esp_cpu_get_ccount();\n}\n\nuint32_t profile_opt_end()\n{\n    /* record profile number */\n    total_opt = esp_cpu_get_ccount() - start_opt;\n    return total_opt;\n}\n\nstatic void print_profile(const char *kernel)\n{\n    float speedup = (total_c > 0 && total_opt > 0) ? (float)total_c / (float)total_opt : 0.0f;\n    printf(\"PROFILE: %s, ansi=%\"PRIu32\", opt=%\"PRIu32\", speedup=%.2fx\\n\",\n           kernel, total_c, total_opt, speedup);\n}\n\nvoid app_main()\n{\n    /* s8 tests */\n    ESP_LOGI(TAG, \"Running s8 tests...\");\n    esp_nn_add_elementwise_s8_test();\n    print_profile(\"add_s8\");\n    esp_nn_mul_elementwise_s8_test();\n    print_profile(\"mul_s8\");\n    esp_nn_mul_broadcast_channel_s8_test();\n    print_profile(\"mul_broadcast_ch_s8\");\n    esp_nn_depthwise_conv_s8_test();\n    print_profile(\"depthwise_conv_s8\");\n    esp_nn_conv_s8_test();\n    print_profile(\"conv_s8\");\n    esp_nn_relu6_s8_test();\n    print_profile(\"relu6_s8\");\n    esp_nn_avg_pool_s8_test();\n    print_profile(\"avg_pool_s8\");\n    esp_nn_max_pool_s8_test();\n    print_profile(\"max_pool_s8\");\n    esp_nn_fully_connected_s8_test();\n    print_profile(\"fc_s8\");\n    esp_nn_fully_connected_per_ch_s8_test();\n    print_profile(\"fc_per_ch_s8\");\n    esp_nn_softmax_s8_test();\n    print_profile(\"softmax_s8\");\n    esp_nn_hard_swish_s8_test();\n    print_profile(\"hard_swish_s8\");\n    esp_nn_mean_nhwc_s8_test();\n    print_profile(\"mean_nhwc_s8\");\n    ESP_LOGI(TAG, \"s8 tests done!\\n\");\n\n    /* u8 tests */\n    //ESP_LOGI(TAG, \"Running u8 tests...\");\n    //esp_nn_add_elementwise_u8_test();\n    //esp_nn_depthwise_conv_u8_test();\n    //esp_nn_conv_u8_test();\n    //esp_nn_avg_pool_u8_test();\n    //esp_nn_max_pool_u8_test();\n    //esp_nn_fully_connected_u8_test();\n    //ESP_LOGI(TAG, \"u8 tests done!\\n\");\n}\n"
  },
  {
    "path": "test_app/sdkconfig.defaults",
    "content": "\n#\n# esp-nn\n#\nCONFIG_NN_OPTIMIZED=y\n"
  },
  {
    "path": "test_app/sdkconfig.defaults.esp32p4",
    "content": "# Enables high speed SPIRAM and other options\nCONFIG_IDF_EXPERIMENTAL_FEATURES=y\n\n#\n# ESP System Settings\n#\nCONFIG_ESP_DEFAULT_CPU_FREQ_MHZ=360\nCONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_360=y\n\n#\n# ESP PSRAM\n#\nCONFIG_SPIRAM=y\nCONFIG_SPIRAM_BOOT_INIT=y\nCONFIG_SPIRAM_MODE_HEX=y\nCONFIG_SPIRAM_SPEED_200M=y\nCONFIG_SPIRAM_SPEED=200\nCONFIG_SPIRAM_ALLOW_STACK_EXTERNAL_MEMORY=y\nCONFIG_SPIRAM_ALLOW_BSS_SEG_EXTERNAL_MEMORY=n\nCONFIG_SPIRAM_USE_CAPS_ALLOC=y\nCONFIG_SPIRAM_TRY_ALLOCATE_WIFI_LWIP=y\n\n#\n# GDB Stub\n#\nCONFIG_ESP_GDBSTUB_ENABLED=y\nCONFIG_ESP_SYSTEM_PANIC_GDBSTUB=y\n\n#\n# Heap memory debugging\n#\n# CONFIG_HEAP_POISONING_DISABLED is not set\nCONFIG_HEAP_POISONING_LIGHT=y\n"
  },
  {
    "path": "test_app/sdkconfig.defaults.esp32s3",
    "content": "# Default configurations for ESP32-S3\n\nCONFIG_ESP32S3_DEFAULT_CPU_FREQ_240=y\n# CONFIG_ESP32S3_SPIRAM_SUPPORT is not set\n\nCONFIG_ESP32S3_DATA_CACHE_64KB=y\nCONFIG_ESP32S3_DATA_CACHE_8WAYS=y\nCONFIG_ESP32S3_DATA_CACHE_LINE_64B=y\n"
  },
  {
    "path": "tests/CMakeLists.txt",
    "content": "\nset(COMPONENT_ADD_INCLUDEDIRS ./include/)\nset(COMPONENT_SRCS \"src/basic_math_test.c\"\n                   \"src/convolution_test.c\"\n                   \"src/fully_connected_test.c\"\n                   \"src/pooling_test.c\"\n                   \"src/relu_test.c\"\n                   \"src/softmax_test.c\"\n                   \"src/hard_swish_test.c\"\n                   \"src/mean_test.c\")\n\nset(COMPONENT_REQUIRES )\nset(COMPONENT_PRIV_REQUIRES esp-nn)\n\nregister_component()\n\ntarget_compile_options(${COMPONENT_LIB} PRIVATE -Wno-unused-function)\n"
  },
  {
    "path": "tests/README.md",
    "content": "# Tests for esp_nn library\n\n- Include these in your test framework and run the framework.\n- For IDF test please refer `test_app`\n"
  },
  {
    "path": "tests/component.mk",
    "content": "#FIXME\n\nCOMPONENT_ADD_INCLUDEDIRS := include/\n\nCOMPONENT_SRCDIRS :=  src/\n"
  },
  {
    "path": "tests/include/test_functions.h",
    "content": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n\n/* int8_t ops tests */\nvoid esp_nn_add_elementwise_s8_test();\nvoid esp_nn_mul_elementwise_s8_test();\nvoid esp_nn_mul_broadcast_channel_s8_test();\n\nvoid esp_nn_depthwise_conv_s8_test();\nvoid esp_nn_conv_s8_test();\n\nvoid esp_nn_avg_pool_s8_test();\nvoid esp_nn_max_pool_s8_test();\n\nvoid esp_nn_fully_connected_s8_test();\nvoid esp_nn_fully_connected_per_ch_s8_test();\n\nvoid esp_nn_relu6_s8_test();\n\nvoid esp_nn_softmax_s8_test();\n\nvoid esp_nn_hard_swish_s8_test();\nvoid esp_nn_mean_nhwc_s8_test();\n\n/* uint8_t ops tests */\nvoid esp_nn_add_elementwise_u8_test();\n\nvoid esp_nn_depthwise_conv_u8_test();\nvoid esp_nn_conv_u8_test();\n\nvoid esp_nn_avg_pool_u8_test();\nvoid esp_nn_max_pool_u8_test();\n\nvoid esp_nn_fully_connected_u8_test();\n\n/* instructions test functions */\nvoid compare_instructions_test();\nvoid arith_instructions_test();\nvoid min_max_instructions_test();\nvoid bitwise_instructions_test();\nvoid load_store_instructions_test();\n"
  },
  {
    "path": "tests/include/test_utils.h",
    "content": "/*\n * SPDX-FileCopyrightText: 2020-2023 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <stdint.h>\n#include <stdbool.h>\n#include <common_functions.h>\n#include <stdio.h>\n\n/* mult value range */\n#define MULT_MAX    INT32_MAX\n#define MULT_MIN    0\n\n/* shift value range */\n#define SHIFT_MIN   -31\n#define SHIFT_MAX   30\n\n/**\n * @brief callback function to run before C function\n */\nvoid profile_c_start();\n\n/**\n * @brief callback function to run after C function\n *\n * @return uint32_t cycles consumed running C function\n */\nuint32_t profile_c_end();\n\n/**\n * @brief callback function to run before optimized function\n */\nvoid profile_opt_start();\n\n/**\n * @brief callback function to run after optimized function\n *\n * @return uint32_t cycles consumed running optimized function\n */\nuint32_t profile_opt_end();\n\n#define ANSI_COLOR_RED     \"\\x1b[31m\"\n#define ANSI_COLOR_GREEN   \"\\x1b[32m\"\n#define ANSI_COLOR_YELLOW  \"\\x1b[33m\"\n#define ANSI_COLOR_BLUE    \"\\x1b[34m\"\n#define ANSI_COLOR_MAGENTA \"\\x1b[35m\"\n#define ANSI_COLOR_CYAN    \"\\x1b[36m\"\n#define ANSI_COLOR_RESET   \"\\x1b[0m\"\n\n#define CHECK_EQUAL(ARRAY1, ARRAY2, size) ({    \\\n    bool res = true;                            \\\n    for (int _i = 0; _i < size; _i++) {         \\\n        if (ARRAY1[_i] != ARRAY2[_i]) {         \\\n            res = false;                        \\\n            break;                              \\\n        }                                       \\\n    }                                           \\\n    res;                                        \\\n})\n\n#define PRINT_ARRAY_INT(ARRAY, width, height) ({        \\\n    int *_array = (int *) ARRAY;                        \\\n    for (int _j = 0; _j < height; _j++) {               \\\n        for (int _i = 0; _i < width; _i++) {            \\\n            printf(\"%d\\t\", _array[width * _j + _i]);    \\\n        }                                               \\\n        printf(\"\\n\");                                   \\\n    }                                                   \\\n    printf(\"\\n\");                                       \\\n})\n\n#define PRINT_ARRAY_HEX(ARRAY, width, height) ({        \\\n    uint8_t *_array = (uint8_t *) ARRAY;                \\\n    for (int _j = 0; _j < height; _j++) {               \\\n        for (int _i = 0; _i < width; _i++) {            \\\n            printf(\"%02x\\t\", _array[width * _j + _i]);  \\\n        }                                               \\\n        printf(\"\\n\");                                   \\\n    }                                                   \\\n    printf(\"\\n\");                                       \\\n})\n\n#define PRINT_ARRAY_INT8(ARRAY, width, height) ({        \\\n    int8_t *_array = (int8_t *) ARRAY;                \\\n    for (int _j = 0; _j < height; _j++) {               \\\n        for (int _i = 0; _i < width; _i++) {            \\\n            printf(\"%4d \", _array[width * _j + _i]);  \\\n        }                                               \\\n        printf(\"\\n\");                                   \\\n    }                                                   \\\n    printf(\"\\n\");                                       \\\n})\n\n#if CONFIG_IDF_CMAKE\n#if ((CONFIG_SPIRAM || CONFIG_SPIRAM_SUPPORT || CONFIG_ESP32S3_SPIRAM_SUPPORT) && \\\n        (CONFIG_SPIRAM_USE_CAPS_ALLOC || CONFIG_SPIRAM_USE_MALLOC))\n#define IDF_HEAP_CAPS 1\n#endif\n#endif\n\n#if IDF_HEAP_CAPS\n#include \"esp_heap_caps.h\"\n/* Try SPIRAM first, fall back to internal RAM */\nstatic inline void *esp_nn_test_alloc(size_t size) {\n    void *ptr = heap_caps_malloc(size, MALLOC_CAP_SPIRAM | MALLOC_CAP_8BIT);\n    if (!ptr) {\n        ptr = heap_caps_malloc(size, MALLOC_CAP_8BIT);\n    }\n    return ptr;\n}\n#define ESP_NN_TEST_ALLOC(SIZE) esp_nn_test_alloc(SIZE)\n#else\n#include <malloc.h>\n#define ESP_NN_TEST_ALLOC(SIZE) malloc(SIZE)\n#endif\n"
  },
  {
    "path": "tests/src/basic_math_test.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2020-2023 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <stdint.h>\n#include <stdbool.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <malloc.h>\n#include <inttypes.h>\n\n#include <common_functions.h>\n#include <esp_nn.h>\n#include \"test_utils.h\"\n\nconst int8_t test_add_in1[] = {\n       13,   26,  -26,   26,  -13,   13,  -13,   13,  -13,   13,  -13,   13,  -26,  -51,  -26,  -51,\n      -26,  -39,  -26,  -39,  -39,  -39,  -26,  -51,  -13,  -13,  -13,  -13,  -26,  -13,  -13,  -13,\n      -13,  -13,    0,  -26,    0,  -13,    0,  -26,  -13,  -26,  -26,  -26,  -26,  -26,  -26,  -26,\n      -13,  -13,    0,  -26,  -13,  -26,  -26,  -26,    0,    0,  -26,  -13,   13,    0,   26,    0,\n       13,    0,   13,    0,    0,    0,   13,    0,   13,   26,  -26,   13,  -26,   13,  -13,   13,\n      -13,   13,  -13,   13,  -26,  -26,  -13,  -26,  -26,  -26,  -26,  -26,  -39,  -26,  -26,  -26,\n      -13,    0,  -13,    0,  -26,    0,  -13,    0,  -13,    0,  -13,  -13,    0,    0,    0,  -13,\n      -13,  -13,  -26,  -13,  -26,  -13,  -13,  -13,  -13,    0,    0,  -13,  -13,  -13,  -13,  -13,\n        0,    0,  -13,    0,   13,   13,   13,    0,    0,    0,   13,   13,    0,    0,   13,   13,\n        0,   26,    0,   13,    0,   13,    0,   13,    0,   13,    0,   13,    0,   13,    0,    0,\n        0,    0,    0,   13,    0,    0,    0,    0,   13,   13,    0,   13,    0,   13,    0,   13,\n        0,   13,    0,   13,   13,   13,    0,   13,    0,   13,    0,   13,    0,   13,   13,   13,\n       13,   13,    0,   13,    0,   13,    0,   13,   13,   13,   13,   13,   13,   13,   13,   13,\n        0,   13,   13,   13,   13,   13,   13,   13,\n};\n\nconst int8_t test_add_in2[] = {\n     -128, -128, -103, -128,  -77, -128,  -52, -128,  -26, -128,   -1, -128, -128, -103, -103, -103,\n      -77, -103,  -52, -103,  -26, -103,   -1, -103, -128,  -77, -103,  -77,  -77,  -77,  -52,  -77,\n      -26,  -77,   -1,  -77, -128,  -52, -103,  -52,  -77,  -52,  -52,  -52,  -26,  -52,   -1,  -52,\n     -128,  -26, -103,  -26,  -77,  -26,  -52,  -26,  -26,  -26,   -1,  -26, -128,   -1, -103,   -1,\n      -77,   -1,  -52,   -1,  -26,   -1,   -1,   -1, -128, -128, -103, -128,  -77, -128,  -52, -128,\n      -26, -128,   -1, -128, -128, -103, -103, -103,  -77, -103,  -52, -103,  -26, -103,   -1, -103,\n     -128,  -77, -103,  -77,  -77,  -77,  -52,  -77,  -26,  -77,   -1,  -77, -128,  -52, -103,  -52,\n      -77,  -52,  -52,  -52,  -26,  -52,   -1,  -52, -128,  -26, -103,  -26,  -77,  -26,  -52,  -26,\n      -26,  -26,   -1,  -26, -128,   -1, -103,   -1,  -77,   -1,  -52,   -1,  -26,   -1,   -1,   -1,\n     -128, -128, -103, -128,  -77, -128,  -52, -128,  -26, -128,   -1, -128, -128, -103, -103, -103,\n      -77, -103,  -52, -103,  -26, -103,   -1, -103, -128,  -77, -103,  -77,  -77,  -77,  -52,  -77,\n      -26,  -77,   -1,  -77, -128,  -52, -103,  -52,  -77,  -52,  -52,  -52,  -26,  -52,   -1,  -52,\n     -128,  -26, -103,  -26,  -77,  -26,  -52,  -26,  -26,  -26,   -1,  -26, -128,   -1, -103,   -1,\n      -77,   -1,  -52,   -1,  -26,   -1,   -1,   -1,\n};\n\nvoid esp_nn_add_elementwise_s8_test()\n{\n    /* prepare data */\n    int size = 1600 + 8 + 7; /* odd len to test leftover */\n    int8_t *input1;\n    int8_t *input2;\n    int8_t *out_data_c;\n    int8_t *out_data_opt;\n    int8_t *input1_orig = NULL;\n    int8_t *input2_orig = NULL;\n    int8_t *out_c_orig = NULL;\n    int8_t *out_opt_orig = NULL;\n    int32_t input1_offset = 34;\n    int32_t input2_offset = 35;\n    int32_t output_offset = 36;\n    int32_t input1_shift = -8; // right_shift amt always <= 0\n    int32_t input2_shift = -8; // right_shift amt always <= 0\n    int32_t output_shift = -9; // right_shift amt always <= 0\n    int32_t left_shift = 15; // always +ve\n    int32_t input1_mult = INT32_MAX;\n    int32_t input2_mult = INT32_MAX;\n    int32_t output_mult = INT32_MAX;\n    int32_t activation_min = -128;\n    int32_t activation_max = 127;\n\n    for (int itr = 0; itr < 10; itr++) {\n        switch (itr) {\n        case 0: // all zeros\n            input1_offset = 0;\n            input2_offset = 0;\n            output_offset = 0;\n            input1_mult = 0;\n            input2_mult = 0;\n            output_mult = 0;\n            input1_shift = 0;\n            input2_shift = 0;\n            output_shift = 0;\n            left_shift = 0;\n        break;\n        case 1: // hit min\n            input1_offset = -127;\n            input2_offset = -127;\n            output_offset = -128;\n            input1_mult = MULT_MIN;\n            input2_mult = MULT_MIN;\n            output_mult = MULT_MIN;\n            input1_shift = 0;\n            input2_shift = 0;\n            output_shift = 0;\n            left_shift = 0;\n        break;\n        case 2: // hit max\n            input1_offset = 128;\n            input2_offset = 128;\n            output_offset = -127;\n            input1_mult = MULT_MAX;\n            input2_mult = MULT_MAX;\n            output_mult = MULT_MAX;\n            input1_shift = SHIFT_MIN;\n            input2_shift = SHIFT_MIN;\n            output_shift = SHIFT_MIN;\n            left_shift = 30 - 8; // since input is 8 bits\n        break;\n        case 3: // hit extreme max\n            input1_offset = 128;\n            input2_offset = 128;\n            output_offset = -127;\n            input1_mult = MULT_MAX;\n            input2_mult = MULT_MAX;\n            output_mult = MULT_MAX;\n            input1_shift = 0;\n            input2_shift = 0;\n            output_shift = 0;\n            left_shift = 30 - 8; // -8 since input is 8 bit\n        break;\n        case 4: // from yolo model\n            input1_offset = 64;\n            input2_offset = 128;\n            output_offset = -128;\n            input1_mult = 1705397815;\n            input2_mult = 1073741824;\n            output_mult = 1756091225;\n            input1_shift = -3;\n            input2_shift = 0;\n            output_shift = -19;\n            left_shift = 20;\n            size = 216;\n        break;\n        default:  // practical random input\n            input1_offset = rand() % 256 - 127; // range [-127, 128]\n            input2_offset = rand() % 256 - 127; // range [-127, 128]\n            output_offset = rand() % 256 - 128; // range [-128, 127]\n            input1_mult = MULT_MAX / 2 + rand() % INT16_MAX;\n            input2_mult = MULT_MAX / 2 + rand() % INT16_MAX;\n            output_mult = MULT_MAX / 2 + rand() % INT16_MAX;\n            input1_shift = -8 + rand() % 4;\n            input2_shift = -8 + rand() % 4;\n            output_shift = -8 + rand() % 4;\n            left_shift = rand() % 15;\n        }\n\n        input1_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);\n        input2_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);\n        out_c_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);\n        out_opt_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);\n\n        if (input1_orig == NULL || input2_orig == NULL ||\n                out_c_orig == NULL || out_opt_orig == NULL) {\n            printf(ANSI_COLOR_RED\"%s error allocating buffers\\n\"ANSI_COLOR_RESET, __FUNCTION__);\n            goto elementwise_add_test_cleanup;\n        }\n\n        input1 = (int8_t *) (((uint32_t) input1_orig + 15) & ~15);\n        input2 = (int8_t *) (((uint32_t) input2_orig + 15) & ~15);\n        if (itr == 4) {\n            input2 = input2_orig; // unaligned input\n        }\n        out_data_c = (int8_t *) (((uint32_t)out_c_orig + 15) & ~15);\n        out_data_opt = (int8_t *) (((uint32_t)out_opt_orig + 15) & ~15);\n\n\n        if (itr == 4) {\n            memcpy(input1, test_add_in1, size);\n            memcpy(input2, test_add_in2, size);\n        } else {\n            for (int i = 0; i < size; ++i) {\n                input1[i] = rand() % 256 - 128;\n                input2[i] = rand() % 256 - 128;\n            }\n        }\n\n        if (itr == 0) {\n            /* enable profiler */\n            profile_c_start();\n        }\n        /* C function */\n        esp_nn_add_elementwise_s8_ansi(input1, input2, input1_offset, input2_offset,\n                                       input1_mult, input2_mult, input1_shift, input2_shift,\n                                       left_shift, out_data_c, output_offset, output_mult,\n                                       output_shift, activation_min, activation_max, size);\n\n        if (itr == 0) {\n            profile_c_end();\n            profile_opt_start();\n        }\n\n        /* Optimized function */\n        esp_nn_add_elementwise_s8(input1, input2, input1_offset, input2_offset,\n                                  input1_mult, input2_mult, input1_shift, input2_shift,\n                                  left_shift, out_data_opt, output_offset, output_mult,\n                                  output_shift, activation_min, activation_max, size);\n        if (itr == 0) {\n            /* disable profiler */\n            profile_opt_end();\n        }\n\n        bool ret = CHECK_EQUAL(out_data_c, out_data_opt, size);\n        if (ret == false) {\n            printf(ANSI_COLOR_RED\"%s[%d] failed\\n\"ANSI_COLOR_RESET, __FUNCTION__, itr);\n            printf(\"Output: \\n\");\n            PRINT_ARRAY_INT8(out_data_opt, size, 1);\n            printf(\"Expected: \\n\");\n            PRINT_ARRAY_INT8(out_data_c, size, 1);\n            printf(\"Input1:\\n\");\n            PRINT_ARRAY_INT8(input1, size, 1);\n            printf(\"Input2:\\n\");\n            PRINT_ARRAY_INT8(input2, size, 1);\n            printf(\"in1_shift %\"PRIi32\", in2_shift %\"PRIi32\", left_shift %\"PRIi32\", out_shift %\"PRIi32\"\\n\",\n                   input1_shift, input2_shift, left_shift, output_shift);\n            printf(\"in1_mult %\"PRIi32\", in2_mult %\"PRIi32\", out_mult %\"PRIi32\"\\n\",\n                   input1_mult, input2_mult, output_mult);\n            goto elementwise_add_test_cleanup;\n        }\n        printf(ANSI_COLOR_GREEN\"%s[%d] passed\\n\"ANSI_COLOR_RESET, __FUNCTION__, itr);\n\nelementwise_add_test_cleanup:\n        if (input1_orig) {\n            free(input1_orig);\n        }\n        if (input2_orig) {\n            free(input2_orig);\n        }\n        if (out_c_orig) {\n            free(out_c_orig);\n        }\n        if (out_opt_orig) {\n            free(out_opt_orig);\n        }\n    }\n}\n\nvoid esp_nn_mul_elementwise_s8_test()\n{\n    /* prepare data */\n    int size = 1600 + 8 + 7; /* odd len to test leftover */\n    int8_t *input1;\n    int8_t *input2;\n    int8_t *out_data_c;\n    int8_t *out_data_opt;\n    int32_t input1_offset = 34;\n    int32_t input2_offset = 35;\n    int32_t output_offset = 36;\n    int32_t output_shift = -7;\n    int32_t output_mult = MULT_MAX; // max out_mult\n    int32_t activation_min = -128;\n    int32_t activation_max = 127;\n    int8_t *input1_orig = NULL;\n    int8_t *input2_orig = NULL;\n    int8_t *out_c_orig = NULL;\n    int8_t *out_opt_orig = NULL;\n\n    for (int itr = 0; itr < 10; itr++) {\n        switch (itr) {\n        case 0: // all zeros\n            input1_offset = 0;\n            input2_offset = 0;\n            output_offset = 0;\n            output_mult = 0;\n            output_shift = 0;\n        break;\n        case 1: // hit min\n            input1_offset = -127;\n            input2_offset = -127;\n            output_offset = -128;\n            output_mult = MULT_MIN;\n            output_shift = 0;\n        break;\n        case 2: // hit max\n            input1_offset = 128;\n            input2_offset = 128;\n            output_offset = -127;\n            output_mult = MULT_MAX;\n            output_shift = SHIFT_MIN;\n        break;\n        case 3: // hit extreme max\n            input1_offset = 128;\n            input2_offset = 128;\n            output_offset = -127;\n            output_mult = MULT_MAX;\n            output_shift = 0;\n        break;\n        default:  // practical random input\n            input1_offset = rand() % 256 - 127; // range [-127, 128]\n            input2_offset = rand() % 256 - 127; // range [-127, 128]\n            output_offset = rand() % 256 - 128; // range [-128, 127]\n            output_mult = MULT_MAX / 2 + rand() % INT16_MAX;\n            output_shift = -8 + rand() % 4;\n            size = 4 + rand() % 64;\n        }\n\n        input1_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);\n        input2_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);\n        out_c_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);\n        out_opt_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);\n\n        if (input1_orig == NULL || input2_orig == NULL ||\n                out_c_orig == NULL || out_opt_orig == NULL) {\n            printf(ANSI_COLOR_RED\"%s error allocating buffers\\n\"ANSI_COLOR_RESET, __FUNCTION__);\n            goto elementwise_mult_test_cleanup;\n        }\n\n        input1 = (int8_t *) (((uint32_t) input1_orig + 15) & ~15);\n        input2 = (int8_t *) (((uint32_t) input2_orig + 15) & ~15);\n        if (itr == 4 || itr == 5) {\n            input2 = input2_orig; // unaligned input\n        }\n\n        out_data_c = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15);\n        out_data_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15);\n\n        for (int i = 0; i < size; ++i) {\n            input1[i] = rand() % 256 - 128;\n            input2[i] = rand() % 256 - 128;\n        }\n\n        if (itr == 0) {\n            /* enable profiler */\n            profile_c_start();\n        }\n        /* C function */\n        esp_nn_mul_elementwise_s8_ansi(input1, input2, input1_offset, input2_offset,\n                                       out_data_c, output_offset, output_mult, output_shift,\n                                       activation_min, activation_max, size);\n\n        if (itr == 0) {\n            profile_c_end();\n            profile_opt_start();\n        }\n        /* Optimized function */\n        esp_nn_mul_elementwise_s8(input1, input2, input1_offset, input2_offset,\n                                  out_data_opt, output_offset, output_mult, output_shift,\n                                  activation_min, activation_max, size);\n\n        if (itr == 0) {\n            /* disable profiler */\n            profile_opt_end();\n        }\n\n        bool ret = CHECK_EQUAL(out_data_c, out_data_opt, size);\n        if (ret == false) {\n            printf(ANSI_COLOR_RED\"%s[%d] failed\\n\"ANSI_COLOR_RESET, __FUNCTION__, itr);\n            printf(\"Output: \\n\");\n            PRINT_ARRAY_HEX(out_data_opt, size, 1);\n            printf(\"Expected: \\n\");\n            PRINT_ARRAY_HEX(out_data_c, size, 1);\n            printf(\"Input1:\\n\");\n            PRINT_ARRAY_HEX(input1, size, 1);\n            printf(\"Input2:\\n\");\n            PRINT_ARRAY_HEX(input2, size, 1);\n            goto elementwise_mult_test_cleanup;\n        }\n        printf(ANSI_COLOR_GREEN\"%s[%d] passed\\n\"ANSI_COLOR_RESET, __FUNCTION__, itr);\n\nelementwise_mult_test_cleanup:\n        if (input1_orig) {\n            free(input1_orig);\n        }\n        if (input2_orig) {\n            free(input2_orig);\n        }\n        if (out_c_orig) {\n            free(out_c_orig);\n        }\n        if (out_opt_orig) {\n            free(out_opt_orig);\n        }\n    }\n}\n\nvoid esp_nn_mul_broadcast_channel_s8_test()\n{\n    int total_spatial = 49; /* 7x7 feature map */\n    int channels = 64;\n    int8_t *input1;\n    int8_t *input2_per_ch;\n    int8_t *out_data_c;\n    int8_t *out_data_opt;\n    int8_t *input1_orig = NULL;\n    int8_t *input2_orig = NULL;\n    int8_t *out_c_orig = NULL;\n    int8_t *out_opt_orig = NULL;\n    int32_t input1_offset = 34;\n    int32_t input2_offset = 35;\n    int32_t output_offset = 36;\n    int32_t output_shift = -7;\n    int32_t output_mult = MULT_MAX;\n    int32_t activation_min = -128;\n    int32_t activation_max = 127;\n\n    for (int itr = 0; itr < 10; itr++) {\n        switch (itr) {\n        case 0: // all zeros\n            input1_offset = 0;\n            input2_offset = 0;\n            output_offset = 0;\n            output_mult = 0;\n            output_shift = 0;\n            total_spatial = 49;\n            channels = 64;\n        break;\n        case 1: // hit min\n            input1_offset = -127;\n            input2_offset = -127;\n            output_offset = -128;\n            output_mult = MULT_MIN;\n            output_shift = 0;\n        break;\n        case 2: // hit max\n            input1_offset = 128;\n            input2_offset = 128;\n            output_offset = -127;\n            output_mult = MULT_MAX;\n            output_shift = SHIFT_MIN;\n        break;\n        case 3: // small channels (leftover only, no SIMD)\n            input1_offset = 64;\n            input2_offset = 32;\n            output_offset = -10;\n            output_mult = MULT_MAX / 2;\n            output_shift = -5;\n            total_spatial = 16;\n            channels = 5;\n        break;\n        case 4: // unaligned channels (SIMD + leftover)\n            total_spatial = 14;\n            channels = 19;\n        break;\n        case 5: // typical SE-block: 7x7 spatial, 96 channels\n            input1_offset = 128;\n            input2_offset = 128;\n            output_offset = -128;\n            output_mult = 1705397815;\n            output_shift = -3;\n            total_spatial = 49;\n            channels = 96;\n        break;\n        default: // random\n            input1_offset = rand() % 256 - 127;\n            input2_offset = rand() % 256 - 127;\n            output_offset = rand() % 256 - 128;\n            output_mult = MULT_MAX / 2 + rand() % INT16_MAX;\n            output_shift = -8 + rand() % 4;\n            total_spatial = 4 + rand() % 64;\n            channels = 8 + rand() % 128;\n        }\n\n        int size = total_spatial * channels;\n        input1_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);\n        input2_orig = (int8_t *) ESP_NN_TEST_ALLOC(channels + 16);\n        out_c_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);\n        out_opt_orig = (int8_t *) ESP_NN_TEST_ALLOC(size + 16);\n\n        if (input1_orig == NULL || input2_orig == NULL ||\n                out_c_orig == NULL || out_opt_orig == NULL) {\n            printf(ANSI_COLOR_RED\"%s error allocating buffers\\n\"ANSI_COLOR_RESET, __FUNCTION__);\n            goto broadcast_mul_test_cleanup;\n        }\n\n        input1 = (int8_t *) (((uint32_t) input1_orig + 15) & ~15);\n        input2_per_ch = (int8_t *) (((uint32_t) input2_orig + 15) & ~15);\n        out_data_c = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15);\n        out_data_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15);\n\n        if (itr == 4) {\n            input1 = input1_orig; // unaligned input\n        }\n\n        for (int i = 0; i < size; ++i) {\n            input1[i] = rand() % 256 - 128;\n        }\n        for (int i = 0; i < channels; ++i) {\n            input2_per_ch[i] = rand() % 256 - 128;\n        }\n\n        if (itr == 0) {\n            profile_c_start();\n        }\n        /* C reference */\n        esp_nn_mul_broadcast_channel_s8_ansi(input1, input2_per_ch,\n                                             input1_offset, input2_offset,\n                                             out_data_c, output_offset,\n                                             output_mult, output_shift,\n                                             activation_min, activation_max,\n                                             total_spatial, channels);\n        if (itr == 0) {\n            profile_c_end();\n            profile_opt_start();\n        }\n        /* Optimized function */\n        esp_nn_mul_broadcast_channel_s8(input1, input2_per_ch,\n                                        input1_offset, input2_offset,\n                                        out_data_opt, output_offset,\n                                        output_mult, output_shift,\n                                        activation_min, activation_max,\n                                        total_spatial, channels);\n        if (itr == 0) {\n            profile_opt_end();\n        }\n\n        bool ret = CHECK_EQUAL(out_data_c, out_data_opt, size);\n        if (ret == false) {\n            printf(ANSI_COLOR_RED\"%s[%d] failed\\n\"ANSI_COLOR_RESET, __FUNCTION__, itr);\n            printf(\"spatial=%d channels=%d size=%d\\n\", total_spatial, channels, size);\n            for (int idx = 0; idx < size; idx++) {\n                if (out_data_c[idx] != out_data_opt[idx]) {\n                    printf(\"first mismatch at idx=%d (row=%d ch=%d): got %02x exp %02x\\n\",\n                           idx, idx / channels, idx % channels,\n                           (uint8_t)out_data_opt[idx], (uint8_t)out_data_c[idx]);\n                    // print 8 more mismatches\n                    int cnt = 0;\n                    for (int j = idx + 1; j < size && cnt < 8; j++) {\n                        if (out_data_c[j] != out_data_opt[j]) {\n                            printf(\"  mismatch at idx=%d (row=%d ch=%d): got %02x exp %02x\\n\",\n                                   j, j / channels, j % channels,\n                                   (uint8_t)out_data_opt[j], (uint8_t)out_data_c[j]);\n                            cnt++;\n                        }\n                    }\n                    break;\n                }\n            }\n            goto broadcast_mul_test_cleanup;\n        }\n        printf(ANSI_COLOR_GREEN\"%s[%d] passed\\n\"ANSI_COLOR_RESET, __FUNCTION__, itr);\n\nbroadcast_mul_test_cleanup:\n        if (input1_orig) {\n            free(input1_orig);\n        }\n        if (input2_orig) {\n            free(input2_orig);\n        }\n        if (out_c_orig) {\n            free(out_c_orig);\n        }\n        if (out_opt_orig) {\n            free(out_opt_orig);\n        }\n    }\n}\n"
  },
  {
    "path": "tests/src/convolution_test.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <stdint.h>\n#include <stdbool.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <malloc.h>\n#include <inttypes.h>\n\n#include <esp_nn.h>\n#include \"test_utils.h\"\n\nvoid esp_nn_depthwise_conv_s8_test()\n{\n    uint32_t total_c = 0, total_opt = 0;\n    int8_t *input = NULL, *filter_data = NULL;\n    int8_t *out_data_c = NULL, *out_data_opt = NULL;\n    int32_t *bias = NULL;\n    int32_t input_offset = 5; /* some number in [-128, 127] */\n    int32_t out_offset = 7;\n    int32_t activation_min = -125;\n    int32_t activation_max = 120;\n    void *scratch_buf = NULL;\n\n    /* independent variables */\n    int input_wd, input_ht, channels;\n    uint16_t filter_ht, filter_wd, ch_mult, out_wd, out_ht;\n    uint16_t pad_wd, pad_ht, stride_wd, stride_ht;\n\n    printf(\"\\n######## Running %s ##########\\n\", __FUNCTION__);\n    // run for 17 iterations\n    for (int itr = 0; itr < 17; itr++) {\n        /* prepare data */\n        switch (itr) {\n        case 0: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (0,0)\n            input_wd = 18;\n            input_ht = 18;\n            filter_ht = 3;\n            filter_wd = 3;\n            ch_mult = 1;\n            channels = 16;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 1;\n            stride_ht = 1;\n            break;\n        case 1: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (1,1)\n            input_wd = 10;\n            input_ht = 10;\n            filter_ht = 3;\n            filter_wd = 3;\n            ch_mult = 1;\n            channels = 16;\n            pad_wd = 1;\n            pad_ht = 1;\n            stride_wd = 1;\n            stride_ht = 1;\n            break;\n        case 2: // (ch_mult 1, (channels % 8) = 0), filter (3,3), pad (1,1)\n            input_wd = 10;\n            input_ht = 10;\n            filter_ht = 3;\n            filter_wd = 3;\n            ch_mult = 1;\n            channels = 24;\n            pad_wd = 1;\n            pad_ht = 1;\n            stride_wd = 1;\n            stride_ht = 1;\n            break;\n        case 3: // other filter sizes (ch_mult 1, (channels % 8) = 0)\n            input_wd = 10;\n            input_ht = 10;\n            filter_ht = 3;\n            filter_wd = 3;\n            ch_mult = 1;\n            channels = 24;\n            pad_wd = 1;\n            pad_ht = 1;\n            stride_wd = 1;\n            stride_ht = 1;\n            break;\n        case 4: // other filter sizes (ch_mult 8 = 0)\n            input_wd = 6;\n            input_ht = 6;\n            filter_ht = 3;\n            filter_wd = 3;\n            ch_mult = 8;\n            channels = 4;\n            pad_wd = 1;\n            pad_ht = 1;\n            stride_wd = 1;\n            stride_ht = 1;\n            break;\n        case 5: // other filter sizes (ch_mult 8 = 0)\n            input_wd = 12;\n            input_ht = 12;\n            filter_ht = 5;\n            filter_wd = 5;\n            ch_mult = 8;\n            channels = 4;\n            pad_wd = 1;\n            pad_ht = 1;\n            stride_wd = 1;\n            stride_ht = 1;\n            break;\n        case 6: // other filter sizes (ch_mult 4 = 0)\n            input_wd = 6;\n            input_ht = 6;\n            filter_ht = 3;\n            filter_wd = 3;\n            ch_mult = 4;\n            channels = 4;\n            pad_wd = 1;\n            pad_ht = 1;\n            stride_wd = 1;\n            stride_ht = 1;\n            break;\n        case 7: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (0,0)  stride (2,2)\n            input_wd = 6;\n            input_ht = 6;\n            filter_ht = 3;\n            filter_wd = 3;\n            ch_mult = 1;\n            channels = 16;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 2;\n            stride_ht = 2;\n            break;\n        case 8: // same as case 7, with large parameters (reduced for non-PSRAM boards)\n            input_wd = 28;\n            input_ht = 28;\n            filter_ht = 3;\n            filter_wd = 3;\n            ch_mult = 1;\n            channels = 64;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 2;\n            stride_ht = 2;\n            break;\n        case 9: // (ch_mult 1, (channels % 16) = 0), filter (3,3), pad (0,0)  stride (2,2)\n            input_wd = 6;\n            input_ht = 6;\n            filter_ht = 3;\n            filter_wd = 3;\n            ch_mult = 1;\n            channels = 16;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 2;\n            stride_ht = 2;\n            break;\n        case 15: // ch=8, 3x3, pad=1 (person_detection model layer, ch<12 path)\n            input_wd = 48;\n            input_ht = 48;\n            filter_ht = 3;\n            filter_wd = 3;\n            ch_mult = 1;\n            channels = 8;\n            pad_wd = 1;\n            pad_ht = 1;\n            stride_wd = 1;\n            stride_ht = 1;\n            break;\n        case 16: // ch=8, 3x3, pad=0, stride=2 (another ch<12 variant)\n            input_wd = 12;\n            input_ht = 12;\n            filter_ht = 3;\n            filter_wd = 3;\n            ch_mult = 1;\n            channels = 8;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 2;\n            stride_ht = 2;\n            break;\n        default:\n            input_wd = 6;\n            input_ht = 6;\n            filter_ht = 3;\n            filter_wd = 3;\n            ch_mult = 1;\n            channels = 16;\n            stride_wd = rand() % 2 + 1;\n            stride_ht = stride_wd;\n            pad_wd = stride_wd == 1 ? 0 : rand() % 2;\n            pad_ht = pad_wd;\n            break;\n        }\n\n        /* prepare data */\n        if (pad_wd) {\n            out_wd = (input_wd + stride_wd - 1) / stride_wd;\n        } else {\n            out_wd = (input_wd + stride_wd - filter_wd) / stride_wd;\n        }\n        if (pad_ht) {\n            out_ht = (input_ht + stride_ht - 1) / stride_ht;\n        } else {\n            out_ht = (input_ht + stride_ht - filter_ht) / stride_ht;\n        }\n\n        // if (itr == 9) {\n            // expect the function to handle this gracefully\n            // out_wd += 1;\n            // out_ht += 1;\n        // }\n        int in_size = input_wd * input_ht * channels;\n        int out_size = out_wd * out_ht * channels * ch_mult;\n        int filter_size = filter_wd * filter_ht * channels * ch_mult + 4;\n        int bias_size = channels * ch_mult + 1;\n        int32_t out_shift[channels * ch_mult];\n        int32_t out_mult[channels * ch_mult];\n\n        int8_t *input_orig = ESP_NN_TEST_ALLOC(in_size + 16);\n        int8_t *out_c_orig = ESP_NN_TEST_ALLOC(out_size + 16);\n        int8_t *out_opt_orig = ESP_NN_TEST_ALLOC(out_size + 16);\n        filter_data = ESP_NN_TEST_ALLOC(filter_size);\n        bias = ESP_NN_TEST_ALLOC(bias_size * 4);\n\n        if (bias == NULL || input_orig == NULL || filter_data == NULL ||\n                out_c_orig == NULL || out_opt_orig == NULL) {\n            printf(ANSI_COLOR_RED\"[%d] allocations failed\\n\"ANSI_COLOR_RESET, itr);\n            goto dc_s8_cleanup;\n        }\n\n        input = (int8_t *) (((uint32_t) input_orig + 15) & ~15);\n        out_data_c = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15);\n        out_data_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15);\n\n        /* Generate input data */\n        for (int i = 0; i < in_size; ++i) {\n            input[i] = rand() % 128;\n        }\n\n        /* Generate filter data */\n        for (int i = 0; i < filter_size; ++i) {\n            filter_data[i] = rand() % 256 - 128;\n        }\n\n        /* Generate bias data */\n        for (int i = 0; i < channels * ch_mult; ++i) {\n            bias[i + 1] = rand() % INT16_MAX; //0th index left for unalignment\n            out_shift[i] = -8 + rand() % 3;\n            out_mult[i] = 0x7eb0e200 + rand() % 50;\n        }\n\n        data_dims_t input_dims = {.width = input_wd, .height = input_ht, .channels = channels, 1};\n        data_dims_t output_dims = {.width = out_wd, .height = out_ht, .channels = channels * ch_mult, 1};\n        data_dims_t filter_dims = {.width = filter_wd, .height = filter_ht, 0, 0};\n        dw_conv_params_t conv_params = {.in_offset = input_offset, .out_offset = out_offset, .ch_mult = ch_mult,\n                                        .stride = {stride_wd, stride_ht}, .padding = {pad_wd, pad_ht},\n                                        .dilation = {0, 0}, .activation = {activation_min, activation_max}};\n        quant_data_t quant_data = {.shift = out_shift, .mult = out_mult};\n\n        int scratch_buf_size = esp_nn_get_depthwise_conv_scratch_size(&input_dims, &filter_dims,\n                                                                      &output_dims, &conv_params);\n        if (scratch_buf_size > 0) {\n            scratch_buf = ESP_NN_TEST_ALLOC(scratch_buf_size + 16);\n            if (scratch_buf == NULL) {\n                printf(ANSI_COLOR_RED\"[%d] scratch_buf alloc failed size %d\\n\"ANSI_COLOR_RESET,\n                       itr, scratch_buf_size);\n                goto dc_s8_cleanup;\n            }\n            int align_sz = 16 - (((int32_t) scratch_buf) & 0xf);\n            esp_nn_set_depthwise_conv_scratch_buf(scratch_buf + align_sz);\n        }\n\n        /* enable profiler */\n        profile_c_start();\n\n        /* C function */\n        esp_nn_depthwise_conv_s8_ansi(&input_dims, input, &filter_dims, filter_data + 4,\n                                      bias + 1, &output_dims, out_data_c, &conv_params, &quant_data);\n\n        total_c = profile_c_end();\n        profile_opt_start();\n\n        /* Optimized function */\n        esp_nn_depthwise_conv_s8(&input_dims, input, &filter_dims, filter_data + 4,\n                                 bias + 1, &output_dims, out_data_opt, &conv_params, &quant_data);\n\n        /* disable profiler */\n        total_opt = profile_opt_end();\n\n        bool ret = CHECK_EQUAL(out_data_c, out_data_opt, out_size);\n        if (ret == false) {\n        printf(ANSI_COLOR_RED\"[%3d] failed [pad: (%d, %d), stride: (%d, %d)\"\n               \" out: (%3d,%3d), filter: (%d, %d,%3d), ch_mult %d]\\n\"ANSI_COLOR_RESET,\n               itr, pad_wd, pad_ht, stride_wd, stride_ht, out_wd, out_ht,\n               filter_wd, filter_ht, channels, ch_mult);\n#if 0\n            printf(\"Output: \\n\");\n            PRINT_ARRAY_HEX(out_data_opt, out_size / out_ht, out_ht);\n            printf(\"Expected: \\n\");\n            PRINT_ARRAY_HEX(out_data_c, out_size / out_ht, out_ht);\n            printf(\"Input:\\n\");\n            PRINT_ARRAY_HEX(input, in_size / input_ht, input_ht);\n            printf(\"Filter data:\\n\");\n            PRINT_ARRAY_HEX(filter_data + 4, (filter_size - 4) / filter_ht, filter_ht);\n            printf(\"bias data:\\n\");\n            PRINT_ARRAY_INT(bias + 1, ch_mult * channels, 1);\n#endif\n            goto dc_s8_cleanup;\n        }\n        printf(ANSI_COLOR_GREEN\"[%3d] passed [pad: (%d, %d), stride: (%d, %d)\"\n               \" out: (%3d,%3d), filter: (%d, %d,%3d), ch_mult %d]\"ANSI_COLOR_RESET,\n               itr, pad_wd, pad_ht, stride_wd, stride_ht, out_wd,\n               out_ht, filter_wd, filter_ht, channels, ch_mult);\n        printf(\"\\tcycles: c %8\"PRIu32\", opt %8\"PRIu32\"\\n\", total_c, total_opt);\n\n    dc_s8_cleanup:\n        if (input_orig) {\n            free(input_orig);\n        }\n        if (filter_data) {\n            free(filter_data);\n        }\n        if (out_c_orig) {\n            free(out_c_orig);\n        }\n        if (out_opt_orig) {\n            free(out_opt_orig);\n        }\n        if (bias) {\n            free(bias);\n        }\n        if (scratch_buf) {\n            free(scratch_buf);\n        }\n    }\n}\n\nvoid esp_nn_conv_s8_test()\n{\n    uint32_t total_c = 0, total_opt = 0;\n    int32_t input_offset = 5; /* some number in [-128, 127] */\n    int32_t activation_min = -125;\n    int32_t activation_max = 122;\n    int32_t out_offset = 3;\n\n    void *scratch_buf = NULL;\n    int8_t *input_orig = NULL;\n    int8_t *out_c_orig = NULL;\n    int8_t *out_opt_orig = NULL;\n    int8_t *filter_data = NULL;\n    int32_t *bias = NULL;\n    int32_t *out_shift = NULL;\n    int32_t *out_mult = NULL;\n\n    /* independent variable */\n    int in_wd, in_ht, in_channels, out_channels;\n    uint16_t filter_ht, filter_wd, out_wd, out_ht;\n    uint16_t pad_wd, pad_ht, stride_wd, stride_ht;\n\n    printf(\"\\n######## Running %s ##########\\n\", __FUNCTION__);\n    for (int itr = 0; itr < 18; itr++) {\n        /* Reset quant params to defaults each iteration */\n        input_offset = 5;\n        out_offset = 3;\n        activation_min = -125;\n        activation_max = 122;\n\n        switch (itr) {\n        case 0: // ch % 8 == 0 && filter (1,1), padding (0,0)\n            in_wd = 10;\n            in_ht = 10;\n            in_channels = 64;\n            out_channels = 64;\n            filter_ht = 1;\n            filter_wd = 1;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 1;\n            stride_ht = 1;\n            break;\n        case 1: // ch % 4 == 0 && (in_wd * in_ht) % 16 == 0\n            in_wd = 4;\n            in_ht = 4;\n            in_channels = 20;\n            out_channels = 8;\n            filter_ht = 1;\n            filter_wd = 1;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 1;\n            stride_ht = 1;\n            break;\n        case 2: // ch, filter (3x3x3)\n            in_wd = 10;\n            in_ht = 10;\n            in_channels = 3;\n            out_channels = 64;\n            filter_ht = 3;\n            filter_wd = 3;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 1;\n            stride_ht = 1;\n            break;\n        case 3: // remaining pad (0, 0)\n            in_wd = 10;\n            in_ht = 10;\n            in_channels = 3;\n            out_channels = 64;\n            filter_ht = 1;\n            filter_wd = 1;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 1;\n            stride_ht = 1;\n            break;\n        case 4: // unopt case\n            in_wd = 10;\n            in_ht = 10;\n            in_channels = 12;\n            out_channels = 64;\n            filter_ht = 3;\n            filter_wd = 3;\n            pad_wd = 1;\n            pad_ht = 1;\n            stride_wd = 1;\n            stride_ht = 1;\n            break;\n        case 5: // ch % 8 == 0 & stride (2,2)\n            in_wd = 16;\n            in_ht = 16;\n            in_channels = 16;\n            out_channels = 16;\n            filter_ht = 1;\n            filter_wd = 1;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 2;\n            stride_ht = 2;\n            break;\n        case 6: // ch % 8 == 0 && filter (1,1), padding (0,0)\n            in_wd = 2;\n            in_ht = 2;\n            in_channels = 8;\n            out_channels = 8;\n            filter_ht = 1;\n            filter_wd = 1;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 1;\n            stride_ht = 1;\n            break;\n        case 7: // ch == 3, pad (0, 0)\n            in_wd = 112;\n            in_ht = 112;\n            in_channels = 3;\n            out_channels = 16;\n            filter_ht = 6;\n            filter_wd = 6;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 2;\n            stride_ht = 2;\n            break;\n        case 8: // ch == 5, remaining pad (0, 0)\n            in_wd = 8;\n            in_ht = 8;\n            in_channels = 5;\n            out_channels = 16;\n            filter_ht = 6;\n            filter_wd = 6;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 2;\n            stride_ht = 2;\n            break;\n        case 9: //\n            in_wd = 3;\n            in_ht = 3;\n            in_channels = 32;\n            out_channels = 1;\n            filter_ht = 3;\n            filter_wd = 3;\n            pad_wd = 1;\n            pad_ht = 1;\n            stride_wd = 1;\n            stride_ht = 1;\n            break;\n        case 10: // needs right and bottom padding\n            in_wd = 4;\n            in_ht = 8;\n            in_channels = 1;\n            out_channels = 3;\n            filter_ht = 3;\n            filter_wd = 3;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 2;\n            stride_ht = 2;\n            break;\n        case 11: // needs right and bottom padding\n            in_wd = 4;\n            in_ht = 8;\n            in_channels = 3;\n            out_channels = 4;\n            filter_ht = 3;\n            filter_wd = 3;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 2;\n            stride_ht = 2;\n            break;\n        case 15: // 1x1 conv, large spatial, YOLO-like quant params\n            in_wd = 48;\n            in_ht = 48;\n            in_channels = 32;\n            out_channels = 32;\n            filter_ht = 1;\n            filter_wd = 1;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 1;\n            stride_ht = 1;\n            // Override quant params to match YOLO Op[8]\n            input_offset = 127;\n            out_offset = 39;\n            break;\n        case 16: // 1x1, YOLO exact data: 48x48x32->32 with real filter/bias/quant\n            in_wd = 48;\n            in_ht = 48;\n            in_channels = 32;\n            out_channels = 32;\n            filter_ht = 1;\n            filter_wd = 1;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 1;\n            stride_ht = 1;\n            input_offset = 127;\n            out_offset = 39;\n            activation_min = -128;\n            activation_max = 127;\n            break;\n        case 17: // 1x1 conv with DELIBERATELY UNALIGNED filter + small out_shift\n            // Tests both alignment (filter+5) AND transpose correctness (shift=-6 won't mask 8x error)\n            in_wd = 24;\n            in_ht = 24;\n            in_channels = 32;\n            out_channels = 32;\n            filter_ht = 1;\n            filter_wd = 1;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 1;\n            stride_ht = 1;\n            input_offset = 110; /* typical YOLO value that exposed the bug */\n            out_offset = 39;\n            activation_min = -128;\n            activation_max = 127;\n            break;\n        default: // ch % 8 == 0\n            in_wd = 8;\n            in_ht = 8;\n            in_channels = 16;\n            out_channels = 16;\n            filter_ht = 1;\n            filter_wd = 1;\n            pad_wd = 0;\n            pad_ht = 0;\n            stride_wd = 1;\n            stride_ht = 1;\n            break;\n        }\n\n        int8_t *filter_data_orig_save = NULL; /* for case 17 unaligned filter restore */\n\n        /* prepare data */\n        if (pad_wd) {\n            out_wd = (in_wd + stride_wd - 1) / stride_wd;\n        } else {\n            out_wd = (in_wd + stride_wd - filter_wd) / stride_wd;\n        }\n        if (pad_ht) {\n            out_ht = (in_ht + stride_ht - 1) / stride_ht;\n        } else {\n            out_ht = (in_ht + stride_ht - filter_ht) / stride_ht;\n        }\n\n        int in_size = in_wd * in_ht * in_channels;\n        int filter_size = filter_wd * filter_ht * in_channels * out_channels + 2;\n        int out_size = out_wd * out_ht * out_channels;\n\n        input_orig = ESP_NN_TEST_ALLOC(in_size + 16);\n        out_c_orig = ESP_NN_TEST_ALLOC(out_size + 16);\n        out_opt_orig = ESP_NN_TEST_ALLOC(out_size + 16);\n        filter_data = ESP_NN_TEST_ALLOC(filter_size + 16);\n        bias = ESP_NN_TEST_ALLOC(128 + sizeof (int32_t) * out_channels);\n        out_shift = ESP_NN_TEST_ALLOC(128 + sizeof (int32_t) * out_channels);\n        out_mult = ESP_NN_TEST_ALLOC(128 + sizeof (int32_t) * out_channels);\n\n        if (input_orig == NULL || filter_data == NULL ||\n                out_c_orig == NULL || out_opt_orig == NULL ||\n                bias == NULL || out_shift == NULL || out_mult == NULL) {\n            printf(ANSI_COLOR_RED\"[%3d] alloc failed (in=%d filter=%d out=%d)\\n\"ANSI_COLOR_RESET,\n                   itr, in_size, filter_size, out_size);\n            goto conv_s8_cleanup;\n        }\n\n        int8_t *input = (int8_t *) (((uint32_t) input_orig + 15) & ~15);\n        int8_t *out_data_c = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15);\n        int8_t *out_data_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15);\n\n        /* Generate input data between -128 -> +127 */\n        for (int i = 0; i < in_size; ++i) {\n            input[i] = rand() % 255 - 128;\n        }\n\n        /* Generate filter data between -128 -> +127 */\n        for (int i = 0; i < filter_size; ++i) {\n            filter_data[i] = rand() % 256 - 128;\n        }\n\n        /* Case 17: deliberately misalign filter by 5 bytes to test alignment handling.\n         * This reproduces the bug where ee.vld.l.64.ip ignores lower address bits. */\n        filter_data_orig_save = filter_data;\n        if (itr == 17) {\n            filter_data = filter_data + 5; /* misalign by 5 bytes (like YOLO's 0x3c05fe55) */\n        }\n\n        /* Generate bias data */\n        for (int i = 0; i < out_channels; ++i) {\n            bias[i] = (int32_t)rand() % UINT16_MAX + UINT8_MAX;\n        }\n\n        /* Shift and multiplier */\n        for (int i = 0; i < out_channels; ++i) {\n            out_shift[i] = -10 + rand() % 2;\n            out_mult[i] = 0x7f67f4f8 + rand() % 50;\n        }\n\n        /* Case 17: use small out_shift to expose transpose cross-position contamination.\n         * out_shift=-6 (÷64) won't mask an 8x error like -10 (÷1024) would. */\n        if (itr == 17) {\n            for (int i = 0; i < out_channels; ++i) {\n                out_shift[i] = -6;\n            }\n        }\n\n        /* Case 16: override ALL data with exact YOLO Op[8] values */\n        if (itr == 16) {\n            static const int8_t yolo_filter[] = {\n                6,127,57,21,23,8,5,109,2,15,-1,-99,14,7,-67,-59,-12,40,-90,16,-1,-3,25,7,17,-16,14,24,-53,-2,-110,-10,\n                -6,-5,5,5,55,3,2,-6,-4,-17,0,17,-10,7,-3,-13,56,-3,-13,-83,-1,-4,-49,6,-127,1,5,1,8,-10,7,-2,\n                3,-1,-2,0,-29,-1,-5,-14,-2,-22,-1,-1,-9,1,-12,-18,-127,-1,-14,71,-1,0,-3,-2,-5,3,0,-4,0,-21,-1,-1,\n                -13,-9,-20,-77,-2,-77,-20,59,127,-7,120,-51,-9,-47,50,45,11,8,17,8,112,-20,2,-50,-12,-34,-88,-14,-59,8,-29,2,\n                -4,11,-32,-32,3,-4,5,-113,-11,2,-18,-13,-2,-7,127,8,8,7,2,6,-16,-3,1,-15,1,-5,-20,-2,13,1,24,3,\n                -8,-7,1,-1,54,1,1,-1,7,-6,5,3,-4,-5,-2,11,-68,-3,-10,27,5,4,-61,4,-127,3,0,2,1,1,2,3,\n                0,-2,0,2,11,2,-3,3,1,-61,1,-5,127,2,-2,-5,8,0,27,9,2,-2,4,1,2,-1,2,-2,0,101,0,0,\n                1,2,-51,-1,6,-6,2,10,-7,-2,2,-19,2,-3,-115,127,12,12,6,1,0,-6,2,-22,5,-4,-18,3,1,-2,51,-2,\n                -20,-21,-60,123,-4,127,-17,25,-80,-7,-95,-45,-7,11,49,51,14,0,-4,8,-73,-52,-5,-47,-11,-33,119,-21,-31,4,21,1,\n                -1,4,-1,1,-2,1,-1,1,1,71,1,-3,-127,1,2,4,-1,3,46,1,2,-2,-7,-1,2,-2,1,0,-1,85,2,0,\n                -2,-127,81,81,-5,71,18,22,80,14,83,58,14,-2,-14,36,6,29,-106,7,71,-46,-27,88,-19,-66,79,-13,77,7,66,-18,\n                46,17,-9,-24,3,17,-22,-4,-9,-1,-36,15,-1,-49,11,-3,5,-29,2,2,0,64,-1,-19,4,12,-4,32,9,-5,-9,-127,\n                -30,6,105,-67,-16,-61,-45,110,-56,-15,-50,-54,-18,-37,14,36,19,1,21,22,-66,-13,2,127,-4,-52,-60,-22,92,-6,45,-7,\n                -14,12,18,13,5,10,12,29,6,-10,2,-29,-3,-28,-3,-15,-2,39,127,14,3,-43,13,6,1,-103,9,11,4,-10,-5,-27,\n                -88,-35,-15,7,4,-6,64,-2,-48,-3,-18,-8,-3,-71,-8,0,7,63,12,3,-7,74,0,16,1,-67,-10,-78,-9,-7,5,127,\n                -3,7,2,4,15,2,0,9,2,127,2,-16,74,2,3,-5,9,1,29,13,2,-5,-16,1,8,-4,3,2,-2,-122,-2,-1,\n                8,-15,-2,3,11,-1,0,57,1,-7,2,19,-4,-2,-127,54,0,17,48,0,0,2,-2,-4,5,5,4,-5,-8,-7,-20,5,\n                11,39,-91,-65,11,-67,3,4,-56,-4,-66,-3,-3,5,4,8,-3,6,28,-8,-51,11,15,-106,10,23,-73,9,-127,-3,-78,8,\n                8,-3,1,0,-127,0,-7,-5,-2,-17,-2,0,5,-3,-7,-5,-3,11,-10,-3,-2,-2,3,1,70,-3,2,-7,2,-17,0,-1,\n                5,-127,13,6,2,3,12,25,0,-3,-2,-45,0,-6,4,-6,9,-11,-19,6,0,4,-14,9,2,25,3,0,3,5,-5,3,\n                -9,36,18,-4,-4,2,-19,-101,0,10,-9,-127,-4,-5,-37,82,-1,-20,6,-13,-2,-1,4,4,1,-11,3,-10,-27,5,-45,-3,\n                -6,-13,6,-3,4,-1,-5,5,2,-4,-2,-5,-1,-16,-5,-1,0,-1,0,-1,-15,-127,-3,-2,0,23,-3,0,-1,2,0,16,\n                3,45,17,24,8,27,-5,42,25,-9,21,-47,-14,18,27,23,-4,-15,127,19,24,14,19,21,-2,39,23,9,14,-7,15,15,\n                -127,-30,2,-10,3,-5,71,11,-16,0,-15,-18,3,-3,14,6,-1,73,-3,-1,-12,27,-2,-2,0,8,-7,-108,9,3,-6,8,\n                63,-47,0,-37,11,-20,-48,6,-19,-1,-18,13,-3,76,-18,15,3,-48,16,2,-4,-34,-6,3,7,-127,-7,58,1,-3,-23,108,\n                102,-1,0,3,11,1,-127,-7,-4,0,-2,-8,-13,-6,-6,-22,5,115,18,7,-1,-6,-4,3,-5,10,-1,-88,0,4,-1,7,\n                127,-5,12,-6,10,-13,-89,16,-20,1,-24,-12,-6,4,-4,-15,-3,-110,3,-6,-17,89,-10,9,13,-80,-18,105,-3,-4,3,-85,\n                2,7,2,-1,-25,-2,-5,-5,0,-25,-1,-6,-5,0,-14,-24,74,0,-13,-127,-1,0,-1,-1,-4,0,-1,-4,-2,-19,-8,1,\n                -84,-1,-6,-2,-19,-4,105,-3,-2,8,-2,-32,2,-3,2,-21,1,-127,-10,5,-3,8,0,2,-5,-8,-4,85,1,10,4,2,\n                19,-15,0,1,95,2,-15,-2,0,-56,-3,-4,-24,-5,-2,3,-16,-6,-37,-6,-3,1,127,-1,-119,4,2,-13,0,-41,3,3,\n                -10,-29,-13,-4,5,-9,-7,71,-6,7,-3,113,-2,0,51,-127,-10,-11,26,-3,-4,-1,0,-23,1,5,-7,-9,-20,8,-2,7,\n                -66,-1,-1,-10,3,-31,43,9,-18,-9,-2,-22,-2,75,22,-1,5,39,-14,4,-5,-62,-2,3,-1,69,-19,-61,-17,-2,-8,-127\n            };\n            static const int8_t yolo_input[] = {\n                -127,-65,-96,-127,-124,-100,-122,-127,-93,-122,-127,-127,-114,-91,-126,-105,\n                -127,-127,-128,-118,-102,-127,-127,-93,-127,-126,-127,-103,-127,-124,-127,-127,\n                -126,-63,-128,-128,-127,-127,-122,-118,-127,-126,-128,-114,-112,-122,-120,-122,\n                -114,-127,-127,-114,-126,-118,-127,-127,-127,-124,-128,-100,-128,-124,-127,-107,\n                -126,-63,-128,-128,-128,-126,-120,-118,-124,-126,-128,-112,-112,-122,-120,-122,\n                -114,-127,-127,-114,-128,-120,-127,-124,-127,-124,-127,-98,-128,-124,-127,-105,\n                -127,-62,-127,-127,-127,-128,-118,-114,-128,-126,-126,-112,-112,-124,-122,-124,\n                -114,-127,-127,-114,-127,-120,-127,-128,-127,-122,-128,-100,-128,-124,-128,-105,\n                -126,-63,-128,-127,-127,-128,-120,-116,-128,-124,-128,-112,-114,-122,-120,-124,\n                -114,-127,-127,-112,-126,-118,-127,-127,-127,-124,-127,-98,-128,-124,-128,-105,\n                -127,-63,-128,-128,-127,-128,-120,-114,-127,-124,-120,-112,-114,-122,-122,-124,\n                -114,-127,-127,-114,-127,-120,-127,-127,-127,-124,-127,-98,-124,-124,-128,-107,\n                -128,-67,-127,-126,-127,-127,-118,-112,-127,-124,-122,-111,-114,-128,-118,-127,\n                -114,-127,-127,-114,-128,-118,-127,-127,-127,-122,-127,-102,-127,-124,-128,-102,\n                -126,-69,-128,-128,-127,-127,-120,-112,-127,-124,-118,-111,-114,-124,-124,-126,\n                -112,-127,-127,-116,-128,-120,-127,-127,-127,-124,-126,-105,-128,-124,-122,-107\n            };\n            static const int32_t yolo_bias[] = {\n                2420,1649,1302,1816,-446,1562,685,32,2503,-74,3143,463,1507,1883,-932,525,\n                1205,162,540,1680,1846,388,338,274,-433,502,817,1021,812,1371,-30,1525\n            };\n            static const int32_t yolo_shifts[] = {\n                -8,-7,-6,-8,-6,-7,-8,-7,-8,-7,-9,-6,-8,-8,-7,-8,-8,-8,-7,-7,-7,-6,-8,-7,-7,-8,-8,-7,-8,-8,-8,-7\n            };\n            static const int32_t yolo_mults[] = {\n                0x52a119c9,0x53a7fce0,0x4430a104,0x5afd73fd,0x4a9394b6,0x5e2b6940,0x7c02c5c9,0x509cb64d,\n                0x5941a055,0x5d50f6be,0x60b9e0ad,0x41e9ef39,0x67d9347b,0x6b36dcc7,0x5406c784,0x70ae9dd9,\n                0x6a183a7f,0x78f48e0e,0x53e7df22,0x63cc6072,0x448b1623,0x4cd5d08c,0x6175e8be,0x5cd03362,\n                0x4de1312d,0x6c5bd16d,0x6e89094f,0x64a1947e,0x78e1060e,0x63d8179b,0x791c8d51,0x532420c2\n            };\n            memcpy(input, yolo_input, sizeof(yolo_input));\n            memcpy(filter_data, yolo_filter, sizeof(yolo_filter));\n            memcpy(bias, yolo_bias, sizeof(yolo_bias));\n            memcpy(out_shift, yolo_shifts, sizeof(yolo_shifts));\n            memcpy(out_mult, yolo_mults, sizeof(yolo_mults));\n        }\n\n        data_dims_t input_dims = {.width = in_wd, .height = in_ht, .channels = in_channels, 1};\n        data_dims_t output_dims = {.width = out_wd, .height = out_ht, .channels = out_channels, 1};\n        data_dims_t filter_dims = {.width = filter_wd, .height = filter_ht, .channels = in_channels, 1};\n        conv_params_t conv_params = {.in_offset = input_offset, .out_offset = out_offset,\n                                    .stride = {stride_wd, stride_ht}, .padding = {pad_wd, pad_ht},\n                                    .dilation = {0, 0}, .activation = {activation_min, activation_max}};\n        quant_data_t quant_data = {.shift = out_shift, .mult = out_mult};\n\n        int scratch_buf_size = esp_nn_get_conv_scratch_size(&input_dims, &filter_dims,\n                                                            &output_dims, &conv_params);\n        if (scratch_buf_size > 0) {\n            scratch_buf = ESP_NN_TEST_ALLOC(scratch_buf_size + 16);\n            if (scratch_buf == NULL) {\n                printf(ANSI_COLOR_RED\"[%3d] scratch_buf alloc failed size %d\\n\"ANSI_COLOR_RESET,\n                       itr, scratch_buf_size);\n                goto conv_s8_cleanup;\n            }\n            int align_sz = 16 - (((int32_t) scratch_buf) & 0xf);\n            esp_nn_set_conv_scratch_buf(scratch_buf + align_sz);\n        }\n\n        /* enable profiler */\n        profile_c_start();\n\n        /* C function */\n        esp_nn_conv_s8_ansi(&input_dims, input, &filter_dims, filter_data,\n                            bias, &output_dims, out_data_c, &conv_params, &quant_data);\n\n        total_c = profile_c_end();\n        profile_opt_start();\n\n        /* Optimized function */\n        esp_nn_conv_s8(&input_dims, input, &filter_dims, filter_data,\n                       bias, &output_dims, out_data_opt, &conv_params, &quant_data);\n\n        /* disable profiler */\n        total_opt = profile_opt_end();\n\n        bool ret = CHECK_EQUAL(out_data_c, out_data_opt, out_size);\n        if (ret == false) {\n            printf(ANSI_COLOR_RED\"[%3d] failed [pad: (%d, %d), stride: (%d, %d)\"\n                   \" out: (%3d,%3d,%3d), filter: (%d, %d,%3d)]\\n\"ANSI_COLOR_RESET,\n                   itr, pad_wd, pad_ht, stride_wd, stride_ht, out_wd, out_ht,\n                   out_channels, filter_wd, filter_ht, in_channels);\n            goto conv_s8_cleanup;\n        }\n        printf(ANSI_COLOR_GREEN\"[%3d] passed [pad: (%d, %d), stride: (%d, %d)\"\n               \" out: (%3d,%3d,%3d), filter: (%d, %d,%3d)]\"ANSI_COLOR_RESET,\n               itr, pad_wd, pad_ht, stride_wd, stride_ht, out_wd, out_ht,\n               out_channels, filter_wd, filter_ht, in_channels);\n        printf(\"\\tcycles: c %8\"PRIu32\", opt %8\"PRIu32\"\\n\", total_c, total_opt);\n\n    conv_s8_cleanup:\n        /* Restore original filter pointer (may have been offset for alignment test) */\n        filter_data = filter_data_orig_save;\n        if (input_orig) {\n            free(input_orig);\n            input_orig = NULL;\n        }\n        if (filter_data) {\n            free(filter_data);\n            filter_data = NULL;\n        }\n        if (out_c_orig) {\n            free(out_c_orig);\n            out_c_orig = NULL;\n        }\n        if (out_opt_orig) {\n            free(out_opt_orig);\n            out_opt_orig = NULL;\n        }\n        if (bias) {\n            free(bias);\n            bias = NULL;\n        }\n        if (out_shift) {\n            free(out_shift);\n            out_shift = NULL;\n        }\n        if (out_mult) {\n            free(out_mult);\n            out_mult = NULL;\n        }\n        if (scratch_buf) {\n            free(scratch_buf);\n            scratch_buf = NULL;\n        }\n    }\n}\n"
  },
  {
    "path": "tests/src/fully_connected_test.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <stdint.h>\n#include <stdbool.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <inttypes.h>\n\n#include <esp_nn.h>\n#include \"test_utils.h\"\n\n\nvoid esp_nn_fully_connected_s8_test()\n{\n    uint32_t total_c = 0, total_opt = 0;\n    /* prepare data */\n    uint16_t row_len = 256 + 8 + 7; /* odd len to test unaligned+left-over */\n    const int32_t max_out_ch = 16;\n    const int32_t max_row_len = 271;\n    uint16_t out_channels = 3;\n\n    /* Use heap-allocated aligned buffers (matches TFLite real-world usage) */\n    int8_t *input_orig = malloc(max_row_len + 16);\n    int8_t *filter_orig = malloc(max_row_len * max_out_ch + 16);\n    int8_t *out_c_orig = malloc(max_out_ch + 16);\n    int8_t *out_opt_orig = malloc(max_out_ch + 16);\n    if (!input_orig || !filter_orig || !out_c_orig || !out_opt_orig) {\n        printf(ANSI_COLOR_RED\"%s allocations failed\\n\"ANSI_COLOR_RESET, __FUNCTION__);\n        goto fc_s8_cleanup;\n    }\n    int8_t *input = (int8_t *)(((uint32_t)input_orig + 15) & ~15);\n    int8_t *filter_data = (int8_t *)(((uint32_t)filter_orig + 15) & ~15);\n    int8_t *output_c = (int8_t *)(((uint32_t)out_c_orig + 15) & ~15);\n    int8_t *output_opt = (int8_t *)(((uint32_t)out_opt_orig + 15) & ~15);\n    int32_t activation_min = -128;\n    int32_t activation_max = 127;\n    int32_t input_offset = 0;\n    int32_t filter_offset = 0;\n    int32_t out_shift = -10;\n    int32_t out_offset = 5;\n    int32_t out_mult = 0x59e492c4;\n    printf(\"\\n######## Running %s ##########\\n\", __FUNCTION__);\n    for (int itr = 0; itr < 15; itr++) {\n        out_mult = INT32_MAX / row_len + rand() % INT16_MAX;\n        switch (itr) {\n        case 0:\n            out_shift = -10;\n            break;\n        case 1:\n            out_shift = SHIFT_MIN;\n            break;\n        case 2:\n            out_shift = SHIFT_MAX;\n            break;\n        case 3:\n            out_shift = 0;\n            break;\n        case 4:\n            row_len = 1;\n            out_channels = 16;\n            out_shift = -10 + rand() % 5;\n            break;\n        case 5:\n            row_len = 16;\n            out_channels = 8;\n            out_shift = -10 + rand() % 5;\n            break;\n        case 6:\n            row_len = 8;\n            out_channels = 8;\n            out_shift = -10 + rand() % 5;\n            break;\n        case 7:\n            row_len = 8;\n            out_channels = 15;\n            out_shift = -10 + rand() % 5;\n            break;\n        case 8:\n            row_len = 8;\n            out_channels = 1;\n            out_shift = -10 + rand() % 5;\n            break;\n        default:\n            row_len = rand() % 7 + 1;\n            out_channels = 8;\n            out_shift = -10 + rand() % 5;\n            break;\n        }\n        if (itr == 0) {\n            out_shift = SHIFT_MAX;\n        }\n        /* Generate input and filter data */\n        for (int i = 0; i < row_len; ++i) {\n            input[i] = rand() % 256 - 128;\n        }\n        for (int i = 0; i < row_len * out_channels; ++i) {\n            filter_data[i] = rand() % 256 - 128;\n        }\n\n        /* enable profiler */\n        profile_c_start();\n\n        /* C function */\n        esp_nn_fully_connected_s8_ansi(input, input_offset, row_len, filter_data, filter_offset,\n                                    NULL, output_c, out_channels, out_offset, out_shift, out_mult,\n                                    activation_min, activation_max);\n\n        total_c = profile_c_end();\n        profile_opt_start();\n\n        /* Optimized function */\n        esp_nn_fully_connected_s8(input, input_offset, row_len, filter_data, filter_offset,\n                                NULL, output_opt, out_channels, out_offset, out_shift, out_mult,\n                                activation_min, activation_max);\n\n        /* disable profiler */\n        total_opt = profile_opt_end();\n\n        bool ret = CHECK_EQUAL(output_c, output_opt, out_channels);\n        if (ret == false) {\n            printf(ANSI_COLOR_RED\"[%3d] failed\\n\"ANSI_COLOR_RESET, itr);\n#if 0\n            printf(\"Output: \\n\");\n            PRINT_ARRAY_HEX(output_opt, out_channels, 1);\n            printf(\"Expected: \\n\");\n            PRINT_ARRAY_HEX(output_c, out_channels, 1);\n            printf(\"Input:\\n\");\n            PRINT_ARRAY_HEX(input, row_len, 1);\n            printf(\"Filter data:\\n\");\n            PRINT_ARRAY_HEX(filter_data, row_len, out_channels);\n            printf(\"Out shift: %d\\n\", out_shift);\n            printf(\"Out mult: %x\\n\", out_mult);\n#endif\n            goto fc_s8_cleanup;\n        }\n        printf(ANSI_COLOR_GREEN\"[%3d] passed [row_len %\"PRIu16\", out_ch %\"PRIu16\"]\"ANSI_COLOR_RESET,\n               itr, row_len, out_channels);\n        printf(\"\\tcycles: c %8\"PRIu32\", opt %8\"PRIu32\"\\n\", total_c, total_opt);\n    }\n\nfc_s8_cleanup:\n    if (input_orig) {\n        free(input_orig);\n    }\n    if (filter_orig) {\n        free(filter_orig);\n    }\n    if (out_c_orig) {\n        free(out_c_orig);\n    }\n    if (out_opt_orig) {\n        free(out_opt_orig);\n    }\n}\n\nvoid esp_nn_fully_connected_per_ch_s8_test()\n{\n    uint32_t total_c = 0, total_opt = 0;\n    /* prepare data */\n    uint16_t row_len = 256 + 8 + 7; /* odd len to test unaligned+left-over */\n    const int32_t max_out_ch = 16;\n    const int32_t max_row_len = 271;\n    uint16_t out_channels = 3;\n\n    /* Use heap-allocated aligned buffers (matches TFLite real-world usage) */\n    int8_t *input_orig = malloc(max_row_len + 16);\n    int8_t *filter_orig = malloc(max_row_len * max_out_ch + 16);\n    int8_t *out_c_orig = malloc(max_out_ch + 16);\n    int8_t *out_opt_orig = malloc(max_out_ch + 16);\n    if (!input_orig || !filter_orig || !out_c_orig || !out_opt_orig) {\n        printf(ANSI_COLOR_RED\"%s allocations failed\\n\"ANSI_COLOR_RESET, __FUNCTION__);\n        goto fc_per_ch_s8_buffers_cleanup;\n    }\n    int8_t *input = (int8_t *)(((uint32_t)input_orig + 15) & ~15);\n    int8_t *filter_data = (int8_t *)(((uint32_t)filter_orig + 15) & ~15);\n    int8_t *output_c = (int8_t *)(((uint32_t)out_c_orig + 15) & ~15);\n    int8_t *output_opt = (int8_t *)(((uint32_t)out_opt_orig + 15) & ~15);\n    int32_t activation_min = -128;\n    int32_t activation_max = 127;\n    int32_t input_offset = 0;\n    int32_t filter_offset = 0;\n    int32_t out_offset = 7;\n\n    int32_t* out_mult = NULL;\n    int32_t* out_shift = NULL;\n\n    printf(\"\\n######## Running %s ##########\\n\", __FUNCTION__);\n    for (int itr = 0;  itr < 15; itr++) {\n        int32_t out_shift_val = 0;\n        switch (itr) {\n        case 0:\n            out_shift_val = -10;\n            break;\n        case 1:\n            out_shift_val = SHIFT_MIN;\n            break;\n        case 2:\n            out_shift_val = SHIFT_MAX;\n            break;\n        case 3:\n            out_shift_val = 0;\n            break;\n        case 4:\n            row_len = 1;\n            out_channels = 16;\n            break;\n        case 5:\n            row_len = 16;\n            out_channels = 8;\n            break;\n        case 6:\n            row_len = 8;\n            out_channels = 8;\n            break;\n        case 7:\n            row_len = 8;\n            out_channels = 15;\n            break;\n        case 8:\n            row_len = 8;\n            out_channels = 1;\n            break;\n        default:\n            row_len = rand() % 7 + 1;\n            out_channels = 8;\n            break;\n        }\n\n        out_mult = ESP_NN_TEST_ALLOC(out_channels * sizeof(int32_t));\n        out_shift = ESP_NN_TEST_ALLOC(out_channels * sizeof(int32_t));\n\n        if (out_shift == NULL || out_mult == NULL) {\n            printf(ANSI_COLOR_RED\"out_shift/out_mult allocations failed\\n\"ANSI_COLOR_RESET);\n            goto fully_connected_per_ch_cleanup;\n        }\n\n        for (int i = 0; i < out_channels; i++) {\n            out_mult[i] = INT32_MAX / row_len + rand() % INT16_MAX;\n            if (i < 4) {\n                out_shift[i] = out_shift_val;\n            } else {\n                out_shift[i] = -10 + rand() % 5;\n            }\n        }\n\n        /* Generate input and filter data */\n        for (int i = 0; i < row_len; ++i) {\n            input[i] = rand() % 256 - 128;\n        }\n        for (int i = 0; i < row_len * out_channels; ++i) {\n            filter_data[i] = rand() % 256 - 128;\n        }\n        \n        /* enable profiler */\n        profile_c_start();\n\n        /* C function */\n        esp_nn_fully_connected_per_ch_s8_ansi(input, input_offset, row_len, filter_data, filter_offset,\n                                    NULL, output_c, out_channels, out_offset, out_shift, out_mult,\n                                    activation_min, activation_max);\n\n        total_c = profile_c_end();\n        profile_opt_start();\n\n        /* Optimized function */\n        esp_nn_fully_connected_per_ch_s8(input, input_offset, row_len, filter_data, filter_offset,\n                                NULL, output_opt, out_channels, out_offset, out_shift, out_mult,\n                                activation_min, activation_max);\n\n        /* disable profiler */\n        total_opt = profile_opt_end();\n\n        bool ret = CHECK_EQUAL(output_c, output_opt, out_channels);\n        if (ret == false) {\n            printf(ANSI_COLOR_RED\"[%3d] failed\\n\"ANSI_COLOR_RESET, itr);\n            goto fully_connected_per_ch_cleanup;\n        }\n        printf(ANSI_COLOR_GREEN\"[%3d] passed [row_len %\"PRIu16\", out_ch %\"PRIu16\"]\"ANSI_COLOR_RESET,\n               itr, row_len, out_channels);\n        printf(\"\\tcycles: c %8\"PRIu32\", opt %8\"PRIu32\"\\n\", total_c, total_opt);\n    \n    fully_connected_per_ch_cleanup:\n        if (out_shift) {\n            free(out_shift);\n        }\n        if (out_mult) {\n            free(out_mult);\n        }\n    }\n\nfc_per_ch_s8_buffers_cleanup:\n    if (input_orig) {\n        free(input_orig);\n    }\n    if (filter_orig) {\n        free(filter_orig);\n    }\n    if (out_c_orig) {\n        free(out_c_orig);\n    }\n    if (out_opt_orig) {\n        free(out_opt_orig);\n    }\n}\n"
  },
  {
    "path": "tests/src/hard_swish_test.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <stdint.h>\n#include <stdbool.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <malloc.h>\n\n#include <esp_nn.h>\n#include \"test_utils.h\"\n\nvoid esp_nn_hard_swish_s8_test()\n{\n    /* Test with representative MobileNetV3 parameters */\n    const int test_sizes[] = {1, 8, 16, 32, 100, 1024, 12544};\n    const int num_tests = sizeof(test_sizes) / sizeof(test_sizes[0]);\n\n    /* Typical quantization params from MobileNetV3 layers */\n    const int16_t input_zp = -128;\n    const int16_t output_mult_fxp = 19661;  /* typical value */\n    const int16_t reluish_mult_fxp = 22938; /* typical value */\n    const int16_t output_zp = -128;\n\n    /* Test all three branches: exp > 0, exp < 0, exp == 0 */\n    int32_t reluish_exps[] = {2, -1, 0};\n    int32_t output_exps[] = {-1, -2, -1};\n\n    printf(\"\\n######## Running %s ##########\\n\", __FUNCTION__);\n\n    /* Set up scratch buffer for LUT-based optimization */\n    int32_t scratch_size = esp_nn_get_hard_swish_scratch_size();\n    void *scratch_buf = NULL;\n    if (scratch_size > 0) {\n        scratch_buf = malloc(scratch_size);\n        if (scratch_buf) {\n            esp_nn_set_hard_swish_scratch_buf(scratch_buf);\n        }\n    }\n\n    for (int t = 0; t < num_tests; t++) {\n        int size = test_sizes[t];\n        int8_t *input_orig = malloc(size + 16);\n        int8_t *out_c_orig = malloc(size + 16);\n        int8_t *out_opt_orig = malloc(size + 16);\n\n        if (!input_orig || !out_c_orig || !out_opt_orig) {\n            printf(ANSI_COLOR_RED\"hard_swish [%d] alloc failed\\n\"ANSI_COLOR_RESET, t);\n            goto cleanup;\n        }\n\n        int8_t *input = (int8_t *)(((uint32_t)input_orig + 15) & ~15);\n        int8_t *out_c = (int8_t *)(((uint32_t)out_c_orig + 15) & ~15);\n        int8_t *out_opt = (int8_t *)(((uint32_t)out_opt_orig + 15) & ~15);\n\n        for (int i = 0; i < size; i++) {\n            input[i] = rand() % 256 - 128;\n        }\n\n        for (int exp_idx = 0; exp_idx < 3; exp_idx++) {\n            /* ANSI C reference */\n            profile_c_start();\n            esp_nn_hard_swish_s8_ansi(input, out_c, size,\n                                       input_zp, output_mult_fxp, reluish_mult_fxp,\n                                       reluish_exps[exp_idx], output_exps[exp_idx], output_zp);\n            profile_c_end();\n\n            /* Optimized */\n            profile_opt_start();\n            esp_nn_hard_swish_s8(input, out_opt, size,\n                                  input_zp, output_mult_fxp, reluish_mult_fxp,\n                                  reluish_exps[exp_idx], output_exps[exp_idx], output_zp);\n            profile_opt_end();\n\n            bool ret = CHECK_EQUAL(out_c, out_opt, size);\n            if (!ret) {\n                printf(ANSI_COLOR_RED\"hard_swish [size=%d, exp=%d] failed\\n\"ANSI_COLOR_RESET,\n                       size, (int)reluish_exps[exp_idx]);\n                goto cleanup;\n            }\n        }\n        printf(ANSI_COLOR_GREEN\"hard_swish [%2d] passed [size %d]\\n\"ANSI_COLOR_RESET, t, size);\n\n    cleanup:\n        if (input_orig) free(input_orig);\n        if (out_c_orig) free(out_c_orig);\n        if (out_opt_orig) free(out_opt_orig);\n    }\n    if (scratch_buf) free(scratch_buf);\n}\n"
  },
  {
    "path": "tests/src/mean_test.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <stdint.h>\n#include <stdbool.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <malloc.h>\n#include <inttypes.h>\n\n#include <esp_nn.h>\n#include \"test_utils.h\"\n\nvoid esp_nn_mean_nhwc_s8_test()\n{\n    /* Test dimensions matching MobileNetV3 SE blocks */\n    struct {\n        int height, width, channels;\n    } test_cases[] = {\n        {7, 7, 16},      /* small SE block */\n        {7, 7, 72},      /* medium SE block */\n        {14, 14, 40},    /* larger spatial */\n        {14, 14, 120},   /* larger channels */\n        {28, 28, 24},    /* early layer SE */\n        {1, 1, 576},     /* degenerate 1x1 */\n        {3, 3, 96},      /* small spatial */\n    };\n    const int num_tests = sizeof(test_cases) / sizeof(test_cases[0]);\n\n    const int32_t input_zp = -128;\n    const int32_t output_zp = -128;\n    const int32_t multiplier = 1073741824; /* typical */\n    const int32_t shift = -1;\n\n    printf(\"\\n######## Running %s ##########\\n\", __FUNCTION__);\n\n    for (int t = 0; t < num_tests; t++) {\n        int h = test_cases[t].height;\n        int w = test_cases[t].width;\n        int c = test_cases[t].channels;\n        int input_size = h * w * c;\n\n        int8_t *input_orig = malloc(input_size + 16);\n        int8_t *out_c_orig = malloc(c + 16);\n        int8_t *out_opt_orig = malloc(c + 16);\n\n        if (!input_orig || !out_c_orig || !out_opt_orig) {\n            printf(ANSI_COLOR_RED\"mean [%d] alloc failed\\n\"ANSI_COLOR_RESET, t);\n            goto cleanup;\n        }\n\n        int8_t *input = (int8_t *)(((uint32_t)input_orig + 15) & ~15);\n        int8_t *out_c = (int8_t *)(((uint32_t)out_c_orig + 15) & ~15);\n        int8_t *out_opt = (int8_t *)(((uint32_t)out_opt_orig + 15) & ~15);\n\n        for (int i = 0; i < input_size; i++) {\n            input[i] = rand() % 256 - 128;\n        }\n\n        /* ANSI C reference */\n        profile_c_start();\n        esp_nn_mean_nhwc_s8_ansi(input, out_c, h, w, c,\n                                  input_zp, output_zp, multiplier, shift);\n        profile_c_end();\n\n        /* Optimized */\n        profile_opt_start();\n        esp_nn_mean_nhwc_s8(input, out_opt, h, w, c,\n                             input_zp, output_zp, multiplier, shift);\n        profile_opt_end();\n\n        bool ret = CHECK_EQUAL(out_c, out_opt, c);\n        if (!ret) {\n            printf(ANSI_COLOR_RED\"mean [%d] failed [%dx%dx%d]\\n\"ANSI_COLOR_RESET,\n                   t, h, w, c);\n            goto cleanup;\n        }\n        printf(ANSI_COLOR_GREEN\"mean [%2d] passed [%dx%dx%d]\\n\"ANSI_COLOR_RESET,\n               t, h, w, c);\n\n    cleanup:\n        if (input_orig) free(input_orig);\n        if (out_c_orig) free(out_c_orig);\n        if (out_opt_orig) free(out_opt_orig);\n    }\n}\n"
  },
  {
    "path": "tests/src/pooling_test.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <stdint.h>\n#include <stdbool.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <malloc.h>\n#include <inttypes.h>\n\n#include <esp_nn.h>\n#include \"test_utils.h\"\n\nstatic void run_avg_pool_test(uint16_t input_wd, uint16_t input_ht, uint16_t channels,\n                              uint16_t filter_wd, uint16_t filter_ht,\n                              uint16_t stride_wd, uint16_t stride_ht,\n                              uint16_t pad_wd, uint16_t pad_ht,\n                              int iter)\n{\n    const int32_t activation_min = -128;\n    const int32_t activation_max = 127;\n    const uint16_t out_wd = (input_wd + 2 * pad_wd - filter_wd) / stride_wd + 1;\n    const uint16_t out_ht = (input_ht + 2 * pad_ht - filter_ht) / stride_ht + 1;\n    const int size = input_wd * input_ht * channels;\n    const int out_size = out_wd * out_ht * channels;\n\n    int8_t *input = NULL, *output_c = NULL, *output_opt = NULL;\n    int8_t *input_orig = malloc(size + 16);\n    int8_t *out_c_orig = malloc(out_size + 16);\n    int8_t *out_opt_orig = malloc(out_size + 16);\n    if (input_orig == NULL || out_c_orig == NULL || out_opt_orig == NULL) {\n        printf(ANSI_COLOR_RED\"avg_pool [%d] allocations failed\\n\"ANSI_COLOR_RESET, iter);\n        goto avg_pool_cleanup;\n    }\n\n    input = (int8_t *) (((uint32_t) input_orig + 15) & ~15);\n    output_c = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15);\n    output_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15);\n\n    for (int i = 0; i < size; ++i) {\n        input[i] = rand() % 256 - 128;\n    }\n\n    profile_c_start();\n    esp_nn_avg_pool_s8_ansi(input, input_wd, input_ht, output_c, out_wd, out_ht,\n                            stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht,\n                            activation_min, activation_max, channels);\n    profile_c_end();\n\n    profile_opt_start();\n    esp_nn_avg_pool_s8(input, input_wd, input_ht, output_opt, out_wd, out_ht,\n                       stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht,\n                       activation_min, activation_max, channels);\n    profile_opt_end();\n\n    bool ret = CHECK_EQUAL(output_c, output_opt, out_size);\n    if (ret == false) {\n        printf(ANSI_COLOR_RED\"avg_pool [%d] failed [in %dx%dx%d, f %dx%d, s %dx%d, p %dx%d]\\n\"ANSI_COLOR_RESET,\n               iter, input_wd, input_ht, channels, filter_wd, filter_ht,\n               stride_wd, stride_ht, pad_wd, pad_ht);\n        goto avg_pool_cleanup;\n    }\n    printf(ANSI_COLOR_GREEN\"avg_pool [%2d] passed [in %dx%dx%d, f %dx%d, s %dx%d, p %dx%d]\\n\"ANSI_COLOR_RESET,\n           iter, input_wd, input_ht, channels, filter_wd, filter_ht,\n           stride_wd, stride_ht, pad_wd, pad_ht);\n\navg_pool_cleanup:\n    if (input_orig) free(input_orig);\n    if (out_c_orig) free(out_c_orig);\n    if (out_opt_orig) free(out_opt_orig);\n}\n\nvoid esp_nn_avg_pool_s8_test()\n{\n    int iter = 0;\n    /* Original test case */\n    run_avg_pool_test(16, 16, 16, 3, 3, 1, 1, 1, 1, iter++);\n    /* Varying channel counts */\n    run_avg_pool_test(16, 16, 4, 3, 3, 1, 1, 1, 1, iter++);\n    run_avg_pool_test(16, 16, 8, 3, 3, 1, 1, 1, 1, iter++);\n    run_avg_pool_test(16, 16, 32, 3, 3, 1, 1, 1, 1, iter++);\n    run_avg_pool_test(16, 16, 64, 3, 3, 1, 1, 1, 1, iter++);\n    /* Note: non-multiple-of-4 channels not supported by S3 optimized path */\n    /* Different filter sizes */\n    run_avg_pool_test(16, 16, 16, 1, 1, 1, 1, 0, 0, iter++);\n    run_avg_pool_test(16, 16, 16, 2, 2, 1, 1, 0, 0, iter++);\n    run_avg_pool_test(16, 16, 16, 5, 5, 1, 1, 2, 2, iter++);\n    /* Stride > 1 */\n    run_avg_pool_test(16, 16, 16, 3, 3, 2, 2, 1, 1, iter++);\n    run_avg_pool_test(24, 24, 32, 3, 3, 2, 2, 1, 1, iter++);\n    /* Person detection final pooling: 6x6x128, filter 6x6 */\n    run_avg_pool_test(6, 6, 128, 6, 6, 1, 1, 0, 0, iter++);\n    /* No padding */\n    run_avg_pool_test(16, 16, 16, 3, 3, 1, 1, 0, 0, iter++);\n}\n\nstatic void run_max_pool_test(uint16_t input_wd, uint16_t input_ht, uint16_t channels,\n                              uint16_t filter_wd, uint16_t filter_ht,\n                              uint16_t stride_wd, uint16_t stride_ht,\n                              uint16_t pad_wd, uint16_t pad_ht,\n                              int iter)\n{\n    const int32_t activation_min = -128;\n    const int32_t activation_max = 127;\n    const uint16_t out_wd = (input_wd + 2 * pad_wd - filter_wd) / stride_wd + 1;\n    const uint16_t out_ht = (input_ht + 2 * pad_ht - filter_ht) / stride_ht + 1;\n    const int size = input_wd * input_ht * channels;\n    const int out_size = out_wd * out_ht * channels;\n\n    int8_t *input = NULL, *output_c = NULL, *output_opt = NULL;\n    int8_t *input_orig = malloc(size + 16);\n    int8_t *out_c_orig = malloc(out_size + 16);\n    int8_t *out_opt_orig = malloc(out_size + 16);\n    if (input_orig == NULL || out_c_orig == NULL || out_opt_orig == NULL) {\n        printf(ANSI_COLOR_RED\"max_pool [%d] allocations failed\\n\"ANSI_COLOR_RESET, iter);\n        goto max_pool_cleanup;\n    }\n\n    input = (int8_t *) (((uint32_t) input_orig + 15) & ~15);\n    output_c = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15);\n    output_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15);\n\n    for (int i = 0; i < size; ++i) {\n        input[i] = rand() % 256 - 128;\n    }\n\n    profile_c_start();\n    esp_nn_max_pool_s8_ansi(input, input_wd, input_ht, output_c, out_wd, out_ht,\n                            stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht,\n                            activation_min, activation_max, channels);\n    profile_c_end();\n\n    profile_opt_start();\n    esp_nn_max_pool_s8(input, input_wd, input_ht, output_opt, out_wd, out_ht,\n                       stride_wd, stride_ht, filter_wd, filter_ht, pad_wd, pad_ht,\n                       activation_min, activation_max, channels);\n    profile_opt_end();\n\n    bool ret = CHECK_EQUAL(output_c, output_opt, out_size);\n    if (ret == false) {\n        printf(ANSI_COLOR_RED\"max_pool [%d] failed [in %dx%dx%d, f %dx%d, s %dx%d, p %dx%d]\\n\"ANSI_COLOR_RESET,\n               iter, input_wd, input_ht, channels, filter_wd, filter_ht,\n               stride_wd, stride_ht, pad_wd, pad_ht);\n        goto max_pool_cleanup;\n    }\n    printf(ANSI_COLOR_GREEN\"max_pool [%2d] passed [in %dx%dx%d, f %dx%d, s %dx%d, p %dx%d]\\n\"ANSI_COLOR_RESET,\n           iter, input_wd, input_ht, channels, filter_wd, filter_ht,\n           stride_wd, stride_ht, pad_wd, pad_ht);\n\nmax_pool_cleanup:\n    if (input_orig) free(input_orig);\n    if (out_c_orig) free(out_c_orig);\n    if (out_opt_orig) free(out_opt_orig);\n}\n\nvoid esp_nn_max_pool_s8_test()\n{\n    int iter = 0;\n    /* Original test case */\n    run_max_pool_test(16, 16, 16, 3, 3, 1, 1, 1, 1, iter++);\n    /* Varying channel counts */\n    run_max_pool_test(16, 16, 4, 3, 3, 1, 1, 1, 1, iter++);\n    run_max_pool_test(16, 16, 8, 3, 3, 1, 1, 1, 1, iter++);\n    run_max_pool_test(16, 16, 32, 3, 3, 1, 1, 1, 1, iter++);\n    run_max_pool_test(16, 16, 64, 3, 3, 1, 1, 1, 1, iter++);\n    /* Note: non-multiple-of-4 channels not supported by S3 optimized path */\n    /* Different filter sizes */\n    run_max_pool_test(16, 16, 16, 1, 1, 1, 1, 0, 0, iter++);\n    run_max_pool_test(16, 16, 16, 2, 2, 1, 1, 0, 0, iter++);\n    run_max_pool_test(16, 16, 16, 5, 5, 1, 1, 2, 2, iter++);\n    /* Stride > 1 */\n    run_max_pool_test(16, 16, 16, 3, 3, 2, 2, 1, 1, iter++);\n    run_max_pool_test(24, 24, 32, 3, 3, 2, 2, 1, 1, iter++);\n    /* Person detection final pooling-like: 6x6x128 */\n    run_max_pool_test(6, 6, 128, 6, 6, 1, 1, 0, 0, iter++);\n    /* No padding */\n    run_max_pool_test(16, 16, 16, 3, 3, 1, 1, 0, 0, iter++);\n}\n"
  },
  {
    "path": "tests/src/relu_test.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2020-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <stdint.h>\n#include <stdbool.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <malloc.h>\n\n#include <esp_nn.h>\n#include \"test_utils.h\"\n\nstatic void run_relu6_test(int size, int iter)\n{\n    int8_t *input = NULL, *inout_ansi = NULL, *inout_opt = NULL;\n\n    int8_t *input_orig = malloc(size + 16);\n    int8_t *inout_c_orig = malloc(size + 16);\n    int8_t *inout_opt_orig = malloc(size + 16);\n\n    if (input_orig == NULL || inout_c_orig == NULL || inout_opt_orig == NULL) {\n        printf(ANSI_COLOR_RED\"relu6 [%d] allocations failed\\n\"ANSI_COLOR_RESET, iter);\n        goto relu6_cleanup;\n    }\n    input = (int8_t *) (((uint32_t) input_orig + 15) & ~15);\n    inout_ansi = (int8_t *) (((uint32_t) inout_c_orig + 15) & ~15);\n    inout_opt = (int8_t *) (((uint32_t) inout_opt_orig + 15) & ~15);\n\n    for (int i = 0; i < size; ++i) {\n        input[i] = rand() % 255 - 128;\n        inout_ansi[i] = input[i];\n        inout_opt[i] = input[i];\n    }\n\n    profile_c_start();\n    esp_nn_relu6_s8_ansi(inout_ansi, size);\n    profile_c_end();\n\n    profile_opt_start();\n    esp_nn_relu6_s8(inout_opt, size);\n    profile_opt_end();\n\n    bool ret = CHECK_EQUAL(inout_ansi, inout_opt, size);\n    if (ret == false) {\n        printf(ANSI_COLOR_RED\"relu6 [%d] failed [size %d]\\n\"ANSI_COLOR_RESET, iter, size);\n        goto relu6_cleanup;\n    }\n    printf(ANSI_COLOR_GREEN\"relu6 [%2d] passed [size %d]\\n\"ANSI_COLOR_RESET, iter, size);\n\nrelu6_cleanup:\n    if (input_orig) free(input_orig);\n    if (inout_c_orig) free(inout_c_orig);\n    if (inout_opt_orig) free(inout_opt_orig);\n}\n\nvoid esp_nn_relu6_s8_test()\n{\n    int iter = 0;\n    /* Original test case: odd size with leftover */\n    run_relu6_test(1600 + 8 + 7, iter++);\n    /* Very small sizes (< 8 elements, below SIMD width) */\n    run_relu6_test(1, iter++);\n    run_relu6_test(3, iter++);\n    run_relu6_test(7, iter++);\n    /* Between 8 and 16 (partial SIMD) */\n    run_relu6_test(8, iter++);\n    run_relu6_test(12, iter++);\n    run_relu6_test(15, iter++);\n    /* Exact multiple of 16 (full SIMD, no leftover) */\n    run_relu6_test(16, iter++);\n    run_relu6_test(32, iter++);\n    run_relu6_test(256, iter++);\n    /* Non-aligned sizes */\n    run_relu6_test(17, iter++);\n    run_relu6_test(33, iter++);\n    run_relu6_test(100, iter++);\n}\n"
  },
  {
    "path": "tests/src/softmax_test.c",
    "content": "/*\n * SPDX-FileCopyrightText: 2022-2026 Espressif Systems (Shanghai) CO LTD\n *\n * SPDX-License-Identifier: Apache-2.0\n */\n\n#include <stdint.h>\n#include <stdbool.h>\n#include <stdio.h>\n#include <stdlib.h>\n#include <malloc.h>\n#include <inttypes.h>\n\n#include <esp_nn.h>\n#include \"test_utils.h\"\n\nstatic void run_softmax_test(int32_t height, int32_t width, int32_t mult,\n                             int32_t shift, int32_t diff_min, int iter)\n{\n    void *scratch_buf = NULL, *scratch_buf_orig = NULL;\n    const int size = width * height;\n    int8_t *input = NULL, *out_ansi = NULL, *out_opt = NULL;\n\n    int8_t *input_orig = malloc(size + 16);\n    int8_t *out_c_orig = malloc(size + 16);\n    int8_t *out_opt_orig = malloc(size + 16);\n    if (input_orig == NULL || out_c_orig == NULL || out_opt_orig == NULL) {\n        printf(ANSI_COLOR_RED\"softmax [%d] allocations failed\\n\"ANSI_COLOR_RESET, iter);\n        goto softmax_cleanup;\n    }\n\n    input = (int8_t *) (((uint32_t) input_orig + 15) & ~15);\n    out_ansi = (int8_t *) (((uint32_t) out_c_orig + 15) & ~15);\n    out_opt = (int8_t *) (((uint32_t) out_opt_orig + 15) & ~15);\n\n    for (int i = 0; i < size; ++i) {\n        input[i] = rand() % 255 - 128;\n    }\n\n    profile_c_start();\n    esp_nn_softmax_s8_ansi(input, height, width, mult, shift, diff_min, out_ansi);\n    profile_c_end();\n\n    int32_t scratch_buf_size = esp_nn_get_softmax_scratch_size(width, height);\n    if (scratch_buf_size) {\n        scratch_buf_orig = malloc(scratch_buf_size * 4 + 16);\n        if (scratch_buf_orig == NULL) {\n            printf(ANSI_COLOR_RED\"softmax [%d] scratch alloc failed size %\"PRIi32\"\\n\"ANSI_COLOR_RESET,\n                   iter, scratch_buf_size);\n            goto softmax_cleanup;\n        }\n        scratch_buf = (void *)(((uint32_t) scratch_buf_orig + 15) & ~15);\n        esp_nn_set_softmax_scratch_buf(scratch_buf);\n    }\n\n    profile_opt_start();\n    esp_nn_softmax_s8(input, height, width, mult, shift, diff_min, out_opt);\n    profile_opt_end();\n\n    bool ret = CHECK_EQUAL(out_ansi, out_opt, size);\n    if (ret == false) {\n        printf(ANSI_COLOR_RED\"softmax [%d] failed [h %\"PRIi32\", w %\"PRIi32\", mult %\"PRIi32\", shift %\"PRIi32\", diff_min %\"PRIi32\"]\\n\"ANSI_COLOR_RESET,\n               iter, height, width, mult, shift, diff_min);\n        printf(\"Output: \\n\");\n        PRINT_ARRAY_HEX(out_opt, width, height);\n        printf(\"Expected: \\n\");\n        PRINT_ARRAY_HEX(out_ansi, width, height);\n        goto softmax_cleanup;\n    }\n    printf(ANSI_COLOR_GREEN\"softmax [%2d] passed [h %\"PRIi32\", w %\"PRIi32\", mult %\"PRIi32\", shift %\"PRIi32\"]\\n\"ANSI_COLOR_RESET,\n           iter, height, width, mult, shift);\n\nsoftmax_cleanup:\n    if (input_orig) free(input_orig);\n    if (out_c_orig) free(out_c_orig);\n    if (out_opt_orig) free(out_opt_orig);\n    if (scratch_buf_orig) free(scratch_buf_orig);\n}\n\nvoid esp_nn_softmax_s8_test()\n{\n    int iter = 0;\n    /* Original test case */\n    run_softmax_test(8, 32, INT32_MAX / 2, 7, -128, iter++);\n    /* Small output classes (person_detection: 2, micro_speech: 4) */\n    run_softmax_test(1, 2, INT32_MAX / 2, 7, -128, iter++);\n    run_softmax_test(1, 4, INT32_MAX / 2, 7, -128, iter++);\n    /* Single element (degenerate) */\n    run_softmax_test(1, 1, INT32_MAX / 2, 7, -128, iter++);\n    /* Medium width */\n    run_softmax_test(1, 10, INT32_MAX / 2, 7, -128, iter++);\n    run_softmax_test(4, 10, INT32_MAX / 2, 7, -128, iter++);\n    /* Large width (ImageNet-class) */\n    run_softmax_test(1, 1000, INT32_MAX / 2, 7, -128, iter++);\n    /* Large height */\n    run_softmax_test(64, 32, INT32_MAX / 2, 7, -128, iter++);\n    /* Varying diff_min */\n    run_softmax_test(8, 32, INT32_MAX / 2, 7, -64, iter++);\n    run_softmax_test(8, 32, INT32_MAX / 2, 7, -32, iter++);\n    run_softmax_test(8, 32, INT32_MAX / 2, 7, 0, iter++);\n    /* Varying multiplier and shift */\n    run_softmax_test(8, 32, INT32_MAX / 4, 5, -128, iter++);\n    run_softmax_test(8, 32, INT32_MAX, 10, -128, iter++);\n    /* Odd width (non-aligned) */\n    run_softmax_test(8, 17, INT32_MAX / 2, 7, -128, iter++);\n    run_softmax_test(8, 3, INT32_MAX / 2, 7, -128, iter++);\n}\n"
  }
]