[
  {
    "path": ".github/workflows/build.yml",
    "content": "name: CI\n\non:\n  workflow_dispatch: # allows manual triggering\n    inputs:\n      create_release:\n        description: \"Create new release\"\n        required: true\n        type: boolean\n  push:\n    paths: [\".github/workflows/**\", \"Sources/**/*\", \"Package.swift\", \"llama.xcodeproj/**/*\"]\n  pull_request:\n    paths: [\".github/workflows/**\", \"Sources/**/*\", \"Package.swift\", \"llama.xcodeproj/**/*\"]\n\njobs:\n  swift-build:\n    runs-on: macos-latest\n    steps:\n      - name: Clone repo\n        id: checkout\n        uses: actions/checkout@v1\n      - name: Update dependencies\n        id: depends\n        run: |\n          brew update\n      - name: Swift build\n        id: swift_build\n        run: |\n          swift build\n"
  },
  {
    "path": ".gitignore",
    "content": "*.o\n*.a\n.cache/\n.vs/\n.vscode/\n.DS_Store\n\nbuild/\nbuild-em/\nbuild-debug/\nbuild-release/\nbuild-static/\nbuild-no-accel/\nbuild-sanitize-addr/\nbuild-sanitize-thread/\n\nmodels/*\n\n/main\n/quantize\n\narm_neon.h\ncompile_commands.json\n# Xcode\n#\n# gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore\n\n## User settings\nxcuserdata/\n\n## compatibility with Xcode 8 and earlier (ignoring not required starting Xcode 9)\n*.xcscmblueprint\n*.xccheckout\n\n## compatibility with Xcode 3 and earlier (ignoring not required starting Xcode 4)\nbuild/\nDerivedData/\n*.moved-aside\n*.pbxuser\n!default.pbxuser\n*.mode1v3\n!default.mode1v3\n*.mode2v3\n!default.mode2v3\n*.perspectivev3\n!default.perspectivev3\n\n## Obj-C/Swift specific\n*.hmap\n\n## App packaging\n*.ipa\n*.dSYM.zip\n*.dSYM\n\n## Playgrounds\ntimeline.xctimeline\nplayground.xcworkspace\n\n# Swift Package Manager\n#\n# Add this line if you want to avoid checking in source code from Swift Package Manager dependencies.\n# Packages/\n# Package.pins\n# Package.resolved\n# *.xcodeproj\n#\n# Xcode automatically generates this directory with a .xcworkspacedata file and xcuserdata\n# hence it is not needed unless you have added a package configuration file to your project\n# .swiftpm\n\n.build/\n\n# CocoaPods\n#\n# We recommend against adding the Pods directory to your .gitignore. However\n# you should judge for yourself, the pros and cons are mentioned at:\n# https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control\n#\n# Pods/\n#\n# Add this line if you want to avoid checking in source code from the Xcode workspace\n# *.xcworkspace\n\n# Carthage\n#\n# Add this line if you want to avoid checking in source code from Carthage dependencies.\n# Carthage/Checkouts\n\nCarthage/Build/\n\n# Accio dependency management\nDependencies/\n.accio/\n\n# fastlane\n#\n# It is recommended to not store the screenshots in the git repo.\n# Instead, use fastlane to re-generate the screenshots whenever they are needed.\n# For more information about the recommended setup visit:\n# https://docs.fastlane.tools/best-practices/source-control/#source-control\n\nfastlane/report.xml\nfastlane/Preview.html\nfastlane/screenshots/**/*.png\nfastlane/test_output\n\n# Code Injection\n#\n# After new code Injection tools there's a generated folder /iOSInjectionProject\n# https://github.com/johnno1962/injectionforxcode\n\niOSInjectionProject/\n"
  },
  {
    "path": "CODE_OF_CONDUCT.md",
    "content": "# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nWe as members, contributors, and leaders pledge to make participation in our\ncommunity a harassment-free experience for everyone, regardless of age, body\nsize, visible or invisible disability, ethnicity, sex characteristics, gender\nidentity and expression, level of experience, education, socio-economic status,\nnationality, personal appearance, race, religion, or sexual identity\nand orientation.\n\nWe pledge to act and interact in ways that contribute to an open, welcoming,\ndiverse, inclusive, and healthy community.\n\n## Our Standards\n\nExamples of behavior that contributes to a positive environment for our\ncommunity include:\n\n* Demonstrating empathy and kindness toward other people\n* Being respectful of differing opinions, viewpoints, and experiences\n* Giving and gracefully accepting constructive feedback\n* Accepting responsibility and apologizing to those affected by our mistakes,\n  and learning from the experience\n* Focusing on what is best not just for us as individuals, but for the\n  overall community\n\nExamples of unacceptable behavior include:\n\n* The use of sexualized language or imagery, and sexual attention or\n  advances of any kind\n* Trolling, insulting or derogatory comments, and personal or political attacks\n* Public or private harassment\n* Publishing others' private information, such as a physical or email\n  address, without their explicit permission\n* Other conduct which could reasonably be considered inappropriate in a\n  professional setting\n\n## Enforcement Responsibilities\n\nCommunity leaders are responsible for clarifying and enforcing our standards of\nacceptable behavior and will take appropriate and fair corrective action in\nresponse to any behavior that they deem inappropriate, threatening, offensive,\nor harmful.\n\nCommunity leaders have the right and responsibility to remove, edit, or reject\ncomments, commits, code, wiki edits, issues, and other contributions that are\nnot aligned to this Code of Conduct, and will communicate reasons for moderation\ndecisions when appropriate.\n\n## Scope\n\nThis Code of Conduct applies within all community spaces, and also applies when\nan individual is officially representing the community in public spaces.\nExamples of representing our community include using an official e-mail address,\nposting via an official social media account, or acting as an appointed\nrepresentative at an online or offline event.\n\n## Enforcement\n\nInstances of abusive, harassing, or otherwise unacceptable behavior may be\nreported to the community leaders responsible for enforcement at\nalex@rozanski.me.\nAll complaints will be reviewed and investigated promptly and fairly.\n\nAll community leaders are obligated to respect the privacy and security of the\nreporter of any incident.\n\n## Enforcement Guidelines\n\nCommunity leaders will follow these Community Impact Guidelines in determining\nthe consequences for any action they deem in violation of this Code of Conduct:\n\n### 1. Correction\n\n**Community Impact**: Use of inappropriate language or other behavior deemed\nunprofessional or unwelcome in the community.\n\n**Consequence**: A private, written warning from community leaders, providing\nclarity around the nature of the violation and an explanation of why the\nbehavior was inappropriate. A public apology may be requested.\n\n### 2. Warning\n\n**Community Impact**: A violation through a single incident or series\nof actions.\n\n**Consequence**: A warning with consequences for continued behavior. No\ninteraction with the people involved, including unsolicited interaction with\nthose enforcing the Code of Conduct, for a specified period of time. This\nincludes avoiding interactions in community spaces as well as external channels\nlike social media. Violating these terms may lead to a temporary or\npermanent ban.\n\n### 3. Temporary Ban\n\n**Community Impact**: A serious violation of community standards, including\nsustained inappropriate behavior.\n\n**Consequence**: A temporary ban from any sort of interaction or public\ncommunication with the community for a specified period of time. No public or\nprivate interaction with the people involved, including unsolicited interaction\nwith those enforcing the Code of Conduct, is allowed during this period.\nViolating these terms may lead to a permanent ban.\n\n### 4. Permanent Ban\n\n**Community Impact**: Demonstrating a pattern of violation of community\nstandards, including sustained inappropriate behavior,  harassment of an\nindividual, or aggression toward or disparagement of classes of individuals.\n\n**Consequence**: A permanent ban from any sort of public interaction within\nthe community.\n\n## Attribution\n\nThis Code of Conduct is adapted from the [Contributor Covenant][homepage],\nversion 2.0, available at\nhttps://www.contributor-covenant.org/version/2/0/code_of_conduct.html.\n\nCommunity Impact Guidelines were inspired by [Mozilla's code of conduct\nenforcement ladder](https://github.com/mozilla/diversity).\n\n[homepage]: https://www.contributor-covenant.org\n\nFor answers to common questions about this code of conduct, see the FAQ at\nhttps://www.contributor-covenant.org/faq. Translations are available at\nhttps://www.contributor-covenant.org/translations.\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2023 Georgi Gerganov, Alex Rozanski and others\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "Package.swift",
    "content": "// swift-tools-version:5.5\n\nimport PackageDescription\n\nlet package = Package(\n  name: \"llama.swift\",\n  platforms: [\n    .macOS(.v10_15),\n    .iOS(.v13),\n  ],\n  products: [\n    .library(name: \"llama\", targets: [\"llama\"]),\n  ],\n  targets: [\n    .target(\n      name: \"llama\",\n      dependencies: [\"llamaObjCxx\"],\n      path: \"Sources/llama\"),\n    .target(\n      name: \"llamaObjCxx\",\n      dependencies: [],\n      path: \"Sources/llamaObjCxx\",\n      exclude: [\n        \"cpp/quantize.cpp\"\n      ],\n      publicHeadersPath: \"headers\",\n      cxxSettings: [\n        .headerSearchPath(\"cpp\")\n      ])\n  ],\n  cLanguageStandard: .gnu11,\n  cxxLanguageStandard: .gnucxx20\n)\n"
  },
  {
    "path": "README.md",
    "content": "# 🦙 llama.swift\n\n[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)\n\nA fork of [@ggerganov](https://github.com/ggerganov)'s [llama.cpp](https://github.com/ggerganov/llama.cpp) to use [Facebook's LLaMA](https://github.com/facebookresearch/llama) models in Swift.\n\nSee the [llama.cpp repository](https://github.com/ggerganov/llama.cpp/) for info about the original goals of the project and implementation.\n\n## 🚀 llama.swift → future\n\nVersion 1 of llama.swift provides a simple, clean wrapper around the original LLaMA models and some of their early derivatives.\n\nThe future of llama.swift is [CameLLM](https://github.com/CameLLM/), which provides clean, Swift interfaces to run LLMs locally on macOS (and hopefully in the future, iOS, too). CameLLM is still in development, and you can star or watch the [main repository](https://github.com/CameLLM/CameLLM) for updates.\n\n<hr/>\n\n## 🔨 Setup\n\nClone the repo:\n\n```bash\ngit clone https://github.com/alexrozanski/llama.swift.git\ncd llama.swift\n```\n\nGrab the LLaMA model weights and place them in `./models`. `ls` should print something like:\n\n```bash\nls ./models\n65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model\n```\n\nTo convert the LLaMA-7B model and quantize:\n\n```bash\n# install Python dependencies\npython3 -m pip install torch numpy sentencepiece\n\n# the command-line tools are in `./tools` instead of the repo root like in llama.cpp\ncd tools\n\n# convert the 7B model to ggml FP16 format\npython3 convert-pth-to-ggml.py ../models/7B/ 1\n\n# quantize the model to 4-bits\nmake\n./quantize.sh 7B\n```\n\nWhen running the larger models, make sure you have enough disk space to store all of the intermediate files.\n\n## ⬇️ Installation\n\n### Swift Package Manager\n\nAdd `llama.swift` to your project using Xcode (File > Add Packages...) or by adding it to your project's `Package.swift` file:\n\n```swift\ndependencies: [\n  .package(url: \"https://github.com/alexrozanski/llama.swift.git\", .upToNextMajor(from: \"1.0.0\"))\n]\n```\n\n## 👩‍💻 Usage\n\n### Swift library\n\nTo generate output from a prompt, first instantiate a `LlamaRunner` instance with the URL to your LLaMA model file:\n\n```swift\nimport llama\n\nlet url = ... // URL to the ggml-model-q4_0.bin model file\nlet runner = LlamaRunner(modelURL: url)\n```\n\nGenerating output is as simple as calling `run()` with your prompt on the `LlamaRunner` instance. Since tokens are generated asynchronously this returns an `AsyncThrowingStream` which you can enumerate over to process tokens as they are returned:\n\n```swift\ndo {\n  for try await token in runner.run(with: \"Building a website can be done in 10 simple steps:\") {\n    print(token, terminator: \"\")\n  }\n} catch let error {\n  // Handle error\n}\n```\n\nNote that tokens don't necessarily correspond to a single word, and also include any whitespace and newlines.\n\n#### Configuration\n\n`LlamaRunner.run()` takes an optional `LlamaRunner.Config` instance which lets you control the number of threads inference is run on (default: `8`), the maximum number of tokens returned (default: `512`) and an optional reverse/negative prompt:\n\n```swift\nlet prompt = \"...\"\nlet config = LlamaRunner.Config(numThreads: 8, numTokens: 20, reversePrompt: \"...\")\nlet tokenStream = runner.run(with: prompt, config: config)\n\ndo {\n  for try await token in tokenStream {\n    ...\n  }\n} catch let error {\n  ...\n}\n```\n\n#### State Changes\n\n`LlamaRunner.run()` also takes an optional `stateChangeHandler` closure, which is invoked whenever the run state changes:\n\n```\nlet prompt = \"...\"\nlet tokenStream = runner.run(\n  with: prompt,\n  config: .init(numThreads: 8, numTokens: 20),\n  stateChangeHandler: { state in\n    switch state {\n      case .notStarted:\n        // Initial state\n        break\n      case .initializing:\n        // Loading the model and initializing\n        break\n      case .generatingOutput:\n        // Generating tokens\n        break\n      case .completed:\n        // Completed successfully\n        break\n      case .failed:\n        // Failed. This is also the error thrown by the `AsyncThrowingSequence` returned from `LlamaRunner.run()`\n        break\n    }\n  })\n```\n\n#### Closure-based API\n\nIf you don't want to use Swift concurrency there is an alternative version of `run()` which returns tokens via a `tokenHandler` closure instead:\n\n```swift\nlet prompt = \"...\"\nrunner.run(\n  with: prompt,\n  config: ...,\n  tokenHandler: { token in\n    ...\n  },\n  stateChangeHandler: ...\n)\n```\n\n#### Other notes\n\n- Build for Release if you want token generation to be snappy, since `llama` will generate tokens slowly in Debug builds.\n- Because of the way the Swift package is structured (and some gaps in my knowledge around exported symbols from modules), including `llama.swift` also leaks the name of the internal module containing the Objective-C/C++ implementation, `llamaObjCxx`, as well as some internal classes prefixed with `_Llama`. Pull requests welcome if you have any ideas on fixing this!\n\n\n### `llamaTest` app\n\nThe repo contains a barebones command-line tool, `llamaTest`, which uses the `llama` Framework to run a simple input loop to run inference on a given input prompt.\n\n- Ensure to set `MODEL_PATH` in `LlamaTest.xcconfig` to point to your `path/to/ggml-model-q4_0.bin` (without quotes or spaces after `MODEL_PATH=`), for example:\n\n```\nMODEL_PATH=/path/to/ggml-model-q4_0.bin\n```\n\n## 📃 Misc\n\n- License: MIT\n- Other matters: See the [llama.cpp repo](https://github.com/ggerganov/llama.cpp/).\n"
  },
  {
    "path": "Sources/cpp/ggml.c",
    "content": "#include \"ggml.h\"\n\n#if defined(_MSC_VER) || defined(__MINGW32__)\n#include <malloc.h> // using malloc.h with MSC/MINGW\n#elif !defined(__FreeBSD__) && !defined(__NetBSD__)\n#include <alloca.h>\n#endif\n\n#include <assert.h>\n#include <time.h>\n#include <math.h>\n#include <stdlib.h>\n#include <string.h>\n#include <stdint.h>\n#include <stdio.h>\n#include <float.h>\n\n// if C99 - static_assert is noop\n// ref: https://stackoverflow.com/a/53923785/4039976\n#ifndef static_assert\n#define static_assert(cond, msg) struct global_scope_noop_trick\n#endif\n\n#if defined _MSC_VER || defined(__MINGW32__)\n\n#if !defined(__MINGW32__)\n#include <Windows.h>\n#else\n// ref: https://github.com/ggerganov/whisper.cpp/issues/168\n#include <windows.h>\n#include <errno.h>\n#endif\n\ntypedef volatile LONG atomic_int;\ntypedef atomic_int atomic_bool;\n\nstatic void atomic_store(atomic_int* ptr, LONG val) {\n    InterlockedExchange(ptr, val);\n}\nstatic LONG atomic_load(atomic_int* ptr) {\n    return InterlockedCompareExchange(ptr, 0, 0);\n}\nstatic LONG atomic_fetch_add(atomic_int* ptr, LONG inc) {\n    return InterlockedExchangeAdd(ptr, inc);\n}\nstatic LONG atomic_fetch_sub(atomic_int* ptr, LONG dec) {\n    return atomic_fetch_add(ptr, -(dec));\n}\n\ntypedef HANDLE pthread_t;\n\ntypedef DWORD thread_ret_t;\nstatic int pthread_create(pthread_t* out, void* unused, thread_ret_t(*func)(void*), void* arg) {\n    HANDLE handle = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) func, arg, 0, NULL);\n    if (handle == NULL)\n    {\n        return EAGAIN;\n    }\n\n    *out = handle;\n    return 0;\n}\n\nstatic int pthread_join(pthread_t thread, void* unused) {\n    return (int) WaitForSingleObject(thread, INFINITE);\n}\n\nstatic int sched_yield (void) {\n    Sleep (0);\n    return 0;\n}\n#else\n#include <pthread.h>\n#include <stdatomic.h>\n\ntypedef void* thread_ret_t;\n#endif\n\n#ifdef __HAIKU__\n#define static_assert(cond, msg) _Static_assert(cond, msg)\n#endif\n\n/*#define GGML_PERF*/\n#define GGML_DEBUG 0\n#define GGML_GELU_FP16\n#define GGML_SILU_FP16\n\n#define GGML_SOFT_MAX_UNROLL 4\n#define GGML_VEC_DOT_UNROLL  2\n\n#ifdef GGML_USE_ACCELERATE\n// uncomment to use vDSP for soft max computation\n// note: not sure if it is actually faster\n//#define GGML_SOFT_MAX_ACCELERATE\n#endif\n\n#if UINTPTR_MAX == 0xFFFFFFFF\n    #define GGML_MEM_ALIGN 4\n#else\n    #define GGML_MEM_ALIGN 16\n#endif\n\n#define UNUSED(x) (void)(x)\n#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)\n\n#define GGML_ASSERT(x) \\\n    do { \\\n        if (!(x)) { \\\n            fprintf(stderr, \"GGML_ASSERT: %s:%d: %s\\n\", __FILE__, __LINE__, #x); \\\n            abort(); \\\n        } \\\n    } while (0)\n\n#ifdef GGML_USE_ACCELERATE\n#include <Accelerate/Accelerate.h>\n#elif GGML_USE_OPENBLAS\n#include <cblas.h>\n#endif\n\n#undef MIN\n#undef MAX\n#define MIN(a, b) ((a) < (b) ? (a) : (b))\n#define MAX(a, b) ((a) > (b) ? (a) : (b))\n\n// floating point type used to accumulate sums\ntypedef double ggml_float;\n\n// 16-bit float\n// on Arm, we use __fp16\n// on x86, we use uint16_t\n#ifdef __ARM_NEON\n\n// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:\n//\n//   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/\n//\n#include <arm_neon.h>\n\n#define GGML_COMPUTE_FP16_TO_FP32(x) (x)\n#define GGML_COMPUTE_FP32_TO_FP16(x) (x)\n\n#define GGML_FP16_TO_FP32(x) (x)\n#define GGML_FP32_TO_FP16(x) (x)\n\n#else\n\n#ifdef __wasm_simd128__\n#include <wasm_simd128.h>\n#else\n#ifdef __POWER9_VECTOR__\n#include <altivec.h>\n#undef bool\n#define bool _Bool\n#else\n#include <immintrin.h>\n#endif\n#endif\n\n#ifdef __F16C__\n\n#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)\n#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)\n\n#else\n\n// FP16 <-> FP32\n// ref: https://github.com/Maratyszcza/FP16\n\nstatic inline float fp32_from_bits(uint32_t w) {\n    union {\n        uint32_t as_bits;\n        float as_value;\n    } fp32;\n    fp32.as_bits = w;\n    return fp32.as_value;\n}\n\nstatic inline uint32_t fp32_to_bits(float f) {\n\tunion {\n\t\tfloat as_value;\n\t\tuint32_t as_bits;\n\t} fp32;\n\tfp32.as_value = f;\n\treturn fp32.as_bits;\n}\n\nstatic inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {\n    const uint32_t w = (uint32_t) h << 16;\n    const uint32_t sign = w & UINT32_C(0x80000000);\n    const uint32_t two_w = w + w;\n\n    const uint32_t exp_offset = UINT32_C(0xE0) << 23;\n#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)\n    const float exp_scale = 0x1.0p-112f;\n#else\n    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));\n#endif\n    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;\n\n    const uint32_t magic_mask = UINT32_C(126) << 23;\n    const float magic_bias = 0.5f;\n    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;\n\n    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;\n    const uint32_t result = sign |\n        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));\n    return fp32_from_bits(result);\n}\n\nstatic inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {\n#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)\n    const float scale_to_inf = 0x1.0p+112f;\n    const float scale_to_zero = 0x1.0p-110f;\n#else\n    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));\n    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));\n#endif\n    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;\n\n    const uint32_t w = fp32_to_bits(f);\n    const uint32_t shl1_w = w + w;\n    const uint32_t sign = w & UINT32_C(0x80000000);\n    uint32_t bias = shl1_w & UINT32_C(0xFF000000);\n    if (bias < UINT32_C(0x71000000)) {\n        bias = UINT32_C(0x71000000);\n    }\n\n    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;\n    const uint32_t bits = fp32_to_bits(base);\n    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);\n    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);\n    const uint32_t nonsign = exp_bits + mantissa_bits;\n    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);\n}\n\n#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)\n#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)\n\n#endif // __F16C__\n\n#endif // __ARM_NEON\n\n//\n// global data\n//\n\n// precomputed gelu table for f16 (128 KB)\nstatic ggml_fp16_t table_gelu_f16[1 << 16];\n\n// precomputed silu table for f16 (128 KB)\nstatic ggml_fp16_t table_silu_f16[1 << 16];\n\n// precomputed exp table for f16 (128 KB)\nstatic ggml_fp16_t table_exp_f16[1 << 16];\n\n// precomputed f32 table for f16 (256 KB)\nstatic float table_f32_f16[1 << 16];\n\n// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,\n// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.\n#if !defined(GGML_FP16_TO_FP32) || !defined(GGML_FP32_TO_FP16)\n\ninline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {\n    uint16_t s;\n    memcpy(&s, &f, sizeof(uint16_t));\n    return table_f32_f16[s];\n}\n\n#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)\n#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)\n\n#endif\n\n// note: do not use these inside ggml.c\n// these are meant to be used via the ggml.h API\nfloat ggml_fp16_to_fp32(ggml_fp16_t x) {\n    return GGML_FP16_TO_FP32(x);\n}\n\nggml_fp16_t ggml_fp32_to_fp16(float x) {\n    return GGML_FP32_TO_FP16(x);\n}\n\n//\n// timing\n//\n\n#if defined(_MSC_VER) || defined(__MINGW32__)\nstatic int64_t timer_freq;\nvoid ggml_time_init(void) {\n    LARGE_INTEGER frequency;\n    QueryPerformanceFrequency(&frequency);\n    timer_freq = frequency.QuadPart;\n}\nint64_t ggml_time_ms(void) {\n    LARGE_INTEGER t;\n    QueryPerformanceCounter(&t);\n    return (t.QuadPart * 1000) / timer_freq;\n}\nint64_t ggml_time_us(void) {\n    LARGE_INTEGER t;\n    QueryPerformanceCounter(&t);\n    return (t.QuadPart * 1000000) / timer_freq;\n}\n#else\nvoid ggml_time_init(void) {}\nint64_t ggml_time_ms(void) {\n    struct timespec ts;\n    clock_gettime(CLOCK_MONOTONIC, &ts);\n    return (int64_t)ts.tv_sec*1000 + (int64_t)ts.tv_nsec/1000000;\n}\n\nint64_t ggml_time_us(void) {\n    struct timespec ts;\n    clock_gettime(CLOCK_MONOTONIC, &ts);\n    return (int64_t)ts.tv_sec*1000000 + (int64_t)ts.tv_nsec/1000;\n}\n#endif\n\nint64_t ggml_cycles(void) {\n    return clock();\n}\n\nint64_t ggml_cycles_per_ms(void) {\n    return CLOCKS_PER_SEC/1000;\n}\n\n#ifdef GGML_PERF\n#define ggml_perf_time_ms()       ggml_time_ms()\n#define ggml_perf_time_us()       ggml_time_us()\n#define ggml_perf_cycles()        ggml_cycles()\n#define ggml_perf_cycles_per_ms() ggml_cycles_per_ms()\n#else\n#define ggml_perf_time_ms()       0\n#define ggml_perf_time_us()       0\n#define ggml_perf_cycles()        0\n#define ggml_perf_cycles_per_ms() 0\n#endif\n\n//\n// cache line\n//\n\n#if defined(__cpp_lib_hardware_interference_size)\n#define CACHE_LINE_SIZE hardware_destructive_interference_size\n#else\n#if defined(__POWER9_VECTOR__)\n#define CACHE_LINE_SIZE 128\n#else\n#define CACHE_LINE_SIZE 64\n#endif\n#endif\n\nstatic const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);\n\n//\n// quantization\n//\n\n#define QK 32\n\n// AVX routines provided by GH user Const-me\n// ref: https://github.com/ggerganov/ggml/pull/27#issuecomment-1464934600\n#if __AVX2__\n// Unpack 32 4-bit fields into 32 bytes\n// The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval\ninline __m256i bytesFromNibbles( const uint8_t* rsi )\n{\n    // Load 16 bytes from memory\n    __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );\n\n    // Expand bytes into uint16_t values\n    __m256i bytes = _mm256_cvtepu8_epi16( tmp );\n\n    // Unpack values into individual bytes\n    const __m256i lowMask = _mm256_set1_epi8( 0xF );\n    __m256i high = _mm256_andnot_si256( lowMask, bytes );\n    __m256i low = _mm256_and_si256( lowMask, bytes );\n    high = _mm256_slli_epi16( high, 4 );\n    bytes = _mm256_or_si256( low, high );\n    return bytes;\n}\n\ninline __m128i packNibbles( __m256i bytes )\n{\n    // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh\n    const __m256i lowByte = _mm256_set1_epi16( 0xFF );\n    __m256i high = _mm256_andnot_si256( lowByte, bytes );\n    __m256i low = _mm256_and_si256( lowByte, bytes );\n    high = _mm256_srli_epi16( high, 4 );\n    bytes = _mm256_or_si256( low, high );\n\n    // Compress uint16_t lanes into bytes\n    __m128i r0 = _mm256_castsi256_si128( bytes );\n    __m128i r1 = _mm256_extracti128_si256( bytes, 1 );\n    return _mm_packus_epi16( r0, r1 );\n}\n#endif\n\n\n// method 5\n// blocks of QK elements\n// represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors)\nvoid quantize_row_q4_0(const float * restrict x, void * restrict y, int k) {\n    assert(k % QK == 0);\n\n    const int nb = k / QK;\n    const size_t bs = sizeof(float) + QK/2;\n\n    uint8_t * restrict pd = ((uint8_t *)y + 0*bs);\n    uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float));\n\n    uint8_t pp[QK/2];\n\n#if __ARM_NEON\n#if QK == 32\n    for (int i = 0; i < nb; i++) {\n        float amax = 0.0f; // absolute max\n\n        float32x4_t srcv [8];\n        float32x4_t asrcv[8];\n        float32x4_t amaxv[8];\n\n        for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(x + i*32 + 4*l);\n        for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);\n\n        for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);\n        for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);\n        for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);\n\n        amax = MAX(\n                MAX(vgetq_lane_f32(amaxv[0], 0), vgetq_lane_f32(amaxv[0], 1)),\n                MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3)));\n\n        const float d = amax / ((1 << 3) - 1);\n        const float id = d ? 1.0/d : 0.0;\n\n        *(float *)pd = d;\n        pd += bs;\n\n        for (int l = 0; l < 8; l++) {\n            const float32x4_t v  = vmulq_n_f32(srcv[l], id);\n            const float32x4_t vf = vaddq_f32(v, vdupq_n_f32(8.5f));\n            const int32x4_t   vi = vcvtq_s32_f32(vf);\n\n            pp[2*l + 0] = vgetq_lane_s32(vi, 0) | (vgetq_lane_s32(vi, 1) << 4);\n            pp[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4);\n        }\n\n        memcpy(pb, pp, sizeof(pp));\n        pb += bs;\n    }\n#else\n#error \"not implemented for QK\"\n#endif\n#elif defined(__AVX2__)\n#if QK == 32\n    for (int i = 0; i < nb; i++) {\n        // Load elements into 4 AVX vectors\n        __m256 v0 = _mm256_loadu_ps( x );\n        __m256 v1 = _mm256_loadu_ps( x + 8 );\n        __m256 v2 = _mm256_loadu_ps( x + 16 );\n        __m256 v3 = _mm256_loadu_ps( x + 24 );\n        x += 32;\n\n        // Compute max(abs(e)) for the block\n        const __m256 signBit = _mm256_set1_ps( -0.0f );\n        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );\n        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );\n        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );\n        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );\n\n        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );\n        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );\n        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );\n        const float maxScalar = _mm_cvtss_f32( max4 );\n\n        // Quantize these floats\n        const float d = maxScalar / 7.0f;\n        *(float *)pd = d;\n        pd += bs;\n        const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f;\n        const __m256 mul = _mm256_set1_ps( id );\n\n        // Apply the multiplier\n        v0 = _mm256_mul_ps( v0, mul );\n        v1 = _mm256_mul_ps( v1, mul );\n        v2 = _mm256_mul_ps( v2, mul );\n        v3 = _mm256_mul_ps( v3, mul );\n\n        // Round to nearest integer\n        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );\n        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );\n        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );\n        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );\n\n        // Convert floats to integers\n        __m256i i0 = _mm256_cvtps_epi32( v0 );\n        __m256i i1 = _mm256_cvtps_epi32( v1 );\n        __m256i i2 = _mm256_cvtps_epi32( v2 );\n        __m256i i3 = _mm256_cvtps_epi32( v3 );\n\n        // Convert int32 to int16\n        i0 = _mm256_packs_epi32( i0, i1 );\t// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15\n        i2 = _mm256_packs_epi32( i2, i3 );\t// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31\n                                            // Convert int16 to int8\n        i0 = _mm256_packs_epi16( i0, i2 );\t// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31\n\n        // We got our precious signed bytes, but the order is now wrong\n        // These AVX2 pack instructions process 16-byte pieces independently\n        // The following instruction is fixing the order\n        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );\n        i0 = _mm256_permutevar8x32_epi32( i0, perm );\n\n        // Apply offset to translate the range from [ -7 .. +7 ] into [ +1 .. +15 ]\n        const __m256i off = _mm256_set1_epi8( 8 );\n        i0 = _mm256_add_epi8( i0, off );\n\n        // Compress the vector into 4 bit/value, and store\n        __m128i res = packNibbles( i0 );\n        _mm_storeu_si128( ( __m128i* )pb, res );\n        pb += bs;\n    }\n#else\n#error \"not implemented for QK\"\n#endif\n#elif defined(__wasm_simd128__)\n#if QK == 32\n    for (int i = 0; i < nb; i++) {\n        float amax = 0.0f; // absolute max\n\n        v128_t srcv [8];\n        v128_t asrcv[8];\n        v128_t amaxv[8];\n\n        for (int l = 0; l < 8; l++) srcv[l]  = wasm_v128_load(x + i*32 + 4*l);\n        for (int l = 0; l < 8; l++) asrcv[l] = wasm_f32x4_abs(srcv[l]);\n\n        for (int l = 0; l < 4; l++) amaxv[2*l] = wasm_f32x4_max(asrcv[2*l], asrcv[2*l+1]);\n        for (int l = 0; l < 2; l++) amaxv[4*l] = wasm_f32x4_max(amaxv[4*l], amaxv[4*l+2]);\n        for (int l = 0; l < 1; l++) amaxv[8*l] = wasm_f32x4_max(amaxv[8*l], amaxv[8*l+4]);\n\n        amax = MAX(\n                MAX(wasm_f32x4_extract_lane(amaxv[0], 0), wasm_f32x4_extract_lane(amaxv[0], 1)),\n                MAX(wasm_f32x4_extract_lane(amaxv[0], 2), wasm_f32x4_extract_lane(amaxv[0], 3)));\n\n        const float d = amax / ((1 << 3) - 1);\n        const float id = d ? 1.0/d : 0.0;\n\n        *(float *)pd = d;\n        pd += bs;\n\n        for (int l = 0; l < 8; l++) {\n            const v128_t v  = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));\n            const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));\n            const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);\n\n            pp[2*l + 0] = wasm_i32x4_extract_lane(vi, 0) | (wasm_i32x4_extract_lane(vi, 1) << 4);\n            pp[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4);\n        }\n\n        memcpy(pb, pp, sizeof(pp));\n        pb += bs;\n    }\n#else\n#error \"not implemented for QK\"\n#endif\n#else\n    // scalar\n    for (int i = 0; i < nb; i++) {\n        float amax = 0.0f; // absolute max\n\n        for (int l = 0; l < QK; l++) {\n            const float v = x[i*QK + l];\n            amax = MAX(amax, fabsf(v));\n        }\n\n        const float d = amax / ((1 << 3) - 1);\n        const float id = d ? 1.0f/d : 0.0f;\n\n        *(float *)pd = d;\n        pd += bs;\n\n        for (int l = 0; l < QK; l += 2) {\n            const float v0 = x[i*QK + l + 0]*id;\n            const float v1 = x[i*QK + l + 1]*id;\n\n            const uint8_t vi0 = ((int8_t) (round(v0))) + 8;\n            const uint8_t vi1 = ((int8_t) (round(v1))) + 8;\n\n            assert(vi0 >= 0 && vi0 < 16);\n            assert(vi1 >= 0 && vi1 < 16);\n\n            pp[l/2] = vi0 | (vi1 << 4);\n        }\n\n        memcpy(pb, pp, sizeof(pp));\n        pb += bs;\n    }\n#endif\n}\n\n// method 4\n// blocks of QK elements\n// represented with 2 floats (min + delta) and QK/2 8-bit ints (i.e QK 4-bit unsigned integer factors)\nvoid quantize_row_q4_1(const float * restrict x, void * restrict y, int k) {\n    assert(k % QK == 0);\n\n    const int nb = k / QK;\n\n    float   * restrict pm = (float *)   (y);\n    float   * restrict pd = (float *)   (pm + nb);\n    uint8_t * restrict pb = (uint8_t *) (pd + nb);\n\n    uint8_t pp[QK/2];\n\n    for (int i = 0; i < nb; i++) {\n        float min = FLT_MAX;\n        float max = -FLT_MAX;\n\n        for (int l = 0; l < QK; l++) {\n            const float v = x[i*QK + l];\n            if (v < min) min = v;\n            if (v > max) max = v;\n        }\n\n        const float d = (max - min) / ((1 << 4) - 1);\n        const float id = d ? 1.0f/d : 0.0f;\n\n        pm[i] = min;\n        pd[i] = d;\n\n        for (int l = 0; l < QK; l += 2) {\n            const float v0 = (x[i*QK + l + 0] - min)*id;\n            const float v1 = (x[i*QK + l + 1] - min)*id;\n\n            const uint8_t vi0 = round(v0);\n            const uint8_t vi1 = round(v1);\n\n            assert(vi0 >= 0 && vi0 < 16);\n            assert(vi1 >= 0 && vi1 < 16);\n\n            pp[l/2] = vi0 | (vi1 << 4);\n        }\n\n        memcpy(pb + i*QK/2, pp, sizeof(pp));\n    }\n}\n\n// TODO: vectorize\nvoid dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) {\n    assert(k % QK == 0);\n\n    const int nb = k / QK;\n    const size_t bs = sizeof(float) + QK/2;\n\n    const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);\n    const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float));\n\n    // scalar\n    for (int i = 0; i < nb; i++) {\n        const float d = *(const float *) (pd + i*bs);\n\n        const uint8_t * restrict pp = pb + i*bs;\n\n        for (int l = 0; l < QK; l += 2) {\n            const uint8_t vi = pp[l/2];\n\n            const int8_t vi0 = vi & 0xf;\n            const int8_t vi1 = vi >> 4;\n\n            const float v0 = (vi0 - 8)*d;\n            const float v1 = (vi1 - 8)*d;\n\n            //printf(\"d = %f, vi = %d, vi0 = %d, vi1 = %d, v0 = %f, v1 = %f\\n\", d, vi, vi0, vi1, v0, v1);\n\n            y[i*QK + l + 0] = v0;\n            y[i*QK + l + 1] = v1;\n\n            assert(!isnan(y[i*QK + l + 0]));\n            assert(!isnan(y[i*QK + l + 1]));\n        }\n    }\n}\n\nvoid dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) {\n    assert(k % QK == 0);\n\n    const int nb = k / QK;\n\n    const float   * restrict pm = (const float *)   (x);\n    const float   * restrict pd = (const float *)   (pm + nb);\n    const uint8_t * restrict pb = (const uint8_t *) (pd + nb);\n\n    for (int i = 0; i < nb; i++) {\n        const float m = pm[i];\n        const float d = pd[i];\n\n        const uint8_t * restrict pp = pb + i*QK/2;\n\n        for (int l = 0; l < QK; l += 2) {\n            const uint8_t vi = pp[l/2];\n\n            const int8_t vi0 = vi & 0xf;\n            const int8_t vi1 = vi >> 4;\n\n            const float v0 = vi0*d + m;\n            const float v1 = vi1*d + m;\n\n            y[i*QK + l + 0] = v0;\n            y[i*QK + l + 1] = v1;\n\n            assert(!isnan(y[i*QK + l + 0]));\n            assert(!isnan(y[i*QK + l + 1]));\n        }\n    }\n}\n\n//\n// simd mappings\n//\n\n// we define a common set of C macros which map to specific intrinsics based on the current architecture\n// we then implement the fundamental computation operations below using only these macros\n// adding support for new architectures requires to define the corresponding SIMD macros\n//\n// GGML_F32_STEP / GGML_F16_STEP\n//   number of elements to process in a single step\n//\n// GGML_F32_EPR / GGML_F16_EPR\n//   number of elements to fit in a single register\n//\n\n#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)\n\n#define GGML_SIMD\n\n// F32 NEON\n\n#define GGML_F32_STEP 16\n#define GGML_F32_EPR  4\n\n#define GGML_F32x4              float32x4_t\n#define GGML_F32x4_ZERO         vdupq_n_f32(0.0f)\n#define GGML_F32x4_SET1(x)      vdupq_n_f32(x)\n#define GGML_F32x4_LOAD         vld1q_f32\n#define GGML_F32x4_STORE        vst1q_f32\n#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)\n#define GGML_F32x4_ADD          vaddq_f32\n#define GGML_F32x4_MUL          vmulq_f32\n#if defined(__ARM_FEATURE_QRDMX)\n    #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)\n#else\n    #define GGML_F32x4_REDUCE_ONE(x) \\\n    (vgetq_lane_f32(x, 0) +          \\\n     vgetq_lane_f32(x, 1) +          \\\n     vgetq_lane_f32(x, 2) +          \\\n     vgetq_lane_f32(x, 3))\n#endif\n#define GGML_F32x4_REDUCE(res, x)              \\\n{                                              \\\n    for (int i = 0; i < GGML_F32_ARR/2; ++i) { \\\n        x[2*i] = vaddq_f32(x[2*i], x[2*i+1]);  \\\n    }                                          \\\n    for (int i = 0; i < GGML_F32_ARR/4; ++i) { \\\n        x[4*i] = vaddq_f32(x[4*i], x[4*i+2]);  \\\n    }                                          \\\n    for (int i = 0; i < GGML_F32_ARR/8; ++i) { \\\n        x[8*i] = vaddq_f32(x[8*i], x[8*i+4]);  \\\n    }                                          \\\n    res = GGML_F32x4_REDUCE_ONE(x[0]);         \\\n}\n\n#define GGML_F32_VEC        GGML_F32x4\n#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO\n#define GGML_F32_VEC_SET1   GGML_F32x4_SET1\n#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD\n#define GGML_F32_VEC_STORE  GGML_F32x4_STORE\n#define GGML_F32_VEC_FMA    GGML_F32x4_FMA\n#define GGML_F32_VEC_ADD    GGML_F32x4_ADD\n#define GGML_F32_VEC_MUL    GGML_F32x4_MUL\n#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE\n\n// F16 NEON\n\n#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)\n    #define GGML_F16_STEP 32\n    #define GGML_F16_EPR  8\n\n    #define GGML_F16x8              float16x8_t\n    #define GGML_F16x8_ZERO         vdupq_n_f16(0.0f)\n    #define GGML_F16x8_SET1(x)      vdupq_n_f16(x)\n    #define GGML_F16x8_LOAD         vld1q_f16\n    #define GGML_F16x8_STORE        vst1q_f16\n    #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)\n    #define GGML_F16x8_ADD          vaddq_f16\n    #define GGML_F16x8_MUL          vmulq_f16\n    #define GGML_F16x8_REDUCE(res, x)                             \\\n    {                                                             \\\n        for (int i = 0; i < GGML_F16_ARR/2; ++i) {                \\\n            x[2*i] = vaddq_f16(x[2*i], x[2*i+1]);                 \\\n        }                                                         \\\n        for (int i = 0; i < GGML_F16_ARR/4; ++i) {                \\\n            x[4*i] = vaddq_f16(x[4*i], x[4*i+2]);                 \\\n        }                                                         \\\n        for (int i = 0; i < GGML_F16_ARR/8; ++i) {                \\\n            x[8*i] = vaddq_f16(x[8*i], x[8*i+4]);                 \\\n        }                                                         \\\n        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \\\n        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \\\n        res = vaddvq_f32(vaddq_f32(t0, t1));                      \\\n    }\n\n    #define GGML_F16_VEC                GGML_F16x8\n    #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO\n    #define GGML_F16_VEC_SET1           GGML_F16x8_SET1\n    #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)\n    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE(p, r[i])\n    #define GGML_F16_VEC_FMA            GGML_F16x8_FMA\n    #define GGML_F16_VEC_ADD            GGML_F16x8_ADD\n    #define GGML_F16_VEC_MUL            GGML_F16x8_MUL\n    #define GGML_F16_VEC_REDUCE         GGML_F16x8_REDUCE\n#else\n    // if FP16 vector arithmetic is not supported, we use FP32 instead\n    // and take advantage of the vcvt_ functions to convert to/from FP16\n\n    #define GGML_F16_STEP 16\n    #define GGML_F16_EPR  4\n\n    #define GGML_F32Cx4              float32x4_t\n    #define GGML_F32Cx4_ZERO         vdupq_n_f32(0.0f)\n    #define GGML_F32Cx4_SET1(x)      vdupq_n_f32(x)\n    #define GGML_F32Cx4_LOAD(x)      vcvt_f32_f16(vld1_f16(x))\n    #define GGML_F32Cx4_STORE(x, y)  vst1_f16(x, vcvt_f16_f32(y))\n    #define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)\n    #define GGML_F32Cx4_ADD          vaddq_f32\n    #define GGML_F32Cx4_MUL          vmulq_f32\n    #define GGML_F32Cx4_REDUCE       GGML_F32x4_REDUCE\n\n    #define GGML_F16_VEC                GGML_F32Cx4\n    #define GGML_F16_VEC_ZERO           GGML_F32Cx4_ZERO\n    #define GGML_F16_VEC_SET1           GGML_F32Cx4_SET1\n    #define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx4_LOAD(p)\n    #define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])\n    #define GGML_F16_VEC_FMA            GGML_F32Cx4_FMA\n    #define GGML_F16_VEC_ADD            GGML_F32Cx4_ADD\n    #define GGML_F16_VEC_MUL            GGML_F32Cx4_MUL\n    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE\n#endif\n\n#elif defined(__AVX__)\n\n#define GGML_SIMD\n\n// F32 AVX\n\n#define GGML_F32_STEP 32\n#define GGML_F32_EPR  8\n\n#define GGML_F32x8         __m256\n#define GGML_F32x8_ZERO    _mm256_setzero_ps()\n#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)\n#define GGML_F32x8_LOAD    _mm256_loadu_ps\n#define GGML_F32x8_STORE   _mm256_storeu_ps\n#if defined(__FMA__)\n    #define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)\n#else\n    #define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)\n#endif\n#define GGML_F32x8_ADD     _mm256_add_ps\n#define GGML_F32x8_MUL     _mm256_mul_ps\n#define GGML_F32x8_REDUCE(res, x)                                 \\\n{                                                                 \\\n    for (int i = 0; i < GGML_F32_ARR/2; ++i) {                    \\\n        x[2*i] = _mm256_add_ps(x[2*i], x[2*i+1]);                 \\\n    }                                                             \\\n    for (int i = 0; i < GGML_F32_ARR/4; ++i) {                    \\\n        x[4*i] = _mm256_add_ps(x[4*i], x[4*i+2]);                 \\\n    }                                                             \\\n    for (int i = 0; i < GGML_F32_ARR/8; ++i) {                    \\\n        x[8*i] = _mm256_add_ps(x[8*i], x[8*i+4]);                 \\\n    }                                                             \\\n    const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]),    \\\n                                 _mm256_extractf128_ps(x[0], 1)); \\\n    const __m128 t1 = _mm_hadd_ps(t0, t0);                        \\\n    res = _mm_cvtss_f32(_mm_hadd_ps(t1, t1));                     \\\n}\n// TODO: is this optimal ?\n\n#define GGML_F32_VEC        GGML_F32x8\n#define GGML_F32_VEC_ZERO   GGML_F32x8_ZERO\n#define GGML_F32_VEC_SET1   GGML_F32x8_SET1\n#define GGML_F32_VEC_LOAD   GGML_F32x8_LOAD\n#define GGML_F32_VEC_STORE  GGML_F32x8_STORE\n#define GGML_F32_VEC_FMA    GGML_F32x8_FMA\n#define GGML_F32_VEC_ADD    GGML_F32x8_ADD\n#define GGML_F32_VEC_MUL    GGML_F32x8_MUL\n#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE\n\n// F16 AVX\n\n#define GGML_F16_STEP 32\n#define GGML_F16_EPR  8\n\n// F16 arithmetic is not supported by AVX, so we use F32 instead\n// we take advantage of the _mm256_cvt intrinsics to convert F16 <-> F32\n\n#define GGML_F32Cx8             __m256\n#define GGML_F32Cx8_ZERO        _mm256_setzero_ps()\n#define GGML_F32Cx8_SET1(x)     _mm256_set1_ps(x)\n#define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x)))\n#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))\n#define GGML_F32Cx8_FMA         GGML_F32x8_FMA\n#define GGML_F32Cx8_ADD         _mm256_add_ps\n#define GGML_F32Cx8_MUL         _mm256_mul_ps\n#define GGML_F32Cx8_REDUCE      GGML_F32x8_REDUCE\n\n#define GGML_F16_VEC                GGML_F32Cx8\n#define GGML_F16_VEC_ZERO           GGML_F32Cx8_ZERO\n#define GGML_F16_VEC_SET1           GGML_F32Cx8_SET1\n#define GGML_F16_VEC_LOAD(p, i)     GGML_F32Cx8_LOAD(p)\n#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])\n#define GGML_F16_VEC_FMA            GGML_F32Cx8_FMA\n#define GGML_F16_VEC_ADD            GGML_F32Cx8_ADD\n#define GGML_F16_VEC_MUL            GGML_F32Cx8_MUL\n#define GGML_F16_VEC_REDUCE         GGML_F32Cx8_REDUCE\n\n#elif defined(__POWER9_VECTOR__)\n\n#define GGML_SIMD\n\n// F32 POWER9\n\n#define GGML_F32_STEP 32\n#define GGML_F32_EPR  4\n\n#define GGML_F32x4              vector float\n#define GGML_F32x4_ZERO         0.0f\n#define GGML_F32x4_SET1         vec_splats\n#define GGML_F32x4_LOAD(p)      vec_xl(0, p)\n#define GGML_F32x4_STORE(p, r)  vec_xst(r, 0, p)\n#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)\n#define GGML_F32x4_ADD          vec_add\n#define GGML_F32x4_MUL          vec_mul\n#define GGML_F32x4_REDUCE(res, x)              \\\n{                                              \\\n    for (int i = 0; i < GGML_F32_ARR/2; ++i) { \\\n        x[2*i] = vec_add(x[2*i], x[2*i+1]);    \\\n    }                                          \\\n    for (int i = 0; i < GGML_F32_ARR/4; ++i) { \\\n        x[4*i] = vec_add(x[4*i], x[4*i+2]);    \\\n    }                                          \\\n    for (int i = 0; i < GGML_F32_ARR/8; ++i) { \\\n        x[8*i] = vec_add(x[8*i], x[8*i+4]);    \\\n    }                                          \\\n    res = vec_extract(x[0], 0) +               \\\n          vec_extract(x[0], 1) +               \\\n          vec_extract(x[0], 2) +               \\\n          vec_extract(x[0], 3);                \\\n}\n\n#define GGML_F32_VEC        GGML_F32x4\n#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO\n#define GGML_F32_VEC_SET1   GGML_F32x4_SET1\n#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD\n#define GGML_F32_VEC_STORE  GGML_F32x4_STORE\n#define GGML_F32_VEC_FMA    GGML_F32x4_FMA\n#define GGML_F32_VEC_ADD    GGML_F32x4_ADD\n#define GGML_F32_VEC_MUL    GGML_F32x4_MUL\n#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE\n\n// F16 POWER9\n#define GGML_F16_STEP       GGML_F32_STEP\n#define GGML_F16_EPR        GGML_F32_EPR\n#define GGML_F16_VEC        GGML_F32x4\n#define GGML_F16_VEC_ZERO   GGML_F32x4_ZERO\n#define GGML_F16_VEC_SET1   GGML_F32x4_SET1\n#define GGML_F16_VEC_FMA    GGML_F32x4_FMA\n#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE\n// Use vec_xl, not vec_ld, in case the load address is not aligned.\n#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ?                   \\\n  vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \\\n  vec_extract_fp32_from_shortl(vec_xl(0, p))\n#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]\n#define GGML_F16_VEC_STORE(p, r, i)                             \\\n  if (i & 0x1)                                                  \\\n    vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)],  \\\n                                   r[i - GGML_ENDIAN_BYTE(0)]), \\\n            0, p - GGML_F16_EPR)\n\n#elif defined(__wasm_simd128__)\n\n#define GGML_SIMD\n\n// F32 WASM\n\n#define GGML_F32_STEP 16\n#define GGML_F32_EPR  4\n\n#define GGML_F32x4              v128_t\n#define GGML_F32x4_ZERO         wasm_f32x4_splat(0.0f)\n#define GGML_F32x4_SET1(x)      wasm_f32x4_splat(x)\n#define GGML_F32x4_LOAD         wasm_v128_load\n#define GGML_F32x4_STORE        wasm_v128_store\n#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)\n#define GGML_F32x4_ADD          wasm_f32x4_add\n#define GGML_F32x4_MUL          wasm_f32x4_mul\n#define GGML_F32x4_REDUCE(res, x)                  \\\n{                                                  \\\n    for (int i = 0; i < GGML_F32_ARR/2; ++i) {     \\\n        x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \\\n    }                                              \\\n    for (int i = 0; i < GGML_F32_ARR/4; ++i) {     \\\n        x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \\\n    }                                              \\\n    for (int i = 0; i < GGML_F32_ARR/8; ++i) {     \\\n        x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \\\n    }                                              \\\n    res = wasm_f32x4_extract_lane(x[0], 0) +       \\\n          wasm_f32x4_extract_lane(x[0], 1) +       \\\n          wasm_f32x4_extract_lane(x[0], 2) +       \\\n          wasm_f32x4_extract_lane(x[0], 3);        \\\n}\n\n#define GGML_F32_VEC        GGML_F32x4\n#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO\n#define GGML_F32_VEC_SET1   GGML_F32x4_SET1\n#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD\n#define GGML_F32_VEC_STORE  GGML_F32x4_STORE\n#define GGML_F32_VEC_FMA    GGML_F32x4_FMA\n#define GGML_F32_VEC_ADD    GGML_F32x4_ADD\n#define GGML_F32_VEC_MUL    GGML_F32x4_MUL\n#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE\n\n// F16 WASM\n\n#define GGML_F16_STEP 16\n#define GGML_F16_EPR  4\n\ninline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {\n    float tmp[4];\n\n    tmp[0] = GGML_FP16_TO_FP32(p[0]);\n    tmp[1] = GGML_FP16_TO_FP32(p[1]);\n    tmp[2] = GGML_FP16_TO_FP32(p[2]);\n    tmp[3] = GGML_FP16_TO_FP32(p[3]);\n\n    return wasm_v128_load(tmp);\n}\n\ninline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {\n    float tmp[4];\n\n    wasm_v128_store(tmp, x);\n\n    p[0] = GGML_FP32_TO_FP16(tmp[0]);\n    p[1] = GGML_FP32_TO_FP16(tmp[1]);\n    p[2] = GGML_FP32_TO_FP16(tmp[2]);\n    p[3] = GGML_FP32_TO_FP16(tmp[3]);\n}\n\n#define GGML_F16x4             v128_t\n#define GGML_F16x4_ZERO        wasm_f32x4_splat(0.0f)\n#define GGML_F16x4_SET1(x)     wasm_f32x4_splat(x)\n#define GGML_F16x4_LOAD(x)     __wasm_f16x4_load(x)\n#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)\n#define GGML_F16x4_FMA         GGML_F32x4_FMA\n#define GGML_F16x4_ADD         wasm_f32x4_add\n#define GGML_F16x4_MUL         wasm_f32x4_mul\n#define GGML_F16x4_REDUCE(res, x)                  \\\n{                                                  \\\n    for (int i = 0; i < GGML_F16_ARR/2; ++i) {     \\\n        x[2*i] = wasm_f32x4_add(x[2*i], x[2*i+1]); \\\n    }                                              \\\n    for (int i = 0; i < GGML_F16_ARR/4; ++i) {     \\\n        x[4*i] = wasm_f32x4_add(x[4*i], x[4*i+2]); \\\n    }                                              \\\n    for (int i = 0; i < GGML_F16_ARR/8; ++i) {     \\\n        x[8*i] = wasm_f32x4_add(x[8*i], x[8*i+4]); \\\n    }                                              \\\n    res = wasm_f32x4_extract_lane(x[0], 0) +       \\\n          wasm_f32x4_extract_lane(x[0], 1) +       \\\n          wasm_f32x4_extract_lane(x[0], 2) +       \\\n          wasm_f32x4_extract_lane(x[0], 3);        \\\n}\n\n#define GGML_F16_VEC                GGML_F16x4\n#define GGML_F16_VEC_ZERO           GGML_F16x4_ZERO\n#define GGML_F16_VEC_SET1           GGML_F16x4_SET1\n#define GGML_F16_VEC_LOAD(p, i)     GGML_F16x4_LOAD(p)\n#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])\n#define GGML_F16_VEC_FMA            GGML_F16x4_FMA\n#define GGML_F16_VEC_ADD            GGML_F16x4_ADD\n#define GGML_F16_VEC_MUL            GGML_F16x4_MUL\n#define GGML_F16_VEC_REDUCE         GGML_F16x4_REDUCE\n\n#elif defined(__SSE3__)\n\n#define GGML_SIMD\n\n// F32 SSE\n\n#define GGML_F32_STEP 32\n#define GGML_F32_EPR  4\n\n#define GGML_F32x4         __m128\n#define GGML_F32x4_ZERO    _mm_setzero_ps()\n#define GGML_F32x4_SET1(x) _mm_set1_ps(x)\n#define GGML_F32x4_LOAD    _mm_loadu_ps\n#define GGML_F32x4_STORE   _mm_storeu_ps\n#if defined(__FMA__)\n    // TODO: Does this work?\n    #define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)\n#else\n    #define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)\n#endif\n#define GGML_F32x4_ADD     _mm_add_ps\n#define GGML_F32x4_MUL     _mm_mul_ps\n#define GGML_F32x4_REDUCE(res, x)                                 \\\n{                                                                 \\\n    for (int i = 0; i < GGML_F32_ARR/2; ++i) {                    \\\n        x[2*i] = _mm_add_ps(x[2*i], x[2*i+1]);                    \\\n    }                                                             \\\n    for (int i = 0; i < GGML_F32_ARR/4; ++i) {                    \\\n        x[4*i] = _mm_add_ps(x[4*i], x[4*i+2]);                    \\\n    }                                                             \\\n    for (int i = 0; i < GGML_F32_ARR/8; ++i) {                    \\\n        x[8*i] = _mm_add_ps(x[8*i], x[8*i+4]);                    \\\n    }                                                             \\\n    const __m128 t0 = _mm_hadd_ps(x[0], x[0]);                    \\\n    res = _mm_cvtss_f32(_mm_hadd_ps(t0, t0));                     \\\n}\n// TODO: is this optimal ?\n\n#define GGML_F32_VEC        GGML_F32x4\n#define GGML_F32_VEC_ZERO   GGML_F32x4_ZERO\n#define GGML_F32_VEC_SET1   GGML_F32x4_SET1\n#define GGML_F32_VEC_LOAD   GGML_F32x4_LOAD\n#define GGML_F32_VEC_STORE  GGML_F32x4_STORE\n#define GGML_F32_VEC_FMA    GGML_F32x4_FMA\n#define GGML_F32_VEC_ADD    GGML_F32x4_ADD\n#define GGML_F32_VEC_MUL    GGML_F32x4_MUL\n#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE\n\n// F16 SSE\n\n#define GGML_F16_STEP 32\n#define GGML_F16_EPR  4\n\nstatic inline __m128 __sse_f16x4_load(ggml_fp16_t *x) {\n    float tmp[4];\n\n    tmp[0] = GGML_FP16_TO_FP32(x[0]);\n    tmp[1] = GGML_FP16_TO_FP32(x[1]);\n    tmp[2] = GGML_FP16_TO_FP32(x[2]);\n    tmp[3] = GGML_FP16_TO_FP32(x[3]);\n\n    return _mm_loadu_ps(tmp);\n}\n\nstatic inline void __sse_f16x4_store(ggml_fp16_t *x, __m128 y) {\n    float arr[4];\n\n    _mm_storeu_ps(arr, y);\n\n    x[0] = GGML_FP32_TO_FP16(arr[0]);\n    x[1] = GGML_FP32_TO_FP16(arr[1]);\n    x[2] = GGML_FP32_TO_FP16(arr[2]);\n    x[3] = GGML_FP32_TO_FP16(arr[3]);\n}\n\n#define GGML_F32Cx4             __m128\n#define GGML_F32Cx4_ZERO        _mm_setzero_ps()\n#define GGML_F32Cx4_SET1(x)     _mm_set1_ps(x)\n#define GGML_F32Cx4_LOAD(x)     __sse_f16x4_load(x)\n#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)\n#define GGML_F32Cx4_FMA         GGML_F32x4_FMA\n#define GGML_F32Cx4_ADD         _mm_add_ps\n#define GGML_F32Cx4_MUL         _mm_mul_ps\n#define GGML_F32Cx4_REDUCE      GGML_F32x4_REDUCE\n\n#define GGML_F16_VEC                 GGML_F32Cx4\n#define GGML_F16_VEC_ZERO            GGML_F32Cx4_ZERO\n#define GGML_F16_VEC_SET1            GGML_F32Cx4_SET1\n#define GGML_F16_VEC_LOAD(p, i)      GGML_F32Cx4_LOAD(p)\n#define GGML_F16_VEC_STORE(p, r, i)  GGML_F32Cx4_STORE(p, r[i])\n#define GGML_F16_VEC_FMA             GGML_F32Cx4_FMA\n#define GGML_F16_VEC_ADD             GGML_F32Cx4_ADD\n#define GGML_F16_VEC_MUL             GGML_F32Cx4_MUL\n#define GGML_F16_VEC_REDUCE          GGML_F32Cx4_REDUCE\n\n#endif\n\n// GGML_F32_ARR / GGML_F16_ARR\n//   number of registers to use per step\n#ifdef GGML_SIMD\n#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)\n#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)\n#endif\n\n//\n// fundamental operations\n//\n\ninline static void ggml_vec_set_i8(const int n, int8_t * x, const int8_t v) { for (int i = 0; i < n; ++i) x[i] = v; }\n\ninline static void ggml_vec_set_i16(const int n, int16_t * x, const int16_t v) { for (int i = 0; i < n; ++i) x[i] = v; }\n\ninline static void ggml_vec_set_i32(const int n, int32_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }\n\ninline static void ggml_vec_set_f16(const int n, ggml_fp16_t * x, const int32_t v) { for (int i = 0; i < n; ++i) x[i] = v; }\n\ninline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }\ninline static void ggml_vec_acc_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i] += x[i];        }\ninline static void ggml_vec_acc1_f32(const int n, float * y, const float   v)                  { for (int i = 0; i < n; ++i) y[i] += v;           }\ninline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i]; }\ninline static void ggml_vec_set_f32 (const int n, float * x, const float   v)                  { for (int i = 0; i < n; ++i) x[i]  = v;           }\ninline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = x[i];        }\ninline static void ggml_vec_neg_f32 (const int n, float * y, const float * x)                  { for (int i = 0; i < n; ++i) y[i]  = -x[i];       }\ninline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];   }\ninline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];   }\n\ninline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) {\n    ggml_float sumf = 0.0;\n\n#ifdef GGML_SIMD\n    const int np = (n & ~(GGML_F32_STEP - 1));\n\n    GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO };\n\n    GGML_F32_VEC ax[GGML_F32_ARR];\n    GGML_F32_VEC ay[GGML_F32_ARR];\n\n    for (int i = 0; i < np; i += GGML_F32_STEP) {\n        for (int j = 0; j < GGML_F32_ARR; j++) {\n            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);\n            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);\n\n            sum[j] = GGML_F32_VEC_FMA(sum[j], ax[j], ay[j]);\n        }\n    }\n\n    // reduce sum0..sum3 to sum0\n    GGML_F32_VEC_REDUCE(sumf, sum);\n\n    // leftovers\n    for (int i = np; i < n; ++i) {\n        sumf += x[i]*y[i];\n    }\n#else\n    // scalar\n    for (int i = 0; i < n; ++i) {\n        sumf += x[i]*y[i];\n    }\n#endif\n\n    *s = sumf;\n}\n\ninline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) {\n    ggml_float sumf = 0.0;\n\n#if defined(GGML_SIMD)\n    const int np = (n & ~(GGML_F16_STEP - 1));\n\n    GGML_F16_VEC sum[GGML_F16_ARR] = { GGML_F16_VEC_ZERO };\n\n    GGML_F16_VEC ax[GGML_F16_ARR];\n    GGML_F16_VEC ay[GGML_F16_ARR];\n\n    for (int i = 0; i < np; i += GGML_F16_STEP) {\n        for (int j = 0; j < GGML_F16_ARR; j++) {\n            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);\n            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);\n\n            sum[j] = GGML_F16_VEC_FMA(sum[j], ax[j], ay[j]);\n        }\n    }\n\n    // reduce sum0..sum3 to sum0\n    GGML_F16_VEC_REDUCE(sumf, sum);\n\n    // leftovers\n    for (int i = np; i < n; ++i) {\n        sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);\n    }\n#else\n    for (int i = 0; i < n; ++i) {\n        sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]);\n    }\n#endif\n\n    *s = sumf;\n}\n\ninline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * restrict x, const void * restrict y) {\n    const int nb = n / QK;\n\n    assert(n % QK == 0);\n    assert(nb % 2 == 0);\n\n    const size_t bs = sizeof(float) + QK/2;\n\n    const uint8_t * restrict pd0 = ((const uint8_t *)x + 0*bs);\n    const uint8_t * restrict pd1 = ((const uint8_t *)y + 0*bs);\n\n    const uint8_t * restrict pb0 = ((const uint8_t *)x + 0*bs + sizeof(float));\n    const uint8_t * restrict pb1 = ((const uint8_t *)y + 0*bs + sizeof(float));\n\n    float sumf = 0.0;\n\n#ifdef __ARM_NEON\n#if QK == 32\n    float sum0 = 0.0f;\n    float sum1 = 0.0f;\n\n    for (int i = 0; i < nb; i += 2) {\n        const float d0_0 = *(const float *) (pd0 + i*bs);\n        const float d1_0 = *(const float *) (pd1 + i*bs);\n        const float d0_1 = *(const float *) (pd0 + (i + 1)*bs);\n        const float d1_1 = *(const float *) (pd1 + (i + 1)*bs);\n\n        //printf(\"d0_0: %f, d1_0: %f, d0_1: %f, d1_1: %f\\n\", d0_0, d1_0, d0_1, d1_1);\n\n        const uint8_t * restrict p0 = pb0 + i*bs;\n        const uint8_t * restrict p1 = pb1 + i*bs;\n\n        const uint8x16_t m4b = vdupq_n_u8(0xf);\n        const int8x16_t  s8b = vdupq_n_s8(0x8);\n\n        const uint8x16_t v0_0 = vld1q_u8(p0);\n        const uint8x16_t v1_0 = vld1q_u8(p1);\n        const uint8x16_t v0_1 = vld1q_u8(p0 + bs);\n        const uint8x16_t v1_1 = vld1q_u8(p1 + bs);\n\n        // 4-bit -> 8-bit\n        const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b));\n        const int8x16_t v1_0l = vreinterpretq_s8_u8(vandq_u8(v1_0, m4b));\n\n        const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));\n        const int8x16_t v1_0h = vreinterpretq_s8_u8(vshrq_n_u8(v1_0, 4));\n\n        const int8x16_t v0_1l = vreinterpretq_s8_u8(vandq_u8(v0_1, m4b));\n        const int8x16_t v1_1l = vreinterpretq_s8_u8(vandq_u8(v1_1, m4b));\n\n        const int8x16_t v0_1h = vreinterpretq_s8_u8(vshrq_n_u8(v0_1, 4));\n        const int8x16_t v1_1h = vreinterpretq_s8_u8(vshrq_n_u8(v1_1, 4));\n\n        // sub 8\n        const int8x16_t v0_0ls = vsubq_s8(v0_0l, s8b);\n        const int8x16_t v1_0ls = vsubq_s8(v1_0l, s8b);\n\n        const int8x16_t v0_0hs = vsubq_s8(v0_0h, s8b);\n        const int8x16_t v1_0hs = vsubq_s8(v1_0h, s8b);\n\n        const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);\n        const int8x16_t v1_1ls = vsubq_s8(v1_1l, s8b);\n\n        const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);\n        const int8x16_t v1_1hs = vsubq_s8(v1_1h, s8b);\n\n#if defined(__ARM_FEATURE_DOTPROD)\n        // dot product into int16x8_t\n        int32x4_t p_0 = vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls);\n        int32x4_t p_1 = vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls);\n\n        p_0 = vdotq_s32(p_0, v0_0hs, v1_0hs);\n        p_1 = vdotq_s32(p_1, v0_1hs, v1_1hs);\n\n        // scalar\n#if defined(__ARM_FEATURE_QRDMX)\n        sum0 += d0_0*d1_0*vaddvq_s32(p_0);\n        sum1 += d0_1*d1_1*vaddvq_s32(p_1);\n#else\n        sum0 += d0_0*d1_0*(vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3));\n        sum1 += d0_1*d1_1*(vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3));\n#endif\n#else\n\t    const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));\n        const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));\n\n        const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));\n        const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));\n\n        const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));\n        const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));\n\n        const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));\n        const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));\n\n        const int16x8_t pl_0 = vaddq_s16(pl0l, pl0h);\n        const int16x8_t ph_0 = vaddq_s16(ph0l, ph0h);\n\n        const int16x8_t pl_1 = vaddq_s16(pl1l, pl1h);\n        const int16x8_t ph_1 = vaddq_s16(ph1l, ph1h);\n\n        const int16x8_t p_0 = vaddq_s16(pl_0, ph_0);\n        const int16x8_t p_1 = vaddq_s16(pl_1, ph_1);\n\n        // scalar\n#if defined(__ARM_FEATURE_QRDMX)\n        sum0 += d0_0*d1_0*vaddvq_s16(p_0);\n        sum1 += d0_1*d1_1*vaddvq_s16(p_1);\n#else\n        sum0 += d0_0*d1_0*(vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7));\n        sum1 += d0_1*d1_1*(vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7));\n#endif\n#endif\n    }\n\n    sumf = sum0 + sum1;\n#else\n#error \"not implemented for QK\"\n#endif\n#elif defined(__AVX2__)\n#if QK == 32\n    const size_t countBlocks = nb;\n\n    // Initialize accumulator with zeros\n    __m256 acc = _mm256_setzero_ps();\n\n    // Main loop\n    for (int i = 0; i < nb; ++i) {\n        const float * d0_0 = (const float *) (pd0 + i*bs);\n        const float * d1_0 = (const float *) (pd1 + i*bs);\n\n        const uint8_t * restrict p0 = pb0 + i*bs;\n        const uint8_t * restrict p1 = pb1 + i*bs;\n\n        // Compute combined scale for the block\n        const __m256 scale = _mm256_mul_ps( _mm256_broadcast_ss( d0_0 ), _mm256_broadcast_ss( d1_0 ) );\n\n        // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes\n        __m256i bx = bytesFromNibbles( p0 );\n        __m256i by = bytesFromNibbles( p1 );\n\n        // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.\n        const __m256i off = _mm256_set1_epi8( 8 );\n        bx = _mm256_sub_epi8( bx, off );\n        by = _mm256_sub_epi8( by, off );\n\n        // Sign-extend first 16 signed bytes into int16_t\n        __m256i x16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( bx ) );\n        __m256i y16 = _mm256_cvtepi8_epi16( _mm256_castsi256_si128( by ) );\n        // Compute products of int16_t integers, add pairwise\n        __m256i i32 = _mm256_madd_epi16( x16, y16 );\n\n        // Sign-extend last 16 signed bytes into int16_t vectors\n        x16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( bx, 1 ) );\n        y16 = _mm256_cvtepi8_epi16( _mm256_extracti128_si256( by, 1 ) );\n        // Accumulate products of int16_t integers\n        i32 = _mm256_add_epi32( i32, _mm256_madd_epi16( x16, y16 ) );\n\n        // Convert int32_t to float\n        __m256 p = _mm256_cvtepi32_ps( i32 );\n        // Apply the scale, and accumulate\n        acc = _mm256_fmadd_ps( scale, p, acc );\n    }\n\n    // Return horizontal sum of the acc vector\n    __m128 res = _mm256_extractf128_ps( acc, 1 );\n    res = _mm_add_ps( res, _mm256_castps256_ps128( acc ) );\n    res = _mm_add_ps( res, _mm_movehl_ps( res, res ) );\n    res = _mm_add_ss( res, _mm_movehdup_ps( res ) );\n\n    sumf = _mm_cvtss_f32( res );\n#else\n#error \"not implemented for QK\"\n#endif\n#elif defined(__wasm_simd128__)\n#if QK == 32\n    // wasm simd\n    float sum0 = 0.0f;\n    float sum1 = 0.0f;\n\n    for (int i = 0; i < nb; i += 2) {\n        const float d0_0 = *(const float *) (pd0 + i*bs);\n        const float d1_0 = *(const float *) (pd1 + i*bs);\n        const float d0_1 = *(const float *) (pd0 + (i + 1)*bs);\n        const float d1_1 = *(const float *) (pd1 + (i + 1)*bs);\n\n        const uint8_t * restrict p0 = pb0 + i*bs;\n        const uint8_t * restrict p1 = pb1 + i*bs;\n\n        const v128_t m4b = wasm_u8x16_splat(0xf);\n        const v128_t s8b = wasm_i8x16_splat(0x8);\n\n        const v128_t v0_0 = wasm_v128_load(p0);\n        const v128_t v0_1 = wasm_v128_load(p0 + bs);\n        const v128_t v1_0 = wasm_v128_load(p1);\n        const v128_t v1_1 = wasm_v128_load(p1 + bs);\n\n        // 4-bit -> 8-bit\n        const v128_t v0_0l = wasm_v128_and(v0_0, m4b);\n        const v128_t v1_0l = wasm_v128_and(v1_0, m4b);\n\n        const v128_t v0_0h = wasm_u8x16_shr(v0_0, 4);\n        const v128_t v1_0h = wasm_u8x16_shr(v1_0, 4);\n\n        const v128_t v0_1l = wasm_v128_and(v0_1, m4b);\n        const v128_t v1_1l = wasm_v128_and(v1_1, m4b);\n\n        const v128_t v0_1h = wasm_u8x16_shr(v0_1, 4);\n        const v128_t v1_1h = wasm_u8x16_shr(v1_1, 4);\n\n        // sub 8\n        const v128_t v0_0ls = wasm_i8x16_sub(v0_0l, s8b);\n        const v128_t v1_0ls = wasm_i8x16_sub(v1_0l, s8b);\n\n        const v128_t v0_0hs = wasm_i8x16_sub(v0_0h, s8b);\n        const v128_t v1_0hs = wasm_i8x16_sub(v1_0h, s8b);\n\n        const v128_t v0_1ls = wasm_i8x16_sub(v0_1l, s8b);\n        const v128_t v1_1ls = wasm_i8x16_sub(v1_1l, s8b);\n\n        const v128_t v0_1hs = wasm_i8x16_sub(v0_1h, s8b);\n        const v128_t v1_1hs = wasm_i8x16_sub(v1_1h, s8b);\n\n        // dot product into int16x8_t\n        const v128_t pl0l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_0ls), wasm_i16x8_extend_low_i8x16(v1_0ls));\n        const v128_t pl0h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_0ls), wasm_i16x8_extend_high_i8x16(v1_0ls));\n\n        const v128_t ph0l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_0hs), wasm_i16x8_extend_low_i8x16(v1_0hs));\n        const v128_t ph0h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_0hs), wasm_i16x8_extend_high_i8x16(v1_0hs));\n\n        const v128_t pl1l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_1ls), wasm_i16x8_extend_low_i8x16(v1_1ls));\n        const v128_t pl1h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_1ls), wasm_i16x8_extend_high_i8x16(v1_1ls));\n\n        const v128_t ph1l = wasm_i16x8_mul(wasm_i16x8_extend_low_i8x16(v0_1hs), wasm_i16x8_extend_low_i8x16(v1_1hs));\n        const v128_t ph1h = wasm_i16x8_mul(wasm_i16x8_extend_high_i8x16(v0_1hs), wasm_i16x8_extend_high_i8x16(v1_1hs));\n\n        const v128_t pl_0 = wasm_i16x8_add(pl0l, pl0h);\n        const v128_t ph_0 = wasm_i16x8_add(ph0l, ph0h);\n\n        const v128_t pl_1 = wasm_i16x8_add(pl1l, pl1h);\n        const v128_t ph_1 = wasm_i16x8_add(ph1l, ph1h);\n\n        const v128_t p_0 = wasm_i16x8_add(pl_0, ph_0);\n        const v128_t p_1 = wasm_i16x8_add(pl_1, ph_1);\n\n        sum0 += d0_0*d1_0*(\n                wasm_i16x8_extract_lane(p_0, 0) + wasm_i16x8_extract_lane(p_0, 1) +\n                wasm_i16x8_extract_lane(p_0, 2) + wasm_i16x8_extract_lane(p_0, 3) +\n                wasm_i16x8_extract_lane(p_0, 4) + wasm_i16x8_extract_lane(p_0, 5) +\n                wasm_i16x8_extract_lane(p_0, 6) + wasm_i16x8_extract_lane(p_0, 7));\n        sum1 += d0_1*d1_1*(\n                wasm_i16x8_extract_lane(p_1, 0) + wasm_i16x8_extract_lane(p_1, 1) +\n                wasm_i16x8_extract_lane(p_1, 2) + wasm_i16x8_extract_lane(p_1, 3) +\n                wasm_i16x8_extract_lane(p_1, 4) + wasm_i16x8_extract_lane(p_1, 5) +\n                wasm_i16x8_extract_lane(p_1, 6) + wasm_i16x8_extract_lane(p_1, 7));\n    }\n\n    sumf = sum0 + sum1;\n#else\n#error \"not implemented for QK\"\n#endif\n#else\n    // scalar\n    for (int i = 0; i < nb; i++) {\n        const float d0 = *(const float *) (pd0 + i*bs);\n        const float d1 = *(const float *) (pd1 + i*bs);\n\n        const uint8_t * restrict p0 = pb0 + i*bs;\n        const uint8_t * restrict p1 = pb1 + i*bs;\n\n        for (int j = 0; j < QK/2; j++) {\n            const uint8_t v0 = p0[j];\n            const uint8_t v1 = p1[j];\n\n            const float f0 = d0*((int8_t) (v0 & 0xf) - 8);\n            const float f1 = d0*((int8_t) (v0 >> 4)  - 8);\n\n            const float f2 = d1*((int8_t) (v1 & 0xf) - 8);\n            const float f3 = d1*((int8_t) (v1 >> 4)  - 8);\n\n            sumf += f0*f2 + f1*f3;\n        }\n    }\n#endif\n\n    *s = sumf;\n}\n\ninline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict x, const void * restrict y) {\n    const int nb = n / QK;\n\n    const float * restrict pm0 = (const float *) x;\n    const float * restrict pm1 = (const float *) y;\n\n    const float * restrict pd0 = (const float *) (pm0 + nb);\n    const float * restrict pd1 = (const float *) (pm1 + nb);\n\n    const uint8_t * restrict pb0 = (const uint8_t *) (pd0 + nb);\n    const uint8_t * restrict pb1 = (const uint8_t *) (pd1 + nb);\n\n    float sumf = 0.0;\n\n#if 1\n    // scalar\n    for (int i = 0; i < nb; i++) {\n        const float m0 = pm0[i];\n        const float m1 = pm1[i];\n\n        const float d0 = pd0[i];\n        const float d1 = pd1[i];\n\n        const uint8_t * restrict p0 = pb0 + i*QK/2;\n        const uint8_t * restrict p1 = pb1 + i*QK/2;\n\n        for (int j = 0; j < QK/2; j++) {\n            const uint8_t v0 = p0[j];\n            const uint8_t v1 = p1[j];\n\n            const float f0 = d0*(v0 & 0xf) + m0;\n            const float f1 = d0*(v0 >> 4)  + m0;\n\n            const float f2 = d1*(v1 & 0xf) + m1;\n            const float f3 = d1*(v1 >> 4)  + m1;\n\n            sumf += f0*f2 + f1*f3;\n        }\n    }\n#endif\n\n    *s = sumf;\n}\n\n// compute GGML_VEC_DOT_UNROLL dot products at once\n// xs - x row stride in bytes\ninline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {\n    ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };\n\n    ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];\n\n    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {\n        x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);\n    }\n\n#if defined(GGML_SIMD)\n    const int np = (n & ~(GGML_F16_STEP - 1));\n\n    GGML_F16_VEC sum[GGML_VEC_DOT_UNROLL][GGML_F16_ARR] = { { GGML_F16_VEC_ZERO } };\n\n    GGML_F16_VEC ax[GGML_F16_ARR];\n    GGML_F16_VEC ay[GGML_F16_ARR];\n\n    for (int i = 0; i < np; i += GGML_F16_STEP) {\n        for (int j = 0; j < GGML_F16_ARR; j++) {\n            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);\n\n            for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {\n                ax[j] = GGML_F16_VEC_LOAD(x[k] + i + j*GGML_F16_EPR, j);\n\n                sum[k][j] = GGML_F16_VEC_FMA(sum[k][j], ax[j], ay[j]);\n            }\n        }\n    }\n\n    // reduce sum0..sum3 to sum0\n    for (int k = 0; k < GGML_VEC_DOT_UNROLL; ++k) {\n        GGML_F16_VEC_REDUCE(sumf[k], sum[k]);\n    }\n\n    // leftovers\n    for (int i = np; i < n; ++i) {\n        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {\n            sumf[j] += GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]);\n        }\n    }\n#else\n    for (int i = 0; i < n; ++i) {\n        for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) {\n            sumf[j] += GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]);\n        }\n    }\n#endif\n\n    for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {\n        s[i] = sumf[i];\n    }\n}\n\ninline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) {\n#if defined(GGML_SIMD)\n    const int np = (n & ~(GGML_F32_STEP - 1));\n\n    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);\n\n    GGML_F32_VEC ax[GGML_F32_ARR];\n    GGML_F32_VEC ay[GGML_F32_ARR];\n\n    for (int i = 0; i < np; i += GGML_F32_STEP) {\n        for (int j = 0; j < GGML_F32_ARR; j++) {\n            ax[j] = GGML_F32_VEC_LOAD(x + i + j*GGML_F32_EPR);\n            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);\n            ay[j] = GGML_F32_VEC_FMA(ay[j], ax[j], vx);\n\n            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);\n        }\n    }\n\n    // leftovers\n    for (int i = np; i < n; ++i) {\n        y[i] += x[i]*v;\n    }\n#else\n    // scalar\n    for (int i = 0; i < n; ++i) {\n        y[i] += x[i]*v;\n    }\n#endif\n}\n\ninline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * restrict y, ggml_fp16_t * restrict x, const float v) {\n#if defined(GGML_SIMD)\n    const int np = (n & ~(GGML_F16_STEP - 1));\n\n    GGML_F16_VEC vx = GGML_F16_VEC_SET1(v);\n\n    GGML_F16_VEC ax[GGML_F16_ARR];\n    GGML_F16_VEC ay[GGML_F16_ARR];\n\n    for (int i = 0; i < np; i += GGML_F16_STEP) {\n        for (int j = 0; j < GGML_F16_ARR; j++) {\n            ax[j] = GGML_F16_VEC_LOAD(x + i + j*GGML_F16_EPR, j);\n            ay[j] = GGML_F16_VEC_LOAD(y + i + j*GGML_F16_EPR, j);\n            ay[j] = GGML_F16_VEC_FMA(ay[j], ax[j], vx);\n\n            GGML_F16_VEC_STORE(y + i + j*GGML_F16_EPR, ay, j);\n        }\n    }\n\n    // leftovers\n    for (int i = np; i < n; ++i) {\n        GGML_ASSERT(false);\n        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);\n    }\n#else\n    for (int i = 0; i < n; ++i) {\n        y[i] = GGML_FP32_TO_FP16(GGML_FP16_TO_FP32(y[i]) + GGML_FP16_TO_FP32(x[i])*v);\n    }\n#endif\n}\n\ninline static void ggml_vec_mad_q4_0(const int n, float * restrict y, void * restrict x, const float v) {\n    assert(n % QK == 0);\n\n    const int nb = n / QK;\n    const size_t bs = sizeof(float) + QK/2;\n\n    const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs);\n    const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float));\n\n#if __ARM_NEON\n#if QK == 32\n    for (int i = 0; i < nb; ++i) {\n        const float d0 = v*(*(const float *) (pd + i*bs));\n\n        const uint8_t * restrict pp = pb + i*bs;\n\n        const uint8x8_t m4b = vdup_n_u8(0xf);\n        const int8x8_t  s8b = vdup_n_s8(0x8);\n\n        const float32x4_t vd = vdupq_n_f32(d0);\n\n        for (int j = 0; j < 2; j++) {\n            const uint8x8_t vx = vld1_u8(pp + j*8);\n\n            const int8x8_t vxl = vreinterpret_s8_u8(vand_u8(vx, m4b));\n            const int8x8_t vxh = vreinterpret_s8_u8(vshr_n_u8(vx, 4));\n\n            // sub 8\n            const int8x8_t vxls = vsub_s8(vxl, s8b);\n            const int8x8_t vxhs = vsub_s8(vxh, s8b);\n\n            //const int8x8_t vxlt = vzip_s8(vxls, vxhs)[0];\n            //const int8x8_t vxht = vzip_s8(vxls, vxhs)[1];\n            const int8x8_t vxlt = vzip1_s8(vxls, vxhs);\n            const int8x8_t vxht = vzip2_s8(vxls, vxhs);\n\n            const int8x16_t vxq = vcombine_s8(vxlt, vxht);\n\n            // convert to 2x int16x8_t\n            const int16x8_t vxq0 = vmovl_s8(vget_low_s8 (vxq));\n            const int16x8_t vxq1 = vmovl_s8(vget_high_s8(vxq));\n\n            // convert to 4x float32x4_t\n            const float32x4_t vx0 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vxq0)));\n            const float32x4_t vx1 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vxq0)));\n            const float32x4_t vx2 = vcvtq_f32_s32(vmovl_s16(vget_low_s16 (vxq1)));\n            const float32x4_t vx3 = vcvtq_f32_s32(vmovl_s16(vget_high_s16(vxq1)));\n\n            const float32x4_t vy0 = vld1q_f32(y + i*32 + j*16 + 0);\n            const float32x4_t vy1 = vld1q_f32(y + i*32 + j*16 + 4);\n            const float32x4_t vy2 = vld1q_f32(y + i*32 + j*16 + 8);\n            const float32x4_t vy3 = vld1q_f32(y + i*32 + j*16 + 12);\n\n            const float32x4_t vr0 = vfmaq_f32(vy0, vx0, vd);\n            const float32x4_t vr1 = vfmaq_f32(vy1, vx1, vd);\n            const float32x4_t vr2 = vfmaq_f32(vy2, vx2, vd);\n            const float32x4_t vr3 = vfmaq_f32(vy3, vx3, vd);\n\n            vst1q_f32(y + i*32 + j*16 + 0,  vr0);\n            vst1q_f32(y + i*32 + j*16 + 4,  vr1);\n            vst1q_f32(y + i*32 + j*16 + 8,  vr2);\n            vst1q_f32(y + i*32 + j*16 + 12, vr3);\n        }\n    }\n#endif\n#else\n    // scalar\n    for (int i = 0; i < nb; i++) {\n        const float d = *(const float *) (pd + i*bs);\n\n        const uint8_t * restrict pp = pb + i*bs;\n\n        for (int l = 0; l < QK; l += 2) {\n            const uint8_t vi = pp[l/2];\n\n            const int8_t vi0 = vi & 0xf;\n            const int8_t vi1 = vi >> 4;\n\n            const float v0 = (vi0 - 8)*d;\n            const float v1 = (vi1 - 8)*d;\n\n            y[i*QK + l + 0] += v0*v;\n            y[i*QK + l + 1] += v1*v;\n\n            assert(!isnan(y[i*QK + l + 0]));\n            assert(!isnan(y[i*QK + l + 1]));\n            assert(!isinf(y[i*QK + l + 0]));\n            assert(!isinf(y[i*QK + l + 1]));\n        }\n    }\n#endif\n}\n\ninline static void ggml_vec_mad_q4_1(const int n, float * restrict y, void * restrict x, const float v) {\n    assert(n % QK == 0);\n\n    const int nb = n / QK;\n\n    const float   * restrict pm = (const float *)   (x);\n    const float   * restrict pd = (const float *)   (pm + nb);\n    const uint8_t * restrict pb = (const uint8_t *) (pd + nb);\n\n    for (int i = 0; i < nb; i++) {\n        const float m = pm[i];\n        const float d = pd[i];\n\n        const uint8_t * restrict pp = pb + i*QK/2;\n\n        for (int l = 0; l < QK; l += 2) {\n            const uint8_t vi = pp[l/2];\n\n            const uint8_t vi0 = vi & 0xf;\n            const uint8_t vi1 = vi >> 4;\n\n            const float v0 = d*vi0 + m;\n            const float v1 = d*vi1 + m;\n\n            y[i*QK + l + 0] += v0*v;\n            y[i*QK + l + 1] += v1*v;\n\n            assert(!isnan(y[i*QK + l + 0]));\n            assert(!isnan(y[i*QK + l + 1]));\n            assert(!isinf(y[i*QK + l + 0]));\n            assert(!isinf(y[i*QK + l + 1]));\n            //printf(\"mad: v0 %f v1 %f, i = %d, l = %d, d = %f, vi = %d, vi0 = %d, vi1 = %d\\n\", v0, v1, i, l, d, vi, vi0, vi1);\n        }\n    }\n}\n\n//inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) { for (int i = 0; i < n; ++i) y[i] *= v;          }\ninline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {\n#if defined(GGML_SIMD)\n    const int np = (n & ~(GGML_F32_STEP - 1));\n\n    GGML_F32_VEC vx = GGML_F32_VEC_SET1(v);\n\n    GGML_F32_VEC ay[GGML_F32_ARR];\n\n    for (int i = 0; i < np; i += GGML_F32_STEP) {\n        for (int j = 0; j < GGML_F32_ARR; j++) {\n            ay[j] = GGML_F32_VEC_LOAD(y + i + j*GGML_F32_EPR);\n            ay[j] = GGML_F32_VEC_MUL(ay[j], vx);\n\n            GGML_F32_VEC_STORE(y + i + j*GGML_F32_EPR, ay[j]);\n        }\n    }\n\n    // leftovers\n    for (int i = np; i < n; ++i) {\n        y[i] *= v;\n    }\n#else\n    // scalar\n    for (int i = 0; i < n; ++i) {\n        y[i] *= v;\n    }\n#endif\n}\n\ninline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrt(*s);   }\ninline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }\ninline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrt(x[i]); }\ninline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }\ninline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }\ninline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }\ninline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }\n\nstatic const ggml_float GELU_COEF_A    = 0.044715;\nstatic const ggml_float SQRT_2_OVER_PI = 0.79788456080286535587989211986876;\n\ninline static float ggml_gelu_f32(float x) {\n    return 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x)));\n}\n\ninline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {\n    const uint16_t * i16 = (const uint16_t *) x;\n    for (int i = 0; i < n; ++i) {\n        y[i] = table_gelu_f16[i16[i]];\n    }\n}\n\n#ifdef GGML_GELU_FP16\ninline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {\n    uint16_t t;\n    for (int i = 0; i < n; ++i) {\n        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);\n        memcpy(&t, &fp16, sizeof(uint16_t));\n        y[i] = GGML_FP16_TO_FP32(table_gelu_f16[t]);\n    }\n}\n#else\ninline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) {\n    for (int i = 0; i < n; ++i) {\n        y[i] = ggml_gelu_f32(x[i]);\n    }\n}\n#endif\n\n// Sigmoid Linear Unit (SiLU) function\ninline static float ggml_silu_f32(float x) {\n    return x/(1.0 + exp(-x));\n}\n\ninline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {\n    const uint16_t * i16 = (const uint16_t *) x;\n    for (int i = 0; i < n; ++i) {\n        y[i] = table_silu_f16[i16[i]];\n    }\n}\n\n#ifdef GGML_SILU_FP16\ninline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {\n    uint16_t t;\n    for (int i = 0; i < n; ++i) {\n        ggml_fp16_t fp16 = GGML_FP32_TO_FP16(x[i]);\n        memcpy(&t, &fp16, sizeof(uint16_t));\n        y[i] = GGML_FP16_TO_FP32(table_silu_f16[t]);\n    }\n}\n#else\ninline static void ggml_vec_silu_f32(const int n, float * y, const float * x) {\n    for (int i = 0; i < n; ++i) {\n        y[i] = ggml_silu_f32(x[i]);\n    }\n}\n#endif\n\ninline static void ggml_vec_sum_f32(const int n, float * s, const float * x) {\n#ifndef GGML_USE_ACCELERATE\n    ggml_float sum = 0.0;\n    for (int i = 0; i < n; ++i) {\n        sum += x[i];\n    }\n    *s = sum;\n#else\n    vDSP_sve(x, 1, s, n);\n#endif\n}\n\ninline static void ggml_vec_max_f32(const int n, float * s, const float * x) {\n#ifndef GGML_USE_ACCELERATE\n    ggml_float max = -INFINITY;\n    for (int i = 0; i < n; ++i) {\n        max = MAX(max, x[i]);\n    }\n    *s = max;\n#else\n    vDSP_maxv(x, 1, s, n);\n#endif\n}\n\ninline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { ggml_vec_norm_f32(n, s, x); *s = 1./(*s); }\n\n//\n// logging\n//\n\n#if (GGML_DEBUG >= 1)\n#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)\n#else\n#define GGML_PRINT_DEBUG(...)\n#endif\n\n#if (GGML_DEBUG >= 5)\n#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)\n#else\n#define GGML_PRINT_DEBUG_5(...)\n#endif\n\n#if (GGML_DEBUG >= 10)\n#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)\n#else\n#define GGML_PRINT_DEBUG_10(...)\n#endif\n\n#define GGML_PRINT(...) printf(__VA_ARGS__)\n\n//\n// data types\n//\n\nstatic const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {\n    QK,\n    QK,\n    1,\n    1,\n    1,\n    1,\n    1,\n};\n\nstatic_assert(GGML_TYPE_COUNT == 7, \"GGML_TYPE_COUNT != 5\");\n\nstatic const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {\n    sizeof(float  )   + QK/2,\n    sizeof(float  )*2 + QK/2,\n    sizeof(int8_t ),\n    sizeof(int16_t),\n    sizeof(int32_t),\n    sizeof(ggml_fp16_t),\n    sizeof(float  ),\n};\n\n// don't forget to update the array above when adding new types\nstatic_assert(GGML_TYPE_COUNT == 7, \"GGML_TYPE_COUNT != 5\");\n\nstatic const char * GGML_OP_LABEL[GGML_OP_COUNT] = {\n    \"NONE\",\n\n    \"DUP\",\n    \"ADD\",\n    \"SUB\",\n    \"MUL\",\n    \"DIV\",\n    \"SQR\",\n    \"SQRT\",\n    \"SUM\",\n    \"MEAN\",\n    \"REPEAT\",\n    \"ABS\",\n    \"SGN\",\n    \"NEG\",\n    \"STEP\",\n    \"RELU\",\n    \"GELU\",\n    \"SILU\",\n    \"NORM\",\n\n    \"MUL_MAT\",\n\n    \"SCALE\",\n    \"CPY\",\n    \"RESHAPE\",\n    \"VIEW\",\n    \"PERMUTE\",\n    \"TRANSPOSE\",\n    \"GET_ROWS\",\n    \"DIAG_MASK_INF\",\n    \"SOFT_MAX\",\n    \"ROPE\",\n    \"CONV_1D_1S\",\n    \"CONV_1D_2S\",\n\n    \"FLASH_ATTN\",\n    \"FLASH_FF\",\n};\n\nstatic_assert(GGML_OP_COUNT == 34, \"GGML_OP_COUNT != 34\");\n\nstatic const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {\n    \"none\",\n\n    \"x\",\n    \"x+y\",\n    \"x-y\",\n    \"x*y\",\n    \"x/y\",\n    \"x^2\",\n    \"√x\",\n    \"Σx\",\n    \"Σx/n\",\n    \"repeat(x)\",\n    \"abs(x)\",\n    \"sgn(x)\",\n    \"-x\",\n    \"step(x)\",\n    \"relu(x)\",\n    \"gelu(x)\",\n    \"silu(x)\",\n    \"norm(x)\",\n\n    \"X*Y\",\n\n    \"x*v\",\n    \"x-\\\\>y\",\n    \"reshape(x)\",\n    \"view(x)\",\n    \"permute(x)\",\n    \"transpose(x)\",\n    \"get_rows(x)\",\n    \"diag_mask_inf(x)\",\n    \"soft_max(x)\",\n    \"rope(x)\",\n    \"conv_1d_1s(x)\",\n    \"conv_1d_2s(x)\",\n\n    \"flash_attn(x)\",\n    \"flash_ff(x)\",\n};\n\nstatic_assert(GGML_OP_COUNT == 34, \"GGML_OP_COUNT != 34\");\n\n//\n// ggml object\n//\n\nstruct ggml_object {\n    size_t offs;\n    size_t size;\n\n    struct ggml_object * next;\n\n    char padding[8];\n};\n\nstatic const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);\n\nstatic_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, \"ggml_object size must be a multiple of GGML_MEM_ALIGN\");\nstatic_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, \"ggml_tensor size must be a multiple of GGML_MEM_ALIGN\");\n\n//\n// ggml context\n//\n\nstruct ggml_context {\n    size_t mem_size;\n    void * mem_buffer;\n    bool   mem_buffer_owned;\n\n    int n_objects;\n\n    struct ggml_object * objects_begin;\n    struct ggml_object * objects_end;\n\n    struct ggml_scratch scratch;\n    struct ggml_scratch scratch_save;\n};\n\nstruct ggml_context_container {\n    bool used;\n\n    struct ggml_context context;\n};\n\n//\n// compute types\n//\n\nenum ggml_task_type {\n    GGML_TASK_INIT = 0,\n    GGML_TASK_COMPUTE,\n    GGML_TASK_FINALIZE,\n};\n\nstruct ggml_compute_params {\n    enum ggml_task_type type;\n\n    int ith, nth;\n\n    // work buffer for all threads\n    size_t wsize;\n    void * wdata;\n};\n\n//\n// ggml state\n//\n\nstruct ggml_state {\n    struct ggml_context_container contexts[GGML_MAX_CONTEXTS];\n};\n\n// global state\nstatic struct ggml_state g_state;\nstatic atomic_int g_state_barrier = 0;\n\n// barrier via spin lock\ninline static void ggml_critical_section_start(void) {\n    int processing = atomic_fetch_add(&g_state_barrier, 1);\n\n    while (processing > 0) {\n        // wait for other threads to finish\n        atomic_fetch_sub(&g_state_barrier, 1);\n        sched_yield(); // TODO: reconsider this\n        processing = atomic_fetch_add(&g_state_barrier, 1);\n    }\n}\n\n// TODO: make this somehow automatically executed\n//       some sort of \"sentry\" mechanism\ninline static void ggml_critical_section_end(void) {\n    atomic_fetch_sub(&g_state_barrier, 1);\n}\n\n////////////////////////////////////////////////////////////////////////////////\n\nvoid ggml_print_object(const struct ggml_object * obj) {\n    GGML_PRINT(\" - ggml_object: offset = %zu, size = %zu, next = %p\\n\",\n            obj->offs, obj->size, (const void *) obj->next);\n}\n\nvoid ggml_print_objects(const struct ggml_context * ctx) {\n    struct ggml_object * obj = ctx->objects_begin;\n\n    GGML_PRINT(\"%s: objects in context %p:\\n\", __func__, (const void *) ctx);\n\n    while (obj != NULL) {\n        ggml_print_object(obj);\n        obj = obj->next;\n    }\n\n    GGML_PRINT(\"%s: --- end ---\\n\", __func__);\n}\n\nint ggml_nelements(const struct ggml_tensor * tensor) {\n    static_assert(GGML_MAX_DIMS == 4, \"GGML_MAX_DIMS is not 4 - update this function\");\n\n    return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];\n}\n\nint ggml_nrows(const struct ggml_tensor * tensor) {\n    static_assert(GGML_MAX_DIMS == 4, \"GGML_MAX_DIMS is not 4 - update this function\");\n\n    return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];\n}\n\nsize_t ggml_nbytes(const struct ggml_tensor * tensor) {\n    static_assert(GGML_MAX_DIMS == 4, \"GGML_MAX_DIMS is not 4 - update this function\");\n\n    return (ggml_nelements(tensor)*GGML_TYPE_SIZE[tensor->type])/GGML_BLCK_SIZE[tensor->type];\n}\n\nint ggml_blck_size(enum ggml_type type) {\n    return GGML_BLCK_SIZE[type];\n}\n\nsize_t ggml_type_size(enum ggml_type type) {\n    return GGML_TYPE_SIZE[type];\n}\n\nfloat ggml_type_sizef(enum ggml_type type) {\n    return ((float)(GGML_TYPE_SIZE[type]))/GGML_BLCK_SIZE[type];\n}\n\nsize_t ggml_element_size(const struct ggml_tensor * tensor) {\n    return GGML_TYPE_SIZE[tensor->type];\n}\n\nstatic inline bool ggml_is_scalar(const struct ggml_tensor * tensor) {\n    static_assert(GGML_MAX_DIMS == 4, \"GGML_MAX_DIMS is not 4 - update this function\");\n\n    return tensor->ne[0] == 1 && tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;\n}\n\nstatic inline bool ggml_is_vector(const struct ggml_tensor * tensor) {\n    static_assert(GGML_MAX_DIMS == 4, \"GGML_MAX_DIMS is not 4 - update this function\");\n\n    return tensor->ne[1] == 1 && tensor->ne[2] == 1 && tensor->ne[3] == 1;\n}\n\nstatic inline bool ggml_is_matrix(const struct ggml_tensor * tensor) {\n    static_assert(GGML_MAX_DIMS == 4, \"GGML_MAX_DIMS is not 4 - update this function\");\n\n    return tensor->ne[2] == 1 && tensor->ne[3] == 1;\n}\n\nstatic inline bool ggml_can_mul_mat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {\n    static_assert(GGML_MAX_DIMS == 4, \"GGML_MAX_DIMS is not 4 - update this function\");\n\n    return\n        (t0->ne[0]  == t1->ne[0])  &&\n        (t0->ne[2]  == t1->ne[2])  &&\n        (t0->ne[3]  == t1->ne[3]);\n}\n\nstatic inline bool ggml_is_contiguous(const struct ggml_tensor * tensor) {\n    static_assert(GGML_MAX_DIMS == 4, \"GGML_MAX_DIMS is not 4 - update this function\");\n\n    return\n        tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&\n        tensor->nb[1] == (tensor->nb[0]*tensor->ne[0])/GGML_BLCK_SIZE[tensor->type] &&\n        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&\n        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];\n}\n\nstatic inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {\n    static_assert(GGML_MAX_DIMS == 4, \"GGML_MAX_DIMS is not 4 - update this function\");\n\n    return\n        tensor->nb[0] == GGML_TYPE_SIZE[tensor->type] &&\n        tensor->nb[2] == tensor->nb[1]*tensor->ne[1] &&\n        tensor->nb[3] == tensor->nb[2]*tensor->ne[2];\n}\n\nstatic inline bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {\n    static_assert(GGML_MAX_DIMS == 4, \"GGML_MAX_DIMS is not 4 - update this function\");\n\n    return\n        (t0->ne[0] == t1->ne[0] ) &&\n        (t0->ne[1] == t1->ne[1] ) &&\n        (t0->ne[2] == t1->ne[2] ) &&\n        (t0->ne[3] == t1->ne[3] );\n}\n\n// check if t1 can be represented as a repeatition of t0\nstatic inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {\n    static_assert(GGML_MAX_DIMS == 4, \"GGML_MAX_DIMS is not 4 - update this function\");\n\n    return\n        (t1->ne[0]%t0->ne[0] == 0) &&\n        (t1->ne[1]%t0->ne[1] == 0) &&\n        (t1->ne[2]%t0->ne[2] == 0) &&\n        (t1->ne[3]%t0->ne[3] == 0);\n}\n\nstatic inline int ggml_up32(int n) {\n    return (n + 31) & ~31;\n}\n\nstatic inline int ggml_up64(int n) {\n    return (n + 63) & ~63;\n}\n\nstatic inline int ggml_up(int n, int m) {\n    // assert m is a power of 2\n    GGML_ASSERT((m & (m - 1)) == 0);\n    return (n + m - 1) & ~(m - 1);\n}\n\n// assert that pointer is aligned to GGML_MEM_ALIGN\n#define ggml_assert_aligned(ptr) \\\n    assert(((uintptr_t) (ptr))%GGML_MEM_ALIGN == 0)\n\n////////////////////////////////////////////////////////////////////////////////\n\nstruct ggml_context * ggml_init(struct ggml_init_params params) {\n    // make this function thread safe\n    ggml_critical_section_start();\n\n    static bool is_first_call = true;\n\n    if (is_first_call) {\n        // initialize GELU, SILU and EXP F32 tables\n        {\n            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);\n\n            ggml_fp16_t ii;\n            for (int i = 0; i < (1 << 16); ++i) {\n                uint16_t ui = i;\n                memcpy(&ii, &ui, sizeof(ii));\n                const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii);\n                table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f));\n                table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f));\n                table_exp_f16[i]  = GGML_FP32_TO_FP16(exp(f));\n            }\n\n            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);\n\n            GGML_PRINT_DEBUG(\"%s: GELU, SILU and EXP tables initialized in %f ms\\n\", __func__, (t_end - t_start)/1000.0f);\n        }\n\n        // initialize g_state\n        {\n            const uint64_t t_start = ggml_time_us(); UNUSED(t_start);\n\n            g_state = (struct ggml_state) {\n                /*.contexts =*/ { { 0 } },\n            };\n\n            for (int i = 0; i < GGML_MAX_CONTEXTS; ++i) {\n                g_state.contexts[i].used = false;\n            }\n\n            const uint64_t t_end = ggml_time_us(); UNUSED(t_end);\n\n            GGML_PRINT_DEBUG(\"%s: g_state initialized in %f ms\\n\", __func__, (t_end - t_start)/1000.0f);\n        }\n\n        is_first_call = false;\n    }\n\n    // find non-used context in g_state\n    struct ggml_context * ctx = NULL;\n\n    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {\n        if (!g_state.contexts[i].used) {\n            g_state.contexts[i].used = true;\n            ctx = &g_state.contexts[i].context;\n\n            GGML_PRINT_DEBUG(\"%s: found unused context %d\\n\", __func__, i);\n            break;\n        }\n    }\n\n    if (ctx == NULL) {\n        GGML_PRINT_DEBUG(\"%s: no unused context found\\n\", __func__);\n\n        ggml_critical_section_end();\n\n        return NULL;\n    }\n\n    *ctx = (struct ggml_context) {\n        /*.mem_size         =*/ params.mem_size,\n        /*.mem_buffer       =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),\n        /*.mem_buffer_owned =*/ params.mem_buffer ? false : true,\n        /*.n_objects        =*/ 0,\n        /*.objects_begin    =*/ NULL,\n        /*.objects_end      =*/ NULL,\n        /*.scratch          =*/ { 0, 0, NULL, },\n        /*.scratch_save     =*/ { 0, 0, NULL, },\n    };\n\n    ggml_assert_aligned(ctx->mem_buffer);\n\n    GGML_PRINT_DEBUG(\"%s: context initialized\\n\", __func__);\n\n    ggml_critical_section_end();\n\n    return ctx;\n}\n\nvoid ggml_free(struct ggml_context * ctx) {\n    // make this function thread safe\n    ggml_critical_section_start();\n\n    bool found = false;\n\n    for (int i = 0; i < GGML_MAX_CONTEXTS; i++) {\n        if (&g_state.contexts[i].context == ctx) {\n            g_state.contexts[i].used = false;\n\n            GGML_PRINT_DEBUG(\"%s: context %d with %d objects has been freed. memory used = %zu\\n\",\n                    __func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);\n\n            if (ctx->mem_buffer_owned) {\n                free(ctx->mem_buffer);\n            }\n\n            found = true;\n            break;\n        }\n    }\n\n    if (!found) {\n        GGML_PRINT_DEBUG(\"%s: context not found\\n\", __func__);\n    }\n\n    ggml_critical_section_end();\n}\n\nsize_t ggml_used_mem(const struct ggml_context * ctx) {\n    return ctx->objects_end->offs + ctx->objects_end->size;\n}\n\nsize_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {\n    const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0;\n\n    ctx->scratch = scratch;\n\n    return result;\n}\n\n////////////////////////////////////////////////////////////////////////////////\n\nstruct ggml_tensor * ggml_new_tensor_impl(\n        struct ggml_context * ctx,\n        enum   ggml_type type,\n        int    n_dims,\n        const int* ne,\n        void*  data) {\n    // always insert objects at the end of the context's memory pool\n    struct ggml_object * obj_cur = ctx->objects_end;\n\n    const size_t cur_offs = obj_cur == NULL ? 0 : obj_cur->offs;\n    const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;\n    const size_t cur_end  = cur_offs + cur_size;\n\n    size_t size_needed = 0;\n\n    if (data == NULL) {\n        size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);\n        for (int i = 1; i < n_dims; i++) {\n            size_needed *= ne[i];\n        }\n        // align to GGML_MEM_ALIGN\n        size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;\n    }\n\n    char * const mem_buffer = ctx->mem_buffer;\n    struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);\n\n    if (ctx->scratch.data == NULL || data != NULL) {\n        size_needed += sizeof(struct ggml_tensor);\n\n        if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {\n            GGML_PRINT(\"%s: not enough space in the context's memory pool (needed %zu, available %zu)\\n\",\n                    __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);\n            assert(false);\n            return NULL;\n        }\n\n        *obj_new = (struct ggml_object) {\n            .offs = cur_end + GGML_OBJECT_SIZE,\n            .size = size_needed,\n            .next = NULL,\n        };\n    } else {\n        if (ctx->scratch.offs + size_needed > ctx->scratch.size) {\n            GGML_PRINT(\"%s: not enough space in the scratch memory\\n\", __func__);\n            assert(false);\n            return NULL;\n        }\n\n        if (cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE > ctx->mem_size) {\n            GGML_PRINT(\"%s: not enough space in the context's memory pool (needed %zu, available %zu)\\n\",\n                    __func__, cur_end + sizeof(struct ggml_tensor) + GGML_OBJECT_SIZE, ctx->mem_size);\n            assert(false);\n            return NULL;\n        }\n\n        data = (char * const) ctx->scratch.data + ctx->scratch.offs;\n\n        *obj_new = (struct ggml_object) {\n            .offs = cur_end + GGML_OBJECT_SIZE,\n            .size = sizeof(struct ggml_tensor),\n            .next = NULL,\n        };\n\n        //printf(\"scratch offs = %zu, size_needed = %zu\\n\", ctx->scratch.offs, size_needed);\n\n        ctx->scratch.offs += size_needed;\n    }\n\n    if (obj_cur != NULL) {\n        obj_cur->next = obj_new;\n    } else {\n        // this is the first object in this context\n        ctx->objects_begin = obj_new;\n    }\n\n    ctx->objects_end = obj_new;\n\n    //printf(\"%s: inserted new object at %zu, size = %zu\\n\", __func__, cur_end, obj_new->size);\n\n    struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs);\n\n    ggml_assert_aligned(result);\n\n    *result = (struct ggml_tensor) {\n        /*.type         =*/ type,\n        /*.n_dims       =*/ n_dims,\n        /*.ne           =*/ { 1, 1, 1, 1 },\n        /*.nb           =*/ { 0, 0, 0, 0 },\n        /*.op           =*/ GGML_OP_NONE,\n        /*.is_param     =*/ false,\n        /*.grad         =*/ NULL,\n        /*.src0         =*/ NULL,\n        /*.src1         =*/ NULL,\n        /*.opt          =*/ { NULL },\n        /*.n_tasks      =*/ 0,\n        /*.perf_runs    =*/ 0,\n        /*.perf_cycles  =*/ 0,\n        /*.perf_time_us =*/ 0,\n        /*.data         =*/ data == NULL ? (void *)(result + 1) : data,\n        /*.pad          =*/ { 0 },\n    };\n\n    ggml_assert_aligned(result->data);\n\n    for (int i = 0; i < n_dims; i++) {\n        result->ne[i] = ne[i];\n    }\n\n    result->nb[0] = GGML_TYPE_SIZE[type];\n    result->nb[1] = result->nb[0]*(result->ne[0]/GGML_BLCK_SIZE[type]);\n    for (int i = 2; i < GGML_MAX_DIMS; i++) {\n        result->nb[i] = result->nb[i - 1]*result->ne[i - 1];\n    }\n\n    ctx->n_objects++;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_new_tensor(\n        struct ggml_context * ctx,\n        enum   ggml_type type,\n        int    n_dims,\n        const int * ne) {\n    return ggml_new_tensor_impl(ctx, type, n_dims, ne, NULL);\n}\n\nstruct ggml_tensor * ggml_new_tensor_1d(\n        struct ggml_context * ctx,\n        enum   ggml_type type,\n        int    ne0) {\n    return ggml_new_tensor(ctx, type, 1, &ne0);\n}\n\nstruct ggml_tensor * ggml_new_tensor_2d(\n        struct ggml_context * ctx,\n        enum   ggml_type type,\n        int    ne0,\n        int    ne1) {\n    const int ne[2] = { ne0, ne1 };\n    return ggml_new_tensor(ctx, type, 2, ne);\n}\n\nstruct ggml_tensor * ggml_new_tensor_3d(\n        struct ggml_context * ctx,\n        enum   ggml_type type,\n        int    ne0,\n        int    ne1,\n        int    ne2) {\n    const int ne[3] = { ne0, ne1, ne2 };\n    return ggml_new_tensor(ctx, type, 3, ne);\n}\n\nstruct ggml_tensor * ggml_new_tensor_4d(\n        struct ggml_context * ctx,\n        enum   ggml_type type,\n        int    ne0,\n        int    ne1,\n        int    ne2,\n        int    ne3) {\n    const int ne[4] = { ne0, ne1, ne2, ne3 };\n    return ggml_new_tensor(ctx, type, 4, ne);\n}\n\nstruct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) {\n    ctx->scratch_save = ctx->scratch;\n    ctx->scratch.data = NULL;\n\n    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);\n\n    ctx->scratch = ctx->scratch_save;\n\n    ggml_set_i32(result, value);\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value) {\n    ctx->scratch_save = ctx->scratch;\n    ctx->scratch.data = NULL;\n\n    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);\n\n    ctx->scratch = ctx->scratch_save;\n\n    ggml_set_f32(result, value);\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_dup_tensor(struct ggml_context * ctx, const struct ggml_tensor * src) {\n    return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, NULL);\n}\n\nstruct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor) {\n    memset(tensor->data, 0, ggml_nbytes(tensor));\n    return tensor;\n}\n\nstruct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value) {\n    const int n     = ggml_nrows(tensor);\n    const int nc    = tensor->ne[0];\n    const size_t n1 = tensor->nb[1];\n\n    char * const data = tensor->data;\n\n    switch (tensor->type) {\n        case GGML_TYPE_Q4_0:\n            {\n                GGML_ASSERT(false);\n            } break;\n        case GGML_TYPE_Q4_1:\n            {\n                GGML_ASSERT(false);\n            } break;\n        case GGML_TYPE_I8:\n            {\n                assert(tensor->nb[0] == sizeof(int8_t));\n                for (int i = 0; i < n; i++) {\n                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);\n                }\n            } break;\n        case GGML_TYPE_I16:\n            {\n                assert(tensor->nb[0] == sizeof(int16_t));\n                for (int i = 0; i < n; i++) {\n                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);\n                }\n            } break;\n        case GGML_TYPE_I32:\n            {\n                assert(tensor->nb[0] == sizeof(int32_t));\n                for (int i = 0; i < n; i++) {\n                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);\n                }\n            } break;\n        case GGML_TYPE_F16:\n            {\n                assert(tensor->nb[0] == sizeof(ggml_fp16_t));\n                for (int i = 0; i < n; i++) {\n                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);\n                }\n            } break;\n        case GGML_TYPE_F32:\n            {\n                assert(tensor->nb[0] == sizeof(float));\n                for (int i = 0; i < n; i++) {\n                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);\n                }\n            } break;\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n\n    return tensor;\n}\n\nstruct ggml_tensor * ggml_set_f32(struct ggml_tensor * tensor, float value) {\n    const int n     = ggml_nrows(tensor);\n    const int nc    = tensor->ne[0];\n    const size_t n1 = tensor->nb[1];\n\n    char * const data = tensor->data;\n\n    switch (tensor->type) {\n        case GGML_TYPE_Q4_0:\n            {\n                GGML_ASSERT(false);\n            } break;\n        case GGML_TYPE_Q4_1:\n            {\n                GGML_ASSERT(false);\n            } break;\n        case GGML_TYPE_I8:\n            {\n                assert(tensor->nb[0] == sizeof(int8_t));\n                for (int i = 0; i < n; i++) {\n                    ggml_vec_set_i8(nc, (int8_t *)(data + i*n1), value);\n                }\n            } break;\n        case GGML_TYPE_I16:\n            {\n                assert(tensor->nb[0] == sizeof(int16_t));\n                for (int i = 0; i < n; i++) {\n                    ggml_vec_set_i16(nc, (int16_t *)(data + i*n1), value);\n                }\n            } break;\n        case GGML_TYPE_I32:\n            {\n                assert(tensor->nb[0] == sizeof(int32_t));\n                for (int i = 0; i < n; i++) {\n                    ggml_vec_set_i32(nc, (int32_t *)(data + i*n1), value);\n                }\n            } break;\n        case GGML_TYPE_F16:\n            {\n                assert(tensor->nb[0] == sizeof(ggml_fp16_t));\n                for (int i = 0; i < n; i++) {\n                    ggml_vec_set_f16(nc, (ggml_fp16_t *)(data + i*n1), value);\n                }\n            } break;\n        case GGML_TYPE_F32:\n            {\n                assert(tensor->nb[0] == sizeof(float));\n                for (int i = 0; i < n; i++) {\n                    ggml_vec_set_f32(nc, (float *)(data + i*n1), value);\n                }\n            } break;\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n\n    return tensor;\n}\n\nint32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i) {\n    switch (tensor->type) {\n        case GGML_TYPE_Q4_0:\n            {\n                GGML_ASSERT(false);\n            } break;\n        case GGML_TYPE_Q4_1:\n            {\n                GGML_ASSERT(false);\n            } break;\n        case GGML_TYPE_I8:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));\n                return ((int8_t *)(tensor->data))[i];\n            } break;\n        case GGML_TYPE_I16:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));\n                return ((int16_t *)(tensor->data))[i];\n            } break;\n        case GGML_TYPE_I32:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));\n                return ((int32_t *)(tensor->data))[i];\n            } break;\n        case GGML_TYPE_F16:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));\n                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);\n            } break;\n        case GGML_TYPE_F32:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(float));\n                return ((float *)(tensor->data))[i];\n            } break;\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n\n    return 0.0f;\n}\n\nvoid ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value) {\n    switch (tensor->type) {\n        case GGML_TYPE_Q4_0:\n            {\n                GGML_ASSERT(false);\n            } break;\n        case GGML_TYPE_Q4_1:\n            {\n                GGML_ASSERT(false);\n            } break;\n        case GGML_TYPE_I8:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));\n                ((int8_t *)(tensor->data))[i] = value;\n            } break;\n        case GGML_TYPE_I16:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));\n                ((int16_t *)(tensor->data))[i] = value;\n            } break;\n        case GGML_TYPE_I32:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));\n                ((int32_t *)(tensor->data))[i] = value;\n            } break;\n        case GGML_TYPE_F16:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));\n                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);\n            } break;\n        case GGML_TYPE_F32:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(float));\n                ((float *)(tensor->data))[i] = value;\n            } break;\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\nfloat ggml_get_f32_1d(const struct ggml_tensor * tensor, int i) {\n    switch (tensor->type) {\n        case GGML_TYPE_Q4_0:\n            {\n                GGML_ASSERT(false);\n            } break;\n        case GGML_TYPE_Q4_1:\n            {\n                GGML_ASSERT(false);\n            } break;\n        case GGML_TYPE_I8:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));\n                return ((int8_t *)(tensor->data))[i];\n            } break;\n        case GGML_TYPE_I16:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));\n                return ((int16_t *)(tensor->data))[i];\n            } break;\n        case GGML_TYPE_I32:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));\n                return ((int32_t *)(tensor->data))[i];\n            } break;\n        case GGML_TYPE_F16:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));\n                return GGML_FP16_TO_FP32(((ggml_fp16_t *)(tensor->data))[i]);\n            } break;\n        case GGML_TYPE_F32:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(float));\n                return ((float *)(tensor->data))[i];\n            } break;\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n\n    return 0.0f;\n}\n\nvoid ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value) {\n    switch (tensor->type) {\n        case GGML_TYPE_Q4_0:\n            {\n                GGML_ASSERT(false);\n            } break;\n        case GGML_TYPE_Q4_1:\n            {\n                GGML_ASSERT(false);\n            } break;\n        case GGML_TYPE_I8:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(int8_t));\n                ((int8_t *)(tensor->data))[i] = value;\n            } break;\n        case GGML_TYPE_I16:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(int16_t));\n                ((int16_t *)(tensor->data))[i] = value;\n            } break;\n        case GGML_TYPE_I32:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(int32_t));\n                ((int32_t *)(tensor->data))[i] = value;\n            } break;\n        case GGML_TYPE_F16:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(ggml_fp16_t));\n                ((ggml_fp16_t *)(tensor->data))[i] = GGML_FP32_TO_FP16(value);\n            } break;\n        case GGML_TYPE_F32:\n            {\n                GGML_ASSERT(tensor->nb[0] == sizeof(float));\n                ((float *)(tensor->data))[i] = value;\n            } break;\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\nvoid * ggml_get_data(const struct ggml_tensor * tensor) {\n    return tensor->data;\n}\n\nfloat * ggml_get_data_f32(const struct ggml_tensor * tensor) {\n    assert(tensor->type == GGML_TYPE_F32);\n    return (float *)(tensor->data);\n}\n\nstruct ggml_tensor * ggml_view_tensor(\n        struct ggml_context * ctx,\n        const struct ggml_tensor * src) {\n    return ggml_new_tensor_impl(ctx, src->type, src->n_dims, src->ne, src->data);\n}\n\n////////////////////////////////////////////////////////////////////////////////\n\n// ggml_dup\n\nstruct ggml_tensor * ggml_dup_impl(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        bool inplace) {\n    bool is_node = false;\n\n    if (!inplace && (a->grad)) {\n        is_node = true;\n    }\n\n    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n\n    result->op   = GGML_OP_DUP;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_dup(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a) {\n    return ggml_dup_impl(ctx, a, false);\n}\n\nstruct ggml_tensor * ggml_dup_inplace(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a) {\n    return ggml_dup_impl(ctx, a, true);\n}\n\n// ggml_add\n\nstruct ggml_tensor * ggml_add_impl(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        struct ggml_tensor * b,\n        bool inplace) {\n    GGML_ASSERT(ggml_are_same_shape(a, b));\n\n    bool is_node = false;\n\n    if (!inplace && (a->grad || b->grad)) {\n        is_node = true;\n    }\n\n    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n\n    result->op   = GGML_OP_ADD;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = b;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_add(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        struct ggml_tensor * b) {\n    return ggml_add_impl(ctx, a, b, false);\n}\n\nstruct ggml_tensor * ggml_add_inplace(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        struct ggml_tensor * b) {\n    return ggml_add_impl(ctx, a, b, true);\n}\n\n// ggml_sub\n\nstruct ggml_tensor * ggml_sub_impl(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        struct ggml_tensor * b,\n        bool inplace) {\n    GGML_ASSERT(ggml_are_same_shape(a, b));\n\n    bool is_node = false;\n\n    if (!inplace && (a->grad || b->grad)) {\n        is_node = true;\n    }\n\n    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n\n    result->op   = GGML_OP_SUB;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = b;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_sub(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        struct ggml_tensor * b) {\n    return ggml_sub_impl(ctx, a, b, false);\n}\n\nstruct ggml_tensor * ggml_sub_inplace(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        struct ggml_tensor * b) {\n    return ggml_sub_impl(ctx, a, b, true);\n}\n\n// ggml_mul\n\nstruct ggml_tensor * ggml_mul_impl(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        struct ggml_tensor * b,\n        bool inplace) {\n    GGML_ASSERT(ggml_are_same_shape(a, b));\n\n    bool is_node = false;\n\n    if (!inplace && (a->grad || b->grad)) {\n        is_node = true;\n    }\n\n    if (inplace) {\n        GGML_ASSERT(is_node == false);\n    }\n\n    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n\n    result->op   = GGML_OP_MUL;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = b;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_mul(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b) {\n    return ggml_mul_impl(ctx, a, b, false);\n}\n\nstruct ggml_tensor * ggml_mul_inplace(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b) {\n    return ggml_mul_impl(ctx, a, b, true);\n}\n\n// ggml_div\n\nstruct ggml_tensor * ggml_div_impl(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        struct ggml_tensor * b,\n        bool inplace) {\n    GGML_ASSERT(ggml_are_same_shape(a, b));\n\n    bool is_node = false;\n\n    if (!inplace && (a->grad || b->grad)) {\n        is_node = true;\n    }\n\n    if (inplace) {\n        GGML_ASSERT(is_node == false);\n    }\n\n    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n\n    result->op   = GGML_OP_DIV;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = b;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_div(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b) {\n    return ggml_div_impl(ctx, a, b, false);\n}\n\nstruct ggml_tensor * ggml_div_inplace(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b) {\n    return ggml_div_impl(ctx, a, b, true);\n}\n\n// ggml_sqr\n\nstruct ggml_tensor * ggml_sqr_impl(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        bool inplace) {\n    bool is_node = false;\n\n    if (!inplace && (a->grad)) {\n        is_node = true;\n    }\n\n    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n\n    result->op   = GGML_OP_SQR;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_sqr(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_sqr_impl(ctx, a, false);\n}\n\nstruct ggml_tensor * ggml_sqr_inplace(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_sqr_impl(ctx, a, true);\n}\n\n// ggml_sqrt\n\nstruct ggml_tensor * ggml_sqrt_impl(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        bool inplace) {\n    bool is_node = false;\n\n    if (!inplace && (a->grad)) {\n        is_node = true;\n    }\n\n    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n\n    result->op   = GGML_OP_SQRT;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_sqrt(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_sqrt_impl(ctx, a, false);\n}\n\nstruct ggml_tensor * ggml_sqrt_inplace(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_sqrt_impl(ctx, a, true);\n}\n\n// ggml_sum\n\nstruct ggml_tensor * ggml_sum(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a) {\n    bool is_node = false;\n\n    if (a->grad) {\n        is_node = true;\n    }\n\n    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, a->type, 1);\n\n    result->op   = GGML_OP_SUM;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL;\n\n    return result;\n}\n\n// ggml_mean\n\nstruct ggml_tensor * ggml_mean(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a) {\n    bool is_node = false;\n\n    if (a->grad) {\n        GGML_ASSERT(false); // TODO: implement\n        is_node = true;\n    }\n\n    int ne[GGML_MAX_DIMS] = { 1, a->ne[1], a->ne[2], a->ne[3] };\n    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, a->n_dims, ne);\n\n    result->op   = GGML_OP_MEAN;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL;\n\n    return result;\n}\n\n// ggml_repeat\n\nstruct ggml_tensor * ggml_repeat(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        struct ggml_tensor * b) {\n    GGML_ASSERT(ggml_can_repeat(a, b));\n\n    bool is_node = false;\n\n    if (a->grad) {\n        is_node = true;\n    }\n\n    if (ggml_are_same_shape(a, b) && !is_node) {\n        return a;\n    }\n\n    struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);\n\n    result->op   = GGML_OP_REPEAT;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = b;\n\n    return result;\n}\n\n// ggml_abs\n\nstruct ggml_tensor * ggml_abs_impl(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        bool inplace) {\n    bool is_node = false;\n\n    if (!inplace && (a->grad)) {\n        is_node = true;\n    }\n\n    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n\n    result->op   = GGML_OP_ABS;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_abs(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_abs_impl(ctx, a, false);\n}\n\nstruct ggml_tensor * ggml_abs_inplace(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_abs_impl(ctx, a, true);\n}\n\n\n// ggml_sgn\n\nstruct ggml_tensor * ggml_sgn_impl(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        bool inplace) {\n    bool is_node = false;\n\n    if (!inplace && (a->grad)) {\n        is_node = true;\n    }\n\n    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n\n    result->op   = GGML_OP_SGN;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_sgn(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_sgn_impl(ctx, a, false);\n}\n\nstruct ggml_tensor * ggml_sgn_inplace(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_sgn_impl(ctx, a, true);\n}\n\n// ggml_neg\n\nstruct ggml_tensor * ggml_neg_impl(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        bool inplace) {\n    bool is_node = false;\n\n    if (!inplace && (a->grad)) {\n        is_node = true;\n    }\n\n    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n\n    result->op   = GGML_OP_NEG;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_neg(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_neg_impl(ctx, a, false);\n}\n\nstruct ggml_tensor * ggml_neg_inplace(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_neg_impl(ctx, a, true);\n}\n\n// ggml_step\n\nstruct ggml_tensor * ggml_step_impl(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        bool inplace) {\n    bool is_node = false;\n\n    if (!inplace && (a->grad)) {\n        is_node = true;\n    }\n\n    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n\n    result->op   = GGML_OP_STEP;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_step(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_step_impl(ctx, a, false);\n}\n\nstruct ggml_tensor * ggml_step_inplace(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_step_impl(ctx, a, true);\n}\n\n// ggml_relu\n\nstruct ggml_tensor * ggml_relu_impl(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        bool inplace) {\n    bool is_node = false;\n\n    if (!inplace && (a->grad)) {\n        is_node = true;\n    }\n\n    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n\n    result->op   = GGML_OP_RELU;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_relu(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_relu_impl(ctx, a, false);\n}\n\nstruct ggml_tensor * ggml_relu_inplace(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_relu_impl(ctx, a, true);\n}\n\n// ggml_gelu\n\nstruct ggml_tensor * ggml_gelu_impl(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        bool inplace) {\n    bool is_node = false;\n\n    if (!inplace && (a->grad)) {\n        is_node = true;\n    }\n\n    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n\n    result->op   = GGML_OP_GELU;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_gelu(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_gelu_impl(ctx, a, false);\n}\n\nstruct ggml_tensor * ggml_gelu_inplace(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_gelu_impl(ctx, a, true);\n}\n\n// ggml_silu\n\nstruct ggml_tensor * ggml_silu_impl(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        bool inplace) {\n    bool is_node = false;\n\n    if (!inplace && (a->grad)) {\n        is_node = true;\n    }\n\n    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n\n    result->op   = GGML_OP_SILU;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_silu(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_silu_impl(ctx, a, false);\n}\n\nstruct ggml_tensor * ggml_silu_inplace(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_silu_impl(ctx, a, true);\n}\n\n// ggml_norm\n\nstruct ggml_tensor * ggml_norm_impl(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        bool inplace) {\n    bool is_node = false;\n\n    if (!inplace && (a->grad)) {\n        GGML_ASSERT(false); // TODO: implement backward\n        is_node = true;\n    }\n\n    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n\n    result->op   = GGML_OP_NORM;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL; // TODO: maybe store epsilon here?\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_norm(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_norm_impl(ctx, a, false);\n}\n\nstruct ggml_tensor * ggml_norm_inplace(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    return ggml_norm_impl(ctx, a, true);\n}\n\n// ggml_mul_mat\n\nstruct ggml_tensor * ggml_mul_mat(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b) {\n    GGML_ASSERT(ggml_can_mul_mat(a, b));\n\n    bool is_node = false;\n\n    if (a->grad || b->grad) {\n        is_node = true;\n    }\n\n    const int ne[4] = { a->ne[1], b->ne[1], a->ne[2], b->ne[3] };\n    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, MIN(a->n_dims, b->n_dims), ne);\n\n    result->op   = GGML_OP_MUL_MAT;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = b;\n\n    return result;\n}\n\n// ggml_scale\n\nstruct ggml_tensor * ggml_scale_impl(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b,\n        bool inplace) {\n    GGML_ASSERT(ggml_is_scalar(b));\n    GGML_ASSERT(ggml_is_padded_1d(a));\n\n    bool is_node = false;\n\n    if (!inplace && (a->grad || b->grad)) {\n        GGML_ASSERT(false); // TODO: implement backward\n        is_node = true;\n    }\n\n    // TODO: when implement backward, fix this:\n    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n    struct ggml_tensor * result = ggml_view_tensor(ctx, a);\n\n    result->op   = GGML_OP_SCALE;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = b;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_scale(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        struct ggml_tensor * b) {\n    return ggml_scale_impl(ctx, a, b, false);\n}\n\nstruct ggml_tensor * ggml_scale_inplace(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        struct ggml_tensor * b) {\n    return ggml_scale_impl(ctx, a, b, true);\n}\n\n// ggml_cpy\n\nstruct ggml_tensor * ggml_cpy_impl(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b,\n        bool inplace) {\n    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));\n\n    bool is_node = false;\n\n    if (!inplace && (a->grad || b->grad)) {\n        GGML_ASSERT(false); // TODO: implement backward\n        is_node = true;\n    }\n\n    // make a view of the destination\n    struct ggml_tensor * result = ggml_view_tensor(ctx, b);\n\n    result->op   = GGML_OP_CPY;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = b;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_cpy(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        struct ggml_tensor * b) {\n    return ggml_cpy_impl(ctx, a, b, false);\n}\n\nstruct ggml_tensor * ggml_cpy_inplace(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        struct ggml_tensor * b) {\n    return ggml_cpy_impl(ctx, a, b, true);\n}\n\n// ggml_reshape\n\nstruct ggml_tensor * ggml_reshape(\n        struct ggml_context * ctx,\n        struct ggml_tensor * a,\n        struct ggml_tensor * b) {\n    GGML_ASSERT(ggml_is_contiguous(a));\n    GGML_ASSERT(ggml_is_contiguous(b));\n    GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b));\n\n    bool is_node = false;\n\n    if (a->grad || b->grad) {\n        GGML_ASSERT(false); // TODO: implement backward\n        is_node = true;\n    }\n\n    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, b->n_dims, b->ne, a->data);\n\n    result->op   = GGML_OP_RESHAPE;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_reshape_2d(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        int                   ne0,\n        int                   ne1) {\n    GGML_ASSERT(ggml_is_contiguous(a));\n    GGML_ASSERT(ggml_nelements(a) == ne0*ne1);\n\n    bool is_node = false;\n\n    if (a->grad) {\n        GGML_ASSERT(false); // TODO: implement backward\n        is_node = true;\n    }\n\n    const int ne[2] = { ne0, ne1 };\n    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, a->data);\n\n    result->op   = GGML_OP_RESHAPE;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL;\n\n    return result;\n}\n\nstruct ggml_tensor * ggml_reshape_3d(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        int                   ne0,\n        int                   ne1,\n        int                   ne2) {\n    GGML_ASSERT(ggml_is_contiguous(a));\n    GGML_ASSERT(ggml_nelements(a) == ne0*ne1*ne2);\n\n    bool is_node = false;\n\n    if (a->grad) {\n        GGML_ASSERT(false); // TODO: implement backward\n        is_node = true;\n    }\n\n    const int ne[3] = { ne0, ne1, ne2 };\n    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, a->data);\n\n    result->op   = GGML_OP_RESHAPE;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL;\n\n    return result;\n}\n\n// ggml_view_1d\n\nstruct ggml_tensor * ggml_view_1d(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        int                   ne0,\n        size_t                offset) {\n    if (a->grad) {\n        GGML_ASSERT(false); // gradient propagation is not supported\n    }\n\n    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset);\n\n    result->op   = GGML_OP_VIEW;\n    result->grad = NULL;\n    result->src0 = a;\n    result->src1 = NULL; // TODO: maybe store the offset here?\n\n    return result;\n}\n\n// ggml_view_2d\n\nstruct ggml_tensor * ggml_view_2d(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        int                   ne0,\n        int                   ne1,\n        size_t                nb1,\n        size_t                offset) {\n    if (a->grad) {\n        GGML_ASSERT(false); // gradient propagation is not supported\n    }\n\n    const int ne[GGML_MAX_DIMS] = { ne0, ne1, 1, 1 };\n\n    struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset);\n\n    result->nb[1] = nb1;\n    result->nb[2] = result->nb[1]*ne1;\n    result->nb[3] = result->nb[2];\n\n    result->op   = GGML_OP_VIEW;\n    result->grad = NULL;\n    result->src0 = a;\n    result->src1 = NULL; // TODO: maybe store the offset here?\n\n    return result;\n}\n\n// ggml_permute\n\nstruct ggml_tensor * ggml_permute(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        int                   axis0,\n        int                   axis1,\n        int                   axis2,\n        int                   axis3) {\n    GGML_ASSERT(axis0 >= 0 && axis0 < GGML_MAX_DIMS);\n    GGML_ASSERT(axis1 >= 0 && axis1 < GGML_MAX_DIMS);\n    GGML_ASSERT(axis2 >= 0 && axis2 < GGML_MAX_DIMS);\n    GGML_ASSERT(axis3 >= 0 && axis3 < GGML_MAX_DIMS);\n\n    GGML_ASSERT(axis0 != axis1);\n    GGML_ASSERT(axis0 != axis2);\n    GGML_ASSERT(axis0 != axis3);\n    GGML_ASSERT(axis1 != axis2);\n    GGML_ASSERT(axis1 != axis3);\n    GGML_ASSERT(axis2 != axis3);\n\n    bool is_node = false;\n\n    if (a->grad) {\n        GGML_ASSERT(false); // TODO: implement backward\n        is_node = true;\n    }\n\n    struct ggml_tensor * result = ggml_view_tensor(ctx, a);\n\n    int ne[GGML_MAX_DIMS];\n    int nb[GGML_MAX_DIMS];\n\n    ne[axis0] = a->ne[0];\n    ne[axis1] = a->ne[1];\n    ne[axis2] = a->ne[2];\n    ne[axis3] = a->ne[3];\n\n    nb[axis0] = a->nb[0];\n    nb[axis1] = a->nb[1];\n    nb[axis2] = a->nb[2];\n    nb[axis3] = a->nb[3];\n\n    result->ne[0] = ne[0];\n    result->ne[1] = ne[1];\n    result->ne[2] = ne[2];\n    result->ne[3] = ne[3];\n\n    result->nb[0] = nb[0];\n    result->nb[1] = nb[1];\n    result->nb[2] = nb[2];\n    result->nb[3] = nb[3];\n\n    result->op   = GGML_OP_PERMUTE;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL; // TODO: maybe store the permutation here?\n\n    return result;\n}\n\n// ggml_transpose\n\nstruct ggml_tensor * ggml_transpose(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    bool is_node = false;\n\n    if (a->grad) {\n        GGML_ASSERT(false); // TODO: implement backward\n        is_node = true;\n    }\n\n    struct ggml_tensor * result = ggml_view_tensor(ctx, a);\n\n    result->ne[0] = a->ne[1];\n    result->ne[1] = a->ne[0];\n\n    result->nb[0] = a->nb[1];\n    result->nb[1] = a->nb[0];\n\n    result->op   = GGML_OP_TRANSPOSE;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL;\n\n    return result;\n}\n\n// ggml_get_rows\n\nstruct ggml_tensor * ggml_get_rows(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b) {\n    GGML_ASSERT(ggml_is_matrix(a) && ggml_is_vector(b) && b->type == GGML_TYPE_I32);\n\n    bool is_node = false;\n\n    if (a->grad || b->grad) {\n        GGML_ASSERT(false); // TODO: implement backward\n        is_node = true;\n    }\n\n    // TODO: implement non F32 return\n    //struct ggml_tensor * result = ggml_new_tensor_2d(ctx, a->type, a->ne[0], b->ne[0]);\n    struct ggml_tensor * result = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, a->ne[0], b->ne[0]);\n\n    result->op   = GGML_OP_GET_ROWS;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = b;\n\n    return result;\n}\n\n// ggml_diag_mask_inf\n\nstruct ggml_tensor * ggml_diag_mask_inf(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        int                   n_past) {\n    bool is_node = false;\n\n    if (a->grad) {\n        GGML_ASSERT(false); // TODO: implement backward\n        is_node = true;\n    }\n\n    // TODO: when implement backward, fix this:\n    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n    struct ggml_tensor * result = ggml_view_tensor(ctx, a);\n    struct ggml_tensor * b = ggml_new_i32(ctx, n_past);\n\n    result->op   = GGML_OP_DIAG_MASK_INF;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = b;\n\n    return result;\n}\n\n// ggml_soft_max\n\nstruct ggml_tensor * ggml_soft_max(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a) {\n    bool is_node = false;\n\n    if (a->grad) {\n        GGML_ASSERT(false); // TODO: implement backward\n        is_node = true;\n    }\n\n    // TODO: when implement backward, fix this:\n    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n    struct ggml_tensor * result = ggml_view_tensor(ctx, a);\n\n    result->op   = GGML_OP_SOFT_MAX;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = NULL;\n\n    return result;\n}\n\n// ggml_rope\n\nstruct ggml_tensor * ggml_rope(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        int                   n_past,\n        int                   n_dims,\n        int                   mode) {\n    GGML_ASSERT(n_past >= 0);\n    bool is_node = false;\n\n    if (a->grad) {\n        GGML_ASSERT(false); // TODO: implement backward\n        is_node = true;\n    }\n\n    // TODO: when implement backward, fix this:\n    //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);\n    struct ggml_tensor * result = ggml_view_tensor(ctx, a);\n\n    struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3);\n    ((int32_t *) b->data)[0] = n_past;\n    ((int32_t *) b->data)[1] = n_dims;\n    ((int32_t *) b->data)[2] = mode;\n\n    result->op   = GGML_OP_ROPE;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = b;\n\n    return result;\n}\n\n// ggml_conv_1d_1s\n\nstruct ggml_tensor * ggml_conv_1d_1s(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b) {\n    GGML_ASSERT(ggml_is_matrix(b));\n    GGML_ASSERT(a->ne[1] == b->ne[1]);\n    GGML_ASSERT(a->ne[3] == 1);\n    bool is_node = false;\n\n    if (a->grad || b->grad) {\n        GGML_ASSERT(false); // TODO: implement backward\n        is_node = true;\n    }\n\n    const int ne[4] = { b->ne[0], a->ne[2], 1, 1, };\n    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);\n\n    result->op   = GGML_OP_CONV_1D_1S;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = b;\n\n    return result;\n}\n\n// ggml_conv_1d_2s\n\nstruct ggml_tensor * ggml_conv_1d_2s(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b) {\n    GGML_ASSERT(ggml_is_matrix(b));\n    GGML_ASSERT(a->ne[1] == b->ne[1]);\n    GGML_ASSERT(a->ne[3] == 1);\n    bool is_node = false;\n\n    if (a->grad || b->grad) {\n        GGML_ASSERT(false); // TODO: implement backward\n        is_node = true;\n    }\n\n    const int ne[4] = { b->ne[0]/2, a->ne[2], 1, 1, };\n    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne);\n\n    result->op   = GGML_OP_CONV_1D_2S;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = b;\n\n    return result;\n}\n\n// ggml_flash_attn\n\nstruct ggml_tensor * ggml_flash_attn(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * q,\n        struct ggml_tensor  * k,\n        struct ggml_tensor  * v,\n        bool                  masked) {\n    GGML_ASSERT(ggml_can_mul_mat(k, q));\n    // TODO: check if vT can be multiplied by (k*qT)\n\n    bool is_node = false;\n\n    if (q->grad || k->grad || v->grad) {\n        GGML_ASSERT(false); // TODO: implement backward\n        is_node = true;\n    }\n\n    //struct ggml_tensor * result = ggml_dup_tensor(ctx, q);\n    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, q->ne);\n\n    result->op   = GGML_OP_FLASH_ATTN;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = q;\n    result->src1 = k;\n    result->opt[0] = v;\n    result->opt[1] = ggml_new_i32(ctx, masked ? 1 : 0);\n\n    return result;\n}\n\n// ggml_flash_ff\n\nstruct ggml_tensor * ggml_flash_ff(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b0,\n        struct ggml_tensor  * b1,\n        struct ggml_tensor  * c0,\n        struct ggml_tensor  * c1) {\n    GGML_ASSERT(ggml_can_mul_mat(b0, a));\n    // TODO: more checks\n\n    bool is_node = false;\n\n    if (a->grad || b0->grad || b1->grad || c0->grad || c1->grad) {\n        GGML_ASSERT(false); // TODO: implement backward\n        is_node = true;\n    }\n\n    //struct ggml_tensor * result = ggml_dup_tensor(ctx, a);\n    struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, a->ne);\n\n    result->op   = GGML_OP_FLASH_FF;\n    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;\n    result->src0 = a;\n    result->src1 = b0;\n    result->opt[0] = b1;\n    result->opt[1] = c0;\n    result->opt[2] = c1;\n\n    return result;\n}\n\n////////////////////////////////////////////////////////////////////////////////\n\nvoid ggml_set_param(\n        struct ggml_context * ctx,\n        struct ggml_tensor * tensor) {\n    tensor->is_param = true;\n\n    GGML_ASSERT(tensor->grad == NULL);\n    tensor->grad = ggml_dup_tensor(ctx, tensor);\n}\n\n// ggml_compute_forward_dup\n\nstatic void ggml_compute_forward_dup_f16(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    GGML_ASSERT(params->ith == 0);\n    GGML_ASSERT(ggml_is_contiguous(dst));\n    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int ne00 = src0->ne[0];\n    const int ne01 = src0->ne[1];\n    const int ne02 = src0->ne[2];\n    const int ne03 = src0->ne[3];\n\n    const size_t nb00 = src0->nb[0];\n    const size_t nb01 = src0->nb[1];\n    const size_t nb02 = src0->nb[2];\n    const size_t nb03 = src0->nb[3];\n\n    if (ggml_is_contiguous(src0) && src0->type == dst->type) {\n        memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);\n        return;\n    }\n\n    if (src0->nb[0] == sizeof(ggml_fp16_t)) {\n        if (dst->type == GGML_TYPE_F16) {\n            int id = 0;\n            const size_t rs = ne00*nb00;\n\n            for (int i03 = 0; i03 < ne03; i03++) {\n                for (int i02 = 0; i02 < ne02; i02++) {\n                    for (int i01 = 0; i01 < ne01; i01++) {\n                        const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;\n                        char * dst_ptr = (char *) dst->data + id*rs;\n\n                        memcpy(dst_ptr, src0_ptr, rs);\n\n                        id++;\n                    }\n                }\n            }\n        } else if (dst->type == GGML_TYPE_F32) {\n            int id = 0;\n            float * dst_ptr = (float *) dst->data;\n\n            for (int i03 = 0; i03 < ne03; i03++) {\n                for (int i02 = 0; i02 < ne02; i02++) {\n                    for (int i01 = 0; i01 < ne01; i01++) {\n                        for (int i00 = 0; i00 < ne00; i00++) {\n                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);\n\n                            dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);\n                            id++;\n                        }\n                    }\n                }\n            }\n        } else {\n            GGML_ASSERT(false); // TODO: implement\n        }\n    } else {\n        //printf(\"%s: this is not optimal - fix me\\n\", __func__);\n\n        if (dst->type == GGML_TYPE_F32) {\n            int id = 0;\n            float * dst_ptr = (float *) dst->data;\n\n            for (int i03 = 0; i03 < ne03; i03++) {\n                for (int i02 = 0; i02 < ne02; i02++) {\n                    for (int i01 = 0; i01 < ne01; i01++) {\n                        for (int i00 = 0; i00 < ne00; i00++) {\n                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);\n\n                            dst_ptr[id] = GGML_FP16_TO_FP32(*src0_ptr);\n                            id++;\n                        }\n                    }\n                }\n            }\n        } else if (dst->type == GGML_TYPE_F16) {\n            int id = 0;\n            ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;\n\n            for (int i03 = 0; i03 < ne03; i03++) {\n                for (int i02 = 0; i02 < ne02; i02++) {\n                    for (int i01 = 0; i01 < ne01; i01++) {\n                        for (int i00 = 0; i00 < ne00; i00++) {\n                            const ggml_fp16_t * src0_ptr = (ggml_fp16_t *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);\n\n                            dst_ptr[id] = *src0_ptr;\n                            id++;\n                        }\n                    }\n                }\n            }\n        } else {\n            GGML_ASSERT(false); // TODO: implement\n        }\n    }\n}\n\nstatic void ggml_compute_forward_dup_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    GGML_ASSERT(params->ith == 0);\n    GGML_ASSERT(ggml_is_contiguous(dst));\n    GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int ne00 = src0->ne[0];\n    const int ne01 = src0->ne[1];\n    const int ne02 = src0->ne[2];\n    const int ne03 = src0->ne[3];\n\n    const size_t nb00 = src0->nb[0];\n    const size_t nb01 = src0->nb[1];\n    const size_t nb02 = src0->nb[2];\n    const size_t nb03 = src0->nb[3];\n\n    if (ggml_is_contiguous(src0) && src0->type == dst->type) {\n        memcpy(dst->data, src0->data, ggml_nelements(dst) * GGML_TYPE_SIZE[src0->type]);\n        return;\n    }\n\n    if (src0->nb[0] == sizeof(float)) {\n        if (dst->type == GGML_TYPE_F32) {\n            int id = 0;\n            const size_t rs = ne00*nb00;\n\n            for (int i03 = 0; i03 < ne03; i03++) {\n                for (int i02 = 0; i02 < ne02; i02++) {\n                    for (int i01 = 0; i01 < ne01; i01++) {\n                        const char * src0_ptr = (char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03;\n                        char * dst_ptr = (char *) dst->data + id*rs;\n\n                        memcpy(dst_ptr, src0_ptr, rs);\n\n                        id++;\n                    }\n                }\n            }\n        } else if (dst->type == GGML_TYPE_F16) {\n            int id = 0;\n            ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;\n\n            for (int i03 = 0; i03 < ne03; i03++) {\n                for (int i02 = 0; i02 < ne02; i02++) {\n                    for (int i01 = 0; i01 < ne01; i01++) {\n                        for (int i00 = 0; i00 < ne00; i00++) {\n                            const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);\n\n                            dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);\n                            id++;\n                        }\n                    }\n                }\n            }\n        } else {\n            GGML_ASSERT(false); // TODO: implement\n        }\n    } else {\n        //printf(\"%s: this is not optimal - fix me\\n\", __func__);\n\n        if (dst->type == GGML_TYPE_F32) {\n            int id = 0;\n            float * dst_ptr = (float *) dst->data;\n\n            for (int i03 = 0; i03 < ne03; i03++) {\n                for (int i02 = 0; i02 < ne02; i02++) {\n                    for (int i01 = 0; i01 < ne01; i01++) {\n                        for (int i00 = 0; i00 < ne00; i00++) {\n                            const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);\n\n                            dst_ptr[id] = *src0_ptr;\n                            id++;\n                        }\n                    }\n                }\n            }\n        } else if (dst->type == GGML_TYPE_F16) {\n            int id = 0;\n            ggml_fp16_t * dst_ptr = (ggml_fp16_t *) dst->data;\n\n            for (int i03 = 0; i03 < ne03; i03++) {\n                for (int i02 = 0; i02 < ne02; i02++) {\n                    for (int i01 = 0; i01 < ne01; i01++) {\n                        for (int i00 = 0; i00 < ne00; i00++) {\n                            const float * src0_ptr = (float *) ((char *) src0->data + i00*nb00 + i01*nb01 + i02*nb02 + i03*nb03);\n\n                            dst_ptr[id] = GGML_FP32_TO_FP16(*src0_ptr);\n                            id++;\n                        }\n                    }\n                }\n            }\n        } else {\n            GGML_ASSERT(false); // TODO: implement\n        }\n    }\n}\n\nstatic void ggml_compute_forward_dup(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F16:\n            {\n                ggml_compute_forward_dup_f16(params, src0, dst);\n            } break;\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_dup_f32(params, src0, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_add\n\nstatic void ggml_compute_forward_add_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    GGML_ASSERT(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int ith = params->ith;\n    const int nth = params->nth;\n\n    const int n  = ggml_nrows(src0);\n    const int nc = src0->ne[0];\n\n    const size_t nb00 = src0->nb[0];\n    const size_t nb01 = src0->nb[1];\n\n    const size_t nb10 = src1->nb[0];\n    const size_t nb11 = src1->nb[1];\n\n    const size_t nb0 = dst->nb[0];\n    const size_t nb1 = dst->nb[1];\n\n    GGML_ASSERT( nb0 == sizeof(float));\n    GGML_ASSERT(nb00 == sizeof(float));\n\n    if (nb10 == sizeof(float)) {\n        const int j0 = (n/nth)*ith;\n        const int j1 = ith == nth - 1 ? n : (n/nth)*(ith + 1);\n\n        for (int j = j0; j < j1; j++) {\n            ggml_vec_add_f32(nc,\n                    (float *) ((char *) dst->data  + j*nb1),\n                    (float *) ((char *) src0->data + j*nb01),\n                    (float *) ((char *) src1->data + j*nb11));\n        }\n    } else {\n        // src1 is not contiguous\n        for (int j = ith; j < n; j += nth) {\n            float * dst_ptr  = (float *) ((char *) dst->data  + j*nb1);\n            float * src0_ptr = (float *) ((char *) src0->data + j*nb01);\n            for (int i = 0; i < nc; i++) {\n                float * src1_ptr = (float *) ((char *) src1->data + j*nb11 + i*nb10);\n\n                dst_ptr[i] = src0_ptr[i] + *src1_ptr;\n            }\n        }\n    }\n}\n\nstatic void ggml_compute_forward_add(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_add_f32(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_sub\n\nstatic void ggml_compute_forward_sub_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int n  = ggml_nrows(src0);\n    const int nc = src0->ne[0];\n\n    assert( dst->nb[0] == sizeof(float));\n    assert(src0->nb[0] == sizeof(float));\n    assert(src1->nb[0] == sizeof(float));\n\n    for (int i = 0; i < n; i++) {\n        ggml_vec_sub_f32(nc,\n                (float *) ((char *) dst->data  + i*( dst->nb[1])),\n                (float *) ((char *) src0->data + i*(src0->nb[1])),\n                (float *) ((char *) src1->data + i*(src1->nb[1])));\n    }\n}\n\nstatic void ggml_compute_forward_sub(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_sub_f32(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_mul\n\nstatic void ggml_compute_forward_mul_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int n  = ggml_nrows(src0);\n    const int nc = src0->ne[0];\n\n    assert( dst->nb[0] == sizeof(float));\n    assert(src0->nb[0] == sizeof(float));\n    assert(src1->nb[0] == sizeof(float));\n\n    for (int i = 0; i < n; i++) {\n        ggml_vec_mul_f32(nc,\n                (float *) ((char *) dst->data  + i*( dst->nb[1])),\n                (float *) ((char *) src0->data + i*(src0->nb[1])),\n                (float *) ((char *) src1->data + i*(src1->nb[1])));\n    }\n}\n\nstatic void ggml_compute_forward_mul(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_mul_f32(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_div\n\nstatic void ggml_compute_forward_div_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n    assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int n  = ggml_nrows(src0);\n    const int nc = src0->ne[0];\n\n    assert( dst->nb[0] == sizeof(float));\n    assert(src0->nb[0] == sizeof(float));\n    assert(src1->nb[0] == sizeof(float));\n\n    for (int i = 0; i < n; i++) {\n        ggml_vec_div_f32(nc,\n                (float *) ((char *) dst->data  + i*( dst->nb[1])),\n                (float *) ((char *) src0->data + i*(src0->nb[1])),\n                (float *) ((char *) src1->data + i*(src1->nb[1])));\n    }\n}\n\nstatic void ggml_compute_forward_div(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_div_f32(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_sqr\n\nstatic void ggml_compute_forward_sqr_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n    assert(ggml_are_same_shape(src0, dst));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int n     = ggml_nrows(src0);\n    const int nc    = src0->ne[0];\n\n    assert( dst->nb[0] == sizeof(float));\n    assert(src0->nb[0] == sizeof(float));\n\n    for (int i = 0; i < n; i++) {\n        ggml_vec_sqr_f32(nc,\n                (float *) ((char *) dst->data  + i*( dst->nb[1])),\n                (float *) ((char *) src0->data + i*(src0->nb[1])));\n    }\n}\n\nstatic void ggml_compute_forward_sqr(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_sqr_f32(params, src0, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_sqrt\n\nstatic void ggml_compute_forward_sqrt_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n    assert(ggml_are_same_shape(src0, dst));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int n  = ggml_nrows(src0);\n    const int nc = src0->ne[0];\n\n    assert( dst->nb[0] == sizeof(float));\n    assert(src0->nb[0] == sizeof(float));\n\n    for (int i = 0; i < n; i++) {\n        ggml_vec_sqrt_f32(nc,\n                (float *) ((char *) dst->data  + i*( dst->nb[1])),\n                (float *) ((char *) src0->data + i*(src0->nb[1])));\n    }\n}\n\nstatic void ggml_compute_forward_sqrt(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_sqrt_f32(params, src0, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_sum\n\nstatic void ggml_compute_forward_sum_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n    assert(ggml_is_scalar(dst));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    assert(ggml_is_scalar(dst));\n    assert(src0->nb[0] == sizeof(float));\n\n    const int ne00 = src0->ne[0];\n    const int ne01 = src0->ne[1];\n    const int ne02 = src0->ne[2];\n    const int ne03 = src0->ne[3];\n\n    const size_t nb01 = src0->nb[1];\n    const size_t nb02 = src0->nb[2];\n    const size_t nb03 = src0->nb[3];\n\n    for (int i03 = 0; i03 < ne03; i03++) {\n        for (int i02 = 0; i02 < ne02; i02++) {\n            for (int i01 = 0; i01 < ne01; i01++) {\n                ggml_vec_sum_f32(ne00,\n                        (float *) (dst->data),\n                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));\n            }\n        }\n    }\n}\n\nstatic void ggml_compute_forward_sum(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_sum_f32(params, src0, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_mean\n\nstatic void ggml_compute_forward_mean_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    assert(src0->nb[0] == sizeof(float));\n\n    const int ne00 = src0->ne[0];\n    const int ne01 = src0->ne[1];\n    const int ne02 = src0->ne[2];\n    const int ne03 = src0->ne[3];\n\n    const size_t nb01 = src0->nb[1];\n    const size_t nb02 = src0->nb[2];\n    const size_t nb03 = src0->nb[3];\n\n    const int ne0 = dst->ne[0];\n    const int ne1 = dst->ne[1];\n    const int ne2 = dst->ne[2];\n    const int ne3 = dst->ne[3];\n\n    assert(ne0 == 1);\n    assert(ne1 == ne01);\n    assert(ne2 == ne02);\n    assert(ne3 == ne03);\n\n    UNUSED(ne0);\n    UNUSED(ne1);\n    UNUSED(ne2);\n    UNUSED(ne3);\n\n    const size_t nb1 = dst->nb[1];\n    const size_t nb2 = dst->nb[2];\n    const size_t nb3 = dst->nb[3];\n\n    for (int i03 = 0; i03 < ne03; i03++) {\n        for (int i02 = 0; i02 < ne02; i02++) {\n            for (int i01 = 0; i01 < ne01; i01++) {\n                ggml_vec_sum_f32(ne00,\n                        (float *) ((char *)  dst->data + i01*nb1  + i02*nb2  + i03*nb3),\n                        (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));\n\n                *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) /= (float) ne00;\n            }\n        }\n    }\n}\n\nstatic void ggml_compute_forward_mean(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_mean_f32(params, src0, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_repeat\n\nstatic void ggml_compute_forward_repeat_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n    assert(ggml_can_repeat(src0, dst));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    // TODO: implement support for rank > 2 tensors\n    assert(src0->ne[2] == 1);\n    assert(src0->ne[3] == 1);\n    assert( dst->ne[2] == 1);\n    assert( dst->ne[3] == 1);\n\n    const int nc  = dst->ne[0];\n    const int nr  = dst->ne[1];\n    const int nc0 = src0->ne[0];\n    const int nr0 = src0->ne[1];\n    const int ncr = nc/nc0; // guaranteed to be an integer due to the check in ggml_can_repeat\n    const int nrr = nr/nr0; // guaranteed to be an integer due to the check in ggml_can_repeat\n\n    // TODO: support for transposed / permuted tensors\n    assert( dst->nb[0] == sizeof(float));\n    assert(src0->nb[0] == sizeof(float));\n\n    // TODO: maybe this is not optimal?\n    for (int i = 0; i < nrr; i++) {\n        for (int j = 0; j < ncr; j++) {\n            for (int k = 0; k < nr0; k++) {\n                ggml_vec_cpy_f32(nc0,\n                        (float *) ((char *)  dst->data + (i*nr0 + k)*( dst->nb[1]) + j*nc0*( dst->nb[0])),\n                        (float *) ((char *) src0->data + (        k)*(src0->nb[1])));\n            }\n        }\n    }\n}\n\nstatic void ggml_compute_forward_repeat(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_repeat_f32(params, src0, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_abs\n\nstatic void ggml_compute_forward_abs_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n    assert(ggml_are_same_shape(src0, dst));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int n  = ggml_nrows(src0);\n    const int nc = src0->ne[0];\n\n    assert(dst->nb[0]  == sizeof(float));\n    assert(src0->nb[0] == sizeof(float));\n\n    for (int i = 0; i < n; i++) {\n        ggml_vec_abs_f32(nc,\n                (float *) ((char *) dst->data  + i*( dst->nb[1])),\n                (float *) ((char *) src0->data + i*(src0->nb[1])));\n    }\n}\n\nstatic void ggml_compute_forward_abs(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_abs_f32(params, src0, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_sgn\n\nstatic void ggml_compute_forward_sgn_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n    assert(ggml_are_same_shape(src0, dst));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int n  = ggml_nrows(src0);\n    const int nc = src0->ne[0];\n\n    assert(dst->nb[0]  == sizeof(float));\n    assert(src0->nb[0] == sizeof(float));\n\n    for (int i = 0; i < n; i++) {\n        ggml_vec_sgn_f32(nc,\n                (float *) ((char *) dst->data  + i*( dst->nb[1])),\n                (float *) ((char *) src0->data + i*(src0->nb[1])));\n    }\n}\n\nstatic void ggml_compute_forward_sgn(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_sgn_f32(params, src0, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_neg\n\nstatic void ggml_compute_forward_neg_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n    assert(ggml_are_same_shape(src0, dst));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int n  = ggml_nrows(src0);\n    const int nc = src0->ne[0];\n\n    assert(dst->nb[0]  == sizeof(float));\n    assert(src0->nb[0] == sizeof(float));\n\n    for (int i = 0; i < n; i++) {\n        ggml_vec_neg_f32(nc,\n                (float *) ((char *) dst->data  + i*( dst->nb[1])),\n                (float *) ((char *) src0->data + i*(src0->nb[1])));\n    }\n}\n\nstatic void ggml_compute_forward_neg(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_neg_f32(params, src0, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_step\n\nstatic void ggml_compute_forward_step_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n    assert(ggml_are_same_shape(src0, dst));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int n  = ggml_nrows(src0);\n    const int nc = src0->ne[0];\n\n    assert(dst->nb[0]  == sizeof(float));\n    assert(src0->nb[0] == sizeof(float));\n\n    for (int i = 0; i < n; i++) {\n        ggml_vec_step_f32(nc,\n                (float *) ((char *) dst->data  + i*( dst->nb[1])),\n                (float *) ((char *) src0->data + i*(src0->nb[1])));\n    }\n}\n\nstatic void ggml_compute_forward_step(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_step_f32(params, src0, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_relu\n\nstatic void ggml_compute_forward_relu_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n    assert(ggml_are_same_shape(src0, dst));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int n  = ggml_nrows(src0);\n    const int nc = src0->ne[0];\n\n    assert(dst->nb[0]  == sizeof(float));\n    assert(src0->nb[0] == sizeof(float));\n\n    for (int i = 0; i < n; i++) {\n        ggml_vec_relu_f32(nc,\n                (float *) ((char *) dst->data  + i*( dst->nb[1])),\n                (float *) ((char *) src0->data + i*(src0->nb[1])));\n    }\n}\n\nstatic void ggml_compute_forward_relu(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_relu_f32(params, src0, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_gelu\n\nstatic void ggml_compute_forward_gelu_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    GGML_ASSERT(ggml_is_contiguous(src0));\n    GGML_ASSERT(ggml_is_contiguous(dst));\n    GGML_ASSERT(ggml_are_same_shape(src0, dst));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int ith = params->ith;\n    const int nth = params->nth;\n\n    const int nc = src0->ne[0];\n    const int nr = ggml_nrows(src0);\n\n    // rows per thread\n    const int dr = (nr + nth - 1)/nth;\n\n    // row range for this thread\n    const int ir0 = dr*ith;\n    const int ir1 = MIN(ir0 + dr, nr);\n\n    for (int i1 = ir0; i1 < ir1; i1++) {\n        ggml_vec_gelu_f32(nc,\n                (float *) ((char *) dst->data  + i1*( dst->nb[1])),\n                (float *) ((char *) src0->data + i1*(src0->nb[1])));\n\n#ifndef NDEBUG\n        for (int k = 0; k < nc; k++) {\n            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];\n            UNUSED(x);\n            assert(!isnan(x));\n            assert(!isinf(x));\n        }\n#endif\n    }\n}\n\nstatic void ggml_compute_forward_gelu(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_gelu_f32(params, src0, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n\n    //printf(\"XXXXXXXX gelu\\n\");\n}\n\n// ggml_compute_forward_silu\n\nstatic void ggml_compute_forward_silu_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    GGML_ASSERT(ggml_is_contiguous(src0));\n    GGML_ASSERT(ggml_is_contiguous(dst));\n    GGML_ASSERT(ggml_are_same_shape(src0, dst));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int ith = params->ith;\n    const int nth = params->nth;\n\n    const int nc = src0->ne[0];\n    const int nr = ggml_nrows(src0);\n\n    // rows per thread\n    const int dr = (nr + nth - 1)/nth;\n\n    // row range for this thread\n    const int ir0 = dr*ith;\n    const int ir1 = MIN(ir0 + dr, nr);\n\n    for (int i1 = ir0; i1 < ir1; i1++) {\n        ggml_vec_silu_f32(nc,\n                (float *) ((char *) dst->data  + i1*( dst->nb[1])),\n                (float *) ((char *) src0->data + i1*(src0->nb[1])));\n\n#ifndef NDEBUG\n        for (int k = 0; k < nc; k++) {\n            const float x = ((float *) ((char *) dst->data + i1*( dst->nb[1])))[k];\n            UNUSED(x);\n            assert(!isnan(x));\n            assert(!isinf(x));\n        }\n#endif\n    }\n}\n\nstatic void ggml_compute_forward_silu(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_silu_f32(params, src0, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n\n// ggml_compute_forward_norm\n\nstatic void ggml_compute_forward_norm_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    GGML_ASSERT(ggml_are_same_shape(src0, dst));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    GGML_ASSERT(src0->nb[0] == sizeof(float));\n\n    const int ith = params->ith;\n    const int nth = params->nth;\n\n    const int ne00 = src0->ne[0];\n    const int ne01 = src0->ne[1];\n    const int ne02 = src0->ne[2];\n    const int ne03 = src0->ne[3];\n\n    const size_t nb01 = src0->nb[1];\n    const size_t nb02 = src0->nb[2];\n    const size_t nb03 = src0->nb[3];\n\n    const size_t nb1 = dst->nb[1];\n    const size_t nb2 = dst->nb[2];\n    const size_t nb3 = dst->nb[3];\n\n    const ggml_float eps = 1e-5f; // TODO: make this a parameter\n\n    // TODO: optimize\n    for (int i03 = 0; i03 < ne03; i03++) {\n        for (int i02 = 0; i02 < ne02; i02++) {\n            for (int i01 = ith; i01 < ne01; i01 += nth) {\n                const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03);\n\n                ggml_float mean = 0.0;\n                for (int i00 = 0; i00 < ne00; i00++) {\n                    mean += x[i00];\n                }\n\n                mean /= ne00;\n\n                float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3);\n\n                ggml_float sum2 = 0.0;\n                for (int i00 = 0; i00 < ne00; i00++) {\n                    ggml_float v = x[i00] - mean;\n                    y[i00] = v;\n                    sum2 += v*v;\n                }\n\n                const float scale = 1.0/sqrt(sum2/ne00 + eps);\n\n                ggml_vec_scale_f32(ne00, y, scale);\n            }\n        }\n    }\n}\n\nstatic void ggml_compute_forward_norm(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_norm_f32(params, src0, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_mul_mat\n\n#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)\n// helper function to determine if it is better to use BLAS or not\n// for large matrices, BLAS is faster\nstatic bool ggml_compute_forward_mul_mat_use_blas(\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n              struct ggml_tensor * dst) {\n    UNUSED(src0);\n\n    const int ne10 = src1->ne[0];\n\n    const int ne0 = dst->ne[0];\n    const int ne1 = dst->ne[1];\n\n    // TODO: find the optimal values for these\n    if (ggml_is_contiguous(src0) &&\n        ggml_is_contiguous(src1) && ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {\n        //printf(\"BLAS: %d %d %d\\n\", ne0, ne1, ne10);\n        return true;\n    }\n\n    return false;\n}\n#endif\n\nstatic void ggml_compute_forward_mul_mat_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n              struct ggml_tensor * dst) {\n    int64_t t0 = ggml_perf_time_us();\n    UNUSED(t0);\n\n    const int ne00 = src0->ne[0];\n    const int ne01 = src0->ne[1];\n    const int ne02 = src0->ne[2];\n    const int ne03 = src0->ne[3];\n\n    const int ne10 = src1->ne[0];\n    const int ne11 = src1->ne[1];\n    const int ne12 = src1->ne[2];\n    const int ne13 = src1->ne[3];\n\n    const int ne0  = dst->ne[0];\n    const int ne1  = dst->ne[1];\n    const int ne2  = dst->ne[2];\n    const int ne3  = dst->ne[3];\n    const int ne   = ne0*ne1*ne2*ne3;\n\n    const int nb00 = src0->nb[0];\n    const int nb01 = src0->nb[1];\n    const int nb02 = src0->nb[2];\n    const int nb03 = src0->nb[3];\n\n    const int nb10 = src1->nb[0];\n    const int nb11 = src1->nb[1];\n    const int nb12 = src1->nb[2];\n    const int nb13 = src1->nb[3];\n\n    const int nb0  = dst->nb[0];\n    const int nb1  = dst->nb[1];\n    const int nb2  = dst->nb[2];\n    const int nb3  = dst->nb[3];\n\n    const int ith = params->ith;\n    const int nth = params->nth;\n\n    assert(ne02 == ne12);\n    assert(ne03 == ne13);\n    assert(ne2  == ne12);\n    assert(ne3  == ne13);\n\n    // TODO: we don't support permuted src0\n    assert(nb00 == sizeof(float) || nb01 == sizeof(float));\n\n    // dst cannot be transposed or permuted\n    assert(nb0 == sizeof(float));\n    assert(nb0 <= nb1);\n    assert(nb1 <= nb2);\n    assert(nb2 <= nb3);\n\n    assert(ne0 == ne01);\n    assert(ne1 == ne11);\n    assert(ne2 == ne02);\n    assert(ne3 == ne03);\n\n    // nb01 >= nb00 - src0 is not transposed\n    //   compute by src0 rows\n    //\n    // nb00 <  nb01 - src0 is transposed\n    //   compute by src0 columns\n\n#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)\n    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {\n        GGML_ASSERT(nb10 == sizeof(float));\n\n        if (params->ith != 0) {\n            return;\n        }\n\n        if (params->type == GGML_TASK_INIT) {\n            return;\n        }\n\n        if (params->type == GGML_TASK_FINALIZE) {\n            return;\n        }\n\n        for (int i03 = 0; i03 < ne03; i03++) {\n            for (int i02 = 0; i02 < ne02; i02++) {\n                const float * x = (float *) (src0->data);\n                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);\n\n                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);\n\n                // zT = y * xT\n                {\n                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,\n                            ne11, ne01, ne10,\n                            1.0f,    y, ne10,\n                                     x, ne10,\n                            0.0f,    d, ne01);\n                }\n            }\n        }\n\n        //printf(\"CBLAS F32 = %f ms, %d x %d x %d x %d\\n\", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);\n\n        return;\n    }\n#endif\n\n    if (params->type == GGML_TASK_INIT) {\n        if (nb01 >= nb00) {\n            return;\n        }\n\n        // TODO: fix this memset (wsize is overestimated)\n        memset(params->wdata, 0, params->wsize);\n        return;\n    }\n\n    if (params->type == GGML_TASK_FINALIZE) {\n        if (nb01 >= nb00) {\n            return;\n        }\n\n        // TODO: fix this memset (wsize is overestimated)\n        //assert(params->wsize == (ggml_nbytes(dst) + CACHE_LINE_SIZE)*nth);\n\n        float * const wdata = params->wdata;\n\n        // cols per thread\n        const int dc = (ne + nth - 1)/nth;\n\n        // col range for this thread\n        const int ic0 = dc*ith;\n        const int ic1 = MIN(ic0 + dc, ne);\n\n        ggml_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0);\n\n        for (int k = 1; k < nth; k++) {\n            ggml_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0);\n        }\n\n        return;\n    }\n\n    if (nb01 >= nb00) {\n        // TODO: do not support transposed src1\n        assert(nb10 == sizeof(float));\n\n        // parallelize by src0 rows using ggml_vec_dot_f32\n\n        // total rows in src0\n        const int nr = ne01*ne02*ne03;\n\n        // rows per thread\n        const int dr = (nr + nth - 1)/nth;\n\n        // row range for this thread\n        const int ir0 = dr*ith;\n        const int ir1 = MIN(ir0 + dr, nr);\n\n        for (int ir = ir0; ir < ir1; ++ir) {\n            // src0 indices\n            const int i03 = ir/(ne02*ne01);\n            const int i02 = (ir - i03*ne02*ne01)/ne01;\n            const int i01 = (ir - i03*ne02*ne01 - i02*ne01);\n\n            for (int ic = 0; ic < ne11; ++ic) {\n                // src1 indices\n                const int i13 = i03;\n                const int i12 = i02;\n                const int i11 = ic;\n\n                // dst indices\n                const int i0 = i01;\n                const int i1 = i11;\n                const int i2 = i02;\n                const int i3 = i03;\n\n                ggml_vec_dot_f32(ne00,\n                        (float *) ((char *)  dst->data + (i0*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),\n                        (float *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03)),\n                        (float *) ((char *) src1->data + (i11*nb11 + i12*nb12 + i13*nb13)));\n            }\n        }\n    } else {\n        // parallelize by src1 columns using ggml_vec_mad_f32\n        // each thread has its own work data\n        // during FINALIZE we accumulate all work data into dst\n\n        // total columns in src1\n        const int nc = ne10;\n\n        // columns per thread\n        const int dc = (nc + nth - 1)/nth;\n\n        // column range for this thread\n        const int ic0 = dc*ith;\n        const int ic1 = MIN(ic0 + dc, nc);\n\n        // work data for thread\n        const int wo = (ne + CACHE_LINE_SIZE_F32)*ith;\n        float * const wdata = params->wdata;\n\n        for (int i13 = 0; i13 < ne13; ++i13) {\n            for (int i12 = 0; i12 < ne12; ++i12) {\n                for (int i11 = 0; i11 < ne11; ++i11) {\n                    for (int ic = ic0; ic < ic1; ++ic) {\n                        // src1 indices\n                        const int i10 = ic;\n\n                        // src0 indices\n                        const int i03 = i13;\n                        const int i02 = i12;\n                        const int i00 = ic;\n\n                        // dst indices\n                        const int i1 = i11;\n                        const int i2 = i12;\n                        const int i3 = i13;\n\n                        assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize);\n\n                        ggml_vec_mad_f32(ne01,\n                                (float *) (wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0),\n                                (float *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03)),\n                               *(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13)));\n                    }\n                }\n            }\n        }\n    }\n\n    //int64_t t1 = ggml_perf_time_us();\n    //static int64_t acc = 0;\n    //acc += t1 - t0;\n    //if (t1 - t0 > 10) {\n    //    printf(\"\\n\");\n    //    printf(\"ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\\n\", ne00, ne01, ne02, ne03);\n    //    printf(\"nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\\n\", nb00, nb01, nb02, nb03);\n    //    printf(\"ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\\n\", ne10, ne11, ne12, ne13);\n    //    printf(\"nb10 = %5d, nb11 = %5d, nb12 = %5d, nb13 = %5d\\n\", nb10, nb11, nb12, nb13);\n\n    //    printf(\"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\\n\", ith, nth, (int) (t1 - t0), (int) acc);\n    //}\n}\n\nstatic void ggml_compute_forward_mul_mat_f16_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n              struct ggml_tensor * dst) {\n    int64_t t0 = ggml_perf_time_us();\n    UNUSED(t0);\n\n    const int ne00 = src0->ne[0];\n    const int ne01 = src0->ne[1];\n    const int ne02 = src0->ne[2];\n    const int ne03 = src0->ne[3];\n\n    const int ne10 = src1->ne[0];\n    const int ne11 = src1->ne[1];\n    const int ne12 = src1->ne[2];\n    const int ne13 = src1->ne[3];\n\n    const int ne0  = dst->ne[0];\n    const int ne1  = dst->ne[1];\n    const int ne2  = dst->ne[2];\n    const int ne3  = dst->ne[3];\n    const int ne   = ne0*ne1*ne2*ne3;\n\n    const int nb00 = src0->nb[0];\n    const int nb01 = src0->nb[1];\n    const int nb02 = src0->nb[2];\n    const int nb03 = src0->nb[3];\n\n    const int nb10 = src1->nb[0];\n    const int nb11 = src1->nb[1];\n    const int nb12 = src1->nb[2];\n    const int nb13 = src1->nb[3];\n\n    const int nb0  = dst->nb[0];\n    const int nb1  = dst->nb[1];\n    const int nb2  = dst->nb[2];\n    const int nb3  = dst->nb[3];\n\n    const int ith = params->ith;\n    const int nth = params->nth;\n\n    GGML_ASSERT(ne02 == ne12);\n    GGML_ASSERT(ne03 == ne13);\n    GGML_ASSERT(ne2  == ne12);\n    GGML_ASSERT(ne3  == ne13);\n\n    // TODO: we don't support permuted src0\n    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t) || nb01 == sizeof(ggml_fp16_t));\n\n    // dst cannot be transposed or permuted\n    GGML_ASSERT(nb0 == sizeof(float));\n    GGML_ASSERT(nb0 <= nb1);\n    GGML_ASSERT(nb1 <= nb2);\n    GGML_ASSERT(nb2 <= nb3);\n\n    GGML_ASSERT(ne0 == ne01);\n    GGML_ASSERT(ne1 == ne11);\n    GGML_ASSERT(ne2 == ne02);\n    GGML_ASSERT(ne3 == ne03);\n\n    // nb01 >= nb00 - src0 is not transposed\n    //   compute by src0 rows\n    //\n    // nb00 <  nb01 - src0 is transposed\n    //   compute by src0 columns\n\n#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)\n    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {\n        GGML_ASSERT(nb10 == sizeof(float));\n\n        if (params->ith != 0) {\n            return;\n        }\n\n        if (params->type == GGML_TASK_INIT) {\n            return;\n        }\n\n        if (params->type == GGML_TASK_FINALIZE) {\n            return;\n        }\n\n        float * const wdata = params->wdata;\n\n        for (int i03 = 0; i03 < ne03; i03++) {\n            for (int i02 = 0; i02 < ne02; i02++) {\n                {\n                    int id = 0;\n                    for (int i01 = 0; i01 < ne01; ++i01) {\n                        for (int i00 = 0; i00 < ne00; ++i00) {\n                            wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));\n                        }\n                    }\n                }\n\n                const float * x = wdata;\n                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);\n\n                //      float * z =                          wdata + ne00*ne01;\n\n                // z = x * yT\n                //{\n                //    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,\n                //            ne01, ne11, ne00,\n                //            1.0f, x, ne00,\n                //                  y, ne00,\n                //            0.0f, z, ne11);\n                //}\n\n                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);\n\n                // transpose z\n                //for (int j = 0; j < ne11; ++j) {\n                //    for (int i = 0; i < ne01; ++i) {\n                //        d[j*ne01 + i] = z[i*ne11 + j];\n                //    }\n                //}\n\n                {\n#if 1\n                    // zT = y * xT\n                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,\n                            ne11, ne01, ne10,\n                            1.0f,    y, ne00,\n                                     x, ne00,\n                            0.0f,    d, ne01);\n#else\n                    // zT = (xT * y)T\n                    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,\n                            ne01, ne11, ne10,\n                            1.0f,    x, ne00,\n                                     y, ne00,\n                            0.0f,    d, ne01);\n#endif\n                }\n            }\n        }\n\n        /*printf(\"CBLAS F16 = %f ms, %d x %d x %d x %d\\n\", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/\n\n        return;\n    }\n#endif\n\n    if (params->type == GGML_TASK_INIT) {\n        if (nb01 >= nb00) {\n            ggml_fp16_t * const wdata = params->wdata;\n\n            int id = 0;\n            for (int i13 = 0; i13 < ne13; ++i13) {\n                for (int i12 = 0; i12 < ne12; ++i12) {\n                    for (int i11 = 0; i11 < ne11; ++i11) {\n                        for (int i10 = 0; i10 < ne10; ++i10) {\n                            wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));\n                        }\n                    }\n                }\n            }\n\n            GGML_ASSERT(id*sizeof(ggml_fp16_t) <= params->wsize);\n\n            return;\n        }\n\n        // TODO: fix this memset (wsize is overestimated)\n        memset(params->wdata, 0, params->wsize);\n        return;\n    }\n\n    if (params->type == GGML_TASK_FINALIZE) {\n        if (nb01 >= nb00) {\n            return;\n        }\n\n        // TODO: fix this memset (wsize is overestimated)\n        //assert(params->wsize == (ggml_nbytes(dst) + CACHE_LINE_SIZE)*nth);\n\n        ggml_fp16_t * const wdata = params->wdata;\n\n        // cols per thread\n        const int dc = (ne + nth - 1)/nth;\n\n        // col range for this thread\n        const int ic0 = dc*ith;\n        const int ic1 = MIN(ic0 + dc, ne);\n\n        for (int i = ic0; i < ic1; ++i) {\n            ((float *) dst->data)[i] = GGML_FP16_TO_FP32(wdata[i]);\n        }\n\n        for (int k = 1; k < nth; k++) {\n            for (int i = ic0; i < ic1; ++i) {\n                ((float *) dst->data)[i] += GGML_FP16_TO_FP32(wdata[(ne + CACHE_LINE_SIZE_F32)*k + i]);\n            }\n        }\n\n        return;\n    }\n\n    if (nb01 >= nb00) {\n        // fp16 -> half the size, so divide by 2\n        // TODO: do not support transposed src1\n        assert(nb10/2 == sizeof(ggml_fp16_t));\n\n        // parallelize by src0 rows using ggml_vec_dot_f16\n\n        // total rows in src0\n        const int nr = ne01*ne02*ne03;\n\n        // rows per thread\n        const int dr = (nr + nth - 1)/nth;\n\n        // row range for this thread\n        const int ir0 = dr*ith;\n        const int ir1 = MIN(ir0 + dr, nr);\n\n        ggml_fp16_t * wdata = params->wdata;\n\n        for (int ir = ir0; ir < ir1; ++ir) {\n            // src0 indices\n            const int i03 = ir/(ne02*ne01);\n            const int i02 = (ir - i03*ne02*ne01)/ne01;\n            const int i01 = (ir - i03*ne02*ne01 - i02*ne01);\n\n            const int i13 = i03;\n            const int i12 = i02;\n\n            const int i0 = i01;\n            const int i2 = i02;\n            const int i3 = i03;\n\n            ggml_fp16_t * src0_row = (ggml_fp16_t *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));\n            ggml_fp16_t * src1_col =                                wdata + (       0 + i12*ne11 + i13*ne12*ne11)*ne00;\n\n            float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));\n\n            assert(ne00 % 32 == 0);\n\n            for (int ic = 0; ic < ne11; ++ic) {\n                ggml_vec_dot_f16(ne00, &dst_col[ic*ne0], src0_row, src1_col + ic*ne00);\n            }\n        }\n    } else {\n        // parallelize by src1 columns using ggml_vec_mad_f16\n        // each thread has its own work data\n        // during FINALIZE we accumulate all work data into dst\n\n        // total columns in src1\n        const int nc = ne10;\n\n        // columns per thread\n        const int dc = (nc + nth - 1)/nth;\n\n        // column range for this thread\n        const int ic0 = dc*ith;\n        const int ic1 = MIN(ic0 + dc, nc);\n\n        // work data for thread\n        const int wo = (ne + CACHE_LINE_SIZE_F32)*ith;\n        ggml_fp16_t * const wdata = params->wdata;\n\n        for (int i13 = 0; i13 < ne13; ++i13) {\n            for (int i12 = 0; i12 < ne12; ++i12) {\n                for (int i11 = 0; i11 < ne11; ++i11) {\n                    // dst indices\n                    const int i1 = i11;\n                    const int i2 = i12;\n                    const int i3 = i13;\n\n                    ggml_fp16_t * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0;\n\n                    for (int ic = ic0; ic < ic1; ++ic) {\n                        // src1 indices\n                        const int i10 = ic;\n\n                        // src0 indices\n                        const int i03 = i13;\n                        const int i02 = i12;\n                        const int i00 = ic;\n\n                        assert(sizeof(ggml_fp16_t)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize);\n\n                        ggml_fp16_t * src0_col =  (ggml_fp16_t *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03));\n                        float         src1_val = *      (float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13));\n\n                        ggml_vec_mad_f16(ne01, dst_row, src0_col, src1_val);\n                    }\n                }\n            }\n        }\n    }\n\n    //int64_t t1 = ggml_time_us();\n    //static int64_t acc = 0;\n    //acc += t1 - t0;\n    //if (t1 - t0 > 10) {\n    //    printf(\"\\n\");\n    //    printf(\"ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\\n\", ne00, ne01, ne02, ne03);\n    //    printf(\"nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\\n\", nb00, nb01, nb02, nb03);\n    //    printf(\"ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\\n\", ne10, ne11, ne12, ne13);\n\n    //    printf(\"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\\n\", ith, nth, (int) (t1 - t0), (int) acc);\n    //}\n}\n\nstatic void ggml_compute_forward_mul_mat_q4_0_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n              struct ggml_tensor * dst) {\n    int64_t t0 = ggml_perf_time_us();\n    UNUSED(t0);\n\n    const int ne00 = src0->ne[0];\n    const int ne01 = src0->ne[1];\n    const int ne02 = src0->ne[2];\n    const int ne03 = src0->ne[3];\n\n    const int ne10 = src1->ne[0];\n    const int ne11 = src1->ne[1];\n    const int ne12 = src1->ne[2];\n    const int ne13 = src1->ne[3];\n\n    const int ne0  = dst->ne[0];\n    const int ne1  = dst->ne[1];\n    const int ne2  = dst->ne[2];\n    const int ne3  = dst->ne[3];\n    const int ne   = ne0*ne1*ne2*ne3;\n\n    const int nb00 = src0->nb[0];\n    const int nb01 = src0->nb[1];\n    const int nb02 = src0->nb[2];\n    const int nb03 = src0->nb[3];\n\n    const int nb10 = src1->nb[0];\n    const int nb11 = src1->nb[1];\n    const int nb12 = src1->nb[2];\n    const int nb13 = src1->nb[3];\n\n    const int nb0  = dst->nb[0];\n    const int nb1  = dst->nb[1];\n    const int nb2  = dst->nb[2];\n    const int nb3  = dst->nb[3];\n\n    const int ith = params->ith;\n    const int nth = params->nth;\n\n    GGML_ASSERT(ne02 == ne12);\n    GGML_ASSERT(ne03 == ne13);\n    GGML_ASSERT(ne2  == ne12);\n    GGML_ASSERT(ne3  == ne13);\n\n    // TODO: we don't support permuted src0\n    GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_0] || nb01 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_0]);\n\n    // dst cannot be transposed or permuted\n    GGML_ASSERT(nb0 == sizeof(float));\n    GGML_ASSERT(nb0 <= nb1);\n    GGML_ASSERT(nb1 <= nb2);\n    GGML_ASSERT(nb2 <= nb3);\n\n    GGML_ASSERT(ne0 == ne01);\n    GGML_ASSERT(ne1 == ne11);\n    GGML_ASSERT(ne2 == ne02);\n    GGML_ASSERT(ne3 == ne03);\n\n    // nb01 >= nb00 - src0 is not transposed\n    //   compute by src0 rows\n    //\n    // nb00 <  nb01 - src0 is transposed\n    //   compute by src0 columns\n\n#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)\n    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {\n        GGML_ASSERT(nb10 == sizeof(float));\n\n        if (params->ith != 0) {\n            return;\n        }\n\n        if (params->type == GGML_TASK_INIT) {\n            return;\n        }\n\n        if (params->type == GGML_TASK_FINALIZE) {\n            return;\n        }\n\n        float * const wdata = params->wdata;\n\n        for (int i03 = 0; i03 < ne03; i03++) {\n            for (int i02 = 0; i02 < ne02; i02++) {\n                {\n                    int id = 0;\n                    for (int i01 = 0; i01 < ne01; ++i01) {\n                        //for (int i00 = 0; i00 < ne00; ++i00) {\n                        //    wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));\n                        //}\n                        dequantize_row_q4_0((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);\n                        id += ne00;\n                    }\n                }\n\n                const float * x = wdata;\n                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);\n\n                //      float * z =                          wdata + ne00*ne01;\n\n                // z = x * yT\n                //{\n                //    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,\n                //            ne01, ne11, ne00,\n                //            1.0f, x, ne00,\n                //                  y, ne00,\n                //            0.0f, z, ne11);\n                //}\n\n                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);\n\n                // transpose z\n                //for (int j = 0; j < ne11; ++j) {\n                //    for (int i = 0; i < ne01; ++i) {\n                //        d[j*ne01 + i] = z[i*ne11 + j];\n                //    }\n                //}\n\n                {\n#if 1\n                    // zT = y * xT\n                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,\n                            ne11, ne01, ne10,\n                            1.0f,    y, ne00,\n                                     x, ne00,\n                            0.0f,    d, ne01);\n#else\n                    // zT = (xT * y)T\n                    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,\n                            ne01, ne11, ne10,\n                            1.0f,    x, ne00,\n                                     y, ne00,\n                            0.0f,    d, ne01);\n#endif\n                }\n            }\n        }\n\n        /*printf(\"CBLAS Q4_0 = %f ms, %d x %d x %d x %d\\n\", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/\n\n        return;\n    }\n#endif\n\n    if (params->type == GGML_TASK_INIT) {\n        //printf(\"HHHHHHHHH ith = %d, nth = %d\\n\", ith, nth);\n        if (nb01 >= nb00) {\n            char * wdata = params->wdata;\n\n            for (int i13 = 0; i13 < ne13; ++i13) {\n                for (int i12 = 0; i12 < ne12; ++i12) {\n                    for (int i11 = 0; i11 < ne11; ++i11) {\n                        //for (int i10 = 0; i10 < ne10; ++i10) {\n                        //    wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));\n                        //}\n                        quantize_row_q4_0((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);\n                        wdata += (ne10*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0];\n                    }\n                }\n            }\n\n            return;\n        }\n\n        // TODO: fix this memset (wsize is overestimated)\n        memset(params->wdata, 0, params->wsize);\n        return;\n    }\n\n    if (params->type == GGML_TASK_FINALIZE) {\n        if (nb01 >= nb00) {\n            return;\n        }\n\n        float * const wdata = params->wdata;\n\n        // cols per thread\n        const int dc = (ne + nth - 1)/nth;\n\n        // col range for this thread\n        const int ic0 = dc*ith;\n        const int ic1 = MIN(ic0 + dc, ne);\n\n        ggml_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0);\n\n        for (int k = 1; k < nth; k++) {\n            ggml_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0);\n        }\n\n        return;\n    }\n\n    if (nb01 >= nb00) {\n        // TODO: do not support transposed src1\n\n        // parallelize by src0 rows using ggml_vec_dot_q4_0\n\n        // total rows in src0\n        const int nr = ne01*ne02*ne03;\n\n        // rows per thread\n        const int dr = (nr + nth - 1)/nth;\n\n        // row range for this thread\n        const int ir0 = dr*ith;\n        const int ir1 = MIN(ir0 + dr, nr);\n\n        void * wdata = params->wdata;\n\n        for (int ir = ir0; ir < ir1; ++ir) {\n            // src0 indices\n            const int i03 = ir/(ne02*ne01);\n            const int i02 = (ir - i03*ne02*ne01)/ne01;\n            const int i01 = (ir - i03*ne02*ne01 - i02*ne01);\n\n            const int i13 = i03;\n            const int i12 = i02;\n\n            const int i0 = i01;\n            const int i2 = i02;\n            const int i3 = i03;\n\n            void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));\n            char * src1_col =          ((char *)      wdata + (      (0 + i12*ne11 + i13*ne12*ne11)*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0]);\n\n            float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));\n\n            assert(ne00 % 32 == 0);\n\n            for (int ic = 0; ic < ne11; ++ic) {\n                ggml_vec_dot_q4_0(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_0])/GGML_BLCK_SIZE[GGML_TYPE_Q4_0])));\n            }\n        }\n    } else {\n        //printf(\"AAAAA ith = %d, nth = %d\\n\", ith, nth);\n        // parallelize by src1 columns using ggml_vec_mad_q4_0\n        // each thread has its own work data\n        // during FINALIZE we accumulate all work data into dst\n\n        // total columns in src1\n        const int nc = ne10;\n\n        // columns per thread\n        const int dc = (nc + nth - 1)/nth;\n\n        // column range for this thread\n        const int ic0 = dc*ith;\n        const int ic1 = MIN(ic0 + dc, nc);\n\n        // work data for thread\n        const int wo = (ne + CACHE_LINE_SIZE_F32)*ith;\n        float * const wdata = params->wdata;\n\n        for (int i13 = 0; i13 < ne13; ++i13) {\n            for (int i12 = 0; i12 < ne12; ++i12) {\n                for (int i11 = 0; i11 < ne11; ++i11) {\n                    // dst indices\n                    const int i1 = i11;\n                    const int i2 = i12;\n                    const int i3 = i13;\n\n                    float * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0;\n\n                    for (int ic = ic0; ic < ic1; ++ic) {\n                        // src1 indices\n                        const int i10 = ic;\n\n                        // src0 indices\n                        const int i03 = i13;\n                        const int i02 = i12;\n                        const int i00 = ic;\n\n                        assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize);\n\n                        void * src0_col =   (void *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03));\n                        float  src1_val = *(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13));\n\n                        ggml_vec_mad_q4_0(ne01, dst_row, src0_col, src1_val);\n                    }\n                }\n            }\n        }\n    }\n\n    //int64_t t1 = ggml_time_us();\n    //static int64_t acc = 0;\n    //acc += t1 - t0;\n    //if (t1 - t0 > 10) {\n    //    printf(\"\\n\");\n    //    printf(\"ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\\n\", ne00, ne01, ne02, ne03);\n    //    printf(\"nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\\n\", nb00, nb01, nb02, nb03);\n    //    printf(\"ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\\n\", ne10, ne11, ne12, ne13);\n\n    //    printf(\"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\\n\", ith, nth, (int) (t1 - t0), (int) acc);\n    //}\n}\n\nstatic void ggml_compute_forward_mul_mat_q4_1_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n              struct ggml_tensor * dst) {\n    int64_t t0 = ggml_perf_time_us();\n    UNUSED(t0);\n\n    const int ne00 = src0->ne[0];\n    const int ne01 = src0->ne[1];\n    const int ne02 = src0->ne[2];\n    const int ne03 = src0->ne[3];\n\n    const int ne10 = src1->ne[0];\n    const int ne11 = src1->ne[1];\n    const int ne12 = src1->ne[2];\n    const int ne13 = src1->ne[3];\n\n    const int ne0  = dst->ne[0];\n    const int ne1  = dst->ne[1];\n    const int ne2  = dst->ne[2];\n    const int ne3  = dst->ne[3];\n    const int ne   = ne0*ne1*ne2*ne3;\n\n    const int nb00 = src0->nb[0];\n    const int nb01 = src0->nb[1];\n    const int nb02 = src0->nb[2];\n    const int nb03 = src0->nb[3];\n\n    const int nb10 = src1->nb[0];\n    const int nb11 = src1->nb[1];\n    const int nb12 = src1->nb[2];\n    const int nb13 = src1->nb[3];\n\n    const int nb0  = dst->nb[0];\n    const int nb1  = dst->nb[1];\n    const int nb2  = dst->nb[2];\n    const int nb3  = dst->nb[3];\n\n    const int ith = params->ith;\n    const int nth = params->nth;\n\n    GGML_ASSERT(ne02 == ne12);\n    GGML_ASSERT(ne03 == ne13);\n    GGML_ASSERT(ne2  == ne12);\n    GGML_ASSERT(ne3  == ne13);\n\n    // TODO: we don't support permuted src0\n    GGML_ASSERT(nb00 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_1] || nb01 == (int) GGML_TYPE_SIZE[GGML_TYPE_Q4_1]);\n\n    // dst cannot be transposed or permuted\n    GGML_ASSERT(nb0 == sizeof(float));\n    GGML_ASSERT(nb0 <= nb1);\n    GGML_ASSERT(nb1 <= nb2);\n    GGML_ASSERT(nb2 <= nb3);\n\n    GGML_ASSERT(ne0 == ne01);\n    GGML_ASSERT(ne1 == ne11);\n    GGML_ASSERT(ne2 == ne02);\n    GGML_ASSERT(ne3 == ne03);\n\n    // nb01 >= nb00 - src0 is not transposed\n    //   compute by src0 rows\n    //\n    // nb00 <  nb01 - src0 is transposed\n    //   compute by src0 columns\n\n#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)\n    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {\n        GGML_ASSERT(nb10 == sizeof(float));\n\n        if (params->ith != 0) {\n            return;\n        }\n\n        if (params->type == GGML_TASK_INIT) {\n            return;\n        }\n\n        if (params->type == GGML_TASK_FINALIZE) {\n            return;\n        }\n\n        float * const wdata = params->wdata;\n\n        for (int i03 = 0; i03 < ne03; i03++) {\n            for (int i02 = 0; i02 < ne02; i02++) {\n                {\n                    int id = 0;\n                    for (int i01 = 0; i01 < ne01; ++i01) {\n                        //for (int i00 = 0; i00 < ne00; ++i00) {\n                        //    wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));\n                        //}\n                        dequantize_row_q4_1((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);\n                        id += ne00;\n                    }\n                }\n\n                const float * x = wdata;\n                const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);\n\n                //      float * z =                          wdata + ne00*ne01;\n\n                // z = x * yT\n                //{\n                //    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,\n                //            ne01, ne11, ne00,\n                //            1.0f, x, ne00,\n                //                  y, ne00,\n                //            0.0f, z, ne11);\n                //}\n\n                float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);\n\n                // transpose z\n                //for (int j = 0; j < ne11; ++j) {\n                //    for (int i = 0; i < ne01; ++i) {\n                //        d[j*ne01 + i] = z[i*ne11 + j];\n                //    }\n                //}\n\n                {\n#if 1\n                    // zT = y * xT\n                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,\n                            ne11, ne01, ne10,\n                            1.0f,    y, ne00,\n                                     x, ne00,\n                            0.0f,    d, ne01);\n#else\n                    // zT = (xT * y)T\n                    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,\n                            ne01, ne11, ne10,\n                            1.0f,    x, ne00,\n                                     y, ne00,\n                            0.0f,    d, ne01);\n#endif\n                }\n            }\n        }\n\n        //printf(\"CBLAS = %f ms, %d x %d x %d x %d\\n\", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);\n\n        return;\n    }\n#endif\n\n    if (params->type == GGML_TASK_INIT) {\n        //printf(\"HHHHHHHHH ith = %d, nth = %d\\n\", ith, nth);\n        if (nb01 >= nb00) {\n            char * wdata = params->wdata;\n\n            for (int i13 = 0; i13 < ne13; ++i13) {\n                for (int i12 = 0; i12 < ne12; ++i12) {\n                    for (int i11 = 0; i11 < ne11; ++i11) {\n                        //for (int i10 = 0; i10 < ne10; ++i10) {\n                        //    wdata[id++] = GGML_FP32_TO_FP16(*(float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10));\n                        //}\n                        quantize_row_q4_1((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11), (void *) wdata, ne10);\n                        wdata += (ne10*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1];\n                    }\n                }\n            }\n\n            return;\n        }\n\n        // TODO: fix this memset (wsize is overestimated)\n        memset(params->wdata, 0, params->wsize);\n        return;\n    }\n\n    if (params->type == GGML_TASK_FINALIZE) {\n        if (nb01 >= nb00) {\n            return;\n        }\n\n        float * const wdata = params->wdata;\n\n        // cols per thread\n        const int dc = (ne + nth - 1)/nth;\n\n        // col range for this thread\n        const int ic0 = dc*ith;\n        const int ic1 = MIN(ic0 + dc, ne);\n\n        ggml_vec_cpy_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + ic0);\n\n        for (int k = 1; k < nth; k++) {\n            ggml_vec_acc_f32(ic1 - ic0, (float *) dst->data + ic0, wdata + (ne + CACHE_LINE_SIZE_F32)*k + ic0);\n        }\n\n        return;\n    }\n\n    if (nb01 >= nb00) {\n        // TODO: do not support transposed src1\n\n        // parallelize by src0 rows using ggml_vec_dot_q4_1\n\n        // total rows in src0\n        const int nr = ne01*ne02*ne03;\n\n        // rows per thread\n        const int dr = (nr + nth - 1)/nth;\n\n        // row range for this thread\n        const int ir0 = dr*ith;\n        const int ir1 = MIN(ir0 + dr, nr);\n\n        void * wdata = params->wdata;\n\n        for (int ir = ir0; ir < ir1; ++ir) {\n            // src0 indices\n            const int i03 = ir/(ne02*ne01);\n            const int i02 = (ir - i03*ne02*ne01)/ne01;\n            const int i01 = (ir - i03*ne02*ne01 - i02*ne01);\n\n            const int i13 = i03;\n            const int i12 = i02;\n\n            const int i0 = i01;\n            const int i2 = i02;\n            const int i3 = i03;\n\n            void * src0_row = (void *) ((char *) src0->data + (i01*nb01 + i02*nb02 + i03*nb03));\n            char * src1_col =          ((char *)      wdata + (      (0 + i12*ne11 + i13*ne12*ne11)*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1]);\n\n            float * dst_col = (float *) ((char *) dst->data + (i0*nb0 + 0*nb1 + i2*nb2 + i3*nb3));\n\n            assert(ne00 % 32 == 0);\n\n            for (int ic = 0; ic < ne11; ++ic) {\n                ggml_vec_dot_q4_1(ne00, &dst_col[ic*ne0], src0_row, ((void *) (src1_col + (ic*ne00*GGML_TYPE_SIZE[GGML_TYPE_Q4_1])/GGML_BLCK_SIZE[GGML_TYPE_Q4_1])));\n            }\n        }\n    } else {\n        //printf(\"AAAAA ith = %d, nth = %d\\n\", ith, nth);\n        // parallelize by src1 columns using ggml_vec_mad_q4_1\n        // each thread has its own work data\n        // during FINALIZE we accumulate all work data into dst\n\n        // total columns in src1\n        const int nc = ne10;\n\n        // columns per thread\n        const int dc = (nc + nth - 1)/nth;\n\n        // column range for this thread\n        const int ic0 = dc*ith;\n        const int ic1 = MIN(ic0 + dc, nc);\n\n        // work data for thread\n        const int wo = (ne + CACHE_LINE_SIZE_F32)*ith;\n        float * const wdata = params->wdata;\n\n        for (int i13 = 0; i13 < ne13; ++i13) {\n            for (int i12 = 0; i12 < ne12; ++i12) {\n                for (int i11 = 0; i11 < ne11; ++i11) {\n                    // dst indices\n                    const int i1 = i11;\n                    const int i2 = i12;\n                    const int i3 = i13;\n\n                    float * dst_row = wdata + wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0;\n\n                    for (int ic = ic0; ic < ic1; ++ic) {\n                        // src1 indices\n                        const int i10 = ic;\n\n                        // src0 indices\n                        const int i03 = i13;\n                        const int i02 = i12;\n                        const int i00 = ic;\n\n                        assert(sizeof(float)*(wo + i3*ne2*ne1*ne0 + i2*ne1*ne0 + i1*ne0 + ne01) <= params->wsize);\n\n                        void * src0_col =   (void *) ((char *) src0->data + (i00*nb00 + i02*nb02 + i03*nb03));\n                        float  src1_val = *(float *) ((char *) src1->data + (i10*nb10 + i11*nb11 + i12*nb12 + i13*nb13));\n\n                        ggml_vec_mad_q4_1(ne01, dst_row, src0_col, src1_val);\n                    }\n                }\n            }\n        }\n    }\n\n    //int64_t t1 = ggml_time_us();\n    //static int64_t acc = 0;\n    //acc += t1 - t0;\n    //if (t1 - t0 > 10) {\n    //    printf(\"\\n\");\n    //    printf(\"ne00 = %5d, ne01 = %5d, ne02 = %5d, ne03 = %5d\\n\", ne00, ne01, ne02, ne03);\n    //    printf(\"nb00 = %5d, nb01 = %5d, nb02 = %5d, nb03 = %5d\\n\", nb00, nb01, nb02, nb03);\n    //    printf(\"ne10 = %5d, ne11 = %5d, ne12 = %5d, ne13 = %5d\\n\", ne10, ne11, ne12, ne13);\n\n    //    printf(\"XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX task %d/%d: %d us, acc = %d\\n\", ith, nth, (int) (t1 - t0), (int) acc);\n    //}\n}\n\nstatic void ggml_compute_forward_mul_mat(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_Q4_0:\n            {\n                ggml_compute_forward_mul_mat_q4_0_f32(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_Q4_1:\n            {\n                ggml_compute_forward_mul_mat_q4_1_f32(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_F16:\n            {\n                ggml_compute_forward_mul_mat_f16_f32(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_mul_mat_f32(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n\n#if 0\n    if (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_Q4_1) {\n        static int first = 8;\n        printf(\"src0: ne0 = %5d, ne1 = %5d, ne2 = %5d\\n\", src0->ne[0], src0->ne[1], src0->ne[2]);\n        printf(\"src1: ne0 = %5d, ne1 = %5d, ne2 = %5d\\n\", src1->ne[0], src1->ne[1], src1->ne[2]);\n        printf(\"dst:  ne0 = %5d, ne1 = %5d, ne2 = %5d\\n\", dst->ne[0], dst->ne[1], dst->ne[2]);\n        if (first) {\n            --first;\n        } else {\n            for (int k = 0; k < dst->ne[1]; ++k) {\n                for (int j = 0; j < dst->ne[0]/16; ++j) {\n                    for (int i = 0; i < 16; ++i) {\n                        printf(\"%8.4f \", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);\n                    }\n                    printf(\"\\n\");\n                }\n                printf(\"\\n\");\n            }\n            printf(\"\\n\");\n            exit(0);\n        }\n    } else {\n        printf(\"aaaa src0: ne0 = %5d, ne1 = %5d, ne2 = %5d\\n\", src0->ne[0], src0->ne[1], src0->ne[2]);\n        printf(\"aaaa src1: ne0 = %5d, ne1 = %5d, ne2 = %5d\\n\", src1->ne[0], src1->ne[1], src1->ne[2]);\n        printf(\"aaaa dst:  ne0 = %5d, ne1 = %5d, ne2 = %5d\\n\", dst->ne[0], dst->ne[1], dst->ne[2]);\n    }\n#endif\n}\n\n// ggml_compute_forward_scale\n\nstatic void ggml_compute_forward_scale_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    GGML_ASSERT(ggml_is_contiguous(src0));\n    GGML_ASSERT(ggml_is_contiguous(dst));\n    GGML_ASSERT(ggml_are_same_shape(src0, dst));\n    GGML_ASSERT(ggml_is_scalar(src1));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    // scale factor\n    const float v = *(float *) src1->data;\n\n    const int ith = params->ith;\n    const int nth = params->nth;\n\n    const int nc = src0->ne[0];\n    const int nr = ggml_nrows(src0);\n\n    // rows per thread\n    const int dr = (nr + nth - 1)/nth;\n\n    // row range for this thread\n    const int ir0 = dr*ith;\n    const int ir1 = MIN(ir0 + dr, nr);\n\n    for (int i1 = ir0; i1 < ir1; i1++) {\n        ggml_vec_scale_f32(nc, (float *) ((char *) dst->data + i1*(dst->nb[1])), v);\n    }\n}\n\nstatic void ggml_compute_forward_scale(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_scale_f32(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_cpy\n\nstatic void ggml_compute_forward_cpy(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    ggml_compute_forward_dup(params, src0, dst);\n}\n\n// ggml_compute_forward_reshape\n\nstatic void ggml_compute_forward_reshape(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    // NOP\n    UNUSED(params);\n    UNUSED(src0);\n    UNUSED(dst);\n}\n\n// ggml_compute_forward_view\n\nstatic void ggml_compute_forward_view(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0) {\n    // NOP\n    UNUSED(params);\n    UNUSED(src0);\n}\n\n// ggml_compute_forward_permute\n\nstatic void ggml_compute_forward_permute(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0) {\n    // NOP\n    UNUSED(params);\n    UNUSED(src0);\n}\n\n// ggml_compute_forward_transpose\n\nstatic void ggml_compute_forward_transpose(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0) {\n    // NOP\n    UNUSED(params);\n    UNUSED(src0);\n}\n\n// ggml_compute_forward_get_rows\n\nstatic void ggml_compute_forward_get_rows_q4_0(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n              struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int nc = src0->ne[0];\n    const int nr = ggml_nelements(src1);\n\n    assert( dst->ne[0] == nc);\n    assert( dst->ne[1] == nr);\n    assert(src0->nb[0] == GGML_TYPE_SIZE[GGML_TYPE_Q4_0]);\n\n    for (int i = 0; i < nr; ++i) {\n        const int r = ((int32_t *) src1->data)[i];\n\n        dequantize_row_q4_0(\n                (const void *) ((char *) src0->data + r*src0->nb[1]),\n                     (float *) ((char *)  dst->data + i*dst->nb[1]), nc);\n    }\n}\n\nstatic void ggml_compute_forward_get_rows_q4_1(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n              struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int nc = src0->ne[0];\n    const int nr = ggml_nelements(src1);\n\n    assert( dst->ne[0] == nc);\n    assert( dst->ne[1] == nr);\n    assert(src0->nb[0] == GGML_TYPE_SIZE[GGML_TYPE_Q4_1]);\n\n    for (int i = 0; i < nr; ++i) {\n        const int r = ((int32_t *) src1->data)[i];\n\n        dequantize_row_q4_1(\n                (const void *) ((char *) src0->data + r*src0->nb[1]),\n                     (float *) ((char *)  dst->data + i*dst->nb[1]), nc);\n    }\n}\n\nstatic void ggml_compute_forward_get_rows_f16(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n              struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int nc = src0->ne[0];\n    const int nr = ggml_nelements(src1);\n\n    assert( dst->ne[0] == nc);\n    assert( dst->ne[1] == nr);\n    assert(src0->nb[0] == sizeof(ggml_fp16_t));\n\n    for (int i = 0; i < nr; ++i) {\n        const int r = ((int32_t *) src1->data)[i];\n\n        for (int j = 0; j < nc; ++j) {\n            ggml_fp16_t v = ((ggml_fp16_t *) ((char *) src0->data + r*src0->nb[1]))[j];\n            ((float *) ((char *)  dst->data + i*dst->nb[1]))[j] = GGML_FP16_TO_FP32(v);\n        }\n    }\n}\n\nstatic void ggml_compute_forward_get_rows_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n              struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int nc = src0->ne[0];\n    const int nr = ggml_nelements(src1);\n\n    assert( dst->ne[0] == nc);\n    assert( dst->ne[1] == nr);\n    assert(src0->nb[0] == sizeof(float));\n\n    for (int i = 0; i < nr; ++i) {\n        const int r = ((int32_t *) src1->data)[i];\n\n        ggml_vec_cpy_f32(nc,\n                (float *) ((char *)  dst->data + i*dst->nb[1]),\n                (float *) ((char *) src0->data + r*src0->nb[1]));\n    }\n}\n\nstatic void ggml_compute_forward_get_rows(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_Q4_0:\n            {\n                ggml_compute_forward_get_rows_q4_0(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_Q4_1:\n            {\n                ggml_compute_forward_get_rows_q4_1(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_F16:\n            {\n                ggml_compute_forward_get_rows_f16(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_get_rows_f32(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n\n    //static bool first = true;\n    //printf(\"ne0 = %d, ne1 = %d, ne2 = %d\\n\", dst->ne[0], dst->ne[1], dst->ne[2]);\n    //if (first) {\n    //    first = false;\n    //} else {\n    //    for (int k = 0; k < dst->ne[1]; ++k) {\n    //        for (int j = 0; j < dst->ne[0]/16; ++j) {\n    //            for (int i = 0; i < 16; ++i) {\n    //                printf(\"%8.4f \", ((float *) dst->data)[k*dst->ne[0] + j*16 + i]);\n    //            }\n    //            printf(\"\\n\");\n    //        }\n    //        printf(\"\\n\");\n    //    }\n    //    printf(\"\\n\");\n    //    exit(0);\n    //}\n}\n\n// ggml_compute_forward_diag_mask_inf\n\nstatic void ggml_compute_forward_diag_mask_inf_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n    assert(src1->type == GGML_TYPE_I32);\n    assert(ggml_nelements(src1) == 1);\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int n_past = ((int32_t *) src1->data)[0];\n\n    // TODO: handle transposed/permuted matrices\n\n    const int n  = ggml_nrows(src0);\n    const int nc = src0->ne[0];\n    const int nr = src0->ne[1];\n    const int nz = n/nr;\n\n    assert( dst->nb[0] == sizeof(float));\n    assert(src0->nb[0] == sizeof(float));\n\n    for (int k = 0; k < nz; k++) {\n        for (int j = 0; j < nr; j++) {\n            for (int i = n_past; i < nc; i++) {\n                if (i > n_past + j) {\n                    *(float *)((char *) dst->data + k*dst->nb[2] + j*dst->nb[1] + i*dst->nb[0]) = -INFINITY;\n                }\n            }\n        }\n    }\n}\n\nstatic void ggml_compute_forward_diag_mask_inf(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_diag_mask_inf_f32(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_soft_max\n\nstatic void ggml_compute_forward_soft_max_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    GGML_ASSERT(ggml_is_contiguous(src0));\n    GGML_ASSERT(ggml_is_contiguous(dst));\n    GGML_ASSERT(ggml_are_same_shape(src0, dst));\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    // TODO: handle transposed/permuted matrices\n\n    const int ith = params->ith;\n    const int nth = params->nth;\n\n    const int nc = src0->ne[0];\n    const int nr = ggml_nrows(src0);\n\n    // rows per thread\n    const int dr = (nr + nth - 1)/nth;\n\n    // row range for this thread\n    const int ir0 = dr*ith;\n    const int ir1 = MIN(ir0 + dr, nr);\n\n    for (int i1 = ir0; i1 < ir1; i1++) {\n        float *p = (float *)((char *) dst->data + i1*dst->nb[1]);\n\n#ifndef NDEBUG\n        for (int i = 0; i < nc; ++i) {\n            //printf(\"p[%d] = %f\\n\", i, p[i]);\n            assert(!isnan(p[i]));\n        }\n#endif\n\n        float max = -INFINITY;\n        ggml_vec_max_f32(nc, &max, p);\n\n        ggml_float sum = 0.0;\n\n        uint16_t scvt;\n        for (int i = 0; i < nc; i++) {\n            if (p[i] == -INFINITY) {\n                p[i] = 0.0f;\n            } else {\n                //const float val = (p[i] == -INFINITY) ? 0.0 : exp(p[i] - max);\n                ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max);\n                memcpy(&scvt, &s, sizeof(scvt));\n                const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]);\n                sum += val;\n                p[i] = val;\n            }\n        }\n\n        assert(sum > 0.0f);\n\n        sum = 1.0/sum;\n        ggml_vec_scale_f32(nc, p, sum);\n\n#ifndef NDEBUG\n        for (int i = 0; i < nc; ++i) {\n            assert(!isnan(p[i]));\n            assert(!isinf(p[i]));\n        }\n#endif\n    }\n}\n\nstatic void ggml_compute_forward_soft_max(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_soft_max_f32(params, src0, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_F16:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_rope\n\nstatic void ggml_compute_forward_rope_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n    assert(src1->type == GGML_TYPE_I32);\n    assert(ggml_nelements(src1) == 3);\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int n_past = ((int32_t *) src1->data)[0];\n    const int n_dims = ((int32_t *) src1->data)[1];\n    const int mode   = ((int32_t *) src1->data)[2];\n\n    //const int ne0 = src0->ne[0];\n    const int ne1 = src0->ne[1];\n    const int ne2 = src0->ne[2];\n    const int ne3 = src0->ne[3];\n\n    const int nb0 = src0->nb[0];\n    const int nb1 = src0->nb[1];\n    const int nb2 = src0->nb[2];\n    const int nb3 = src0->nb[3];\n\n    //printf(\"ne0: %d, ne1: %d, ne2: %d, ne3: %d\\n\", ne0, ne1, ne2, ne3);\n    //printf(\"n_past = %d, ne2 = %d\\n\", n_past, ne2);\n\n    assert(nb0 == sizeof(float));\n\n    // TODO: optimize\n    for (int i3 = 0; i3 < ne3; i3++) {\n        for (int i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {\n            const int p = (mode == 0 ? n_past + i2 : i2);\n            for (int i1 = 0; i1 < ne1; i1++) {\n                for (int i0 = 0; i0 < n_dims; i0 += 2) {\n                    const double theta = pow(10000.0, ((double)-i0)/n_dims);\n\n                    const double cos_theta = cos(p*theta);\n                    const double sin_theta = sin(p*theta);\n\n                    const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);\n                          float * dst_data  = (float *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);\n\n                    double x0 = src[0];\n                    double x1 = src[1];\n\n                    dst_data[0] = x0*cos_theta - x1*sin_theta;\n                    dst_data[1] = x0*sin_theta + x1*cos_theta;\n                }\n            }\n        }\n    }\n}\n\nstatic void ggml_compute_forward_rope_f16(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    assert(params->ith == 0);\n    assert(src1->type == GGML_TYPE_I32);\n    assert(ggml_nelements(src1) == 3);\n\n    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    const int n_past = ((int32_t *) src1->data)[0];\n    const int n_dims = ((int32_t *) src1->data)[1];\n    const int mode   = ((int32_t *) src1->data)[2];\n\n    //const int ne0 = src0->ne[0];\n    const int ne1 = src0->ne[1];\n    const int ne2 = src0->ne[2];\n    const int ne3 = src0->ne[3];\n\n    const int nb0 = src0->nb[0];\n    const int nb1 = src0->nb[1];\n    const int nb2 = src0->nb[2];\n    const int nb3 = src0->nb[3];\n\n    //printf(\"ne0: %d, ne1: %d, ne2: %d, ne3: %d\\n\", ne0, ne1, ne2, ne3);\n    //printf(\"n_past = %d, ne2 = %d\\n\", n_past, ne2);\n\n    assert(nb0 == sizeof(ggml_fp16_t));\n\n    for (int i3 = 0; i3 < ne3; i3++) {\n        for (int i2 = (mode == 0 ? 0 : n_past); i2 < ne2; i2++) {\n            const int p = (mode == 0 ? n_past + i2 : i2);\n            for (int i1 = 0; i1 < ne1; i1++) {\n                for (int i0 = 0; i0 < n_dims; i0 += 2) {\n                    const double theta = pow(10000.0, ((double)-i0)/n_dims);\n\n                    const double cos_theta = cos(p*theta);\n                    const double sin_theta = sin(p*theta);\n\n                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);\n                          ggml_fp16_t * dst_data  = (ggml_fp16_t *)((char *)  dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0);\n\n                    double x0 = ggml_fp16_to_fp32(src[0]);\n                    double x1 = ggml_fp16_to_fp32(src[1]);\n\n                    dst_data[0] = ggml_fp32_to_fp16(x0*cos_theta - x1*sin_theta);\n                    dst_data[1] = ggml_fp32_to_fp16(x0*sin_theta + x1*cos_theta);\n                }\n            }\n        }\n    }\n}\n\nstatic void ggml_compute_forward_rope(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F16:\n            {\n                ggml_compute_forward_rope_f16(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_rope_f32(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_conv_1d_1s\n\nstatic void ggml_compute_forward_conv_1d_1s_f16_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n              struct ggml_tensor * dst) {\n    GGML_ASSERT(src0->type == GGML_TYPE_F16);\n    GGML_ASSERT(src1->type == GGML_TYPE_F32);\n    GGML_ASSERT( dst->type == GGML_TYPE_F32);\n\n    int64_t t0 = ggml_perf_time_us();\n    UNUSED(t0);\n\n    const int ne00 = src0->ne[0];\n    const int ne01 = src0->ne[1];\n    const int ne02 = src0->ne[2];\n    //const int ne03 = src0->ne[3];\n\n    const int ne10 = src1->ne[0];\n    const int ne11 = src1->ne[1];\n    //const int ne12 = src1->ne[2];\n    //const int ne13 = src1->ne[3];\n\n    //const int ne0  = dst->ne[0];\n    //const int ne1  = dst->ne[1];\n    //const int ne2  = dst->ne[2];\n    //const int ne3  = dst->ne[3];\n    //const int ne   = ne0*ne1*ne2*ne3;\n\n    const int nb00 = src0->nb[0];\n    const int nb01 = src0->nb[1];\n    const int nb02 = src0->nb[2];\n    //const int nb03 = src0->nb[3];\n\n    const int nb10 = src1->nb[0];\n    const int nb11 = src1->nb[1];\n    //const int nb12 = src1->nb[2];\n    //const int nb13 = src1->nb[3];\n\n    //const int nb0  = dst->nb[0];\n    const int nb1  = dst->nb[1];\n    //const int nb2  = dst->nb[2];\n    //const int nb3  = dst->nb[3];\n\n    const int ith = params->ith;\n    const int nth = params->nth;\n\n    const int nk = ne00;\n    const int nh = nk/2;\n\n    const int ew0 = ggml_up32(ne01);\n\n    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes\n    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));\n    GGML_ASSERT(nb10 == sizeof(float));\n\n    if (params->type == GGML_TASK_INIT) {\n        // TODO: fix this memset (wsize is overestimated)\n        memset(params->wdata, 0, params->wsize);\n\n        // prepare kernel data (src0)\n        {\n            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;\n\n            for (int i02 = 0; i02 < ne02; i02++) {\n                for (int i01 = 0; i01 < ne01; i01++) {\n                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);\n                    ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;\n                    for (int i00 = 0; i00 < ne00; i00++) {\n                        dst_data[i00*ew0 + i01] = src[i00];\n                    }\n                }\n            }\n        }\n\n        // prepare source data (src1)\n        {\n            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;\n\n            for (int i11 = 0; i11 < ne11; i11++) {\n                const float * const src = (float *)((char *) src1->data + i11*nb11);\n                ggml_fp16_t * dst_data = wdata;\n                for (int i10 = 0; i10 < ne10; i10++) {\n                    dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);\n                }\n            }\n        }\n\n        return;\n    }\n\n    if (params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    // total rows in dst\n    const int nr = ne02;\n\n    // rows per thread\n    const int dr = (nr + nth - 1)/nth;\n\n    // row range for this thread\n    const int ir0 = dr*ith;\n    const int ir1 = MIN(ir0 + dr, nr);\n\n    for (int i1 = ir0; i1 < ir1; i1++) {\n        float * dst_data = (float *)((char *) dst->data + i1*nb1);\n        for (int i0 = 0; i0 < ne10; ++i0) {\n            dst_data[i0] = 0;\n            for (int k = -nh; k <= nh; k++) {\n                float v = 0.0f;\n                ggml_vec_dot_f16(ew0, &v,\n                        (ggml_fp16_t *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,\n                        (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);\n\n                dst_data[i0] += v;\n            }\n        }\n    }\n}\n\nstatic void ggml_compute_forward_conv_1d_1s_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n              struct ggml_tensor * dst) {\n    GGML_ASSERT(src0->type == GGML_TYPE_F32);\n    GGML_ASSERT(src1->type == GGML_TYPE_F32);\n    GGML_ASSERT( dst->type == GGML_TYPE_F32);\n\n    int64_t t0 = ggml_perf_time_us();\n    UNUSED(t0);\n\n    const int ne00 = src0->ne[0];\n    const int ne01 = src0->ne[1];\n    const int ne02 = src0->ne[2];\n    //const int ne03 = src0->ne[3];\n\n    const int ne10 = src1->ne[0];\n    const int ne11 = src1->ne[1];\n    //const int ne12 = src1->ne[2];\n    //const int ne13 = src1->ne[3];\n\n    //const int ne0  = dst->ne[0];\n    //const int ne1  = dst->ne[1];\n    //const int ne2  = dst->ne[2];\n    //const int ne3  = dst->ne[3];\n    //const int ne   = ne0*ne1*ne2*ne3;\n\n    const int nb00 = src0->nb[0];\n    const int nb01 = src0->nb[1];\n    const int nb02 = src0->nb[2];\n    //const int nb03 = src0->nb[3];\n\n    const int nb10 = src1->nb[0];\n    const int nb11 = src1->nb[1];\n    //const int nb12 = src1->nb[2];\n    //const int nb13 = src1->nb[3];\n\n    //const int nb0  = dst->nb[0];\n    const int nb1  = dst->nb[1];\n    //const int nb2  = dst->nb[2];\n    //const int nb3  = dst->nb[3];\n\n    const int ith = params->ith;\n    const int nth = params->nth;\n\n    const int nk = ne00;\n    const int nh = nk/2;\n\n    const int ew0 = ggml_up32(ne01);\n\n    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes\n    GGML_ASSERT(nb00 == sizeof(float));\n    GGML_ASSERT(nb10 == sizeof(float));\n\n    if (params->type == GGML_TASK_INIT) {\n        // TODO: fix this memset (wsize is overestimated)\n        memset(params->wdata, 0, params->wsize);\n\n        // prepare kernel data (src0)\n        {\n            float * const wdata = (float *) params->wdata + 0;\n\n            for (int i02 = 0; i02 < ne02; i02++) {\n                for (int i01 = 0; i01 < ne01; i01++) {\n                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);\n                    float * dst_data = wdata + i02*ew0*ne00;\n                    for (int i00 = 0; i00 < ne00; i00++) {\n                        dst_data[i00*ew0 + i01] = src[i00];\n                    }\n                }\n            }\n        }\n\n        // prepare source data (src1)\n        {\n            float * const wdata = (float *) params->wdata + ne02*ew0*ne00;\n\n            for (int i11 = 0; i11 < ne11; i11++) {\n                const float * const src = (float *)((char *) src1->data + i11*nb11);\n                float * dst_data = wdata;\n                for (int i10 = 0; i10 < ne10; i10++) {\n                    dst_data[(i10 + nh)*ew0 + i11] = src[i10];\n                }\n            }\n        }\n\n        return;\n    }\n\n    if (params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    // total rows in dst\n    const int nr = ne02;\n\n    // rows per thread\n    const int dr = (nr + nth - 1)/nth;\n\n    // row range for this thread\n    const int ir0 = dr*ith;\n    const int ir1 = MIN(ir0 + dr, nr);\n\n    for (int i1 = ir0; i1 < ir1; i1++) {\n        float * dst_data = (float *)((char *) dst->data + i1*nb1);\n        for (int i0 = 0; i0 < ne10; ++i0) {\n            dst_data[i0] = 0;\n            for (int k = -nh; k <= nh; k++) {\n                float v = 0.0f;\n                ggml_vec_dot_f32(ew0, &v,\n                        (float *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,\n                        (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);\n\n                dst_data[i0] += v;\n            }\n        }\n    }\n}\n\nstatic void ggml_compute_forward_conv_1d_1s(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F16:\n            {\n                ggml_compute_forward_conv_1d_1s_f16_f32(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_conv_1d_1s_f32(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_conv_1d_2s\n\nstatic void ggml_compute_forward_conv_1d_2s_f16_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n              struct ggml_tensor * dst) {\n    GGML_ASSERT(src0->type == GGML_TYPE_F16);\n    GGML_ASSERT(src1->type == GGML_TYPE_F32);\n    GGML_ASSERT( dst->type == GGML_TYPE_F32);\n\n    int64_t t0 = ggml_perf_time_us();\n    UNUSED(t0);\n\n    const int ne00 = src0->ne[0];\n    const int ne01 = src0->ne[1];\n    const int ne02 = src0->ne[2];\n    //const int ne03 = src0->ne[3];\n\n    const int ne10 = src1->ne[0];\n    const int ne11 = src1->ne[1];\n    //const int ne12 = src1->ne[2];\n    //const int ne13 = src1->ne[3];\n\n    //const int ne0  = dst->ne[0];\n    //const int ne1  = dst->ne[1];\n    //const int ne2  = dst->ne[2];\n    //const int ne3  = dst->ne[3];\n    //const int ne   = ne0*ne1*ne2*ne3;\n\n    const int nb00 = src0->nb[0];\n    const int nb01 = src0->nb[1];\n    const int nb02 = src0->nb[2];\n    //const int nb03 = src0->nb[3];\n\n    const int nb10 = src1->nb[0];\n    const int nb11 = src1->nb[1];\n    //const int nb12 = src1->nb[2];\n    //const int nb13 = src1->nb[3];\n\n    //const int nb0  = dst->nb[0];\n    const int nb1  = dst->nb[1];\n    //const int nb2  = dst->nb[2];\n    //const int nb3  = dst->nb[3];\n\n    const int ith = params->ith;\n    const int nth = params->nth;\n\n    const int nk = ne00;\n    const int nh = nk/2;\n\n    const int ew0 = ggml_up32(ne01);\n\n    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes\n    GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));\n    GGML_ASSERT(nb10 == sizeof(float));\n\n    if (params->type == GGML_TASK_INIT) {\n        // TODO: fix this memset (wsize is overestimated)\n        memset(params->wdata, 0, params->wsize);\n\n        // prepare kernel data (src0)\n        {\n            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;\n\n            for (int i02 = 0; i02 < ne02; i02++) {\n                for (int i01 = 0; i01 < ne01; i01++) {\n                    const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i02*nb02 + i01*nb01);\n                    ggml_fp16_t * dst_data = wdata + i02*ew0*ne00;\n                    for (int i00 = 0; i00 < ne00; i00++) {\n                        dst_data[i00*ew0 + i01] = src[i00];\n                    }\n                }\n            }\n        }\n\n        // prepare source data (src1)\n        {\n            ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + ne02*ew0*ne00;\n\n            for (int i11 = 0; i11 < ne11; i11++) {\n                const float * const src = (float *)((char *) src1->data + i11*nb11);\n                ggml_fp16_t * dst_data = wdata;\n                for (int i10 = 0; i10 < ne10; i10++) {\n                    dst_data[(i10 + nh)*ew0 + i11] = GGML_FP32_TO_FP16(src[i10]);\n                }\n            }\n        }\n\n        return;\n    }\n\n    if (params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    // total rows in dst\n    const int nr = ne02;\n\n    // rows per thread\n    const int dr = (nr + nth - 1)/nth;\n\n    // row range for this thread\n    const int ir0 = dr*ith;\n    const int ir1 = MIN(ir0 + dr, nr);\n\n    for (int i1 = ir0; i1 < ir1; i1++) {\n        float * dst_data = (float *)((char *) dst->data + i1*nb1);\n        for (int i0 = 0; i0 < ne10; i0 += 2) {\n            dst_data[i0/2] = 0;\n            for (int k = -nh; k <= nh; k++) {\n                float v = 0.0f;\n                ggml_vec_dot_f16(ew0, &v,\n                        (ggml_fp16_t *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,\n                        (ggml_fp16_t *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);\n\n                dst_data[i0/2] += v;\n            }\n        }\n    }\n}\n\nstatic void ggml_compute_forward_conv_1d_2s_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n              struct ggml_tensor * dst) {\n    GGML_ASSERT(src0->type == GGML_TYPE_F32);\n    GGML_ASSERT(src1->type == GGML_TYPE_F32);\n    GGML_ASSERT( dst->type == GGML_TYPE_F32);\n\n    int64_t t0 = ggml_perf_time_us();\n    UNUSED(t0);\n\n    const int ne00 = src0->ne[0];\n    const int ne01 = src0->ne[1];\n    const int ne02 = src0->ne[2];\n    //const int ne03 = src0->ne[3];\n\n    const int ne10 = src1->ne[0];\n    const int ne11 = src1->ne[1];\n    //const int ne12 = src1->ne[2];\n    //const int ne13 = src1->ne[3];\n\n    //const int ne0  = dst->ne[0];\n    //const int ne1  = dst->ne[1];\n    //const int ne2  = dst->ne[2];\n    //const int ne3  = dst->ne[3];\n    //const int ne   = ne0*ne1*ne2*ne3;\n\n    const int nb00 = src0->nb[0];\n    const int nb01 = src0->nb[1];\n    const int nb02 = src0->nb[2];\n    //const int nb03 = src0->nb[3];\n\n    const int nb10 = src1->nb[0];\n    const int nb11 = src1->nb[1];\n    //const int nb12 = src1->nb[2];\n    //const int nb13 = src1->nb[3];\n\n    //const int nb0  = dst->nb[0];\n    const int nb1  = dst->nb[1];\n    //const int nb2  = dst->nb[2];\n    //const int nb3  = dst->nb[3];\n\n    const int ith = params->ith;\n    const int nth = params->nth;\n\n    const int nk = ne00;\n    const int nh = nk/2;\n\n    const int ew0 = ggml_up32(ne01);\n\n    GGML_ASSERT(ne00 % 2 == 1); // TODO: support even kernel sizes\n    GGML_ASSERT(nb00 == sizeof(float));\n    GGML_ASSERT(nb10 == sizeof(float));\n\n    if (params->type == GGML_TASK_INIT) {\n        // TODO: fix this memset (wsize is overestimated)\n        memset(params->wdata, 0, params->wsize);\n\n        // prepare kernel data (src0)\n        {\n            float * const wdata = (float *) params->wdata + 0;\n\n            for (int i02 = 0; i02 < ne02; i02++) {\n                for (int i01 = 0; i01 < ne01; i01++) {\n                    const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);\n                    float * dst_data = wdata + i02*ew0*ne00;\n                    for (int i00 = 0; i00 < ne00; i00++) {\n                        dst_data[i00*ew0 + i01] = src[i00];\n                    }\n                }\n            }\n        }\n\n        // prepare source data (src1)\n        {\n            float * const wdata = (float *) params->wdata + ne02*ew0*ne00;\n\n            for (int i11 = 0; i11 < ne11; i11++) {\n                const float * const src = (float *)((char *) src1->data + i11*nb11);\n                float * dst_data = wdata;\n                for (int i10 = 0; i10 < ne10; i10++) {\n                    dst_data[(i10 + nh)*ew0 + i11] = src[i10];\n                }\n            }\n        }\n\n        return;\n    }\n\n    if (params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    // total rows in dst\n    const int nr = ne02;\n\n    // rows per thread\n    const int dr = (nr + nth - 1)/nth;\n\n    // row range for this thread\n    const int ir0 = dr*ith;\n    const int ir1 = MIN(ir0 + dr, nr);\n\n    for (int i1 = ir0; i1 < ir1; i1++) {\n        float * dst_data = (float *)((char *) dst->data + i1*nb1);\n        for (int i0 = 0; i0 < ne10; i0 += 2) {\n            dst_data[i0/2] = 0;\n            for (int k = -nh; k <= nh; k++) {\n                float v = 0.0f;\n                ggml_vec_dot_f32(ew0, &v,\n                        (float *) params->wdata +   i1*ew0*ne00 +      (nh + k)*ew0,\n                        (float *) params->wdata + ne02*ew0*ne00 + (i0 + nh + k)*ew0);\n\n                dst_data[i0/2] += v;\n            }\n        }\n    }\n}\n\nstatic void ggml_compute_forward_conv_1d_2s(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * src0,\n        const struct ggml_tensor * src1,\n        struct ggml_tensor * dst) {\n    switch (src0->type) {\n        case GGML_TYPE_F16:\n            {\n                ggml_compute_forward_conv_1d_2s_f16_f32(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_conv_1d_2s_f32(params, src0, src1, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_flash_attn\n\nstatic void ggml_compute_forward_flash_attn_f32(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * q,\n        const struct ggml_tensor * k,\n        const struct ggml_tensor * v,\n        const bool masked,\n             struct ggml_tensor * dst) {\n    int64_t t0 = ggml_perf_time_us();\n    UNUSED(t0);\n\n    const int neq0 = q->ne[0];\n    const int neq1 = q->ne[1];\n    const int neq2 = q->ne[2];\n    const int neq3 = q->ne[3];\n\n    const int nek0 = k->ne[0];\n    const int nek1 = k->ne[1];\n    //const int nek2 = k->ne[2];\n    //const int nek3 = k->ne[3];\n\n    //const int nev0 = v->ne[0];\n    const int nev1 = v->ne[1];\n    //const int nev2 = v->ne[2];\n    //const int nev3 = v->ne[3];\n\n    const int ne0  = dst->ne[0];\n    const int ne1  = dst->ne[1];\n    //const int ne2  = dst->ne[2];\n    //const int ne3  = dst->ne[3];\n\n    const int nbk0 = k->nb[0];\n    const int nbk1 = k->nb[1];\n    const int nbk2 = k->nb[2];\n    const int nbk3 = k->nb[3];\n\n    const int nbq0 = q->nb[0];\n    const int nbq1 = q->nb[1];\n    const int nbq2 = q->nb[2];\n    const int nbq3 = q->nb[3];\n\n    const int nbv0 = v->nb[0];\n    const int nbv1 = v->nb[1];\n    const int nbv2 = v->nb[2];\n    const int nbv3 = v->nb[3];\n\n    const int nb0  = dst->nb[0];\n    const int nb1  = dst->nb[1];\n    const int nb2  = dst->nb[2];\n    const int nb3  = dst->nb[3];\n\n    const int ith = params->ith;\n    const int nth = params->nth;\n\n    const int D = neq0;\n    const int N = neq1;\n    const int P = nek1 - N;\n    const int M = P + N;\n\n    const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);\n\n    GGML_ASSERT(ne0 == D);\n    GGML_ASSERT(ne1 == N);\n    GGML_ASSERT(P >= 0);\n\n    GGML_ASSERT(nbq0 == sizeof(float));\n    GGML_ASSERT(nbk0 == sizeof(float));\n    GGML_ASSERT(nbv0 == sizeof(float));\n\n    GGML_ASSERT(neq0 == D);\n    GGML_ASSERT(nek0 == D);\n    GGML_ASSERT(nev1 == D);\n\n    GGML_ASSERT(neq1 == N);\n    GGML_ASSERT(nek1 == N + P);\n    GGML_ASSERT(nev1 == D);\n\n    // dst cannot be transposed or permuted\n    GGML_ASSERT(nb0 == sizeof(float));\n    GGML_ASSERT(nb0 <= nb1);\n    GGML_ASSERT(nb1 <= nb2);\n    GGML_ASSERT(nb2 <= nb3);\n\n    if (params->type == GGML_TASK_INIT) {\n        return;\n    }\n\n    if (params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    // parallelize by q rows using ggml_vec_dot_f32\n\n    // total rows in q\n    const int nr = neq1*neq2*neq3;\n\n    // rows per thread\n    const int dr = (nr + nth - 1)/nth;\n\n    // row range for this thread\n    const int ir0 = dr*ith;\n    const int ir1 = MIN(ir0 + dr, nr);\n\n    const float scale = 1.0/sqrt((double) D);\n\n    //printf(\"P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\\n\", P, N, D, ir0, ir1, scale);\n\n    for (int ir = ir0; ir < ir1; ++ir) {\n        // q indices\n        const int iq3 = ir/(neq2*neq1);\n        const int iq2 = (ir - iq3*neq2*neq1)/neq1;\n        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);\n\n        float * S = (float *) params->wdata + ith*(Mup + CACHE_LINE_SIZE_F32);\n\n        for (int i = M; i < Mup; ++i) {\n            S[i] = -INFINITY;\n        }\n\n        for (int ic = 0; ic < nek1; ++ic) {\n            // k indices\n            const int ik3 = iq3;\n            const int ik2 = iq2;\n            const int ik1 = ic;\n\n            // S indices\n            const int i1 = ik1;\n\n            ggml_vec_dot_f32(neq0,\n                    S + i1,\n                    (float *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),\n                    (float *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));\n        }\n\n        // scale\n        ggml_vec_scale_f32(nek1, S, scale);\n\n        if (masked) {\n            for (int i = P; i < M; i++) {\n                if (i > P + iq1) {\n                    S[i] = -INFINITY;\n                }\n            }\n        }\n\n        // softmax\n        {\n            float max = -INFINITY;\n            ggml_vec_max_f32(M, &max, S);\n\n            float sum = 0.0f;\n            {\n#ifdef GGML_SOFT_MAX_ACCELERATE\n                max = -max;\n                vDSP_vsadd(S, 1, &max, S, 1, Mup);\n                vvexpf(S, S, &Mup);\n                ggml_vec_sum_f32(Mup, &sum, S);\n#else\n                uint16_t   scvt[GGML_SOFT_MAX_UNROLL];\n                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };\n\n                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {\n                    float * SS = S + i;\n\n                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {\n                        if (SS[j] == -INFINITY) {\n                            SS[j] = 0.0f;\n                        } else {\n                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);\n                            memcpy(&scvt[j], &s, sizeof(uint16_t));\n                            const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);\n                            sump[j] += val;\n                            SS[j] = val;\n                        }\n                    }\n                }\n\n                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {\n                    sum += sump[i];\n                }\n#endif\n            }\n\n            assert(sum > 0.0f);\n\n            sum = 1.0/sum;\n            ggml_vec_scale_f32(M, S, sum);\n\n#ifndef NDEBUG\n            for (int i = 0; i < M; ++i) {\n                assert(!isnan(S[i]));\n                assert(!isinf(S[i]));\n            }\n#endif\n        }\n\n        for (int ic = 0; ic < nev1; ++ic) {\n            // dst indices\n            const int i1 = iq1;\n            const int i2 = iq2;\n            const int i3 = iq3;\n\n            ggml_vec_dot_f32(nek1,\n                    (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2  + i3*nb3)),\n                    (float *) ((char *) v->data   + (         ic*nbv1 + i2*nbv2 + i3*nbv3)),\n                    S);\n        }\n    }\n}\n\nstatic void ggml_compute_forward_flash_attn_f16(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * q,\n        const struct ggml_tensor * k,\n        const struct ggml_tensor * v,\n        const bool masked,\n             struct ggml_tensor * dst) {\n    int64_t t0 = ggml_perf_time_us();\n    UNUSED(t0);\n\n    const int neq0 = q->ne[0];\n    const int neq1 = q->ne[1];\n    const int neq2 = q->ne[2];\n    const int neq3 = q->ne[3];\n\n    const int nek0 = k->ne[0];\n    const int nek1 = k->ne[1];\n    //const int nek2 = k->ne[2];\n    //const int nek3 = k->ne[3];\n\n    //const int nev0 = v->ne[0];\n    const int nev1 = v->ne[1];\n    //const int nev2 = v->ne[2];\n    //const int nev3 = v->ne[3];\n\n    const int ne0  = dst->ne[0];\n    const int ne1  = dst->ne[1];\n    //const int ne2  = dst->ne[2];\n    //const int ne3  = dst->ne[3];\n\n    const int nbk0 = k->nb[0];\n    const int nbk1 = k->nb[1];\n    const int nbk2 = k->nb[2];\n    const int nbk3 = k->nb[3];\n\n    const int nbq0 = q->nb[0];\n    const int nbq1 = q->nb[1];\n    const int nbq2 = q->nb[2];\n    const int nbq3 = q->nb[3];\n\n    const int nbv0 = v->nb[0];\n    const int nbv1 = v->nb[1];\n    const int nbv2 = v->nb[2];\n    const int nbv3 = v->nb[3];\n\n    const int nb0  = dst->nb[0];\n    const int nb1  = dst->nb[1];\n    const int nb2  = dst->nb[2];\n    const int nb3  = dst->nb[3];\n\n    const int ith = params->ith;\n    const int nth = params->nth;\n\n    const int D = neq0;\n    const int N = neq1;\n    const int P = nek1 - N;\n    const int M = P + N;\n\n    const int Mup = ggml_up(M, GGML_SOFT_MAX_UNROLL);\n\n    GGML_ASSERT(ne0 == D);\n    GGML_ASSERT(ne1 == N);\n    GGML_ASSERT(P >= 0);\n\n    GGML_ASSERT(nbq0 == sizeof(ggml_fp16_t));\n    GGML_ASSERT(nbk0 == sizeof(ggml_fp16_t));\n    GGML_ASSERT(nbv0 == sizeof(ggml_fp16_t));\n\n    GGML_ASSERT(neq0 == D);\n    GGML_ASSERT(nek0 == D);\n    GGML_ASSERT(nev1 == D);\n\n    GGML_ASSERT(neq1 == N);\n    GGML_ASSERT(nek1 == N + P);\n    GGML_ASSERT(nev1 == D);\n\n    // dst cannot be transposed or permuted\n    GGML_ASSERT(nb0 == sizeof(float));\n    GGML_ASSERT(nb0 <= nb1);\n    GGML_ASSERT(nb1 <= nb2);\n    GGML_ASSERT(nb2 <= nb3);\n\n    if (params->type == GGML_TASK_INIT) {\n        return;\n    }\n\n    if (params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    // parallelize by q rows using ggml_vec_dot_f32\n\n    // total rows in q\n    const int nr = neq1*neq2*neq3;\n\n    // rows per thread\n    const int dr = (nr + nth - 1)/nth;\n\n    // row range for this thread\n    const int ir0 = dr*ith;\n    const int ir1 = MIN(ir0 + dr, nr);\n\n    const float scale = 1.0/sqrt((double) D);\n\n    //printf(\"P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\\n\", P, N, D, ir0, ir1, scale);\n\n    for (int ir = ir0; ir < ir1; ++ir) {\n        // q indices\n        const int iq3 = ir/(neq2*neq1);\n        const int iq2 = (ir - iq3*neq2*neq1)/neq1;\n        const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);\n\n        float * S = (float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32);\n\n        for (int i = M; i < Mup; ++i) {\n            S[i] = -INFINITY;\n        }\n\n        if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {\n            for (int ic = 0; ic < nek1; ++ic) {\n                // k indices\n                const int ik3 = iq3;\n                const int ik2 = iq2;\n                const int ik1 = ic;\n\n                // S indices\n                const int i1 = ik1;\n\n                ggml_vec_dot_f16(neq0,\n                        S + i1,\n                        (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),\n                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));\n            }\n        } else {\n            for (int ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {\n                // k indices\n                const int ik3 = iq3;\n                const int ik2 = iq2;\n                const int ik1 = ic;\n\n                // S indices\n                const int i1 = ik1;\n\n                ggml_vec_dot_f16_unroll(neq0, nbk1,\n                        S + i1,\n                        ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),\n                        (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));\n            }\n        }\n\n        // scale\n        ggml_vec_scale_f32(nek1, S, scale);\n\n        if (masked) {\n            for (int i = P; i < M; i++) {\n                if (i > P + iq1) {\n                    S[i] = -INFINITY;\n                }\n            }\n        }\n\n        // softmax\n        {\n            float max = -INFINITY;\n            ggml_vec_max_f32(M, &max, S);\n\n            float sum = 0.0f;\n            {\n#ifdef GGML_SOFT_MAX_ACCELERATE\n                max = -max;\n                vDSP_vsadd(S, 1, &max, S, 1, Mup);\n                vvexpf(S, S, &Mup);\n                ggml_vec_sum_f32(Mup, &sum, S);\n#else\n                uint16_t   scvt[GGML_SOFT_MAX_UNROLL];\n                ggml_float sump[GGML_SOFT_MAX_UNROLL] = { 0.0 };\n\n                for (int i = 0; i < Mup; i += GGML_SOFT_MAX_UNROLL) {\n                    float * SS = S + i;\n\n                    for (int j = 0; j < GGML_SOFT_MAX_UNROLL; ++j) {\n                        if (SS[j] == -INFINITY) {\n                            SS[j] = 0.0f;\n                        } else {\n                            ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max);\n                            memcpy(&scvt[j], &s, sizeof(uint16_t));\n                            const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]);\n                            sump[j] += val;\n                            SS[j] = val;\n                        }\n                    }\n                }\n\n                for (int i = 0; i < GGML_SOFT_MAX_UNROLL; i++) {\n                    sum += sump[i];\n                }\n#endif\n            }\n\n            assert(sum > 0.0f);\n\n            sum = 1.0/sum;\n            ggml_vec_scale_f32(M, S, sum);\n\n#ifndef NDEBUG\n            for (int i = 0; i < M; ++i) {\n                assert(!isnan(S[i]));\n                assert(!isinf(S[i]));\n            }\n#endif\n        }\n\n        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*Mup + CACHE_LINE_SIZE_F32) + Mup);\n\n        for (int i = 0; i < M; i++) {\n            S16[i] = GGML_FP32_TO_FP16(S[i]);\n        }\n\n        if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {\n            for (int ic = 0; ic < nev1; ++ic) {\n                // dst indices\n                const int i1 = iq1;\n                const int i2 = iq2;\n                const int i3 = iq3;\n\n                ggml_vec_dot_f16(nek1,\n                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2  + i3*nb3)),\n                        (ggml_fp16_t *) ((char *) v->data   + (         ic*nbv1 + i2*nbv2 + i3*nbv3)),\n                        S16);\n            }\n        } else {\n            for (int ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {\n                // dst indices\n                const int i1 = iq1;\n                const int i2 = iq2;\n                const int i3 = iq3;\n\n                ggml_vec_dot_f16_unroll(nek1, nbv1,\n                        (float *) ((char *) dst->data + (ic*nb0 + i1*nb1  + i2*nb2  + i3*nb3)),\n                        ((char *) v->data   + (         ic*nbv1 + i2*nbv2 + i3*nbv3)),\n                        S16);\n            }\n        }\n    }\n}\n\nstatic void ggml_compute_forward_flash_attn(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * q,\n        const struct ggml_tensor * k,\n        const struct ggml_tensor * v,\n        const bool masked,\n        struct ggml_tensor * dst) {\n    switch (q->type) {\n        case GGML_TYPE_F16:\n            {\n                ggml_compute_forward_flash_attn_f16(params, q, k, v, masked, dst);\n            } break;\n        case GGML_TYPE_F32:\n            {\n                ggml_compute_forward_flash_attn_f32(params, q, k, v, masked, dst);\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n// ggml_compute_forward_flash_ff\n\nstatic void ggml_compute_forward_flash_ff_f16(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * a,  // F16\n        const struct ggml_tensor * b0, // F16 fc_w\n        const struct ggml_tensor * b1, // F32 fc_b\n        const struct ggml_tensor * c0, // F16 proj_w\n        const struct ggml_tensor * c1, // F32 proj_b\n        struct ggml_tensor * dst) {\n    int64_t t0 = ggml_perf_time_us();\n    UNUSED(t0);\n\n    const int nea0 = a->ne[0];\n    const int nea1 = a->ne[1];\n    const int nea2 = a->ne[2];\n    const int nea3 = a->ne[3];\n\n    const int neb00 = b0->ne[0];\n    const int neb01 = b0->ne[1];\n    //const int neb02 = b0->ne[2];\n    //const int neb03 = b0->ne[3];\n\n    const int neb10 = b1->ne[0];\n    const int neb11 = b1->ne[1];\n    //const int neb12 = b1->ne[2];\n    //const int neb13 = b1->ne[3];\n\n    const int nec00 = c0->ne[0];\n    const int nec01 = c0->ne[1];\n    //const int nec02 = c0->ne[2];\n    //const int nec03 = c0->ne[3];\n\n    const int nec10 = c1->ne[0];\n    const int nec11 = c1->ne[1];\n    //const int nec12 = c1->ne[2];\n    //const int nec13 = c1->ne[3];\n\n    const int ne0 = dst->ne[0];\n    const int ne1 = dst->ne[1];\n    const int ne2 = dst->ne[2];\n    //const int ne3 = dst->ne[3];\n\n    const int nba0 = a->nb[0];\n    const int nba1 = a->nb[1];\n    const int nba2 = a->nb[2];\n    const int nba3 = a->nb[3];\n\n    const int nbb00 = b0->nb[0];\n    const int nbb01 = b0->nb[1];\n    const int nbb02 = b0->nb[2];\n    const int nbb03 = b0->nb[3];\n\n    const int nbb10 = b1->nb[0];\n    //const int nbb11 = b1->nb[1];\n    //const int nbb12 = b1->nb[2];\n    //const int nbb13 = b1->nb[3];\n\n    const int nbc00 = c0->nb[0];\n    const int nbc01 = c0->nb[1];\n    const int nbc02 = c0->nb[2];\n    const int nbc03 = c0->nb[3];\n\n    const int nbc10 = c1->nb[0];\n    //const int nbc11 = c1->nb[1];\n    //const int nbc12 = c1->nb[2];\n    //const int nbc13 = c1->nb[3];\n\n    const int nb0 = dst->nb[0];\n    const int nb1 = dst->nb[1];\n    const int nb2 = dst->nb[2];\n    const int nb3 = dst->nb[3];\n\n    const int ith = params->ith;\n    const int nth = params->nth;\n\n    const int D = nea0;\n    //const int N = nea1;\n    const int M = neb01;\n\n    GGML_ASSERT(ne0 == nea0);\n    GGML_ASSERT(ne1 == nea1);\n    GGML_ASSERT(ne2 == nea2);\n\n    GGML_ASSERT(nba0  == sizeof(ggml_fp16_t));\n    GGML_ASSERT(nbb00 == sizeof(ggml_fp16_t));\n    GGML_ASSERT(nbb10 == sizeof(float));\n    GGML_ASSERT(nbc00 == sizeof(ggml_fp16_t));\n    GGML_ASSERT(nbc10 == sizeof(float));\n\n    GGML_ASSERT(neb00 == D);\n    GGML_ASSERT(neb01 == M);\n    GGML_ASSERT(neb10 == M);\n    GGML_ASSERT(neb11 == 1);\n\n    GGML_ASSERT(nec00 == M);\n    GGML_ASSERT(nec01 == D);\n    GGML_ASSERT(nec10 == D);\n    GGML_ASSERT(nec11 == 1);\n\n    // dst cannot be transposed or permuted\n    GGML_ASSERT(nb0 == sizeof(float));\n    GGML_ASSERT(nb0 <= nb1);\n    GGML_ASSERT(nb1 <= nb2);\n    GGML_ASSERT(nb2 <= nb3);\n\n    if (params->type == GGML_TASK_INIT) {\n        return;\n    }\n\n    if (params->type == GGML_TASK_FINALIZE) {\n        return;\n    }\n\n    // parallelize by a rows using ggml_vec_dot_f32\n\n    // total rows in a\n    const int nr = nea1*nea2*nea3;\n\n    // rows per thread\n    const int dr = (nr + nth - 1)/nth;\n\n    // row range for this thread\n    const int ir0 = dr*ith;\n    const int ir1 = MIN(ir0 + dr, nr);\n\n    for (int ir = ir0; ir < ir1; ++ir) {\n        // a indices\n        const int ia3 = ir/(nea2*nea1);\n        const int ia2 = (ir - ia3*nea2*nea1)/nea1;\n        const int ia1 = (ir - ia3*nea2*nea1 - ia2*nea1);\n\n        float * S = (float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32);\n\n        for (int ic = 0; ic < neb01; ++ic) {\n            // b0 indices\n            const int ib03 = ia3;\n            const int ib02 = ia2;\n            const int ib01 = ic;\n\n            // S indices\n            const int i1 = ib01;\n\n            ggml_vec_dot_f16(nea0,\n                    S + i1,\n                    (ggml_fp16_t *) ((char *) b0->data + (ib01*nbb01 + ib02*nbb02 + ib03*nbb03)),\n                    (ggml_fp16_t *) ((char *)  a->data + ( ia1*nba1  +  ia2*nba2  +  ia3*nba3)));\n        }\n\n        ggml_vec_add_f32(neb01, S, S, (float *) b1->data);\n        //ggml_vec_gelu_f32(neb01, S, S);\n\n        ggml_fp16_t * S16 = (ggml_fp16_t *) ((float *) params->wdata + ith*(2*M + CACHE_LINE_SIZE_F32) + M);\n\n        for (int i = 0; i < M; i++) {\n            S16[i] = GGML_FP32_TO_FP16(S[i]);\n        }\n\n        ggml_vec_gelu_f16(neb01, S16, S16);\n\n        {\n            // dst indices\n            const int i1 = ia1;\n            const int i2 = ia2;\n            const int i3 = ia3;\n\n            for (int ic = 0; ic < nec01; ++ic) {\n\n                ggml_vec_dot_f16(neb01,\n                        (float *)       ((char *) dst->data + (ic*nb0 + i1*nb1   + i2*nb2   + i3*nb3)),\n                        (ggml_fp16_t *) ((char *) c0->data  + (         ic*nbc01 + i2*nbc02 + i3*nbc03)),\n                        S16);\n            }\n\n            ggml_vec_add_f32(nec01,\n                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),\n                    (float *) ((char *) dst->data + (i1*nb1 + i2*nb2 + i3*nb3)),\n                    (float *) c1->data);\n        }\n    }\n}\n\nstatic void ggml_compute_forward_flash_ff(\n        const struct ggml_compute_params * params,\n        const struct ggml_tensor * a,\n        const struct ggml_tensor * b0,\n        const struct ggml_tensor * b1,\n        const struct ggml_tensor * c0,\n        const struct ggml_tensor * c1,\n        struct ggml_tensor * dst) {\n    switch (b0->type) {\n        case GGML_TYPE_F16:\n            {\n                ggml_compute_forward_flash_ff_f16(params, a, b0, b1, c0, c1, dst);\n            } break;\n        case GGML_TYPE_F32:\n            {\n                GGML_ASSERT(false); // TODO\n            } break;\n        case GGML_TYPE_Q4_0:\n        case GGML_TYPE_Q4_1:\n        case GGML_TYPE_I8:\n        case GGML_TYPE_I16:\n        case GGML_TYPE_I32:\n        case GGML_TYPE_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n/////////////////////////////////\n\nstatic void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {\n    GGML_ASSERT(params);\n\n    switch (tensor->op) {\n        case GGML_OP_DUP:\n            {\n                ggml_compute_forward_dup(params, tensor->src0, tensor);\n            } break;\n        case GGML_OP_ADD:\n            {\n                ggml_compute_forward_add(params, tensor->src0, tensor->src1, tensor);\n            } break;\n        case GGML_OP_SUB:\n            {\n                ggml_compute_forward_sub(params, tensor->src0, tensor->src1, tensor);\n            } break;\n        case GGML_OP_MUL:\n            {\n                ggml_compute_forward_mul(params, tensor->src0, tensor->src1, tensor);\n            } break;\n        case GGML_OP_DIV:\n            {\n                ggml_compute_forward_div(params, tensor->src0, tensor->src1, tensor);\n            } break;\n        case GGML_OP_SQR:\n            {\n                ggml_compute_forward_sqr(params, tensor->src0, tensor);\n            } break;\n        case GGML_OP_SQRT:\n            {\n                ggml_compute_forward_sqrt(params, tensor->src0, tensor);\n            } break;\n        case GGML_OP_SUM:\n            {\n                ggml_compute_forward_sum(params, tensor->src0, tensor);\n            } break;\n        case GGML_OP_MEAN:\n            {\n                ggml_compute_forward_mean(params, tensor->src0, tensor);\n            } break;\n        case GGML_OP_REPEAT:\n            {\n                ggml_compute_forward_repeat(params, tensor->src0, tensor);\n            } break;\n        case GGML_OP_ABS:\n            {\n                ggml_compute_forward_abs(params, tensor->src0, tensor);\n            } break;\n        case GGML_OP_SGN:\n            {\n                ggml_compute_forward_sgn(params, tensor->src0, tensor);\n            } break;\n        case GGML_OP_NEG:\n            {\n                ggml_compute_forward_neg(params, tensor->src0, tensor);\n            } break;\n        case GGML_OP_STEP:\n            {\n                ggml_compute_forward_step(params, tensor->src0, tensor);\n            } break;\n        case GGML_OP_RELU:\n            {\n                ggml_compute_forward_relu(params, tensor->src0, tensor);\n            } break;\n        case GGML_OP_GELU:\n            {\n                ggml_compute_forward_gelu(params, tensor->src0, tensor);\n            } break;\n        case GGML_OP_SILU:\n            {\n                ggml_compute_forward_silu(params, tensor->src0, tensor);\n            } break;\n        case GGML_OP_NORM:\n            {\n                ggml_compute_forward_norm(params, tensor->src0, tensor);\n            } break;\n        case GGML_OP_MUL_MAT:\n            {\n                ggml_compute_forward_mul_mat(params, tensor->src0, tensor->src1, tensor);\n            } break;\n        case GGML_OP_SCALE:\n            {\n                ggml_compute_forward_scale(params, tensor->src0, tensor->src1, tensor);\n            } break;\n        case GGML_OP_CPY:\n            {\n                ggml_compute_forward_cpy(params, tensor->src0, tensor);\n            } break;\n        case GGML_OP_RESHAPE:\n            {\n                ggml_compute_forward_reshape(params, tensor->src0, tensor);\n            } break;\n        case GGML_OP_VIEW:\n            {\n                ggml_compute_forward_view(params, tensor->src0);\n            } break;\n        case GGML_OP_PERMUTE:\n            {\n                ggml_compute_forward_permute(params, tensor->src0);\n            } break;\n        case GGML_OP_TRANSPOSE:\n            {\n                ggml_compute_forward_transpose(params, tensor->src0);\n            } break;\n        case GGML_OP_GET_ROWS:\n            {\n                ggml_compute_forward_get_rows(params, tensor->src0, tensor->src1, tensor);\n            } break;\n        case GGML_OP_DIAG_MASK_INF:\n            {\n                ggml_compute_forward_diag_mask_inf(params, tensor->src0, tensor->src1, tensor);\n            } break;\n        case GGML_OP_SOFT_MAX:\n            {\n                ggml_compute_forward_soft_max(params, tensor->src0, tensor);\n            } break;\n        case GGML_OP_ROPE:\n            {\n                ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor);\n            } break;\n        case GGML_OP_CONV_1D_1S:\n            {\n                ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);\n            } break;\n        case GGML_OP_CONV_1D_2S:\n            {\n                ggml_compute_forward_conv_1d_2s(params, tensor->src0, tensor->src1, tensor);\n            } break;\n        case GGML_OP_FLASH_ATTN:\n            {\n                int32_t t = ggml_get_i32_1d(tensor->opt[1], 0);\n                GGML_ASSERT(t == 0 || t == 1);\n                bool masked = t != 0;\n                ggml_compute_forward_flash_attn(params, tensor->src0, tensor->src1, tensor->opt[0], masked, tensor);\n            } break;\n        case GGML_OP_FLASH_FF:\n            {\n                ggml_compute_forward_flash_ff(params, tensor->src0, tensor->src1, tensor->opt[0], tensor->opt[1], tensor->opt[2], tensor);\n            } break;\n        case GGML_OP_NONE:\n            {\n                // nop\n            } break;\n        case GGML_OP_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\n////////////////////////////////////////////////////////////////////////////////\n\nstatic void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor * tensor, bool inplace) {\n    struct ggml_tensor * src0 = tensor->src0;\n    struct ggml_tensor * src1 = tensor->src1;\n\n    switch (tensor->op) {\n        case GGML_OP_DUP:\n            {\n                if (src0->grad) {\n                    src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);\n                }\n            } break;\n        case GGML_OP_ADD:\n            {\n                if (src0->grad) {\n                    src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);\n                }\n                if (src1->grad) {\n                    src1->grad = ggml_add_impl(ctx, src1->grad, tensor->grad, inplace);\n                }\n            } break;\n        case GGML_OP_SUB:\n            {\n                if (src0->grad) {\n                    src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace);\n                }\n                if (src1->grad) {\n                    src1->grad = ggml_sub_impl(ctx, src1->grad, tensor->grad, inplace);\n                }\n            } break;\n        case GGML_OP_MUL:\n            {\n                if (src0->grad) {\n                    src0->grad =\n                        ggml_add_impl(ctx,\n                                src0->grad,\n                                ggml_mul(ctx, src1, tensor->grad),\n                                inplace);\n                }\n                if (src1->grad) {\n                    src1->grad =\n                        ggml_add_impl(ctx,\n                                src1->grad,\n                                ggml_mul(ctx, src0, tensor->grad),\n                                inplace);\n                }\n            } break;\n        case GGML_OP_DIV:\n            {\n                if (src0->grad) {\n                    src0->grad =\n                        ggml_add_impl(ctx,\n                                src0->grad,\n                                ggml_div(ctx, tensor->grad, src1),\n                                inplace);\n                }\n                if (src1->grad) {\n                    src1->grad =\n                        ggml_sub_impl(ctx,\n                                src1->grad,\n                                ggml_mul(ctx,\n                                    tensor->grad,\n                                    ggml_div(ctx, tensor, src1)),\n                                inplace);\n                }\n            } break;\n        case GGML_OP_SQR:\n            {\n                if (src0->grad) {\n                    src0->grad =\n                        ggml_add_impl(ctx,\n                                src0->grad,\n                                ggml_mul(ctx,\n                                    ggml_mul(ctx, src0, tensor->grad),\n                                    ggml_repeat(ctx, ggml_new_f32(ctx, 2.0f), src0)),\n                                inplace);\n                }\n            } break;\n        case GGML_OP_SQRT:\n            {\n                if (src0->grad) {\n                    src0->grad =\n                        ggml_add_impl(ctx,\n                                src0->grad,\n                                ggml_div(ctx,\n                                    ggml_repeat(ctx, ggml_new_f32(ctx, 0.5f), tensor),\n                                    tensor),\n                                inplace);\n                }\n            } break;\n        case GGML_OP_SUM:\n            {\n                if (src0->grad) {\n                    src0->grad =\n                        ggml_add_impl(ctx,\n                                src0->grad,\n                                ggml_repeat(ctx, tensor->grad, src0->grad),\n                                inplace);\n                }\n            } break;\n        case GGML_OP_MEAN:\n            {\n                GGML_ASSERT(false); // TODO: implement\n            } break;\n        case GGML_OP_REPEAT:\n            {\n                if (src0->grad) {\n                    src0->grad =\n                        ggml_add_impl(ctx,\n                                src0->grad,\n                                ggml_sum(ctx, tensor->grad),\n                                inplace);\n                }\n            } break;\n        case GGML_OP_ABS:\n            {\n                if (src0->grad) {\n                    src0->grad =\n                        ggml_add_impl(ctx,\n                                src0->grad,\n                                ggml_mul(ctx,\n                                    ggml_sgn(ctx, src0),\n                                    tensor->grad),\n                                inplace);\n                }\n            } break;\n        case GGML_OP_SGN:\n            {\n                if (src0->grad) {\n                    // noop\n                }\n            } break;\n        case GGML_OP_NEG:\n            {\n                if (src0->grad) {\n                    src0->grad = ggml_sub_impl(ctx, src0->grad, tensor->grad, inplace);\n                }\n            } break;\n        case GGML_OP_STEP:\n            {\n                if (src0->grad) {\n                    // noop\n                }\n            } break;\n        case GGML_OP_RELU:\n            {\n                if (src0->grad) {\n                    src0->grad = ggml_sub_impl(ctx,\n                            src0->grad,\n                            ggml_mul(ctx,\n                                ggml_step(ctx, src0),\n                                tensor->grad),\n                            inplace);\n                }\n            } break;\n        case GGML_OP_GELU:\n            {\n                GGML_ASSERT(false); // TODO: not implemented\n            } break;\n        case GGML_OP_SILU:\n            {\n                GGML_ASSERT(false); // TODO: not implemented\n            } break;\n        case GGML_OP_NORM:\n            {\n                GGML_ASSERT(false); // TODO: not implemented\n            } break;\n        case GGML_OP_MUL_MAT:\n            {\n                if (src0->grad) {\n                    // TODO: this requires outer product - ggml_out_prod(ctx, src1, tensor->grad);\n                    GGML_ASSERT(false);\n                }\n                if (src1->grad) {\n                    src1->grad =\n                        ggml_add_impl(ctx,\n                                src1->grad,\n                                // TODO: fix transpose, the node will break the graph connections\n                                ggml_mul_mat(ctx, ggml_transpose(ctx, src0), tensor->grad),\n                                inplace);\n                }\n            } break;\n        case GGML_OP_SCALE:\n            {\n                GGML_ASSERT(false); // TODO: not implemented\n            } break;\n        case GGML_OP_CPY:\n            {\n                GGML_ASSERT(false); // TODO: not implemented\n            } break;\n        case GGML_OP_RESHAPE:\n            {\n                GGML_ASSERT(false); // TODO: not implemented\n            } break;\n        case GGML_OP_VIEW:\n            {\n                GGML_ASSERT(false); // not supported\n            } break;\n        case GGML_OP_PERMUTE:\n            {\n                GGML_ASSERT(false); // TODO: not implemented\n            } break;\n        case GGML_OP_TRANSPOSE:\n            {\n                GGML_ASSERT(false); // TODO: not implemented\n            } break;\n        case GGML_OP_GET_ROWS:\n            {\n                GGML_ASSERT(false); // TODO: not implemented\n            } break;\n        case GGML_OP_DIAG_MASK_INF:\n            {\n                GGML_ASSERT(false); // TODO: not implemented\n            } break;\n        case GGML_OP_SOFT_MAX:\n            {\n                GGML_ASSERT(false); // TODO: not implemented\n            } break;\n        case GGML_OP_ROPE:\n            {\n                GGML_ASSERT(false); // TODO: not implemented\n            } break;\n        case GGML_OP_CONV_1D_1S:\n            {\n                GGML_ASSERT(false); // TODO: not implemented\n            } break;\n        case GGML_OP_CONV_1D_2S:\n            {\n                GGML_ASSERT(false); // TODO: not implemented\n            } break;\n        case GGML_OP_FLASH_ATTN:\n            {\n                GGML_ASSERT(false); // not supported\n            } break;\n        case GGML_OP_FLASH_FF:\n            {\n                GGML_ASSERT(false); // not supported\n            } break;\n        case GGML_OP_NONE:\n            {\n                // nop\n            } break;\n        case GGML_OP_COUNT:\n            {\n                GGML_ASSERT(false);\n            } break;\n    }\n}\n\nstatic void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {\n    if (node->grad == NULL) {\n        // this usually happens when we generate intermediate nodes from constants in the backward pass\n        // it can also happen during forward pass, if the user performs computations with constants\n        if (node->op != GGML_OP_NONE) {\n            //GGML_PRINT_DEBUG(\"%s: warning: node %p has no grad, but op %d\\n\", __func__, (void *) node, node->op);\n        }\n    }\n\n    // check if already visited\n    for (int i = 0; i < cgraph->n_nodes; i++) {\n        if (cgraph->nodes[i] == node) {\n            return;\n        }\n    }\n\n    for (int i = 0; i < cgraph->n_leafs; i++) {\n        if (cgraph->leafs[i] == node) {\n            return;\n        }\n    }\n\n    if (node->src0) {\n        ggml_visit_parents(cgraph, node->src0);\n    }\n\n    if (node->src1) {\n        ggml_visit_parents(cgraph, node->src1);\n    }\n\n    for (int i = 0; i < GGML_MAX_OPT; ++i) {\n        if (node->opt[i]) {\n            ggml_visit_parents(cgraph, node->opt[i]);\n        }\n    }\n\n    if (node->op == GGML_OP_NONE && node->grad == NULL) {\n        // reached a leaf node, not part of the gradient graph (e.g. a constant)\n        GGML_ASSERT(cgraph->n_leafs < GGML_MAX_NODES);\n\n        cgraph->leafs[cgraph->n_leafs] = node;\n        cgraph->n_leafs++;\n    } else {\n        GGML_ASSERT(cgraph->n_nodes < GGML_MAX_NODES);\n\n        cgraph->nodes[cgraph->n_nodes] = node;\n        cgraph->grads[cgraph->n_nodes] = node->grad;\n        cgraph->n_nodes++;\n    }\n}\n\nstatic void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {\n    if (!expand) {\n        cgraph->n_nodes = 0;\n        cgraph->n_leafs = 0;\n    }\n\n    const int n0 = cgraph->n_nodes;\n    UNUSED(n0);\n\n    ggml_visit_parents(cgraph, tensor);\n\n    const int n_new = cgraph->n_nodes - n0;\n    GGML_PRINT_DEBUG(\"%s: visited %d new nodes\\n\", __func__, n_new);\n\n    if (n_new > 0) {\n        // the last added node should always be starting point\n        GGML_ASSERT(cgraph->nodes[cgraph->n_nodes - 1] == tensor);\n    }\n}\n\nvoid ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {\n    ggml_build_forward_impl(cgraph, tensor, true);\n}\n\nstruct ggml_cgraph ggml_build_forward(struct ggml_tensor * tensor) {\n    struct ggml_cgraph result = {\n        /*.n_nodes      =*/ 0,\n        /*.n_leafs      =*/ 0,\n        /*.n_threads    =*/ 0,\n        /*.work_size    =*/ 0,\n        /*.work         =*/ NULL,\n        /*.nodes        =*/ { NULL },\n        /*.grads        =*/ { NULL },\n        /*.leafs        =*/ { NULL },\n        /*.perf_runs    =*/ 0,\n        /*.perf_cycles  =*/ 0,\n        /*.perf_time_us =*/ 0,\n    };\n\n    ggml_build_forward_impl(&result, tensor, false);\n\n    return result;\n}\n\nstruct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep) {\n    struct ggml_cgraph result = *gf;\n\n    GGML_ASSERT(gf->n_nodes > 0);\n\n    // if we are keeping the gradient graph, we have to detach the gradient nodes from the original graph\n    if (keep) {\n        for (int i = 0; i < gf->n_nodes; i++) {\n            struct ggml_tensor * node = gf->nodes[i];\n\n            if (node->grad) {\n                node->grad = ggml_dup_tensor(ctx, node);\n                gf->grads[i] = node->grad;\n            }\n        }\n    }\n\n    for (int i = gf->n_nodes - 1; i >= 0; i--) {\n        struct ggml_tensor * node = gf->nodes[i];\n\n        // because we detached the grad nodes from the original graph, we can afford inplace operations\n        if (node->grad) {\n            ggml_compute_backward(ctx, node, keep);\n        }\n    }\n\n    for (int i = gf->n_nodes - 1; i >= 0; i--) {\n        struct ggml_tensor * node = gf->nodes[i];\n\n        if (node->is_param) {\n            GGML_PRINT_DEBUG(\"%s: found root node %p\\n\", __func__, (void *) node);\n            ggml_build_forward_impl(&result, node->grad, true);\n        }\n    }\n\n    return result;\n}\n\n//\n// thread data\n//\n// synchronization is done via busy loops\n// I tried using spin locks, but not sure how to use them correctly - the things I tried were slower than busy loops\n//\n\n#ifdef __APPLE__\n\n//#include <os/lock.h>\n//\n//typedef os_unfair_lock ggml_lock_t;\n//\n//#define ggml_lock_init(x)    UNUSED(x)\n//#define ggml_lock_destroy(x) UNUSED(x)\n//#define ggml_lock_lock       os_unfair_lock_lock\n//#define ggml_lock_unlock     os_unfair_lock_unlock\n//\n//#define GGML_LOCK_INITIALIZER OS_UNFAIR_LOCK_INIT\n\ntypedef int ggml_lock_t;\n\n#define ggml_lock_init(x)    UNUSED(x)\n#define ggml_lock_destroy(x) UNUSED(x)\n#define ggml_lock_lock(x)    UNUSED(x)\n#define ggml_lock_unlock(x)  UNUSED(x)\n\n#define GGML_LOCK_INITIALIZER 0\n\ntypedef pthread_t ggml_thread_t;\n\n#define ggml_thread_create pthread_create\n#define ggml_thread_join   pthread_join\n\n#else\n\n//typedef pthread_spinlock_t ggml_lock_t;\n\n//#define ggml_lock_init(x) pthread_spin_init(x, PTHREAD_PROCESS_PRIVATE)\n//#define ggml_lock_destroy pthread_spin_destroy\n//#define ggml_lock_lock    pthread_spin_lock\n//#define ggml_lock_unlock  pthread_spin_unlock\n\ntypedef int ggml_lock_t;\n\n#define ggml_lock_init(x)    UNUSED(x)\n#define ggml_lock_destroy(x) UNUSED(x)\n#define ggml_lock_lock(x)    UNUSED(x)\n#define ggml_lock_unlock(x)  UNUSED(x)\n\n#define GGML_LOCK_INITIALIZER 0\n\ntypedef pthread_t ggml_thread_t;\n\n#define ggml_thread_create pthread_create\n#define ggml_thread_join   pthread_join\n\n#endif\n\nstruct ggml_compute_state_shared {\n    ggml_lock_t spin;\n\n    int n_threads;\n\n    // synchronization primitives\n    atomic_int  n_ready;\n    atomic_bool has_work;\n    atomic_bool stop; // stop all threads\n};\n\nstruct ggml_compute_state {\n    ggml_thread_t thrd;\n\n    struct ggml_compute_params params;\n    struct ggml_tensor * node;\n\n    struct ggml_compute_state_shared * shared;\n};\n\nstatic thread_ret_t ggml_graph_compute_thread(void * data) {\n    struct ggml_compute_state * state = (struct ggml_compute_state *) data;\n\n    const int n_threads = state->shared->n_threads;\n\n    while (true) {\n        if (atomic_fetch_add(&state->shared->n_ready, 1) == n_threads - 1) {\n            atomic_store(&state->shared->has_work, false);\n        } else {\n            while (atomic_load(&state->shared->has_work)) {\n                if (atomic_load(&state->shared->stop)) {\n                    return 0;\n                }\n                ggml_lock_lock  (&state->shared->spin);\n                ggml_lock_unlock(&state->shared->spin);\n            }\n        }\n\n        atomic_fetch_sub(&state->shared->n_ready, 1);\n\n        // wait for work\n        while (!atomic_load(&state->shared->has_work)) {\n            if (atomic_load(&state->shared->stop)) {\n                return 0;\n            }\n            ggml_lock_lock  (&state->shared->spin);\n            ggml_lock_unlock(&state->shared->spin);\n        }\n\n        // check if we should stop\n        if (atomic_load(&state->shared->stop)) {\n            break;\n        }\n\n        if (state->node) {\n            if (state->params.ith < state->params.nth) {\n                ggml_compute_forward(&state->params, state->node);\n            }\n\n            state->node = NULL;\n        } else {\n            break;\n        }\n    }\n\n    return 0;\n}\n\nvoid ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) {\n    if (cgraph->n_threads <= 0) {\n        cgraph->n_threads = 8;\n    }\n\n    const int n_threads = cgraph->n_threads;\n\n    struct ggml_compute_state_shared state_shared = {\n        /*.spin      =*/ GGML_LOCK_INITIALIZER,\n        /*.n_threads =*/ n_threads,\n        /*.n_ready   =*/ 0,\n        /*.has_work  =*/ false,\n        /*.stop      =*/ false,\n    };\n    struct ggml_compute_state * workers = n_threads > 1 ? alloca(sizeof(struct ggml_compute_state)*(n_threads - 1)) : NULL;\n\n    // create thread pool\n    if (n_threads > 1) {\n        ggml_lock_init(&state_shared.spin);\n\n        atomic_store(&state_shared.has_work, true);\n\n        for (int j = 0; j < n_threads - 1; j++) {\n            workers[j] = (struct ggml_compute_state) {\n                .thrd   = 0,\n                .params = {\n                    .type  = GGML_TASK_COMPUTE,\n                    .ith   = j + 1,\n                    .nth   = n_threads,\n                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,\n                    .wdata = cgraph->work ? cgraph->work->data : NULL,\n                },\n                .node   = NULL,\n                .shared = &state_shared,\n            };\n\n            int rc = ggml_thread_create(&workers[j].thrd, NULL, ggml_graph_compute_thread, &workers[j]);\n            GGML_ASSERT(rc == 0);\n            UNUSED(rc);\n        }\n    }\n\n    // initialize tasks + work buffer\n    {\n        size_t work_size = 0;\n\n        // thread scheduling for the different operations\n        for (int i = 0; i < cgraph->n_nodes; i++) {\n            struct ggml_tensor * node = cgraph->nodes[i];\n\n            switch (node->op) {\n                case GGML_OP_DUP:\n                    {\n                        node->n_tasks = 1;\n                    } break;\n                case GGML_OP_ADD:\n                    {\n                        node->n_tasks = n_threads;\n                    } break;\n                case GGML_OP_SUB:\n                case GGML_OP_MUL:\n                case GGML_OP_DIV:\n                case GGML_OP_SQR:\n                case GGML_OP_SQRT:\n                case GGML_OP_SUM:\n                case GGML_OP_MEAN:\n                case GGML_OP_REPEAT:\n                case GGML_OP_ABS:\n                case GGML_OP_SGN:\n                case GGML_OP_NEG:\n                case GGML_OP_STEP:\n                case GGML_OP_RELU:\n                    {\n                        node->n_tasks = 1;\n                    } break;\n                case GGML_OP_GELU:\n                    {\n                        node->n_tasks = n_threads;\n                    } break;\n                case GGML_OP_SILU:\n                    {\n                        node->n_tasks = n_threads;\n                    } break;\n                case GGML_OP_NORM:\n                    {\n                        node->n_tasks = n_threads;\n                    } break;\n                case GGML_OP_MUL_MAT:\n                    {\n                        node->n_tasks = n_threads;\n\n                        // TODO: use different scheduling for different matrix sizes\n                        //const int nr0 = ggml_nrows(node->src0);\n                        //const int nr1 = ggml_nrows(node->src1);\n\n                        //node->n_tasks = MIN(n_threads, MAX(1, nr0/128));\n                        //printf(\"nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks = %d\\n\", nr0, nr1, nr0*nr1, node->n_tasks);\n\n                        size_t cur = 0;\n\n                        // TODO: better way to determine if the matrix is transposed\n                        if (node->src0->nb[1] < node->src0->nb[0]) {\n                            cur = ggml_nbytes(node)*node->n_tasks; // TODO: this can become (n_tasks-1)\n                                                                   // TODO: overestimated by factor of x2 for FP16\n                        } else {\n                            if (node->src0->type == GGML_TYPE_F16 &&\n                                node->src1->type == GGML_TYPE_F32) {\n#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)\n                                if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {\n                                    node->n_tasks = 1; // TODO: this actually is doing nothing\n                                                       //       the threads are still spinning\n                                    cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);\n                                    //printf(\"src0: ne0 = %d, ne1 = %d, ne = %d\\n\", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);\n                                    //printf(\"src1: ne0 = %d, ne1 = %d, ne = %d\\n\", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);\n                                    //printf(\"cur = %zu\\n\", cur);\n                                } else {\n                                    cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);\n                                }\n#else\n                                cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);\n#endif\n                            } else if (node->src0->type == GGML_TYPE_F32 &&\n                                       node->src1->type == GGML_TYPE_F32) {\n                                cur = 0;\n                            } else if (node->src0->type == GGML_TYPE_Q4_0 &&\n                                       node->src1->type == GGML_TYPE_F32) {\n#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)\n                                if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {\n                                    node->n_tasks = 1;\n                                    cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);\n                                } else {\n                                    cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_0]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_0];\n                                }\n#else\n                                cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_0]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_0];\n#endif\n                            } else if (node->src0->type == GGML_TYPE_Q4_1 &&\n                                       node->src1->type == GGML_TYPE_F32) {\n#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)\n                                if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {\n                                    node->n_tasks = 1;\n                                    cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);\n                                } else {\n                                    cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_1]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_1];\n                                }\n#else\n                                cur = (GGML_TYPE_SIZE[GGML_TYPE_Q4_1]*ggml_nelements(node->src1))/GGML_BLCK_SIZE[GGML_TYPE_Q4_1];\n#endif\n                            } else {\n                                GGML_ASSERT(false);\n                            }\n                        }\n\n                        work_size = MAX(work_size, cur);\n                    } break;\n                case GGML_OP_SCALE:\n                    {\n                        node->n_tasks = n_threads;\n                    } break;\n                case GGML_OP_CPY:\n                case GGML_OP_RESHAPE:\n                case GGML_OP_VIEW:\n                case GGML_OP_PERMUTE:\n                case GGML_OP_TRANSPOSE:\n                case GGML_OP_GET_ROWS:\n                case GGML_OP_DIAG_MASK_INF:\n                    {\n                        node->n_tasks = 1;\n                    } break;\n                case GGML_OP_SOFT_MAX:\n                    {\n                        node->n_tasks = n_threads;\n                    } break;\n                case GGML_OP_ROPE:\n                    {\n                        node->n_tasks = 1;\n                    } break;\n                case GGML_OP_CONV_1D_1S:\n                case GGML_OP_CONV_1D_2S:\n                    {\n                        node->n_tasks = n_threads;\n\n                        GGML_ASSERT(node->src0->ne[3] == 1);\n                        GGML_ASSERT(node->src1->ne[2] == 1);\n                        GGML_ASSERT(node->src1->ne[3] == 1);\n\n                        size_t cur = 0;\n                        const int nk = node->src0->ne[0];\n\n                        if (node->src0->type == GGML_TYPE_F16 &&\n                            node->src1->type == GGML_TYPE_F32) {\n                            cur = sizeof(ggml_fp16_t)*(\n                                    nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +\n                                    ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]\n                                    );\n                        } else if (node->src0->type == GGML_TYPE_F32 &&\n                                   node->src1->type == GGML_TYPE_F32) {\n                            cur = sizeof(float)*(\n                                    nk*ggml_up32(node->src0->ne[1])*node->src0->ne[2] +\n                                    ( 2*(nk/2) + node->src1->ne[0])*node->src1->ne[1]\n                                    );\n                        } else {\n                            GGML_ASSERT(false);\n                        }\n\n                        work_size = MAX(work_size, cur);\n                    } break;\n                case GGML_OP_FLASH_ATTN:\n                    {\n                        node->n_tasks = n_threads;\n\n                        size_t cur = 0;\n\n                        const int ne11 = ggml_up(node->src1->ne[1], GGML_SOFT_MAX_UNROLL);\n\n                        if (node->src1->type == GGML_TYPE_F32) {\n                            cur  = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)\n                            cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2\n                        }\n\n                        if (node->src1->type == GGML_TYPE_F16) {\n                            cur  = sizeof(float)*ne11*node->n_tasks; // TODO: this can become (n_tasks-1)\n                            cur += sizeof(float)*ne11*node->n_tasks; // this is overestimated by x2\n                        }\n\n                        work_size = MAX(work_size, cur);\n                    } break;\n                case GGML_OP_FLASH_FF:\n                    {\n                        node->n_tasks = n_threads;\n\n                        size_t cur = 0;\n\n                        if (node->src1->type == GGML_TYPE_F32) {\n                            cur  = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)\n                            cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2\n                        }\n\n                        if (node->src1->type == GGML_TYPE_F16) {\n                            cur  = sizeof(float)*node->src1->ne[1]*node->n_tasks; // TODO: this can become (n_tasks-1)\n                            cur += sizeof(float)*node->src1->ne[1]*node->n_tasks; // this is overestimated by x2\n                        }\n\n                        work_size = MAX(work_size, cur);\n                    } break;\n                case GGML_OP_NONE:\n                    {\n                        node->n_tasks = 1;\n                    } break;\n                case GGML_OP_COUNT:\n                    {\n                        GGML_ASSERT(false);\n                    } break;\n            }\n        }\n\n        if (cgraph->work != NULL && work_size > cgraph->work_size) {\n            GGML_ASSERT(false); // TODO: better handling\n        }\n\n        if (work_size > 0 && cgraph->work == NULL) {\n            cgraph->work_size = work_size + CACHE_LINE_SIZE*(n_threads - 1);\n\n            GGML_PRINT_DEBUG(\"%s: allocating work buffer for graph (%zu bytes)\\n\", __func__, cgraph->work_size);\n            cgraph->work = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cgraph->work_size);\n        }\n    }\n\n    const int64_t perf_start_cycles  = ggml_perf_cycles();\n    const int64_t perf_start_time_us = ggml_perf_time_us();\n\n    for (int i = 0; i < cgraph->n_nodes; i++) {\n        GGML_PRINT_DEBUG_5(\"%s: %d/%d\\n\", __func__, i, cgraph->n_nodes);\n\n        struct ggml_tensor * node = cgraph->nodes[i];\n\n        // TODO: this could be used to avoid unnecessary computations, but it needs to be improved\n        //if (node->grad == NULL && node->perf_runs > 0) {\n        //    continue;\n        //}\n\n        const int64_t perf_node_start_cycles  = ggml_perf_cycles();\n        const int64_t perf_node_start_time_us = ggml_perf_time_us();\n\n        // INIT\n        struct ggml_compute_params params = {\n            /*.type  =*/ GGML_TASK_INIT,\n            /*.ith   =*/ 0,\n            /*.nth   =*/ node->n_tasks,\n            /*.wsize =*/ cgraph->work ? ggml_nbytes(cgraph->work) : 0,\n            /*.wdata =*/ cgraph->work ? cgraph->work->data : NULL,\n        };\n\n        ggml_compute_forward(&params, node);\n\n        // COMPUTE\n        if (node->n_tasks > 1) {\n            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {\n                atomic_store(&state_shared.has_work, false);\n            }\n\n            while (atomic_load(&state_shared.has_work)) {\n                ggml_lock_lock  (&state_shared.spin);\n                ggml_lock_unlock(&state_shared.spin);\n            }\n\n            // launch thread pool\n            for (int j = 0; j < n_threads - 1; j++) {\n                workers[j].params = (struct ggml_compute_params) {\n                    .type  = GGML_TASK_COMPUTE,\n                    .ith   = j + 1,\n                    .nth   = node->n_tasks,\n                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,\n                    .wdata = cgraph->work ? cgraph->work->data : NULL,\n                };\n                workers[j].node = node;\n            }\n\n            atomic_fetch_sub(&state_shared.n_ready, 1);\n\n            while (atomic_load(&state_shared.n_ready) > 0) {\n                ggml_lock_lock  (&state_shared.spin);\n                ggml_lock_unlock(&state_shared.spin);\n            }\n\n            atomic_store(&state_shared.has_work, true);\n        }\n\n        params.type = GGML_TASK_COMPUTE;\n        ggml_compute_forward(&params, node);\n\n        // wait for thread pool\n        if (node->n_tasks > 1) {\n            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {\n                atomic_store(&state_shared.has_work, false);\n            }\n\n            while (atomic_load(&state_shared.has_work)) {\n                ggml_lock_lock  (&state_shared.spin);\n                ggml_lock_unlock(&state_shared.spin);\n            }\n\n            atomic_fetch_sub(&state_shared.n_ready, 1);\n\n            while (atomic_load(&state_shared.n_ready) != 0) {\n                ggml_lock_lock  (&state_shared.spin);\n                ggml_lock_unlock(&state_shared.spin);\n            }\n        }\n\n        // FINALIZE\n        if (node->n_tasks > 1) {\n            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {\n                atomic_store(&state_shared.has_work, false);\n            }\n\n            while (atomic_load(&state_shared.has_work)) {\n                ggml_lock_lock  (&state_shared.spin);\n                ggml_lock_unlock(&state_shared.spin);\n            }\n\n            // launch thread pool\n            for (int j = 0; j < n_threads - 1; j++) {\n                workers[j].params = (struct ggml_compute_params) {\n                    .type  = GGML_TASK_FINALIZE,\n                    .ith   = j + 1,\n                    .nth   = node->n_tasks,\n                    .wsize = cgraph->work ? ggml_nbytes(cgraph->work) : 0,\n                    .wdata = cgraph->work ? cgraph->work->data : NULL,\n                };\n                workers[j].node = node;\n            }\n\n            atomic_fetch_sub(&state_shared.n_ready, 1);\n\n            while (atomic_load(&state_shared.n_ready) > 0) {\n                ggml_lock_lock  (&state_shared.spin);\n                ggml_lock_unlock(&state_shared.spin);\n            }\n\n            atomic_store(&state_shared.has_work, true);\n        }\n\n        params.type = GGML_TASK_FINALIZE;\n        ggml_compute_forward(&params, node);\n\n        // wait for thread pool\n        if (node->n_tasks > 1) {\n            if (atomic_fetch_add(&state_shared.n_ready, 1) == n_threads - 1) {\n                atomic_store(&state_shared.has_work, false);\n            }\n\n            while (atomic_load(&state_shared.has_work)) {\n                ggml_lock_lock  (&state_shared.spin);\n                ggml_lock_unlock(&state_shared.spin);\n            }\n\n            atomic_fetch_sub(&state_shared.n_ready, 1);\n\n            while (atomic_load(&state_shared.n_ready) != 0) {\n                ggml_lock_lock  (&state_shared.spin);\n                ggml_lock_unlock(&state_shared.spin);\n            }\n        }\n\n        // performance stats (node)\n        {\n            int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_node_start_cycles;\n            int64_t perf_time_us_cur = ggml_perf_time_us() - perf_node_start_time_us;\n\n            node->perf_runs++;\n            node->perf_cycles  += perf_cycles_cur;\n            node->perf_time_us += perf_time_us_cur;\n        }\n    }\n\n    // join thread pool\n    if (n_threads > 1) {\n        atomic_store(&state_shared.stop, true);\n        atomic_store(&state_shared.has_work, true);\n\n        for (int j = 0; j < n_threads - 1; j++) {\n            int rc = ggml_thread_join(workers[j].thrd, NULL);\n            GGML_ASSERT(rc == 0);\n            UNUSED(rc);\n        }\n\n        ggml_lock_destroy(&state_shared.spin);\n    }\n\n    // performance stats (graph)\n    {\n        int64_t perf_cycles_cur  = ggml_perf_cycles()  - perf_start_cycles;\n        int64_t perf_time_us_cur = ggml_perf_time_us() - perf_start_time_us;\n\n        cgraph->perf_runs++;\n        cgraph->perf_cycles  += perf_cycles_cur;\n        cgraph->perf_time_us += perf_time_us_cur;\n\n        GGML_PRINT_DEBUG(\"%s: perf (%d) - cpu = %.3f / %.3f ms, wall = %.3f / %.3f ms\\n\",\n                __func__, cgraph->perf_runs,\n                (double) perf_cycles_cur      / (double) ggml_cycles_per_ms(),\n                (double) cgraph->perf_cycles  / (double) ggml_cycles_per_ms() / (double) cgraph->perf_runs,\n                (double) perf_time_us_cur     / 1000.0,\n                (double) cgraph->perf_time_us / 1000.0 / cgraph->perf_runs);\n    }\n}\n\nvoid ggml_graph_reset(struct ggml_cgraph * cgraph) {\n    for (int i = 0; i < cgraph->n_nodes; i++) {\n        struct ggml_tensor * grad = cgraph->grads[i];\n\n        if (grad) {\n            ggml_set_zero(grad);\n        }\n    }\n}\n\nvoid ggml_graph_print(const struct ggml_cgraph * cgraph) {\n    int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};\n\n    GGML_PRINT(\"=== GRAPH ===\\n\");\n\n    GGML_PRINT_DEBUG(\"n_threads       = %d\\n\",       cgraph->n_threads);\n    GGML_PRINT_DEBUG(\"total work size = %zu bytes\\n\",cgraph->work_size);\n\n    GGML_PRINT(\"n_nodes = %d\\n\", cgraph->n_nodes);\n    for (int i = 0; i < cgraph->n_nodes; i++) {\n        struct ggml_tensor * node = cgraph->nodes[i];\n\n        perf_total_per_op_us[node->op] += node->perf_time_us;\n\n        GGML_PRINT(\" - %3d: [ %6d, %6d, %6d] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\\n\",\n                i,\n                node->ne[0], node->ne[1], node->ne[2],\n                GGML_OP_LABEL[node->op], node->is_param ? \"x\" : node->grad ? \"g\" : \" \", node->perf_runs,\n                (double) node->perf_cycles  / (double) ggml_cycles_per_ms(),\n                (double) node->perf_cycles  / (double) ggml_cycles_per_ms() / (double) node->perf_runs,\n                (double) node->perf_time_us / 1000.0,\n                (double) node->perf_time_us / 1000.0 / node->perf_runs);\n    }\n\n    GGML_PRINT(\"n_leafs = %d\\n\", cgraph->n_leafs);\n    for (int i = 0; i < cgraph->n_leafs; i++) {\n        struct ggml_tensor * node = cgraph->leafs[i];\n\n        GGML_PRINT(\" - %3d: [ %6d, %6d] %8s\\n\",\n                i,\n                node->ne[0], node->ne[1],\n                GGML_OP_LABEL[node->op]);\n    }\n\n    for (int i = 0; i < GGML_OP_COUNT; i++) {\n        GGML_PRINT(\"perf_total_per_op_us[%16s] = %7.3f ms\\n\", GGML_OP_LABEL[i], (double) perf_total_per_op_us[i] / 1000.0);\n    }\n\n    GGML_PRINT(\"========================================\\n\");\n}\n\n// check if node is part of the graph\nstatic bool ggml_graph_find(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {\n    if (cgraph == NULL) {\n        return true;\n    }\n\n    for (int i = 0; i < cgraph->n_nodes; i++) {\n        if (cgraph->nodes[i] == node) {\n            return true;\n        }\n    }\n\n    return false;\n}\n\nstatic struct ggml_tensor * ggml_graph_get_parent(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node) {\n    for (int i = 0; i < cgraph->n_nodes; i++) {\n        struct ggml_tensor * parent = cgraph->nodes[i];\n\n        if (parent->grad == node) {\n            return parent;\n        }\n    }\n\n    return NULL;\n}\n\nvoid ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename) {\n    char color[16];\n\n    FILE * fp = fopen(filename, \"w\");\n    GGML_ASSERT(fp);\n\n    fprintf(fp, \"digraph G {\\n\");\n    fprintf(fp, \"  newrank = true;\\n\");\n    fprintf(fp, \"  rankdir = LR;\\n\");\n\n    for (int i = 0; i < gb->n_nodes; i++) {\n        struct ggml_tensor * node = gb->nodes[i];\n\n        if (ggml_graph_get_parent(gb, node) != NULL) {\n            continue;\n        }\n\n        if (node->is_param) {\n            snprintf(color, sizeof(color), \"yellow\");\n        } else if (node->grad) {\n            if (ggml_graph_find(gf, node)) {\n                snprintf(color, sizeof(color), \"green\");\n            } else {\n                snprintf(color, sizeof(color), \"lightblue\");\n            }\n        } else {\n            snprintf(color, sizeof(color), \"white\");\n        }\n\n        fprintf(fp, \"  \\\"%p\\\" [ \\\nstyle = filled; fillcolor = %s; shape = record; \\\nlabel=\\\"%d [%d, %d] | <x>%s\",\n                (void *) node, color,\n                i, node->ne[0], node->ne[1],\n                GGML_OP_SYMBOL[node->op]);\n\n        if (node->grad) {\n            fprintf(fp, \" | <g>%s\\\"; ]\\n\", GGML_OP_SYMBOL[node->grad->op]);\n        } else {\n            fprintf(fp, \"\\\"; ]\\n\");\n        }\n    }\n\n    for (int i = 0; i < gb->n_leafs; i++) {\n        struct ggml_tensor * node = gb->leafs[i];\n\n        snprintf(color, sizeof(color), \"pink\");\n\n        if (ggml_nelements(node) == 1) {\n            fprintf(fp, \"  \\\"%p\\\" [ \\\nstyle = filled; fillcolor = %s; shape = record; \\\nlabel=\\\"<x>%.1e\\\"; ]\\n\",\n                    (void *) node, color, ggml_get_f32_1d(node, 0));\n        } else {\n            fprintf(fp, \"  \\\"%p\\\" [ \\\nstyle = filled; fillcolor = %s; shape = record; \\\nlabel=\\\"<x>CONST %d [%d, %d]\\\"; ]\\n\",\n                    (void *) node, color,\n                    i, node->ne[0], node->ne[1]);\n        }\n    }\n\n    for (int i = 0; i < gb->n_nodes; i++) {\n        struct ggml_tensor * node = gb->nodes[i];\n\n        struct ggml_tensor * parent = ggml_graph_get_parent(gb, node);\n\n        if (node->src0) {\n            struct ggml_tensor * parent0 = ggml_graph_get_parent(gb, node->src0);\n\n            fprintf(fp, \"  \\\"%p\\\":%s -> \\\"%p\\\":%s [ arrowhead = %s; style = %s; label = \\\"x\\\"; ]\\n\",\n                    parent0 ? (void *) parent0 : (void *) node->src0,\n                    parent0 ? \"g\" : \"x\",\n                    parent ? (void *) parent : (void *) node,\n                    parent ? \"g\" : \"x\",\n                    parent ? \"empty\" : \"vee\",\n                    parent ? \"dashed\" : \"solid\");\n        }\n\n        if (node->src1) {\n            struct ggml_tensor * parent1 = ggml_graph_get_parent(gb, node->src1);\n\n            fprintf(fp, \"  \\\"%p\\\":%s -> \\\"%p\\\":%s [ arrowhead = %s; style = %s; label = \\\"y\\\"; ]\\n\",\n                    parent1 ? (void *) parent1 : (void *) node->src1,\n                    parent1 ? \"g\" : \"x\",\n                    parent ? (void *) parent : (void *) node,\n                    parent ? \"g\" : \"x\",\n                    parent ? \"empty\" : \"vee\",\n                    parent ? \"dashed\" : \"solid\");\n        }\n    }\n\n    for (int i = 0; i < gb->n_leafs; i++) {\n        struct ggml_tensor * node = gb->leafs[i];\n\n        if (node->src0) {\n            fprintf(fp, \"  \\\"%p\\\":%s -> \\\"%p\\\":%s [ label = \\\"x\\\"; ]\\n\",\n                    (void *) node->src0, \"x\",\n                    (void *) node, \"x\");\n        }\n\n        if (node->src1) {\n            fprintf(fp, \"  \\\"%p\\\":%s -> \\\"%p\\\":%s [ label = \\\"y\\\"; ]\\n\",\n                    (void *) node->src1, \"x\",\n                    (void *) node, \"x\");\n        }\n    }\n\n    fprintf(fp, \"}\\n\");\n\n    fclose(fp);\n\n    GGML_PRINT(\"%s: dot -Tpng %s -o %s.png && open %s.png\\n\", __func__, filename, filename, filename);\n}\n\n////////////////////////////////////////////////////////////////////////////////\n\nstatic void ggml_opt_set_params(int np, struct ggml_tensor * const ps[], const float * x) {\n    int i = 0;\n    for (int p = 0; p < np; ++p) {\n        const int ne = ggml_nelements(ps[p]) ;\n        // TODO: add function to set tensor from array\n        for (int j = 0; j < ne; ++j) {\n            ggml_set_f32_1d(ps[p], j, x[i++]);\n        }\n    }\n}\n\nstatic void ggml_opt_get_params(int np, struct ggml_tensor * const ps[], float * x) {\n    int i = 0;\n    for (int p = 0; p < np; ++p) {\n        const int ne = ggml_nelements(ps[p]) ;\n        // TODO: add function to get all elements at once\n        for (int j = 0; j < ne; ++j) {\n            x[i++] = ggml_get_f32_1d(ps[p], j);\n        }\n    }\n}\n\nstatic void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g) {\n    int i = 0;\n    for (int p = 0; p < np; ++p) {\n        const int ne = ggml_nelements(ps[p]) ;\n        // TODO: add function to get all elements at once\n        for (int j = 0; j < ne; ++j) {\n            g[i++] = ggml_get_f32_1d(ps[p]->grad, j);\n        }\n    }\n}\n\n//\n// ADAM\n//\n//   ref: https://arxiv.org/pdf/1412.6980.pdf\n//\n\nstatic enum ggml_opt_result ggml_opt_adam(\n        struct ggml_context * ctx,\n        struct ggml_opt_params params,\n        struct ggml_tensor * f,\n        struct ggml_cgraph * gf,\n        struct ggml_cgraph * gb) {\n    GGML_ASSERT(ggml_is_scalar(f));\n\n    gf->n_threads = params.n_threads;\n    gb->n_threads = params.n_threads;\n\n    // these will store the parameters we want to optimize\n    struct ggml_tensor * ps[GGML_MAX_PARAMS];\n\n    int np = 0;\n    int nx = 0;\n    for (int i = 0; i < gf->n_nodes; ++i) {\n        if (gf->nodes[i]->is_param) {\n            GGML_PRINT_DEBUG(\"found param %d: grad->op = %d\\n\", np, gf->nodes[i]->grad->op);\n\n            GGML_ASSERT(np < GGML_MAX_PARAMS);\n\n            ps[np++] = gf->nodes[i];\n            nx += ggml_nelements(gf->nodes[i]);\n        }\n    }\n\n    // constants\n    const float alpha = params.adam.alpha;\n    const float beta1 = params.adam.beta1;\n    const float beta2 = params.adam.beta2;\n    const float eps   = params.adam.eps;\n\n    float * x  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // view of the parameters\n    float * g1 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // gradient\n    float * g2 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // gradient squared\n    float * m  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // first moment\n    float * v  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // second moment\n    float * mh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // first moment hat\n    float * vh = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // second moment hat\n\n    float * pf = params.past > 0 ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)->data : NULL; // past function values\n\n    // initialize\n    ggml_vec_set_f32(nx, m, 0.0f);\n    ggml_vec_set_f32(nx, v, 0.0f);\n\n    // update view\n    ggml_opt_get_params(np, ps, x);\n\n    // compute the function value\n    ggml_graph_reset  (gf);\n    ggml_set_f32      (f->grad, 1.0f);\n    ggml_graph_compute(ctx, gb);\n\n    float fx_prev = ggml_get_f32_1d(f, 0);\n    if (pf) {\n        pf[0] = fx_prev;\n    }\n\n    int n_no_improvement = 0;\n    float fx_best = fx_prev;\n\n    // run the optimizer\n    for (int t = 0; t < params.adam.n_iter; ++t) {\n        GGML_PRINT_DEBUG  (\"=== iter %d ===\\n\", t);\n\n        GGML_PRINT_DEBUG  (\"f      = %10.6f\\n\", ggml_get_f32_1d(f, 0));\n        GGML_PRINT_DEBUG_5(\"df/dx0 = %10.6f\\n\", ggml_get_f32_1d(ps[0]->grad, 0));\n        GGML_PRINT_DEBUG_5(\"df/dx1 = %10.6f\\n\", ggml_get_f32_1d(ps[1]->grad, 0));\n\n        for (int i = 0; i < np; ++i) {\n            GGML_PRINT_DEBUG(\"param %d: %10.6f, g = %10.6f\\n\", i,\n                    ggml_get_f32_1d(ps[i], 0), ggml_get_f32_1d(ps[i]->grad, 0));\n        }\n\n        const int64_t t_start_wall = ggml_time_us();\n        const int64_t t_start_cpu = ggml_cycles();\n        UNUSED(t_start_wall);\n        UNUSED(t_start_cpu);\n\n        {\n            // update the gradient\n            ggml_opt_get_grad(np, ps, g1);\n\n            // m_t = beta1*m_t-1 + (1 - beta1)*g_t\n            ggml_vec_scale_f32(nx, m, beta1);\n            ggml_vec_mad_f32  (nx, m, g1, 1.0f - beta1);\n\n            // g2 = g1^2\n            ggml_vec_sqr_f32  (nx, g2, g1);\n\n            // v_t = beta2*v_t-1 + (1 - beta2)*g_t^2\n            ggml_vec_scale_f32(nx, v, beta2);\n            ggml_vec_mad_f32  (nx, v, g2, 1.0f - beta2);\n\n            // m^hat = m_t / (1 - beta1^t)\n            // v^hat = v_t / (1 - beta2^t)\n            // x_t = x_t-1 - alpha*m^hat/(sqrt(v^hat) + eps)\n            ggml_vec_cpy_f32  (nx, mh, m);\n            ggml_vec_cpy_f32  (nx, vh, v);\n\n            ggml_vec_scale_f32(nx, mh, alpha/(1.0f - powf(beta1, t + 1)));\n            ggml_vec_scale_f32(nx, vh,  1.0f/(1.0f - powf(beta2, t + 1)));\n\n            ggml_vec_sqrt_f32 (nx, vh, vh);\n            ggml_vec_acc1_f32 (nx, vh, eps);\n\n            ggml_vec_div_f32  (nx, mh, mh, vh);\n            ggml_vec_sub_f32  (nx, x,  x,  mh);\n\n            // update the parameters\n            ggml_opt_set_params(np, ps, x);\n        }\n\n        ggml_graph_reset  (gf);\n        ggml_set_f32      (f->grad, 1.0f);\n        ggml_graph_compute(ctx, gb);\n\n        const float fx = ggml_get_f32_1d(f, 0);\n\n        // check convergence\n        if (fabsf(fx - fx_prev)/fx < params.adam.eps_f) {\n            GGML_PRINT_DEBUG(\"converged\\n\");\n\n            return GGML_OPT_OK;\n        }\n\n        // delta-based convergence test\n        if (pf != NULL) {\n            // need at least params.past iterations to start checking for convergence\n            if (params.past <= t) {\n                const float rate = (pf[t%params.past] - fx)/fx;\n\n                if (fabs(rate) < params.delta) {\n                    return GGML_OPT_OK;\n                }\n            }\n\n            pf[t%params.past] = fx;\n        }\n\n        // check for improvement\n        if (params.max_no_improvement > 0) {\n            if (fx_best > fx) {\n                fx_best = fx;\n                n_no_improvement = 0;\n            } else {\n                ++n_no_improvement;\n\n                if (n_no_improvement >= params.max_no_improvement) {\n                    return GGML_OPT_OK;\n                }\n            }\n        }\n\n        fx_prev = fx;\n\n        {\n            const int64_t t_end_cpu = ggml_cycles();\n            GGML_PRINT_DEBUG(\"time iter:      %5.3f s\\n\", ((float)(t_end_cpu - t_start_cpu))/CLOCKS_PER_SEC);\n            UNUSED(t_end_cpu);\n\n            const int64_t t_end_wall = ggml_time_us();\n            GGML_PRINT_DEBUG(\"wall time iter: %5.3f s\\n\", (t_end_wall - t_start_wall)/1e6);\n            UNUSED(t_end_wall);\n        }\n    }\n\n    return GGML_OPT_DID_NOT_CONVERGE;\n}\n\n//\n// L-BFGS\n//\n// the L-BFGS implementation below is based on the following implementation:\n//\n//   https://github.com/chokkan/liblbfgs\n//\n\nstruct ggml_lbfgs_iteration_data {\n    float alpha;\n    float ys;\n    float * s;\n    float * y;\n};\n\nstatic enum ggml_opt_result linesearch_backtracking(\n        struct ggml_context * ctx,\n        const struct ggml_opt_params * params,\n        int nx,\n        float * x,\n        float * fx,\n        float * g,\n        float * d,\n        float * step,\n        const float * xp,\n        struct ggml_tensor * f,\n        struct ggml_cgraph * gf,\n        struct ggml_cgraph * gb,\n        const int np,\n        struct ggml_tensor * ps[]) {\n    int count = 0;\n\n    float width  = 0.0f;\n    float dg     = 0.0f;\n    float finit  = 0.0f;\n    float dginit = 0.0f;\n    float dgtest = 0.0f;\n\n    const float dec = 0.5f;\n    const float inc = 2.1f;\n\n    if (*step <= 0.) {\n        return GGML_LINESEARCH_INVALID_PARAMETERS;\n    }\n\n    // compute the initial gradient in the search direction\n    ggml_vec_dot_f32(nx, &dginit, g, d);\n\n    // make sure that d points to a descent direction\n    if (0 < dginit) {\n        return GGML_LINESEARCH_FAIL;\n    }\n\n    // initialize local variables\n    finit = *fx;\n    dgtest = params->lbfgs.ftol*dginit;\n\n    while (true) {\n        ggml_vec_cpy_f32(nx, x, xp);\n        ggml_vec_mad_f32(nx, x, d, *step);\n\n        // evaluate the function and gradient values\n        {\n            ggml_opt_set_params(np, ps, x);\n\n            ggml_graph_reset  (gf);\n            ggml_set_f32      (f->grad, 1.0f);\n            ggml_graph_compute(ctx, gb);\n\n            ggml_opt_get_grad(np, ps, g);\n\n            *fx = ggml_get_f32_1d(f, 0);\n        }\n\n        ++count;\n\n        if (*fx > finit + (*step)*dgtest) {\n            width = dec;\n        } else {\n            // Armijo condition is satisfied\n            if (params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_ARMIJO) {\n                return count;\n            }\n\n            ggml_vec_dot_f32(nx, &dg, g, d);\n\n            // check the Wolfe condition\n            if (dg < params->lbfgs.wolfe * dginit) {\n                width = inc;\n            } else {\n                if(params->lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE) {\n                    // regular Wolfe conditions\n                    return count;\n                }\n\n                if(dg > -params->lbfgs.wolfe*dginit) {\n                    width = dec;\n                } else {\n                    // strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)\n                    return count;\n                }\n                return count;\n            }\n        }\n\n        if (*step < params->lbfgs.min_step) {\n            return GGML_LINESEARCH_MINIMUM_STEP;\n        }\n        if (*step > params->lbfgs.max_step) {\n            return GGML_LINESEARCH_MAXIMUM_STEP;\n        }\n        if (params->lbfgs.max_linesearch <= count) {\n            return GGML_LINESEARCH_MAXIMUM_ITERATIONS;\n        }\n\n        (*step) *= width;\n    }\n\n    return GGML_LINESEARCH_FAIL;\n}\n\nstatic enum ggml_opt_result ggml_opt_lbfgs(\n        struct ggml_context * ctx,\n        struct ggml_opt_params params,\n        struct ggml_tensor * f,\n        struct ggml_cgraph * gf,\n        struct ggml_cgraph * gb) {\n    if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE ||\n        params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) {\n        if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1. <= params.lbfgs.wolfe) {\n            return GGML_OPT_INVALID_WOLFE;\n        }\n    }\n\n    gf->n_threads = params.n_threads;\n    gb->n_threads = params.n_threads;\n\n    const int m = params.lbfgs.m;\n\n    // these will store the parameters we want to optimize\n    struct ggml_tensor * ps[GGML_MAX_PARAMS];\n\n    int np = 0;\n    int nx = 0;\n    for (int i = 0; i < gf->n_nodes; ++i) {\n        if (gf->nodes[i]->is_param) {\n            GGML_PRINT_DEBUG(\"found param %d: grad->op = %d\\n\", np, gf->nodes[i]->grad->op);\n\n            GGML_ASSERT(np < GGML_MAX_PARAMS);\n\n            ps[np++] = gf->nodes[i];\n            nx += ggml_nelements(gf->nodes[i]);\n        }\n    }\n\n    float * x  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // current parameters\n    float * xp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // previous parameters\n    float * g  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // current gradient\n    float * gp = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // previous gradient\n    float * d  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data; // search direction\n\n    float * pf = params.past > 0 ? ggml_new_tensor_1d(ctx, GGML_TYPE_F32, params.past)->data : NULL; // past function values\n\n    float fx    = 0.0f; // cost function value\n    float xnorm = 0.0f; // ||x||\n    float gnorm = 0.0f; // ||g||\n    float step  = 0.0f;\n\n    // initialize x from the graph nodes\n    ggml_opt_get_params(np, ps, x);\n\n    // the L-BFGS memory\n    struct ggml_lbfgs_iteration_data * lm = alloca(sizeof(struct ggml_lbfgs_iteration_data)*m);\n\n    for (int i = 0; i < m; ++i) {\n        lm[i].alpha = 0.0f;\n        lm[i].ys    = 0.0f;\n        lm[i].s     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data;\n        lm[i].y     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, nx)->data;\n    }\n\n    // evaluate the function value and its gradient\n    {\n        ggml_opt_set_params(np, ps, x);\n\n        ggml_graph_reset  (gf);\n        ggml_set_f32      (f->grad, 1.0f);\n        ggml_graph_compute(ctx, gb);\n\n        ggml_opt_get_grad(np, ps, g);\n\n        fx = ggml_get_f32_1d(f, 0);\n    }\n\n    if (pf) {\n        pf[0] = fx;\n    }\n\n    float fx_best = fx;\n\n    // search direction = -gradient\n    ggml_vec_neg_f32(nx, d, g);\n\n    // ||x||, ||g||\n    ggml_vec_norm_f32(nx, &xnorm, x);\n    ggml_vec_norm_f32(nx, &gnorm, g);\n\n    if (xnorm < 1.0f) {\n        xnorm = 1.0f;\n    }\n\n    // already optimized\n    if (gnorm/xnorm <= params.lbfgs.eps) {\n        return GGML_OPT_OK;\n    }\n\n    // initial step\n    ggml_vec_norm_inv_f32(nx, &step, d);\n\n    int j                = 0;\n    int k                = 1;\n    int ls               = 0;\n    int end              = 0;\n    int bound            = 0;\n    int n_no_improvement = 0;\n\n    float ys   = 0.0f;\n    float yy   = 0.0f;\n    float beta = 0.0f;\n\n    while (true) {\n        // store the current position and gradient vectors\n        ggml_vec_cpy_f32(nx, xp, x);\n        ggml_vec_cpy_f32(nx, gp, g);\n\n        ls = linesearch_backtracking(ctx, &params, nx, x, &fx, g, d, &step, xp, f, gf, gb, np, ps);\n\n        if (ls < 0) {\n            // linesearch failed - go back to the previous point and return\n            ggml_vec_cpy_f32(nx, x, xp);\n            ggml_vec_cpy_f32(nx, g, gp);\n\n            return ls;\n        }\n\n        ggml_vec_norm_f32(nx, &xnorm, x);\n        ggml_vec_norm_f32(nx, &gnorm, g);\n\n        GGML_PRINT_DEBUG(\"f = %10.6f\\n\", ggml_get_f32_1d(f, 0));\n\n        if (xnorm < 1.0) {\n            xnorm = 1.0;\n        }\n        if (gnorm/xnorm <= params.lbfgs.eps) {\n            // converged\n            return GGML_OPT_OK;\n        }\n\n        // delta-based convergence test\n        if (pf != NULL) {\n            // need at least params.past iterations to start checking for convergence\n            if (params.past <= k) {\n                const float rate = (pf[k%params.past] - fx)/fx;\n\n                if (fabs(rate) < params.delta) {\n                    return GGML_OPT_OK;\n                }\n            }\n\n            pf[k%params.past] = fx;\n        }\n\n        // check for improvement\n        if (params.max_no_improvement > 0) {\n            if (fx < fx_best) {\n                fx_best = fx;\n                n_no_improvement = 0;\n            } else {\n                n_no_improvement++;\n\n                if (n_no_improvement >= params.max_no_improvement) {\n                    return GGML_OPT_OK;\n                }\n            }\n        }\n\n        if (params.lbfgs.n_iter != 0 && params.lbfgs.n_iter < k + 1) {\n            // reached the maximum number of iterations\n            return GGML_OPT_DID_NOT_CONVERGE;\n        }\n\n        // update vectors s and y:\n        //   s_{k+1} = x_{k+1} - x_{k} = \\step * d_{k}.\n        //   y_{k+1} = g_{k+1} - g_{k}.\n        //\n        ggml_vec_sub_f32(nx, lm[end].s, x, xp);\n        ggml_vec_sub_f32(nx, lm[end].y, g, gp);\n\n        // compute scalars ys and yy:\n        //     ys = y^t \\cdot s    -> 1 / \\rho.\n        //     yy = y^t \\cdot y.\n        //\n        ggml_vec_dot_f32(nx, &ys, lm[end].y, lm[end].s);\n        ggml_vec_dot_f32(nx, &yy, lm[end].y, lm[end].y);\n\n        lm[end].ys = ys;\n\n        // find new search direction\n        //   ref: https://en.wikipedia.org/wiki/Limited-memory_BFGS\n\n        bound = (m <= k) ? m : k;\n        k++;\n        end = (end + 1)%m;\n\n        // initialize search direction with -g\n        ggml_vec_neg_f32(nx, d, g);\n\n        j = end;\n        for (int i = 0; i < bound; ++i) {\n            j = (j + m - 1) % m;\n            // \\alpha_{j} = \\rho_{j} s^{t}_{j} \\cdot q_{k+1}\n            ggml_vec_dot_f32(nx, &lm[j].alpha, lm[j].s, d);\n            lm[j].alpha /= lm[j].ys;\n            // q_{i} = q_{i+1} - \\alpha_{i} y_{i}\n            ggml_vec_mad_f32(nx, d, lm[j].y, -lm[j].alpha);\n        }\n\n        ggml_vec_scale_f32(nx, d, ys/yy);\n\n        for (int i = 0; i < bound; ++i) {\n            // \\beta_{j} = \\rho_{j} y^t_{j} \\cdot \\gamma_{i}\n            ggml_vec_dot_f32(nx, &beta, lm[j].y, d);\n            beta /= lm[j].ys;\n            // \\gamma_{i+1} = \\gamma_{i} + (\\alpha_{j} - \\beta_{j}) s_{j}\n            ggml_vec_mad_f32(nx, d, lm[j].s, lm[j].alpha - beta);\n            j = (j + 1)%m;\n        }\n\n        step = 1.0;\n    }\n\n    return GGML_OPT_DID_NOT_CONVERGE;\n}\n\nstruct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {\n    struct ggml_opt_params result;\n\n    switch (type) {\n        case GGML_OPT_ADAM:\n            {\n                result = (struct ggml_opt_params) {\n                    .type      = GGML_OPT_ADAM,\n                    .n_threads = 1,\n                    .past      = 0,\n                    .delta     = 1e-5f,\n\n                    .max_no_improvement = 100,\n\n                    .print_forward_graph  = true,\n                    .print_backward_graph = true,\n\n                    .adam = {\n                        .n_iter = 10000,\n                        .alpha  = 0.001f,\n                        .beta1  = 0.9f,\n                        .beta2  = 0.999f,\n                        .eps    = 1e-8f,\n                        .eps_f  = 1e-5f,\n                        .eps_g  = 1e-3f,\n                    },\n                };\n            } break;\n        case GGML_OPT_LBFGS:\n            {\n                result = (struct ggml_opt_params) {\n                    .type      = GGML_OPT_LBFGS,\n                    .n_threads = 1,\n                    .past      = 0,\n                    .delta     = 1e-5f,\n\n                    .max_no_improvement = 0,\n\n                    .print_forward_graph  = true,\n                    .print_backward_graph = true,\n\n                    .lbfgs = {\n                        .m              = 6,\n                        .n_iter         = 100,\n                        .max_linesearch = 20,\n\n                        .eps      = 1e-5f,\n                        .ftol     = 1e-4f,\n                        .wolfe    = 0.9f,\n                        .min_step = 1e-20f,\n                        .max_step = 1e+20f,\n\n                        .linesearch = GGML_LINESEARCH_DEFAULT,\n                    },\n                };\n            } break;\n    }\n\n    return result;\n}\n\nenum ggml_opt_result ggml_opt(\n        struct ggml_context * ctx,\n        struct ggml_opt_params params,\n        struct ggml_tensor * f) {\n    bool free_ctx = false;\n    if (ctx == NULL) {\n        struct ggml_init_params params_ctx = {\n            .mem_size   = 16*1024*1024,\n            .mem_buffer = NULL,\n        };\n\n        ctx = ggml_init(params_ctx);\n        if (ctx == NULL) {\n            return GGML_OPT_NO_CONTEXT;\n        }\n\n        free_ctx = true;\n    }\n\n    enum ggml_opt_result result = GGML_OPT_OK;\n\n    // build forward + backward compute graphs\n    struct ggml_cgraph gf = ggml_build_forward (f);\n    struct ggml_cgraph gb = ggml_build_backward(ctx, &gf, false);\n\n    switch (params.type) {\n        case GGML_OPT_ADAM:\n            {\n                result = ggml_opt_adam(ctx, params, f, &gf, &gb);\n            } break;\n        case GGML_OPT_LBFGS:\n            {\n                result = ggml_opt_lbfgs(ctx, params, f, &gf, &gb);\n            } break;\n    }\n\n    if (params.print_forward_graph) {\n        ggml_graph_print   (&gf);\n        ggml_graph_dump_dot(&gf, NULL, \"opt-forward.dot\");\n    }\n\n    if (params.print_backward_graph) {\n        ggml_graph_print   (&gb);\n        ggml_graph_dump_dot(&gb, &gf, \"opt-backward.dot\");\n    }\n\n    if (free_ctx) {\n        ggml_free(ctx);\n    }\n\n    return result;\n}\n\n////////////////////////////////////////////////////////////////////////////////\n\nint ggml_cpu_has_avx(void) {\n#if defined(__AVX__)\n    return 1;\n#else\n    return 0;\n#endif\n}\n\nint ggml_cpu_has_avx2(void) {\n#if defined(__AVX2__)\n    return 1;\n#else\n    return 0;\n#endif\n}\n\nint ggml_cpu_has_avx512(void) {\n#if defined(__AVX512F__)\n    return 1;\n#else\n    return 0;\n#endif\n}\n\nint ggml_cpu_has_fma(void) {\n#if defined(__FMA__)\n    return 1;\n#else\n    return 0;\n#endif\n}\n\nint ggml_cpu_has_neon(void) {\n#if defined(__ARM_NEON)\n    return 1;\n#else\n    return 0;\n#endif\n}\n\nint ggml_cpu_has_arm_fma(void) {\n#if defined(__ARM_FEATURE_FMA)\n    return 1;\n#else\n    return 0;\n#endif\n}\n\nint ggml_cpu_has_f16c(void) {\n#if defined(__F16C__)\n    return 1;\n#else\n    return 0;\n#endif\n}\n\nint ggml_cpu_has_fp16_va(void) {\n#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)\n    return 1;\n#else\n    return 0;\n#endif\n}\n\nint ggml_cpu_has_wasm_simd(void) {\n#if defined(__wasm_simd128__)\n    return 1;\n#else\n    return 0;\n#endif\n}\n\nint ggml_cpu_has_blas(void) {\n#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)\n    return 1;\n#else\n    return 0;\n#endif\n}\n\nint ggml_cpu_has_sse3(void) {\n#if defined(__SSE3__)\n    return 1;\n#else\n    return 0;\n#endif\n}\n\nint ggml_cpu_has_vsx(void) {\n#if defined(__POWER9_VECTOR__)\n    return 1;\n#else\n    return 0;\n#endif\n}\n\n////////////////////////////////////////////////////////////////////////////////\n"
  },
  {
    "path": "Sources/cpp/ggml.h",
    "content": "#pragma once\n\n//\n// GGML Tensor Library\n//\n// This documentation is still a work in progress.\n// If you wish some specific topics to be covered, feel free to drop a comment:\n//\n//   https://github.com/ggerganov/whisper.cpp/issues/40\n//\n// ## Overview\n//\n// This library implements:\n//\n//  - a set of tensor operations\n//  - automatic differentiation\n//  - basic optimization algorithms\n//\n// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,\n// but is not limited to, the following:\n//\n//  - linear regression\n//  - support vector machines\n//  - neural networks\n//\n// The library allows the user to define a certain function using the available tensor operations. This function\n// definition is represented internally via a computation graph. Each tensor operation in the function definition\n// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the\n// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized\n// using one of the available optimization algorithms.\n//\n// For example, here we define the function: f(x) = a*x^2 + b\n//\n//   {\n//       struct ggml_init_params params = {\n//           .mem_size   = 16*1024*1024,\n//           .mem_buffer = NULL,\n//       };\n//\n//       // memory allocation happens here\n//       struct ggml_context * ctx = ggml_init(params);\n//\n//       struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);\n//\n//       ggml_set_param(ctx, x); // x is an input variable\n//\n//       struct ggml_tensor * a  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);\n//       struct ggml_tensor * b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);\n//       struct ggml_tensor * x2 = ggml_mul(ctx, x, x);\n//       struct ggml_tensor * f  = ggml_add(ctx, ggml_mul(ctx, a, x2), b);\n//\n//       ...\n//   }\n//\n// Notice that the function definition above does not involve any actual computation. The computation is performed only\n// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:\n//\n//   {\n//       ...\n//\n//       struct ggml_cgraph gf = ggml_build_forward(f);\n//\n//       // set the input variable and parameter values\n//       ggml_set_f32(x, 2.0f);\n//       ggml_set_f32(a, 3.0f);\n//       ggml_set_f32(b, 4.0f);\n//\n//       ggml_graph_compute(ctx0, &gf);\n//\n//       printf(\"f = %f\\n\", ggml_get_f32_1d(f, 0));\n//\n//       ...\n//   }\n//\n// The actual computation is performed in the ggml_graph_compute() function.\n//\n// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the\n// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know\n// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory\n// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was\n// actually needed.\n//\n// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic\n// differentiation and optimization algorithms.\n//\n// The described approach allows to define the function graph once and then compute its forward or backward graphs\n// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way\n// the user can avoid the memory allocation overhead at runtime.\n//\n// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class\n// citizens, but in theory the library can be extended to support FP8 and integer data types.\n//\n// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary\n// and binary operations. Most of the available operations fall into one of these two categories. With time, it became\n// clear that the library needs to support more complex operations. The way to support these operations is not clear\n// yet, but a few examples are demonstrated in the following operations:\n//\n//   - ggml_permute()\n//   - ggml_conv_1d_1s()\n//   - ggml_conv_1d_2s()\n//\n// For each tensor operator, the library implements a forward and backward computation function. The forward function\n// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the\n// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a\n// calculus class, or watch the following video:\n//\n//   What is Automatic Differentiation?\n//   https://www.youtube.com/watch?v=wG_nF1awSSY\n//\n//\n// ## Tensor data (struct ggml_tensor)\n//\n// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of\n// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains\n// pointers to the \"source\" tensors - i.e. the tensors that were used to compute the current tensor. For example:\n//\n//   {\n//       struct ggml_tensor * c = ggml_add(ctx, a, b);\n//\n//       assert(c->src[0] == a);\n//       assert(c->src[1] == b);\n//   }\n//\n// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the\n// number of elements in each dimension (\"ne\") as well as the number of bytes (\"nb\", a.k.a. stride). This allows\n// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and\n// permutation. All tensor operations have to take the stride into account and not assume that the tensor is\n// contiguous in memory.\n//\n// The data of the tensor is accessed via the \"data\" pointer. For example:\n//\n//   {\n//       struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);\n//\n//       // a[1, 2] = 1.0f;\n//       *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;\n//\n//       // a[2, 0] = 2.0f;\n//       *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;\n//\n//       ...\n//   }\n//\n// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.\n//\n// ## The matrix multiplication operator (ggml_mul_mat)\n//\n// TODO\n//\n//\n// ## Multi-threading\n//\n// TODO\n//\n//\n// ## Overview of ggml.c\n//\n// TODO\n//\n//\n// ## SIMD optimizations\n//\n// TODO\n//\n//\n// ## Debugging ggml\n//\n// TODO\n//\n//\n\n#ifdef  __cplusplus\nextern \"C\" {\n#endif\n\n#include <stdint.h>\n#include <stddef.h>\n#include <stdbool.h>\n\n#define GGML_MAX_DIMS     4\n#define GGML_MAX_NODES    4096\n#define GGML_MAX_PARAMS   16\n#define GGML_MAX_CONTEXTS 64\n#define GGML_MAX_OPT      4\n\n#ifdef __ARM_NEON\n// we use the built-in 16-bit float type\ntypedef __fp16 ggml_fp16_t;\n#else\ntypedef uint16_t ggml_fp16_t;\n#endif\n\n// convert FP16 <-> FP32\nfloat       ggml_fp16_to_fp32(ggml_fp16_t x);\nggml_fp16_t ggml_fp32_to_fp16(float x);\n\nstruct ggml_object;\nstruct ggml_context;\n\nenum ggml_type {\n    GGML_TYPE_Q4_0,\n    GGML_TYPE_Q4_1,\n    GGML_TYPE_I8,\n    GGML_TYPE_I16,\n    GGML_TYPE_I32,\n    GGML_TYPE_F16,\n    GGML_TYPE_F32,\n    GGML_TYPE_COUNT,\n};\n\n// available tensor operations:\nenum ggml_op {\n    GGML_OP_NONE = 0,\n\n    GGML_OP_DUP,\n    GGML_OP_ADD,\n    GGML_OP_SUB,\n    GGML_OP_MUL,\n    GGML_OP_DIV,\n    GGML_OP_SQR,\n    GGML_OP_SQRT,\n    GGML_OP_SUM,\n    GGML_OP_MEAN,\n    GGML_OP_REPEAT,\n    GGML_OP_ABS,\n    GGML_OP_SGN,\n    GGML_OP_NEG,\n    GGML_OP_STEP,\n    GGML_OP_RELU,\n    GGML_OP_GELU,\n    GGML_OP_SILU,\n    GGML_OP_NORM, // normalize\n\n    GGML_OP_MUL_MAT,\n\n    GGML_OP_SCALE,\n    GGML_OP_CPY,\n    GGML_OP_RESHAPE,\n    GGML_OP_VIEW,\n    GGML_OP_PERMUTE,\n    GGML_OP_TRANSPOSE,\n    GGML_OP_GET_ROWS,\n    GGML_OP_DIAG_MASK_INF,\n    GGML_OP_SOFT_MAX,\n    GGML_OP_ROPE,\n    GGML_OP_CONV_1D_1S,\n    GGML_OP_CONV_1D_2S,\n\n    GGML_OP_FLASH_ATTN,\n    GGML_OP_FLASH_FF,\n\n    GGML_OP_COUNT,\n};\n\n// n-dimensional tensor\nstruct ggml_tensor {\n    enum ggml_type type;\n\n    int    n_dims;\n    int    ne[GGML_MAX_DIMS]; // number of elements\n    size_t nb[GGML_MAX_DIMS]; // stride in bytes:\n                              // nb[0] = sizeof(type)\n                              // nb[1] = nb[0]   * ne[0] + padding\n                              // nb[i] = nb[i-1] * ne[i-1]\n\n    // compute data\n    enum ggml_op op;\n\n    bool is_param;\n\n    struct ggml_tensor * grad;\n    struct ggml_tensor * src0;\n    struct ggml_tensor * src1;\n    struct ggml_tensor * opt[GGML_MAX_OPT];\n\n    // thread scheduling\n    int n_tasks;\n\n    // performance\n    int     perf_runs;\n    int64_t perf_cycles;\n    int64_t perf_time_us;\n\n    void * data;\n    char padding[8];\n};\n\n// computation graph\nstruct ggml_cgraph {\n    int n_nodes;\n    int n_leafs;\n    int n_threads;\n\n    size_t work_size;\n    struct ggml_tensor * work;\n\n    struct ggml_tensor * nodes[GGML_MAX_NODES];\n    struct ggml_tensor * grads[GGML_MAX_NODES];\n    struct ggml_tensor * leafs[GGML_MAX_NODES];\n\n    // performance\n    int     perf_runs;\n    int64_t perf_cycles;\n    int64_t perf_time_us;\n};\n\n// scratch buffer\nstruct ggml_scratch {\n    size_t offs;\n    size_t size;\n    void * data;\n};\n\nstruct ggml_init_params {\n    // memory pool\n    size_t mem_size;   // bytes\n    void * mem_buffer; // if NULL, memory will be allocated internally\n};\n\nvoid    ggml_time_init(void); // call this once at the beginning of the program\nint64_t ggml_time_ms(void);\nint64_t ggml_time_us(void);\nint64_t ggml_cycles(void);\nint64_t ggml_cycles_per_ms(void);\n\nvoid ggml_print_object (const struct ggml_object * obj);\nvoid ggml_print_objects(const struct ggml_context * ctx);\n\nint    ggml_nelements(const struct ggml_tensor * tensor);\nsize_t ggml_nbytes   (const struct ggml_tensor * tensor);\n\nint    ggml_blck_size (enum ggml_type type);\nsize_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block\nfloat  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float\n\nsize_t ggml_element_size(const struct ggml_tensor * tensor);\n\nstruct ggml_context * ggml_init(struct ggml_init_params params);\nvoid ggml_free(struct ggml_context * ctx);\n\nsize_t ggml_used_mem(const struct ggml_context * ctx);\n\nsize_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);\n\nstruct ggml_tensor * ggml_new_tensor(\n        struct ggml_context * ctx,\n        enum   ggml_type type,\n        int    n_dims,\n        const int *ne);\n\nstruct ggml_tensor * ggml_new_tensor_1d(\n        struct ggml_context * ctx,\n        enum   ggml_type type,\n        int    ne0);\n\nstruct ggml_tensor * ggml_new_tensor_2d(\n        struct ggml_context * ctx,\n        enum   ggml_type type,\n        int    ne0,\n        int    ne1);\n\nstruct ggml_tensor * ggml_new_tensor_3d(\n        struct ggml_context * ctx,\n        enum   ggml_type type,\n        int    ne0,\n        int    ne1,\n        int    ne2);\n\nstruct ggml_tensor * ggml_new_tensor_4d(\n        struct ggml_context * ctx,\n        enum   ggml_type type,\n        int    ne0,\n        int    ne1,\n        int    ne2,\n        int    ne3);\n\nstruct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);\nstruct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);\n\nstruct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);\nstruct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);\n\nstruct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);\nstruct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);\nstruct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);\n\nint32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);\nvoid    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);\n\nfloat ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);\nvoid  ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);\n\n void * ggml_get_data    (const struct ggml_tensor * tensor);\nfloat * ggml_get_data_f32(const struct ggml_tensor * tensor);\n\n//\n// operations on tensors with backpropagation\n//\n\nstruct ggml_tensor * ggml_dup(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a);\n\nstruct ggml_tensor * ggml_add(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b);\n\nstruct ggml_tensor * ggml_sub(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b);\n\nstruct ggml_tensor * ggml_mul(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b);\n\nstruct ggml_tensor * ggml_div(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b);\n\nstruct ggml_tensor * ggml_sqr(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a);\n\nstruct ggml_tensor * ggml_sqrt(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a);\n\n// return scalar\n// TODO: compute sum along rows\nstruct ggml_tensor * ggml_sum(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a);\n\n// mean along rows\nstruct ggml_tensor * ggml_mean(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a);\n\n// if a is the same shape as b, and a is not parameter, return a\n// otherwise, return a new tensor: repeat(a) to fit in b\nstruct ggml_tensor * ggml_repeat(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b);\n\nstruct ggml_tensor * ggml_abs(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a);\n\nstruct ggml_tensor * ggml_sgn(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a);\n\nstruct ggml_tensor * ggml_neg(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a);\n\nstruct ggml_tensor * ggml_step(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a);\n\nstruct ggml_tensor * ggml_relu(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a);\n\n// TODO: double-check this computation is correct\nstruct ggml_tensor * ggml_gelu(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a);\n\nstruct ggml_tensor * ggml_silu(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a);\n\n// normalize along rows\n// TODO: eps is hardcoded to 1e-5 for now\nstruct ggml_tensor * ggml_norm(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a);\n\n// A: m rows, n columns\n// B: p rows, n columns (i.e. we transpose it internally)\n// result is m columns, p rows\nstruct ggml_tensor * ggml_mul_mat(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b);\n\n//\n// operations on tensors without backpropagation\n//\n\n// in-place, returns view(a)\nstruct ggml_tensor * ggml_scale(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b);\n\n// a -> b, return view(b)\nstruct ggml_tensor * ggml_cpy(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b);\n\n// return view(a), b specifies the new shape\n// TODO: when we start computing gradient, make a copy instead of view\nstruct ggml_tensor * ggml_reshape(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b);\n\n// return view(a)\n// TODO: when we start computing gradient, make a copy instead of view\nstruct ggml_tensor * ggml_reshape_2d(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        int                   ne0,\n        int                   ne1);\n\n// return view(a)\n// TODO: when we start computing gradient, make a copy instead of view\nstruct ggml_tensor * ggml_reshape_3d(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        int                   ne0,\n        int                   ne1,\n        int                   ne2);\n\n// offset in bytes\nstruct ggml_tensor * ggml_view_1d(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        int                   ne0,\n        size_t                offset);\n\nstruct ggml_tensor * ggml_view_2d(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        int                   ne0,\n        int                   ne1,\n        size_t                nb1, // row stride in bytes\n        size_t                offset);\n\nstruct ggml_tensor * ggml_permute(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        int                   axis0,\n        int                   axis1,\n        int                   axis2,\n        int                   axis3);\n\n// alias for ggml_permute(ctx, a, 1, 0, 2, 3)\nstruct ggml_tensor * ggml_transpose(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a);\n\nstruct ggml_tensor * ggml_get_rows(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b);\n\n// set elements above the diagonal to -INF\n// in-place, returns view(a)\nstruct ggml_tensor * ggml_diag_mask_inf(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        int                   n_past);\n\n// in-place, returns view(a)\nstruct ggml_tensor * ggml_soft_max(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a);\n\n// rotary position embedding\n// in-place, returns view(a)\n// if mode == 1, skip n_past elements\n// TODO: avoid creating a new tensor every time\nstruct ggml_tensor * ggml_rope(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        int                   n_past,\n        int                   n_dims,\n        int                   mode);\n\n// padding = 1\n// TODO: we don't support extra parameters for now\n//       that's why we are hard-coding the stride, padding, and dilation\n//       not great ..\nstruct ggml_tensor * ggml_conv_1d_1s(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b);\n\nstruct ggml_tensor * ggml_conv_1d_2s(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b);\n\nstruct ggml_tensor * ggml_flash_attn(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * q,\n        struct ggml_tensor  * k,\n        struct ggml_tensor  * v,\n        bool                  masked);\n\nstruct ggml_tensor * ggml_flash_ff(\n        struct ggml_context * ctx,\n        struct ggml_tensor  * a,\n        struct ggml_tensor  * b0,\n        struct ggml_tensor  * b1,\n        struct ggml_tensor  * c0,\n        struct ggml_tensor  * c1);\n\n//\n// automatic differentiation\n//\n\nvoid ggml_set_param(\n        struct ggml_context * ctx,\n        struct ggml_tensor * tensor);\n\nvoid ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);\n\nstruct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);\nstruct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);\n\nvoid ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);\nvoid ggml_graph_reset  (struct ggml_cgraph * cgraph);\n\n// print info and performance information for the graph\nvoid ggml_graph_print(const struct ggml_cgraph * cgraph);\n\n// dump the graph into a file using the dot format\nvoid ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);\n\n//\n// optimization\n//\n\n// optimization methods\nenum ggml_opt_type {\n    GGML_OPT_ADAM,\n    GGML_OPT_LBFGS,\n};\n\n// linesearch methods\nenum ggml_linesearch {\n    GGML_LINESEARCH_DEFAULT = 1,\n\n    GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,\n    GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,\n    GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,\n};\n\n// optimization return values\nenum ggml_opt_result {\n    GGML_OPT_OK = 0,\n    GGML_OPT_DID_NOT_CONVERGE,\n    GGML_OPT_NO_CONTEXT,\n    GGML_OPT_INVALID_WOLFE,\n    GGML_OPT_FAIL,\n\n    GGML_LINESEARCH_FAIL = -128,\n    GGML_LINESEARCH_MINIMUM_STEP,\n    GGML_LINESEARCH_MAXIMUM_STEP,\n    GGML_LINESEARCH_MAXIMUM_ITERATIONS,\n    GGML_LINESEARCH_INVALID_PARAMETERS,\n};\n\n// optimization parameters\n//\n//   see ggml.c (ggml_opt_default_params) for default values\n//\nstruct ggml_opt_params {\n    enum ggml_opt_type type;\n\n    int n_threads;\n\n    // delta-based convergence test\n    //\n    //   if past == 0 - disabled\n    //   if past > 0:\n    //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)\n    //\n    int past;\n    float delta;\n\n    // maximum number of iterations without improvement\n    //\n    //   if 0 - disabled\n    //   if > 0:\n    //     assume convergence if no cost improvement in this number of iterations\n    //\n    int max_no_improvement;\n\n    bool print_forward_graph;\n    bool print_backward_graph;\n\n    // ADAM parameters\n    struct {\n        int n_iter;\n\n        float alpha; // learning rate\n        float beta1;\n        float beta2;\n        float eps;   // epsilon for numerical stability\n        float eps_f; // epsilon for convergence test\n        float eps_g; // epsilon for convergence test\n    } adam;\n\n    // LBFGS parameters\n    struct {\n        int m; // number of corrections to approximate the inv. Hessian\n        int n_iter;\n        int max_linesearch;\n\n        float eps;      // convergence tolerance\n        float ftol;     // line search tolerance\n        float wolfe;\n        float min_step;\n        float max_step;\n\n        enum ggml_linesearch linesearch;\n    } lbfgs;\n};\n\nstruct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);\n\n// optimize the function defined by the tensor f\nenum ggml_opt_result ggml_opt(\n        struct ggml_context * ctx,\n        struct ggml_opt_params params,\n        struct ggml_tensor * f);\n\n//\n// system info\n//\n\nint ggml_cpu_has_avx(void);\nint ggml_cpu_has_avx2(void);\nint ggml_cpu_has_avx512(void);\nint ggml_cpu_has_fma(void);\nint ggml_cpu_has_neon(void);\nint ggml_cpu_has_arm_fma(void);\nint ggml_cpu_has_f16c(void);\nint ggml_cpu_has_fp16_va(void);\nint ggml_cpu_has_wasm_simd(void);\nint ggml_cpu_has_blas(void);\nint ggml_cpu_has_sse3(void);\nint ggml_cpu_has_vsx(void);\n\n#ifdef  __cplusplus\n}\n#endif\n"
  },
  {
    "path": "Sources/cpp/quantize.cpp",
    "content": "#include \"ggml.h\"\n\n#include \"utils.h\"\n\n#include <cassert>\n#include <cmath>\n#include <cstdio>\n#include <cstring>\n#include <fstream>\n#include <map>\n#include <string>\n#include <vector>\n#include <regex>\n\n// TODO: move somewhere else\n#define QK 32\n\n// default hparams (LLaMA76B)\nstruct llama_hparams {\n    int32_t n_vocab = 32000;\n    int32_t n_ctx   = 512;   // this is provided as user input?\n    int32_t n_embd  = 4096;\n    int32_t n_mult  = 256;\n    int32_t n_head  = 32;\n    int32_t n_layer = 32;\n    int32_t n_rot   = 64;\n    int32_t f16     = 1;\n};\n\n\n// quantize a model\nbool llama_model_quantize(const std::string & fname_inp, const std::string & fname_out, int itype) {\n    ggml_type type = GGML_TYPE_Q4_1;\n\n    switch (itype) {\n        case 2: type = GGML_TYPE_Q4_0; break;\n        case 3: type = GGML_TYPE_Q4_1; break;\n        default: fprintf(stderr, \"%s: invalid quantization type %d\\n\", __func__, itype); return 1;\n    };\n\n    if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {\n        fprintf(stderr, \"%s: invalid quantization type %d\\n\", __func__, type);\n        return false;\n    }\n\n    gpt_vocab vocab;\n\n    printf(\"%s: loading model from '%s'\\n\", __func__, fname_inp.c_str());\n\n    auto finp = std::ifstream(fname_inp, std::ios::binary);\n    if (!finp) {\n        fprintf(stderr, \"%s: failed to open '%s' for reading\\n\", __func__, fname_inp.c_str());\n        return false;\n    }\n\n    auto fout = std::ofstream(fname_out, std::ios::binary);\n    if (!fout) {\n        fprintf(stderr, \"%s: failed to open '%s' for writing\\n\", __func__, fname_out.c_str());\n        return false;\n    }\n\n    // verify magic\n    {\n        uint32_t magic;\n        finp.read((char *) &magic, sizeof(magic));\n        if (magic != 0x67676d6c) {\n            fprintf(stderr, \"%s: invalid model file '%s' (bad magic)\\n\", __func__, fname_inp.c_str());\n            return false;\n        }\n\n        fout.write((char *) &magic, sizeof(magic));\n    }\n\n    llama_hparams hparams;\n\n    // load hparams\n    {\n        finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));\n        //finp.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));\n        finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));\n        finp.read((char *) &hparams.n_mult,  sizeof(hparams.n_mult));\n        finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));\n        finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));\n        finp.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));\n        finp.read((char *) &hparams.f16,     sizeof(hparams.f16));\n\n        printf(\"%s: n_vocab = %d\\n\", __func__, hparams.n_vocab);\n        printf(\"%s: n_ctx   = %d\\n\", __func__, hparams.n_ctx);\n        printf(\"%s: n_embd  = %d\\n\", __func__, hparams.n_embd);\n        printf(\"%s: n_mult  = %d\\n\", __func__, hparams.n_mult);\n        printf(\"%s: n_head  = %d\\n\", __func__, hparams.n_head);\n        printf(\"%s: n_layer = %d\\n\", __func__, hparams.n_layer);\n        printf(\"%s: f16     = %d\\n\", __func__, hparams.f16);\n\n        fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));\n        //fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));\n        fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));\n        fout.write((char *) &hparams.n_mult,  sizeof(hparams.n_mult));\n        fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));\n        fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));\n        fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));\n        fout.write((char *) &itype,           sizeof(hparams.f16));\n    }\n\n    // load vocab\n    {\n        const int32_t n_vocab = hparams.n_vocab;\n\n        if (n_vocab != hparams.n_vocab) {\n            fprintf(stderr, \"%s: invalid model file '%s' (bad vocab size %d != %d)\\n\",\n                    __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);\n            return false;\n        }\n\n        std::string word;\n        for (int i = 0; i < n_vocab; i++) {\n            uint32_t len;\n            finp.read ((char *) &len, sizeof(len));\n            fout.write((char *) &len, sizeof(len));\n\n            word.resize(len);\n            finp.read ((char *) word.data(), len);\n            fout.write((char *) word.data(), len);\n\n            vocab.token_to_id[word] = i;\n            vocab.id_to_token[i] = word;\n        }\n    }\n\n    // load weights\n    {\n        size_t total_size_org = 0;\n        size_t total_size_new = 0;\n\n        std::vector<float> work;\n\n        std::vector<uint8_t>     data_u8;\n        std::vector<ggml_fp16_t> data_f16;\n        std::vector<float>       data_f32;\n\n        std::vector<int64_t> hist_all(1 << 4, 0);\n\n        while (true) {\n            int32_t n_dims;\n            int32_t length;\n            int32_t ftype;\n\n            finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));\n            finp.read(reinterpret_cast<char *>(&length), sizeof(length));\n            finp.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));\n\n            if (finp.eof()) {\n                break;\n            }\n\n            int32_t nelements = 1;\n            int32_t ne[2] = { 1, 1 };\n            for (int i = 0; i < n_dims; ++i) {\n                finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));\n                nelements *= ne[i];\n            }\n\n            std::string name(length, 0);\n            finp.read (&name[0], length);\n\n            {\n                static const char * ftype_str[] = { \"f32\", \"f16\", \"q4_0\", \"q4_1\", };\n                printf(\"%48s - [%5d, %5d], type = %6s \", name.data(), ne[0], ne[1], ftype_str[ftype]);\n            }\n\n            // regexes of tensor names to be quantized\n            const std::vector<std::string> k_names = {\n                \".*weight\",\n            };\n\n            bool quantize = false;\n            for (const auto & s : k_names) {\n                if (std::regex_match(name, std::regex(s))) {\n                    quantize = true;\n                    break;\n                }\n            }\n\n            // quantize only 2D tensors\n            quantize &= (n_dims == 2);\n\n            if (quantize) {\n                if (ftype != 0 && ftype != 1) {\n                    fprintf(stderr, \"%s: unsupported ftype %d for integer quantization\\n\", __func__, ftype);\n                    return false;\n                }\n\n                if (ftype == 1) {\n                    data_f16.resize(nelements);\n                    finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));\n                    data_f32.resize(nelements);\n                    for (int i = 0; i < nelements; ++i) {\n                        data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);\n                    }\n                } else {\n                    data_f32.resize(nelements);\n                    finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));\n                }\n\n                ftype = itype;\n            } else {\n                const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);\n\n                data_u8.resize(nelements*bpe);\n                finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);\n            }\n\n            fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));\n            fout.write(reinterpret_cast<char *>(&length), sizeof(length));\n            fout.write(reinterpret_cast<char *>(&ftype),  sizeof(ftype));\n            for (int i = 0; i < n_dims; ++i) {\n                fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));\n            }\n            fout.write(&name[0], length);\n\n            if (quantize) {\n                printf(\"quantizing .. \");\n                work.resize(nelements); // for quantization\n\n                size_t cur_size = 0;\n                std::vector<int64_t> hist_cur(1 << 4, 0);\n\n                switch (type) {\n                    case GGML_TYPE_Q4_0:\n                        {\n                            cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());\n                        } break;\n                    case GGML_TYPE_Q4_1:\n                        {\n                            cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], QK, hist_cur.data());\n                        } break;\n                    default:\n                        {\n                            fprintf(stderr, \"%s: unsupported quantization type %d\\n\", __func__, type);\n                            return false;\n                        }\n                }\n\n                fout.write(reinterpret_cast<char *>(work.data()), cur_size);\n                total_size_new += cur_size;\n\n                printf(\"size = %8.2f MB -> %8.2f MB | hist: \", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);\n                for (int i = 0; i < hist_cur.size(); ++i) {\n                    hist_all[i] += hist_cur[i];\n                }\n\n                for (int i = 0; i < hist_cur.size(); ++i) {\n                    printf(\"%5.3f \", hist_cur[i] / (float)nelements);\n                }\n                printf(\"\\n\");\n            } else {\n                printf(\"size = %8.3f MB\\n\", data_u8.size()/1024.0/1024.0);\n                fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());\n                total_size_new += data_u8.size();\n            }\n\n            total_size_org += nelements * sizeof(float);\n        }\n\n        printf(\"%s: model size  = %8.2f MB\\n\", __func__, total_size_org/1024.0/1024.0);\n        printf(\"%s: quant size  = %8.2f MB\\n\", __func__, total_size_new/1024.0/1024.0);\n\n        {\n            int64_t sum_all = 0;\n            for (int i = 0; i < hist_all.size(); ++i) {\n                sum_all += hist_all[i];\n            }\n\n            printf(\"%s: hist: \", __func__);\n            for (int i = 0; i < hist_all.size(); ++i) {\n                printf(\"%5.3f \", hist_all[i] / (float)sum_all);\n            }\n            printf(\"\\n\");\n        }\n    }\n\n    finp.close();\n    fout.close();\n\n    return true;\n}\n\n// usage:\n//  ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type\n//\nint main(int argc, char ** argv) {\n    ggml_time_init();\n    if (argc != 4) {\n        fprintf(stderr, \"usage: %s model-f32.bin model-quant.bin type\\n\", argv[0]);\n        fprintf(stderr, \"  type = 2 - q4_0\\n\");\n        fprintf(stderr, \"  type = 3 - q4_1\\n\");\n        return 1;\n    }\n\n    // needed to initialize f16 tables\n    {\n        struct ggml_init_params params = { 0, NULL };\n        struct ggml_context * ctx = ggml_init(params);\n        ggml_free(ctx);\n    }\n\n    const std::string fname_inp = argv[1];\n    const std::string fname_out = argv[2];\n\n    const int itype = atoi(argv[3]);\n\n    const int64_t t_main_start_us = ggml_time_us();\n\n    int64_t t_quantize_us = 0;\n\n    // load the model\n    {\n        const int64_t t_start_us = ggml_time_us();\n\n        if (!llama_model_quantize(fname_inp, fname_out, itype)) {\n            fprintf(stderr, \"%s: failed to quantize model from '%s'\\n\", __func__, fname_inp.c_str());\n            return 1;\n        }\n\n        t_quantize_us = ggml_time_us() - t_start_us;\n    }\n\n    // report timing\n    {\n        const int64_t t_main_end_us = ggml_time_us();\n\n        printf(\"\\n\");\n        printf(\"%s: quantize time = %8.2f ms\\n\", __func__, t_quantize_us/1000.0f);\n        printf(\"%s:    total time = %8.2f ms\\n\", __func__, (t_main_end_us - t_main_start_us)/1000.0f);\n    }\n\n    return 0;\n}\n"
  },
  {
    "path": "Sources/cpp/utils.cpp",
    "content": "#include \"utils.h\"\n\n#include <cassert>\n#include <cstring>\n#include <fstream>\n#include <regex>\n#include <iostream>\n#include <iterator>\n#include <string>\n#include <math.h>\n\n #if defined(_MSC_VER) || defined(__MINGW32__)\n #include <malloc.h> // using malloc.h with MSC/MINGW\n #elif !defined(__FreeBSD__) && !defined(__NetBSD__)\n #include <alloca.h>\n #endif\n\nbool gpt_params_parse(int argc, char ** argv, gpt_params & params) {\n    for (int i = 1; i < argc; i++) {\n        std::string arg = argv[i];\n\n        if (arg == \"-s\" || arg == \"--seed\") {\n            params.seed = std::stoi(argv[++i]);\n        } else if (arg == \"-t\" || arg == \"--threads\") {\n            params.n_threads = std::stoi(argv[++i]);\n        } else if (arg == \"-p\" || arg == \"--prompt\") {\n            params.prompt = argv[++i];\n        } else if (arg == \"-f\" || arg == \"--file\") {\n\n            std::ifstream file(argv[++i]);\n\n            std::copy(std::istreambuf_iterator<char>(file),\n                    std::istreambuf_iterator<char>(),\n                    back_inserter(params.prompt));\n                \n        } else if (arg == \"-n\" || arg == \"--n_predict\") {\n            params.n_predict = std::stoi(argv[++i]);\n        } else if (arg == \"--top_k\") {\n            params.top_k = std::stoi(argv[++i]);\n        } else if (arg == \"--top_p\") {\n            params.top_p = std::stof(argv[++i]);\n        } else if (arg == \"--temp\") {\n            params.temp = std::stof(argv[++i]);\n        } else if (arg == \"--repeat_last_n\") {\n            params.repeat_last_n = std::stoi(argv[++i]);\n        } else if (arg == \"--repeat_penalty\") {\n            params.repeat_penalty = std::stof(argv[++i]);\n        } else if (arg == \"-b\" || arg == \"--batch_size\") {\n            params.n_batch = std::stoi(argv[++i]);\n        } else if (arg == \"-m\" || arg == \"--model\") {\n            params.model = argv[++i];\n        } else if (arg == \"-i\" || arg == \"--interactive\") {\n            params.interactive = true;\n        } else if (arg == \"--interactive-start\") {\n            params.interactive = true;\n            params.interactive_start = true;\n        } else if (arg == \"--color\") {\n            params.use_color = true;\n        } else if (arg == \"-r\" || arg == \"--reverse-prompt\") {\n            params.antiprompt = argv[++i];\n        } else if (arg == \"-h\" || arg == \"--help\") {\n            gpt_print_usage(argc, argv, params);\n            exit(0);\n        } else {\n            fprintf(stderr, \"error: unknown argument: %s\\n\", arg.c_str());\n            gpt_print_usage(argc, argv, params);\n            exit(0);\n        }\n    }\n\n    return true;\n}\n\nvoid gpt_print_usage(int argc, char ** argv, const gpt_params & params) {\n    fprintf(stderr, \"usage: %s [options]\\n\", argv[0]);\n    fprintf(stderr, \"\\n\");\n    fprintf(stderr, \"options:\\n\");\n    fprintf(stderr, \"  -h, --help            show this help message and exit\\n\");\n    fprintf(stderr, \"  -i, --interactive     run in interactive mode\\n\");\n    fprintf(stderr, \"  --interactive-start   run in interactive mode and poll user input at startup\\n\");\n    fprintf(stderr, \"  -r PROMPT, --reverse-prompt PROMPT\\n\");\n    fprintf(stderr, \"                        in interactive mode, poll user input upon seeing PROMPT\\n\");\n    fprintf(stderr, \"  --color               colorise output to distinguish prompt and user input from generations\\n\");\n    fprintf(stderr, \"  -s SEED, --seed SEED  RNG seed (default: -1)\\n\");\n    fprintf(stderr, \"  -t N, --threads N     number of threads to use during computation (default: %d)\\n\", params.n_threads);\n    fprintf(stderr, \"  -p PROMPT, --prompt PROMPT\\n\");\n    fprintf(stderr, \"                        prompt to start generation with (default: random)\\n\");\n    fprintf(stderr, \"  -f FNAME, --file FNAME\\n\");\n    fprintf(stderr, \"                        prompt file to start generation.\\n\");\n    fprintf(stderr, \"  -n N, --n_predict N   number of tokens to predict (default: %d)\\n\", params.n_predict);\n    fprintf(stderr, \"  --top_k N             top-k sampling (default: %d)\\n\", params.top_k);\n    fprintf(stderr, \"  --top_p N             top-p sampling (default: %.1f)\\n\", params.top_p);\n    fprintf(stderr, \"  --repeat_last_n N     last n tokens to consider for penalize (default: %d)\\n\", params.repeat_last_n);\n    fprintf(stderr, \"  --repeat_penalty N    penalize repeat sequence of tokens (default: %.1f)\\n\", params.repeat_penalty);\n    fprintf(stderr, \"  --temp N              temperature (default: %.1f)\\n\", params.temp);\n    fprintf(stderr, \"  -b N, --batch_size N  batch size for prompt processing (default: %d)\\n\", params.n_batch);\n    fprintf(stderr, \"  -m FNAME, --model FNAME\\n\");\n    fprintf(stderr, \"                        model path (default: %s)\\n\", params.model.c_str());\n    fprintf(stderr, \"\\n\");\n}\n\nstd::string gpt_random_prompt(std::mt19937 & rng) {\n    const int r = rng() % 10;\n    switch (r) {\n        case 0: return \"So\";\n        case 1: return \"Once upon a time\";\n        case 2: return \"When\";\n        case 3: return \"The\";\n        case 4: return \"After\";\n        case 5: return \"If\";\n        case 6: return \"import\";\n        case 7: return \"He\";\n        case 8: return \"She\";\n        case 9: return \"They\";\n        default: return \"To\";\n    }\n\n    return \"The\";\n}\n\nvoid replace(std::string & str, const std::string & needle, const std::string & replacement) {\n    size_t pos = 0;\n    while ((pos = str.find(needle, pos)) != std::string::npos) {\n        str.replace(pos, needle.length(), replacement);\n        pos += replacement.length();\n    }\n}\n\nstd::map<std::string, int32_t> json_parse(const std::string & fname) {\n    std::map<std::string, int32_t> result;\n\n    // read file into string\n    std::string json;\n    {\n        std::ifstream ifs(fname);\n        if (!ifs) {\n            fprintf(stderr, \"Failed to open %s\\n\", fname.c_str());\n            exit(1);\n        }\n\n        json = std::string((std::istreambuf_iterator<char>(ifs)),\n                (std::istreambuf_iterator<char>()));\n    }\n\n    if (json[0] != '{') {\n        return result;\n    }\n\n    // parse json\n    {\n        bool has_key  = false;\n        bool in_token = false;\n\n        std::string str_key = \"\";\n        std::string str_val = \"\";\n\n        int n = json.size();\n        for (int i = 1; i < n; ++i) {\n            if (!in_token) {\n                if (json[i] == ' ') continue;\n                if (json[i] == '\"') {\n                    in_token = true;\n                    continue;\n                }\n            } else {\n                if (json[i] == '\\\\' && i+1 < n) {\n                    if (has_key == false) {\n                        str_key += json[i];\n                    } else {\n                        str_val += json[i];\n                    }\n                    ++i;\n                } else if (json[i] == '\"') {\n                    if (has_key == false) {\n                        has_key = true;\n                        ++i;\n                        while (json[i] == ' ') ++i;\n                        ++i; // :\n                        while (json[i] == ' ') ++i;\n                        if (json[i] != '\\\"') {\n                            while (json[i] != ',' && json[i] != '}') {\n                                str_val += json[i++];\n                            }\n                            has_key = false;\n                        } else {\n                            in_token = true;\n                            continue;\n                        }\n                    } else {\n                        has_key = false;\n                    }\n\n                    ::replace(str_key, \"\\\\u0120\", \" \" ); // \\u0120 -> space\n                    ::replace(str_key, \"\\\\u010a\", \"\\n\"); // \\u010a -> new line\n                    ::replace(str_key, \"\\\\\\\"\",    \"\\\"\"); // \\\\\\\"   -> \"\n\n                    try {\n                        result[str_key] = std::stoi(str_val);\n                    } catch (...) {\n                        //fprintf(stderr, \"%s: ignoring key '%s' with value '%s'\\n\", fname.c_str(), str_key.c_str(), str_val.c_str());\n\n                    }\n                    str_key = \"\";\n                    str_val = \"\";\n                    in_token = false;\n                    continue;\n                }\n                if (has_key == false) {\n                    str_key += json[i];\n                } else {\n                    str_val += json[i];\n                }\n            }\n        }\n    }\n\n    return result;\n}\n\nstd::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {\n    std::vector<std::string> words;\n\n    // first split the text into words\n    {\n        std::string str = text;\n        std::string pat = R\"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\\s[:alpha:][:digit:]]+|\\s+(?!\\S)|\\s+)\";\n\n        std::regex re(pat);\n        std::smatch m;\n\n        while (std::regex_search(str, m, re)) {\n            for (auto x : m) {\n                words.push_back(x);\n            }\n            str = m.suffix();\n        }\n    }\n\n    // find the longest tokens that form the words:\n    std::vector<gpt_vocab::id> tokens;\n    for (const auto & word : words) {\n        if (word.size() == 0) continue;\n\n        int i = 0;\n        int n = word.size();\n        while (i < n) {\n            int j = n;\n            while (j > i) {\n                auto it = vocab.token_to_id.find(word.substr(i, j-i));\n                if (it != vocab.token_to_id.end()) {\n                    tokens.push_back(it->second);\n                    i = j;\n                    break;\n                }\n                --j;\n            }\n            if (i == n) {\n                break;\n            }\n            if (j == i) {\n                auto sub = word.substr(i, 1);\n                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {\n                    tokens.push_back(vocab.token_to_id.at(sub));\n                } else {\n                    fprintf(stderr, \"%s: unknown token '%s'\\n\", __func__, sub.data());\n                }\n                ++i;\n            }\n        }\n    }\n\n    return tokens;\n}\n\nstd::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {\n    //auto res = gpt_tokenize(vocab, text);\n\n    //if (bos) {\n    //    res.insert(res.begin(), 1); // TODO: replace with vocab.bos\n    //}\n\n    std::vector<gpt_vocab::id> res;\n\n    if (bos) {\n        res.push_back(1); // TODO: replace with vocab.bos\n    }\n\n     //find the longest token that matches the text\n    int pos = 0;\n    while (true) {\n        int l = 0;\n        int t = 0;\n        for (const auto & kv : vocab.id_to_token) {\n            if (kv.second.size() < l) continue;\n            if (kv.second.size() > text.size() - pos) continue;\n            if (text.substr(pos, kv.second.size()) == kv.second) {\n                l = kv.second.size();\n                t = kv.first;\n            }\n        }\n\n        if (l == 0) {\n            break;\n        }\n\n        res.push_back(t);\n        pos += l;\n    }\n\n    return res;\n}\n\nbool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {\n    printf(\"%s: loading vocab from '%s'\\n\", __func__, fname.c_str());\n\n    vocab.token_to_id = ::json_parse(fname);\n\n    for (const auto & kv : vocab.token_to_id) {\n        vocab.id_to_token[kv.second] = kv.first;\n    }\n\n    printf(\"%s: vocab size = %d\\n\", __func__, (int) vocab.token_to_id.size());\n\n    // print the vocabulary\n    //for (auto kv : vocab.token_to_id) {\n    //    printf(\"'%s' -> %d\\n\", kv.first.data(), kv.second);\n    //}\n\n    return true;\n}\n\n\nvoid sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int top_k) {\n    // find the top K tokens\n    std::partial_sort(\n            logits_id.begin(),\n            logits_id.begin() + top_k, logits_id.end(),\n            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {\n        return a.first > b.first;\n    });\n\n    logits_id.resize(top_k);\n}\n\ngpt_vocab::id llama_sample_top_p_top_k(\n        const gpt_vocab & vocab,\n        const float * logits,\n        std::vector<gpt_vocab::id> & last_n_tokens,\n        double repeat_penalty,\n        int top_k,\n        double top_p,\n        double temp,\n        std::mt19937 & rng) {\n    int n_logits = vocab.id_to_token.size();\n\n    std::vector<std::pair<double, gpt_vocab::id>> logits_id;\n    logits_id.reserve(n_logits);\n\n    {\n        const double scale = 1.0/temp;\n        for (int i = 0; i < n_logits; ++i) {\n            // repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858)\n            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main\n            if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {\n                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability\n                if (logits[i] < 0.0) {\n                    logits_id.push_back(std::make_pair(logits[i]*scale*repeat_penalty, i));\n                } else {\n                    logits_id.push_back(std::make_pair(logits[i]*scale/repeat_penalty, i));\n                }                \n            } else {\n                logits_id.push_back(std::make_pair(logits[i]*scale, i));\n            }\n        }\n    }\n\n    sample_top_k(logits_id, top_k);\n\n    double maxl = -INFINITY;\n    for (const auto & kv : logits_id) {\n        maxl = std::max(maxl, kv.first);\n    }\n\n    // compute probs for the top K tokens\n    std::vector<double> probs;\n    probs.reserve(logits_id.size());\n\n    double sum = 0.0;\n    for (const auto & kv : logits_id) {\n        double p = exp(kv.first - maxl);\n        probs.push_back(p);\n        sum += p;\n    }\n\n    // normalize the probs\n    for (auto & p : probs) {\n        p /= sum;\n    }\n\n    if (top_p < 1.0f) {\n        double cumsum = 0.0f;\n        for (int i = 0; i < (int) probs.size(); i++) {\n            cumsum += probs[i];\n            if (cumsum >= top_p) {\n                probs.resize(i + 1);\n                logits_id.resize(i + 1);\n                break;\n            }\n        }\n\n        cumsum = 1.0/cumsum;\n        for (int i = 0; i < (int) probs.size(); i++) {\n            probs[i] *= cumsum;\n        }\n    }\n\n    //printf(\"\\n\");\n    //for (int i = 0; i < (int) 10; i++) {\n    //    printf(\"%d: '%s' %f\\n\", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);\n    //}\n    //printf(\"\\n\\n\");\n    //exit(0);\n\n    std::discrete_distribution<> dist(probs.begin(), probs.end());\n    int idx = dist(rng);\n\n    return logits_id[idx].second;\n}\n\n\nsize_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist) {\n    const int nb = k / qk;\n    const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2);\n    const size_t row_size = nb*bs;\n\n    assert(k % qk == 0);\n\n    const size_t pp_size = qk / 2;\n    uint8_t *pp = static_cast<uint8_t*>(alloca(pp_size));\n\n    char * pdst = (char *) dst;\n\n    for (int j = 0; j < n; j += k) {\n        uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs);\n        uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float));\n\n        for (int i = 0; i < nb; i++) {\n            float amax = 0.0f; // absolute max\n\n            {\n                for (int l = 0; l < qk; l++) {\n                    const float v = src[j + i*qk + l];\n                    amax = std::max(amax, fabsf(v));\n                }\n\n                const float d = amax / ((1 << 3) - 1);\n                const float id = d ? 1.0f/d : 0.0f;\n\n                *(float *) pd = d;\n                pd += bs;\n\n                for (int l = 0; l < qk; l += 2) {\n                    const float v0 = (src[j + i*qk + l + 0])*id;\n                    const float v1 = (src[j + i*qk + l + 1])*id;\n\n                    const uint8_t vi0 = ((int8_t) (round(v0))) + 8;\n                    const uint8_t vi1 = ((int8_t) (round(v1))) + 8;\n\n                    assert(vi0 >= 0 && vi0 < 16);\n                    assert(vi1 >= 0 && vi1 < 16);\n\n                    hist[vi0]++;\n                    hist[vi1]++;\n\n                    pp[l/2] = vi0 | (vi1 << 4);\n                }\n\n                memcpy(pb, pp, pp_size);\n                pb += bs;\n            }\n        }\n    }\n\n    return (n/k)*row_size;\n}\n\nsize_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist) {\n    const int nb = k / qk;\n    const size_t row_size = nb*(2*sizeof(float) + sizeof(uint8_t)*qk/2);\n\n    assert(k % qk == 0);\n\n    const size_t pp_size = qk / 2;\n    uint8_t *pp = static_cast<uint8_t*>(alloca(pp_size));\n\n    char * pdst = (char *) dst;\n\n    for (int j = 0; j < n; j += k) {\n        float   * pm = (float *)   (pdst + (j/k)*row_size);\n        float   * pd = (float *)   (pm + nb);\n        uint8_t * pb = (uint8_t *) (pd + nb);\n\n        //printf(\"n = %d, k = %d, nb = %d, row_size = %d, j = %d, pm = %p, pd = %p, pb = %p\\n\", n, k, nb, row_size, j, pm, pd, pb);\n\n        for (int i = 0; i < nb; i++) {\n            float min = std::numeric_limits<float>::max();\n            float max = std::numeric_limits<float>::min();\n\n            {\n                for (int l = 0; l < qk; l++) {\n                    const float v = src[j + i*qk + l];\n                    if (v < min) min = v;\n                    if (v > max) max = v;\n                }\n\n                const float d = (max - min) / ((1 << 4) - 1);\n                const float id = d ? 1.0f/d : 0.0f;\n\n                pm[i] = min;\n                pd[i] = d;\n\n                for (int l = 0; l < qk; l += 2) {\n                    const float v0 = (src[j + i*qk + l + 0] - min)*id;\n                    const float v1 = (src[j + i*qk + l + 1] - min)*id;\n\n                    const uint8_t vi0 = round(v0);\n                    const uint8_t vi1 = round(v1);\n\n                    assert(vi0 >= 0 && vi0 < 16);\n                    assert(vi1 >= 0 && vi1 < 16);\n\n                    hist[vi0]++;\n                    hist[vi1]++;\n\n                    pp[l/2] = vi0 | (vi1 << 4);\n                }\n\n                memcpy(pb + i*qk/2, pp, pp_size);\n            }\n        }\n    }\n\n    return (n/k)*row_size;\n}\n"
  },
  {
    "path": "Sources/cpp/utils.h",
    "content": "// Various helper functions and utilities\n\n#pragma once\n\n#include <string>\n#include <map>\n#include <vector>\n#include <random>\n#include <thread>\n\n//\n// CLI argument parsing\n//\n\nstruct gpt_params {\n    int32_t seed      = -1; // RNG seed\n    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());\n    int32_t n_predict = 128; // new tokens to predict\n    int32_t repeat_last_n = 64;  // last n tokens to penalize\n\n    // sampling parameters\n    int32_t top_k = 40;\n    float   top_p = 0.95f;\n    float   temp  = 0.80f;\n    float   repeat_penalty  = 1.30f;\n\n    int32_t n_batch = 8; // batch size for prompt processing\n\n    std::string model = \"models/lamma-7B/ggml-model.bin\"; // model path\n    std::string prompt;\n\n    bool use_color = false; // use color to distinguish generations and inputs\n\n    bool interactive = false; // interactive mode\n    bool interactive_start = false; // reverse prompt immediately\n    std::string antiprompt = \"\"; // string upon seeing which more user input is prompted\n};\n\nbool gpt_params_parse(int argc, char ** argv, gpt_params & params);\n\nvoid gpt_print_usage(int argc, char ** argv, const gpt_params & params);\n\nstd::string gpt_random_prompt(std::mt19937 & rng);\n\n//\n// Vocab utils\n//\n\nstruct gpt_vocab {\n    using id    = int32_t;\n    using token = std::string;\n\n    std::map<token, id> token_to_id;\n    std::map<id, token> id_to_token;\n};\n\nvoid replace(std::string & str, const std::string & needle, const std::string & replacement);\n\n// poor-man's JSON parsing\nstd::map<std::string, int32_t> json_parse(const std::string & fname);\n\n// split text into tokens\n//\n// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53\n//\n// Regex (Python):\n// r\"\"\"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+\"\"\"\n//\n// Regex (C++):\n// R\"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\\s[:alpha:][:digit:]]+|\\s+(?!\\S)|\\s+)\"\n//\nstd::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);\n\n// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..\n// ref: https://github.com/google/sentencepiece\nstd::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);\n\n// load the tokens from encoder.json\nbool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);\n\n// sample next token given probabilities for each embedding\n//\n//   - consider only the top K tokens\n//   - from them, consider only the top tokens with cumulative probability > P\n//\ngpt_vocab::id llama_sample_top_p_top_k(\n        const gpt_vocab & vocab,\n        const float * logits,\n        std::vector<gpt_vocab::id> & last_n_tokens,\n        double repeat_penalty,\n        int top_k,\n        double top_p,\n        double temp,\n        std::mt19937 & rng);\n\n// filer to top K tokens from list of logits\nvoid sample_top_k(std::vector<std::pair<double, gpt_vocab::id>> & logits_id, int top_k);\n\n//\n// Quantization\n//\n\nsize_t ggml_quantize_q4_0(float * src, void * dst, int n, int k, int qk, int64_t * hist);\nsize_t ggml_quantize_q4_1(float * src, void * dst, int n, int k, int qk, int64_t * hist);\n"
  },
  {
    "path": "Sources/llama/LlamaRunner.swift",
    "content": "//\n//  LlamaRunner.swift\n//  llama\n//\n//  Created by Alex Rozanski on 12/03/2023.\n//\n\nimport Foundation\nimport llamaObjCxx\n\npublic class LlamaRunner {\n  public struct Config {\n    public let numThreads: UInt\n    public let numTokens: UInt\n    public let reversePrompt: String?\n\n    public static let `default` = Config(numThreads: 8, numTokens: 512, reversePrompt: nil)\n\n    public init(numThreads: UInt, numTokens: UInt, reversePrompt: String? = nil) {\n      self.numThreads = numThreads\n      self.numTokens = numTokens\n      self.reversePrompt = reversePrompt\n    }\n\n    fileprivate func toBridgeConfig() -> _LlamaRunnerBridgeConfig {\n      let _config = _LlamaRunnerBridgeConfig()\n      _config.numberOfThreads = numThreads\n      _config.numberOfTokens = numTokens\n      _config.reversePrompt = reversePrompt\n      return _config\n    }\n  }\n\n  public enum RunState {\n    case notStarted\n    case initializing\n    case generatingOutput\n    case completed\n    case failed(error: Error?)\n  }\n\n  public let modelURL: URL\n\n  private lazy var bridge = _LlamaRunnerBridge(modelPath: modelURL.path)\n\n  public init(modelURL: URL) {\n    self.modelURL = modelURL\n  }\n\n  // Async-based run() function.\n  public func run(\n    with prompt: String,\n    config: Config = .default,\n    stateChangeHandler: ((RunState) -> Void)? = nil\n  ) -> AsyncThrowingStream<String, Error> {\n    return AsyncThrowingStream<String, Error> { continuation in\n      stateChangeHandler?(.notStarted)\n\n      bridge.run(\n        withPrompt: prompt,\n        config: config.toBridgeConfig(),\n        eventHandler: { event in\n          event.match(\n            startedLoadingModel: {\n              stateChangeHandler?(.initializing)\n            },\n            finishedLoadingModel: {},\n            startedGeneratingOutput: {\n              stateChangeHandler?(.generatingOutput)\n            },\n            outputToken: { token in\n              continuation.yield(token)\n            },\n            completed: {\n              stateChangeHandler?(.completed)\n              continuation.finish()\n            },\n            failed: { error in\n              stateChangeHandler?(.failed(error: error))\n              continuation.finish(throwing: error)\n            }\n          )\n        },\n        eventHandlerQueue: DispatchQueue.main\n      )\n    }\n  }\n\n  // Closure-based run() function.\n  public func run(\n    with prompt: String,\n    config: Config = .default,\n    tokenHandler: @escaping (String) -> Void,\n    stateChangeHandler: ((RunState) -> Void)? = nil\n  ) {\n    stateChangeHandler?(.notStarted)\n\n    bridge.run(\n      withPrompt: prompt,\n      config: config.toBridgeConfig(),\n      eventHandler: { event in\n        event.match(\n          startedLoadingModel: {\n            stateChangeHandler?(.initializing)\n          },\n          finishedLoadingModel: {},\n          startedGeneratingOutput: {\n            stateChangeHandler?(.generatingOutput)\n          },\n          outputToken: { token in\n            tokenHandler(token)\n          },\n          completed: {\n            stateChangeHandler?(.completed)\n          },\n          failed: { error in\n            stateChangeHandler?(.failed(error: error))\n          }\n        )\n      },\n      eventHandlerQueue: DispatchQueue.main\n    )\n  }\n}\n"
  },
  {
    "path": "Sources/llamaObjCxx/LlamaError.m",
    "content": "//\n//  LlamaError.m\n//  llama\n//\n//  Created by Alex Rozanski on 14/03/2023.\n//\n\n#import \"LlamaError.h\"\n\nNSString *const LlamaErrorDomain = @\"com.alexrozanski.llama.error\";\n"
  },
  {
    "path": "Sources/llamaObjCxx/bridge/LlamaEvent.mm",
    "content": "//\n//  LlamaEvent.mm\n//  llama\n//\n//  Created by Alex Rozanski on 14/03/2023.\n//\n\n#include \"LlamaEvent.h\"\n\ntypedef NS_ENUM(NSUInteger, LlamaEventType) {\n  LlamaEventTypeNone = 0,\n  LlamaEventTypeStartedLoadingModel,\n  LlamaEventTypeFinishedLoadingModel,\n  LlamaEventTypeStartedGeneratingOutput,\n  LlamaEventTypeOutputToken,\n  LlamaEventTypeCompleted,\n  LlamaEventTypeFailed,\n};\n\ntypedef struct LlamaEventData {\n  NSString *outputToken_token;\n  NSError *failed_error;\n} LlamaEventData;\n\n@interface _LlamaEvent () {\n  LlamaEventType _eventType;\n  LlamaEventData _data;\n}\n\n- (instancetype)initWithEventType:(LlamaEventType)eventType data:(LlamaEventData)data;\n\n@end\n\n@implementation _LlamaEvent\n\n- (instancetype)initWithEventType:(LlamaEventType)eventType data:(LlamaEventData)data\n{\n  if ((self = [super init])) {\n    _eventType = eventType;\n    _data = data;\n  }\n\n  return self;\n}\n\n+ (instancetype)startedLoadingModel\n{\n  LlamaEventData data;\n  _LlamaEvent *event = [[_LlamaEvent alloc] initWithEventType:LlamaEventTypeStartedLoadingModel data:{}];\n  return event;\n}\n\n+ (instancetype)finishedLoadingModel\n{\n  LlamaEventData data;\n  _LlamaEvent *event = [[_LlamaEvent alloc] initWithEventType:LlamaEventTypeFinishedLoadingModel data:{}];\n  return event;\n}\n\n+ (instancetype)startedGeneratingOutput\n{\n  LlamaEventData data;\n  _LlamaEvent *event = [[_LlamaEvent alloc] initWithEventType:LlamaEventTypeStartedGeneratingOutput data:{}];\n  return event;\n}\n\n+ (instancetype)outputTokenWithToken:(nonnull NSString *)token\n{\n  _LlamaEvent *event = [[_LlamaEvent alloc] initWithEventType:LlamaEventTypeOutputToken data:{ .outputToken_token = token }];\n  return event;\n}\n\n+ (instancetype)completed\n{\n  _LlamaEvent *event = [[_LlamaEvent alloc] initWithEventType:LlamaEventTypeCompleted data:{}];\n  return event;\n}\n\n+ (instancetype)failedWithError:(nonnull NSError *)error\n{\n  _LlamaEvent *event = [[_LlamaEvent alloc] initWithEventType:LlamaEventTypeFailed data:{ .failed_error = error }];\n  return event;\n}\n\n- (void)matchWithStartedLoadingModel:(void (^)(void))startedLoadingModel\n                finishedLoadingModel:(void (^)(void))finishedLoadingModel\n             startedGeneratingOutput:(void (^)(void))startedGeneratingOutput\n                         outputToken:(void (^)(NSString *token))outputToken\n                           completed:(void (^)(void))completed\n                              failed:(void (^)(NSError *error))failed\n{\n  switch (_eventType) {\n    case LlamaEventTypeNone:\n      break;\n    case LlamaEventTypeStartedLoadingModel:\n      startedLoadingModel();\n      break;\n    case LlamaEventTypeFinishedLoadingModel:\n      finishedLoadingModel();\n      break;\n    case LlamaEventTypeStartedGeneratingOutput:\n      startedGeneratingOutput();\n      break;\n    case LlamaEventTypeOutputToken:\n      outputToken(_data.outputToken_token);\n      break;\n    case LlamaEventTypeCompleted:\n      completed();\n      break;\n    case LlamaEventTypeFailed:\n      failed(_data.failed_error);\n      break;\n  }\n}\n\n@end\n"
  },
  {
    "path": "Sources/llamaObjCxx/bridge/LlamaPredictOperation.hh",
    "content": "//\n//  LlamaPredictOperation.h\n//  llama\n//\n//  Created by Alex Rozanski on 13/03/2023.\n//\n\n#import <Foundation/NSOperation.h>\n#import \"utils.h\"\n\n@class _LlamaEvent;\n\nNS_ASSUME_NONNULL_BEGIN\n\ntypedef void (^LlamaPredictOperationEventHandler)(_LlamaEvent *event);\n\n@interface LlamaPredictOperation : NSOperation\n\n- (instancetype)initWithParams:(gpt_params)params\n                  eventHandler:(LlamaPredictOperationEventHandler)eventHandler\n             eventHandlerQueue:(dispatch_queue_t)eventHandlerQueue;\n\n@end\n\nNS_ASSUME_NONNULL_END\n"
  },
  {
    "path": "Sources/llamaObjCxx/bridge/LlamaPredictOperation.mm",
    "content": "//\n//  LlamaPredictOperation.m\n//  llama\n//\n//  Created by Alex Rozanski on 13/03/2023.\n//\n\n#import \"LlamaPredictOperation.hh\"\n\n#import \"LlamaError.h\"\n#import \"LlamaEvent.h\"\n#import \"LlamaRunnerBridgeConfig.h\"\n\n#include \"ggml.h\"\n\n#include \"utils.h\"\n\n#include <cassert>\n#include <cmath>\n#include <cstdio>\n#include <cstring>\n#include <fstream>\n#include <map>\n#include <string>\n#include <vector>\n\n#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))\n#include <signal.h>\n#include <unistd.h>\n#endif\n\n// determine number of model parts based on the dimension\nstatic const std::map<int, int> LLAMA_N_PARTS = {\n  { 4096, 1 },\n  { 5120, 2 },\n  { 6656, 4 },\n  { 8192, 8 },\n};\n\n// default hparams (LLaMA 7B)\nstruct llama_hparams {\n  int32_t n_vocab = 32000;\n  int32_t n_ctx   = 512;   // this is provided as user input?\n  int32_t n_embd  = 4096;\n  int32_t n_mult  = 256;\n  int32_t n_head  = 32;\n  int32_t n_layer = 32;\n  int32_t n_rot   = 64;\n  int32_t f16     = 1;\n};\n\nstruct llama_layer {\n  // normalization\n  struct ggml_tensor * attention_norm;\n\n  // attention\n  struct ggml_tensor * wq;\n  struct ggml_tensor * wk;\n  struct ggml_tensor * wv;\n  struct ggml_tensor * wo;\n\n  // normalization\n  struct ggml_tensor * ffn_norm;\n\n  // ff\n  struct ggml_tensor * w1;\n  struct ggml_tensor * w2;\n  struct ggml_tensor * w3;\n};\n\nstruct llama_model {\n  llama_hparams hparams;\n\n  struct ggml_tensor * tok_embeddings;\n\n  struct ggml_tensor * norm;\n  struct ggml_tensor * output;\n\n  std::vector<llama_layer> layers;\n\n  // key + value memory\n  struct ggml_tensor * memory_k;\n  struct ggml_tensor * memory_v;\n\n  //\n  struct ggml_context * ctx;\n  std::map<std::string, struct ggml_tensor *> tensors;\n};\n\nNSError *makeLlamaError(LlamaErrorCode errorCode, NSString *description)\n{\n  return [[NSError alloc] initWithDomain:LlamaErrorDomain code:errorCode userInfo:@{\n    NSLocalizedDescriptionKey: description\n  }];\n}\n\n// load the model's weights from a file\nbool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, NSError **outError) {\n  auto fin = std::ifstream(fname, std::ios::binary);\n  if (!fin) {\n    *outError = makeLlamaError(LlamaErrorCodeFailedToLoadModel,\n                               [NSString stringWithFormat:@\"failed to open '%s'\", fname.c_str()]);\n    return false;\n  }\n\n  // verify magic\n  {\n    uint32_t magic;\n    fin.read((char *) &magic, sizeof(magic));\n    if (magic != 0x67676d6c) {\n      *outError = makeLlamaError(LlamaErrorCodeFailedToLoadModel,\n                                 [NSString stringWithFormat:@\"invalid model file '%s' (bad magic)\", fname.c_str()]);\n      return false;\n    }\n  }\n\n  int n_ff = 0;\n  int n_parts = 0;\n\n  // load hparams\n  {\n    auto & hparams = model.hparams;\n\n    fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));\n    //fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));\n    fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));\n    fin.read((char *) &hparams.n_mult,  sizeof(hparams.n_mult));\n    fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));\n    fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));\n    fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));\n    fin.read((char *) &hparams.f16,     sizeof(hparams.f16));\n\n    hparams.n_ctx = n_ctx;\n\n    n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;\n    n_parts = LLAMA_N_PARTS.at(hparams.n_embd);\n  }\n\n  // load vocab\n  {\n    const int32_t n_vocab = model.hparams.n_vocab;\n\n    if (n_vocab != model.hparams.n_vocab) {\n      *outError = makeLlamaError(LlamaErrorCodeFailedToLoadModel,\n                                 [NSString stringWithFormat:@\"invalid model file '%s' (bad vocab size %d != %d)\", fname.c_str(), n_vocab, model.hparams.n_vocab]);\n      return false;\n    }\n\n    std::string word;\n    for (int i = 0; i < n_vocab; i++) {\n      uint32_t len;\n      fin.read((char *) &len, sizeof(len));\n\n      word.resize(len);\n      fin.read((char *) word.data(), len);\n\n      vocab.token_to_id[word] = i;\n      vocab.id_to_token[i] = word;\n\n      //if (i < 30000) {\n      //    printf(\"%s: vocab[%d] = '%s'\\n\", __func__, i, word.c_str());\n      //}\n    }\n  }\n\n  // for the big tensors, we have the option to store the data in 16-bit floats or quantized\n  // in order to save memory and also to speed up the computation\n  ggml_type wtype = GGML_TYPE_COUNT;\n  switch (model.hparams.f16) {\n    case 0: wtype = GGML_TYPE_F32;  break;\n    case 1: wtype = GGML_TYPE_F16;  break;\n    case 2: wtype = GGML_TYPE_Q4_0; break;\n    case 3: wtype = GGML_TYPE_Q4_1; break;\n    default:\n    {\n      *outError = makeLlamaError(LlamaErrorCodeFailedToLoadModel,\n                                 [NSString stringWithFormat:@\"invalid model file '%s' (bad f16 value %d)\", fname.c_str(), model.hparams.f16]);\n      return false;\n    }\n  }\n\n  const ggml_type wtype2 = GGML_TYPE_F32;\n\n  auto & ctx = model.ctx;\n\n  size_t ctx_size = 0;\n\n  {\n    const auto & hparams = model.hparams;\n\n    const int n_embd  = hparams.n_embd;\n    const int n_layer = hparams.n_layer;\n    const int n_ctx   = hparams.n_ctx;\n    const int n_vocab = hparams.n_vocab;\n\n    ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // tok_embeddings\n\n    ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // norm\n\n    ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // output\n\n    ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // attention_norm\n\n    ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wq\n    ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wk\n    ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wv\n    ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // wo\n\n    ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ffn_norm\n\n    ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w1\n    ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w2\n    ctx_size += n_layer*(n_ff*n_embd*ggml_type_sizef(wtype)); // w3\n\n    ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k\n    ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v\n\n    ctx_size += (5 + 10*n_layer)*256; // object overhead\n  }\n\n  // create the ggml context\n  {\n    struct ggml_init_params params = {\n      /*.mem_size   =*/ ctx_size,\n      /*.mem_buffer =*/ NULL,\n    };\n\n    model.ctx = ggml_init(params);\n    if (!model.ctx) {\n      *outError = makeLlamaError(LlamaErrorCodeFailedToLoadModel, [NSString stringWithFormat:@\"ggml_init() failed\"]);\n      return false;\n    }\n  }\n\n  // prepare memory for the weights\n  {\n    const auto & hparams = model.hparams;\n\n    const int n_embd  = hparams.n_embd;\n    const int n_layer = hparams.n_layer;\n    const int n_ctx   = hparams.n_ctx;\n    const int n_vocab = hparams.n_vocab;\n\n    model.layers.resize(n_layer);\n\n    model.tok_embeddings = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);\n\n    model.norm   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);\n    model.output = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);\n\n    // map by name\n    model.tensors[\"tok_embeddings.weight\"] = model.tok_embeddings;\n\n    model.tensors[\"norm.weight\"]   = model.norm;\n    model.tensors[\"output.weight\"] = model.output;\n\n    for (int i = 0; i < n_layer; ++i) {\n      auto & layer = model.layers[i];\n\n      layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);\n\n      layer.wq = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);\n      layer.wk = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);\n      layer.wv = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);\n      layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);\n\n      layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);\n\n      layer.w1 = ggml_new_tensor_2d(ctx, wtype, n_embd,   n_ff);\n      layer.w2 = ggml_new_tensor_2d(ctx, wtype,   n_ff, n_embd);\n      layer.w3 = ggml_new_tensor_2d(ctx, wtype, n_embd,   n_ff);\n\n      // map by name\n      model.tensors[\"layers.\" + std::to_string(i) + \".attention_norm.weight\"] = layer.attention_norm;\n\n      model.tensors[\"layers.\" + std::to_string(i) + \".attention.wq.weight\"] = layer.wq;\n      model.tensors[\"layers.\" + std::to_string(i) + \".attention.wk.weight\"] = layer.wk;\n      model.tensors[\"layers.\" + std::to_string(i) + \".attention.wv.weight\"] = layer.wv;\n      model.tensors[\"layers.\" + std::to_string(i) + \".attention.wo.weight\"] = layer.wo;\n\n      model.tensors[\"layers.\" + std::to_string(i) + \".ffn_norm.weight\"] = layer.ffn_norm;\n\n      model.tensors[\"layers.\" + std::to_string(i) + \".feed_forward.w1.weight\"] = layer.w1;\n      model.tensors[\"layers.\" + std::to_string(i) + \".feed_forward.w2.weight\"] = layer.w2;\n      model.tensors[\"layers.\" + std::to_string(i) + \".feed_forward.w3.weight\"] = layer.w3;\n    }\n  }\n\n  // key + value memory\n  {\n    const auto & hparams = model.hparams;\n\n    const int n_embd  = hparams.n_embd;\n    const int n_layer = hparams.n_layer;\n    const int n_ctx   = hparams.n_ctx;\n\n    const int n_mem      = n_layer*n_ctx;\n    const int n_elements = n_embd*n_mem;\n\n    model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);\n    model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);\n\n    const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);\n  }\n\n  const size_t file_offset = fin.tellg();\n\n  fin.close();\n\n  std::vector<uint8_t> tmp;\n\n  for (int i = 0; i < n_parts; ++i) {\n    const int part_id = i;\n    //const int part_id = n_parts - i - 1;\n\n    std::string fname_part = fname;\n    if (i > 0) {\n      fname_part += \".\" + std::to_string(i);\n    }\n\n    fin = std::ifstream(fname_part, std::ios::binary);\n    fin.seekg(file_offset);\n\n    // load weights\n    {\n      int n_tensors = 0;\n      size_t total_size = 0;\n\n      while (true) {\n        int32_t n_dims;\n        int32_t length;\n        int32_t ftype;\n\n        fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));\n        fin.read(reinterpret_cast<char *>(&length), sizeof(length));\n        fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));\n\n        if (fin.eof()) {\n          break;\n        }\n\n        int32_t nelements = 1;\n        int32_t ne[2] = { 1, 1 };\n        for (int i = 0; i < n_dims; ++i) {\n          fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));\n          nelements *= ne[i];\n        }\n\n        std::string name(length, 0);\n        fin.read(&name[0], length);\n\n        if (model.tensors.find(name.data()) == model.tensors.end()) {\n          *outError = makeLlamaError(LlamaErrorCodeFailedToLoadModel,\n                                     [NSString stringWithFormat:@\"unknown tensor '%s' in model file\", name.data()]);\n          return false;\n        }\n\n        // split_type = 0: split by columns\n        // split_type = 1: split by rows\n        int split_type = 0;\n\n        // split_type = 0:\n        // regex:\n        //   - tok_embeddings.*\n        //   - layers.*.attention.wo.weight\n        //   - layers.*.feed_forward.w2.weight\n\n        // split_type = 1:\n        // regex:\n        //   - output.*\n        //   - layers.*.attention.wq.weight\n        //   - layers.*.attention.wk.weight\n        //   - layers.*.attention.wv.weight\n        //   - layers.*.feed_forward.w1.weight\n        //   - layers.*.feed_forward.w3.weight\n        if (name.find(\"tok_embeddings\") != std::string::npos) {\n          split_type = 0;\n        } else if (name.find(\"layers\") != std::string::npos) {\n          if (name.find(\"attention.wo.weight\") != std::string::npos) {\n            split_type = 0;\n          } else if (name.find(\"feed_forward.w2.weight\") != std::string::npos) {\n            split_type = 0;\n          } else {\n            split_type = 1;\n          }\n        } else if (name.find(\"output\") != std::string::npos) {\n          split_type = 1;\n        }\n\n        auto tensor = model.tensors[name.data()];\n\n        if (n_dims == 1) {\n          if (ggml_nelements(tensor) != nelements) {\n            *outError = makeLlamaError(LlamaErrorCodeFailedToLoadModel,\n                                       [NSString stringWithFormat:@\"tensor '%s' has wrong size in model file\", name.data()]);\n            return false;\n          }\n        } else {\n          if (ggml_nelements(tensor)/n_parts != nelements) {\n            *outError = makeLlamaError(LlamaErrorCodeFailedToLoadModel,\n                                       [NSString stringWithFormat:@\"tensor '%s' has wrong size in model file\", name.data()]);\n            return false;\n          }\n        }\n\n        if (n_dims == 1) {\n          if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {\n            *outError = makeLlamaError(LlamaErrorCodeFailedToLoadModel,\n                                       [NSString stringWithFormat:@\"tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\", name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]]);\n            return false;\n          }\n        } else {\n          if (split_type == 0) {\n            if (tensor->ne[0]/n_parts != ne[0] || tensor->ne[1] != ne[1]) {\n              *outError = makeLlamaError(LlamaErrorCodeFailedToLoadModel,\n                                         [NSString stringWithFormat:@\"tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\", name.data(), tensor->ne[0]/n_parts, tensor->ne[1], ne[0], ne[1]]);\n              return false;\n            }\n          } else {\n            if (tensor->ne[0] != ne[0] || tensor->ne[1]/n_parts != ne[1]) {\n              *outError = makeLlamaError(LlamaErrorCodeFailedToLoadModel,\n                                         [NSString stringWithFormat:@\"tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\", name.data(), tensor->ne[0], tensor->ne[1]/n_parts, ne[0], ne[1]]);\n              return false;\n            }\n          }\n        }\n\n        if (0) {\n          static const char * ftype_str[] = { \"f32\", \"f16\", \"q4_0\", \"q4_1\", };\n        }\n\n        size_t bpe = 0;\n\n        switch (ftype) {\n          case 0: bpe = ggml_type_size(GGML_TYPE_F32);  break;\n          case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;\n          case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;\n          case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;\n          default:\n          {\n            *outError = makeLlamaError(LlamaErrorCodeFailedToLoadModel, [NSString stringWithFormat:@\"unknown ftype %d in model file\", ftype]);\n            return false;\n          }\n        };\n\n        if (n_dims == 1 || n_parts == 1) {\n          if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {\n            *outError = makeLlamaError(LlamaErrorCodeFailedToLoadModel,\n                                       [NSString stringWithFormat:@\"tensor '%s' has wrong size in model file: got %zu, expected %zu\", name.data(), ggml_nbytes(tensor), nelements*bpe]);\n            return false;\n          }\n\n          if (part_id == 0) {\n            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));\n          } else {\n            fin.seekg(ggml_nbytes(tensor), std::ios::cur);\n          }\n\n          total_size += ggml_nbytes(tensor);\n        } else {\n          if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)/n_parts) {\n            *outError = makeLlamaError(LlamaErrorCodeFailedToLoadModel,\n                                       [NSString stringWithFormat:@\"tensor '%s' has wrong size in model file: got %zu, expected %zu\", name.data(), ggml_nbytes(tensor)/n_parts, nelements*bpe]);\n            return false;\n          }\n\n          if (split_type == 0) {\n            const int np0 = ne[0];\n\n            const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);\n            assert(row_size == tensor->nb[1]);\n\n            for (int i1 = 0; i1 < ne[1]; ++i1) {\n              const size_t offset_row = i1*row_size;\n              const size_t offset = offset_row + ((part_id*np0)/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);\n              fin.read(reinterpret_cast<char *>(tensor->data) + offset, row_size/n_parts);\n            }\n          } else {\n            const int np1 = ne[1];\n\n            const size_t row_size = (tensor->ne[0]/ggml_blck_size(tensor->type))*ggml_type_size(tensor->type);\n\n            for (int i1 = 0; i1 < ne[1]; ++i1) {\n              const size_t offset_row = (i1 + part_id*np1)*row_size;\n              fin.read(reinterpret_cast<char *>(tensor->data) + offset_row, row_size);\n            }\n          }\n\n          total_size += ggml_nbytes(tensor)/n_parts;\n        }\n      }\n    }\n\n    fin.close();\n  }\n\n  return true;\n}\n\n// evaluate the transformer\n//\n//   - model:     the model\n//   - n_threads: number of threads to use\n//   - n_past:    the context size so far\n//   - embd_inp:  the embeddings of the tokens in the context\n//   - embd_w:    the predicted logits for the next token\n//\n// The GPT-J model requires about 16MB of memory per input token.\n//\nbool llama_eval(\n                const llama_model & model,\n                const int n_threads,\n                const int n_past,\n                const std::vector<gpt_vocab::id> & embd_inp,\n                std::vector<float>         & embd_w,\n                size_t                     & mem_per_token,\n                NSError **outError\n) {\n  const int N = embd_inp.size();\n\n  const auto & hparams = model.hparams;\n\n  const int n_embd  = hparams.n_embd;\n  const int n_layer = hparams.n_layer;\n  const int n_ctx   = hparams.n_ctx;\n  const int n_head  = hparams.n_head;\n  const int n_vocab = hparams.n_vocab;\n  const int n_rot   = hparams.n_embd/hparams.n_head;\n\n  const int d_key = n_embd/n_head;\n\n  static size_t buf_size = 512u*1024*1024;\n  static void * buf = malloc(buf_size);\n\n  if (mem_per_token > 0 && mem_per_token*N > buf_size) {\n    const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead\n    //printf(\"\\n%s: reallocating buffer from %zu to %zu bytes\\n\", __func__, buf_size, buf_size_new);\n\n    // reallocate\n    buf_size = buf_size_new;\n    buf = realloc(buf, buf_size);\n    if (buf == nullptr) {\n      *outError = makeLlamaError(LlamaErrorCodePredictionFailed,\n                                 [NSString stringWithFormat:@\"failed to allocate %zu bytes\", buf_size]);\n      return false;\n    }\n  }\n\n  struct ggml_init_params params = {\n    /*.mem_size   =*/ buf_size,\n    /*.mem_buffer =*/ buf,\n  };\n\n  struct ggml_context * ctx0 = ggml_init(params);\n  ggml_cgraph gf = {};\n  gf.n_threads = n_threads;\n\n  struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);\n  memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));\n\n  struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);\n\n  for (int il = 0; il < n_layer; ++il) {\n    struct ggml_tensor * inpSA = inpL;\n\n    struct ggml_tensor * cur;\n\n    // norm\n    {\n      cur = ggml_norm(ctx0, inpL);\n\n      // cur = attention_norm*cur\n      cur = ggml_mul(ctx0,\n                     ggml_repeat(ctx0, model.layers[il].attention_norm, cur),\n                     cur);\n    }\n\n    // self-attention\n    {\n      struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);\n      struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);\n      struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);\n\n      // store key and value to memory\n      if (N >= 1) {\n        struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));\n        struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));\n\n        ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));\n        ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));\n      }\n\n      // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)\n      struct ggml_tensor * Q =\n      ggml_permute(ctx0,\n                   ggml_rope(ctx0,\n                             ggml_cpy(ctx0,\n                                      Qcur,\n                                      ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),\n                             n_past, n_rot, 0),\n                   0, 2, 1, 3);\n\n      // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)\n      struct ggml_tensor * K =\n      ggml_permute(ctx0,\n                   ggml_rope(ctx0,\n                             ggml_reshape_3d(ctx0,\n                                             ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),\n                                             n_embd/n_head, n_head, n_past + N),\n                             n_past, n_rot, 1),\n                   0, 2, 1, 3);\n\n      // K * Q\n      struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);\n\n      // KQ_scaled = KQ / sqrt(n_embd/n_head)\n      struct ggml_tensor * KQ_scaled =\n      ggml_scale(ctx0,\n                 KQ,\n                 ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))\n                 );\n\n      // KQ_masked = mask_past(KQ_scaled)\n      struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);\n\n      // KQ = soft_max(KQ_masked)\n      struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);\n\n      // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()\n      struct ggml_tensor * V_trans =\n      ggml_permute(ctx0,\n                   ggml_reshape_3d(ctx0,\n                                   ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),\n                                   n_embd/n_head, n_head, n_past + N),\n                   1, 2, 0, 3);\n\n      // KQV = transpose(V) * KQ_soft_max\n      struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);\n\n      // KQV_merged = KQV.permute(0, 2, 1, 3)\n      struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);\n\n      // cur = KQV_merged.contiguous().view(n_embd, N)\n      cur = ggml_cpy(ctx0,\n                     KQV_merged,\n                     ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));\n\n      // projection (no bias)\n      cur = ggml_mul_mat(ctx0,\n                         model.layers[il].wo,\n                         cur);\n    }\n\n    struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);\n\n    // feed-forward network\n    {\n      // norm\n      {\n        cur = ggml_norm(ctx0, inpFF);\n\n        // cur = ffn_norm*cur\n        cur = ggml_mul(ctx0,\n                       ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),\n                       cur);\n      }\n\n      struct ggml_tensor * tmp = ggml_mul_mat(ctx0,\n                                              model.layers[il].w3,\n                                              cur);\n\n\n      cur = ggml_mul_mat(ctx0,\n                         model.layers[il].w1,\n                         cur);\n\n      // SILU activation\n      cur = ggml_silu(ctx0, cur);\n\n      cur = ggml_mul(ctx0, cur, tmp);\n\n      cur = ggml_mul_mat(ctx0,\n                         model.layers[il].w2,\n                         cur);\n    }\n\n    cur  = ggml_add(ctx0, cur, inpFF);\n\n    // input for next layer\n    inpL = cur;\n  }\n\n  // norm\n  {\n    inpL = ggml_norm(ctx0, inpL);\n\n    // inpL = norm*inpL\n    inpL = ggml_mul(ctx0,\n                    ggml_repeat(ctx0, model.norm, inpL),\n                    inpL);\n  }\n\n  // lm_head\n  {\n    inpL = ggml_mul_mat(ctx0, model.output, inpL);\n  }\n\n  // logits -> probs\n  //inpL = ggml_soft_max(ctx0, inpL);\n\n  // run the computation\n  ggml_build_forward_expand(&gf, inpL);\n  ggml_graph_compute       (ctx0, &gf);\n\n  //if (n_past%100 == 0) {\n  //    ggml_graph_print   (&gf);\n  //    ggml_graph_dump_dot(&gf, NULL, \"gpt-2.dot\");\n  //}\n\n  //embd_w.resize(n_vocab*N);\n  //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);\n\n  // return result for just the last token\n  embd_w.resize(n_vocab);\n  memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);\n\n  if (mem_per_token == 0) {\n    mem_per_token = ggml_used_mem(ctx0)/N;\n  }\n  //printf(\"used_mem = %zu\\n\", ggml_used_mem(ctx0));\n\n  ggml_free(ctx0);\n\n  return true;\n}\n\n#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))\nvoid sigint_handler(int signo) {\n  if (signo == SIGINT) {\n    _exit(130);\n  }\n}\n#endif\n\n@interface LlamaPredictOperation () {\n  gpt_params _params;\n  LlamaPredictOperationEventHandler _eventHandler;\n  dispatch_queue_t _eventHandlerQueue;\n}\n\n@end\n\n@implementation LlamaPredictOperation\n\n- (instancetype)initWithParams:(gpt_params)params\n                  eventHandler:(LlamaPredictOperationEventHandler)eventHandler\n             eventHandlerQueue:(dispatch_queue_t)eventHandlerQueue\n{\n  if ((self = [super init])) {\n    _params = params;\n    _eventHandler = [eventHandler copy];\n    _eventHandlerQueue = eventHandlerQueue;\n  }\n\n  return self;\n}\n\n- (void)main\n{\n  ggml_time_init();\n  const int64_t t_main_start_us = ggml_time_us();\n\n  std::mt19937 rng(_params.seed);\n  if (_params.prompt.empty()) {\n    _params.prompt = gpt_random_prompt(rng);\n  }\n\n  int64_t t_load_us = 0;\n\n  gpt_vocab vocab;\n  llama_model model;\n\n  // load the model\n  {\n    [self postEvent:[_LlamaEvent startedLoadingModel]];\n\n    const int64_t t_start_us = ggml_time_us();\n\n    NSError *loadError = nil;\n    if (!llama_model_load(_params.model, model, vocab, 512, &loadError)) {  // TODO: set context from user input ??\n      [self postEvent:[_LlamaEvent failedWithError:loadError]];\n      return;\n    }\n\n    t_load_us = ggml_time_us() - t_start_us;\n\n    [self postEvent:[_LlamaEvent finishedLoadingModel]];\n  }\n\n  [self postEvent:[_LlamaEvent startedGeneratingOutput]];\n\n  int n_past = 0;\n\n  int64_t t_sample_us  = 0;\n  int64_t t_predict_us = 0;\n\n  std::vector<float> logits;\n\n  // tokenize the prompt\n  std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, _params.prompt, true);\n\n  _params.n_predict = std::min(_params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());\n\n  // tokenize the reverse prompt\n  std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, _params.antiprompt, false);\n\n  std::vector<gpt_vocab::id> embd;\n\n  // determine the required inference memory per token:\n  size_t mem_per_token = 0;\n  NSError *error = nil;\n  if (!llama_eval(model, _params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token, &error)) {\n    [self postEvent:[_LlamaEvent failedWithError:error]];\n    return;\n  }\n\n  int last_n_size = _params.repeat_last_n;\n  std::vector<gpt_vocab::id> last_n_tokens(last_n_size);\n  std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);\n\n  int remaining_tokens = _params.n_predict;\n  int input_consumed = 0;\n\n  while (remaining_tokens > 0) {\n    // predict\n    if (embd.size() > 0) {\n      const int64_t t_start_us = ggml_time_us();\n\n      NSError *error = nil;\n      if (!llama_eval(model, _params.n_threads, n_past, embd, logits, mem_per_token, &error)) {\n        [self postEvent:[_LlamaEvent failedWithError:error]];\n        return;\n      }\n\n      t_predict_us += ggml_time_us() - t_start_us;\n    }\n\n    n_past += embd.size();\n    embd.clear();\n\n    if (embd_inp.size() <= input_consumed) {\n      // out of user input, sample next token\n      const float top_k = _params.top_k;\n      const float top_p = _params.top_p;\n      const float temp  = _params.temp;\n      const float repeat_penalty = _params.repeat_penalty;\n\n      const int n_vocab = model.hparams.n_vocab;\n\n      gpt_vocab::id id = 0;\n\n      {\n        const int64_t t_start_sample_us = ggml_time_us();\n\n        id = llama_sample_top_p_top_k(vocab, logits.data() + (logits.size() - n_vocab), last_n_tokens, repeat_penalty, top_k, top_p, temp, rng);\n\n        last_n_tokens.erase(last_n_tokens.begin());\n        last_n_tokens.push_back(id);\n\n        t_sample_us += ggml_time_us() - t_start_sample_us;\n      }\n\n      // add it to the context\n      embd.push_back(id);\n\n      // decrement remaining sampling budget\n      --remaining_tokens;\n    } else {\n      // some user input remains from prompt or interaction, forward it to processing\n      while (embd_inp.size() > input_consumed) {\n        embd.push_back(embd_inp[input_consumed]);\n        last_n_tokens.erase(last_n_tokens.begin());\n        last_n_tokens.push_back(embd_inp[input_consumed]);\n        ++input_consumed;\n        if (embd.size() > _params.n_batch) {\n          break;\n        }\n      }\n    }\n\n    // display text\n    for (auto id : embd) {\n      NSString *token = [[NSString alloc] initWithCString:vocab.id_to_token[id].c_str() encoding:NSUTF8StringEncoding];\n      [self postEvent:[_LlamaEvent outputTokenWithToken:token]];\n    }\n  }\n\n  [self postEvent:[_LlamaEvent completed]];\n\n  ggml_free(model.ctx);\n}\n\n- (void)postEvent:(_LlamaEvent *)event\n{\n  dispatch_async(_eventHandlerQueue, ^() {\n    if (self->_eventHandler != NULL) {\n      self->_eventHandler(event);\n    }\n  });\n}\n\n@end\n"
  },
  {
    "path": "Sources/llamaObjCxx/bridge/LlamaRunnerBridge.mm",
    "content": "//\n//  LlamaRunnerBridge.mm\n//  llama\n//\n//  Created by Alex Rozanski on 12/03/2023.\n//\n\n#import \"LlamaRunnerBridge.h\"\n#import \"LlamaEvent.h\"\n#import \"LlamaRunnerBridgeConfig.h\"\n#import \"LlamaPredictOperation.hh\"\n\n#import \"utils.h\"\n\n@implementation _LlamaRunnerBridge {\n  NSOperationQueue *_operationQueue;\n}\n\n- (instancetype)initWithModelPath:(nonnull NSString *)modelPath\n{\n  if ((self = [super init])) {\n    _modelPath = [modelPath copy];\n    _operationQueue = [[NSOperationQueue alloc] init];\n    _operationQueue.qualityOfService = NSQualityOfServiceUserInitiated;\n  }\n  return self;\n}\n\n- (void)runWithPrompt:(nonnull NSString*)prompt\n               config:(nonnull _LlamaRunnerBridgeConfig *)config\n         eventHandler:(nonnull _LlamaRunnerBridgeEventHandler)eventHandler\n    eventHandlerQueue:(nonnull dispatch_queue_t)eventHandlerQueue\n{\n  gpt_params params;\n  params.model = [_modelPath cStringUsingEncoding:NSUTF8StringEncoding];\n  params.prompt = [prompt cStringUsingEncoding:NSUTF8StringEncoding];\n\n  params.n_threads = (int)config.numberOfThreads;\n  params.n_predict = (int)config.numberOfTokens;\n\n  if (config.reversePrompt != nil) {\n    params.antiprompt = [config.reversePrompt cStringUsingEncoding:NSUTF8StringEncoding];\n  }\n\n  LlamaPredictOperation *operation = [[LlamaPredictOperation alloc] initWithParams:params\n                                                                      eventHandler:eventHandler\n                                                                 eventHandlerQueue:eventHandlerQueue];\n  [_operationQueue addOperation:operation];\n}\n\n@end\n"
  },
  {
    "path": "Sources/llamaObjCxx/bridge/LlamaRunnerBridgeConfig.m",
    "content": "//\n//  LlamaRunnerBridgeConfig.m\n//  llama\n//\n//  Created by Alex Rozanski on 13/03/2023.\n//\n\n#import \"LlamaRunnerBridgeConfig.h\"\n\n@implementation _LlamaRunnerBridgeConfig\n\n@synthesize numberOfThreads = _numberOfThreads;\n@synthesize numberOfTokens = _numberOfTokens;\n@synthesize reversePrompt = _reversePrompt;\n\n@end\n"
  },
  {
    "path": "Sources/llamaObjCxx/headers/LlamaError.h",
    "content": "//\n//  LlamaError.h\n//  llama\n//\n//  Created by Alex Rozanski on 14/03/2023.\n//\n\n#import <Foundation/Foundation.h>\n\nNS_ASSUME_NONNULL_BEGIN\n\nextern NSString *const LlamaErrorDomain;\n\ntypedef NS_ENUM(NSInteger, LlamaErrorCode) {\n  LlamaErrorCodeUnknown = -1,\n\n  LlamaErrorCodeFailedToLoadModel = -1000,\n  LlamaErrorCodePredictionFailed = -1001,\n};\n\nNS_ASSUME_NONNULL_END\n"
  },
  {
    "path": "Sources/llamaObjCxx/headers/LlamaEvent.h",
    "content": "//\n//  LlamaEvent.h\n//  llama\n//\n//  Created by Alex Rozanski on 14/03/2023.\n//\n\n#import <Foundation/Foundation.h>\n\nNS_ASSUME_NONNULL_BEGIN\n\n@interface _LlamaEvent : NSObject\n\n+ (instancetype)startedLoadingModel;\n+ (instancetype)finishedLoadingModel;\n+ (instancetype)startedGeneratingOutput;\n+ (instancetype)outputTokenWithToken:(nonnull NSString *)token;\n+ (instancetype)completed;\n+ (instancetype)failedWithError:(nonnull NSError *)error;\n\n- (void)matchWithStartedLoadingModel:(void (^)(void))startedLoadingModel\n                finishedLoadingModel:(void (^)(void))finishedLoadingModel\n             startedGeneratingOutput:(void (^)(void))startedGeneratingOutput\n                         outputToken:(void (^)(NSString *token))startedLoadingModel\n                           completed:(void (^)(void))startedLoadingModel\n                              failed:(void (^)(NSError *error))startedLoadingModel;\n\n@end\n\nNS_ASSUME_NONNULL_END\n"
  },
  {
    "path": "Sources/llamaObjCxx/headers/LlamaRunnerBridge.h",
    "content": "//\n//  LlamaRunnerBridge.h\n//  llama\n//\n//  Created by Alex Rozanski on 12/03/2023.\n//\n\n#import <Foundation/Foundation.h>\n\n@class _LlamaEvent;\n@class _LlamaRunnerBridgeConfig;\n\nNS_ASSUME_NONNULL_BEGIN\n\ntypedef void (^_LlamaRunnerBridgeEventHandler)(_LlamaEvent *event);\n\n@interface _LlamaRunnerBridge : NSObject\n\n@property (nonnull, readonly, copy) NSString *modelPath;\n\n- (instancetype)initWithModelPath:(nonnull NSString *)modelPath;\n\n- (void)runWithPrompt:(nonnull NSString*)prompt\n               config:(nonnull _LlamaRunnerBridgeConfig *)config\n         eventHandler:(nonnull _LlamaRunnerBridgeEventHandler)eventHandler\n    eventHandlerQueue:(nonnull dispatch_queue_t)eventHandlerQueue;\n@end\n\nNS_ASSUME_NONNULL_END\n"
  },
  {
    "path": "Sources/llamaObjCxx/headers/LlamaRunnerBridgeConfig.h",
    "content": "//\n//  LlamaRunnerBridgeConfig.h\n//  llama\n//\n//  Created by Alex Rozanski on 13/03/2023.\n//\n\n#import <Foundation/Foundation.h>\n\nNS_ASSUME_NONNULL_BEGIN\n\n@interface _LlamaRunnerBridgeConfig : NSObject\n\n@property (nonatomic, assign) NSUInteger numberOfThreads;\n@property (nonatomic, assign) NSUInteger numberOfTokens;\n\n@property (nullable, copy) NSString *reversePrompt;\n\n@end\n\nNS_ASSUME_NONNULL_END\n"
  },
  {
    "path": "Sources/llamaObjCxx/module.modulemap",
    "content": "module llamaObjCxx {\n    umbrella \"headers\"\n    export *\n}\n"
  },
  {
    "path": "llama.xcodeproj/project.pbxproj",
    "content": "// !$*UTF8*$!\n{\n\tarchiveVersion = 1;\n\tclasses = {\n\t};\n\tobjectVersion = 56;\n\tobjects = {\n\n/* Begin PBXBuildFile section */\n\t\t8227D27229C2A844003E3197 /* LlamaEvent.h in Headers */ = {isa = PBXBuildFile; fileRef = 8227D26E29C2A844003E3197 /* LlamaEvent.h */; settings = {ATTRIBUTES = (Private, ); }; };\n\t\t8227D27329C2A844003E3197 /* LlamaRunnerBridge.h in Headers */ = {isa = PBXBuildFile; fileRef = 8227D26F29C2A844003E3197 /* LlamaRunnerBridge.h */; settings = {ATTRIBUTES = (Public, ); }; };\n\t\t8227D27429C2A844003E3197 /* LlamaError.h in Headers */ = {isa = PBXBuildFile; fileRef = 8227D27029C2A844003E3197 /* LlamaError.h */; settings = {ATTRIBUTES = (Public, ); }; };\n\t\t8227D27529C2A844003E3197 /* LlamaRunnerBridgeConfig.h in Headers */ = {isa = PBXBuildFile; fileRef = 8227D27129C2A844003E3197 /* LlamaRunnerBridgeConfig.h */; settings = {ATTRIBUTES = (Public, ); }; };\n\t\t8227D27729C2A87E003E3197 /* ggml.h in Headers */ = {isa = PBXBuildFile; fileRef = 8227D27629C2A87E003E3197 /* ggml.h */; settings = {ATTRIBUTES = (Private, ); }; };\n\t\t8227D27929C2A883003E3197 /* utils.h in Headers */ = {isa = PBXBuildFile; fileRef = 8227D27829C2A883003E3197 /* utils.h */; settings = {ATTRIBUTES = (Private, ); }; };\n\t\t8227D27D29C4F6F8003E3197 /* llama.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 82293E3929BDC4ED00C67BD9 /* llama.framework */; };\n\t\t82293E5B29BDC71700C67BD9 /* main.swift in Sources */ = {isa = PBXBuildFile; fileRef = 82293E5A29BDC71700C67BD9 /* main.swift */; };\n\t\t82293E6529BDC7E200C67BD9 /* LlamaRunner.swift in Sources */ = {isa = PBXBuildFile; fileRef = 82293E6429BDC7E200C67BD9 /* LlamaRunner.swift */; };\n\t\t82819FB229C1DB5400399B7E /* LlamaError.m in Sources */ = {isa = PBXBuildFile; fileRef = 82819F9829C07BC900399B7E /* LlamaError.m */; };\n\t\t82819FB329C1DB5800399B7E /* LlamaEvent.mm in Sources */ = {isa = PBXBuildFile; fileRef = 82819F9329C0526100399B7E /* LlamaEvent.mm */; };\n\t\t82819FB429C1DB5800399B7E /* LlamaRunnerBridge.mm in Sources */ = {isa = PBXBuildFile; fileRef = 82293E5129BDC5DE00C67BD9 /* LlamaRunnerBridge.mm */; };\n\t\t82819FB529C1DB5800399B7E /* LlamaRunnerBridgeConfig.m in Sources */ = {isa = PBXBuildFile; fileRef = 82819F8C29BF2F5800399B7E /* LlamaRunnerBridgeConfig.m */; };\n\t\t82819FB629C1DB5800399B7E /* LlamaPredictOperation.mm in Sources */ = {isa = PBXBuildFile; fileRef = 82819F9029BF387400399B7E /* LlamaPredictOperation.mm */; };\n\t\t82819FB729C1DB5800399B7E /* LlamaPredictOperation.hh in Headers */ = {isa = PBXBuildFile; fileRef = 82819F8F29BF387400399B7E /* LlamaPredictOperation.hh */; settings = {ATTRIBUTES = (Private, ); }; };\n\t\t82819FB929C1DB5E00399B7E /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 82819F7D29BF2BFC00399B7E /* ggml.c */; };\n\t\t82819FBA29C1DB5E00399B7E /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 82819F8129BF2BFC00399B7E /* utils.cpp */; };\n\t\t82819FC529C2585700399B7E /* libllamaObjCxx.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 82819FA929C1DB2900399B7E /* libllamaObjCxx.a */; };\n/* End PBXBuildFile section */\n\n/* Begin PBXContainerItemProxy section */\n\t\t82293E5F29BDC72B00C67BD9 /* PBXContainerItemProxy */ = {\n\t\t\tisa = PBXContainerItemProxy;\n\t\t\tcontainerPortal = 82293E3029BDC4ED00C67BD9 /* Project object */;\n\t\t\tproxyType = 1;\n\t\t\tremoteGlobalIDString = 82293E3829BDC4ED00C67BD9;\n\t\t\tremoteInfo = llama;\n\t\t};\n\t\t82819FC129C1DB8B00399B7E /* PBXContainerItemProxy */ = {\n\t\t\tisa = PBXContainerItemProxy;\n\t\t\tcontainerPortal = 82293E3029BDC4ED00C67BD9 /* Project object */;\n\t\t\tproxyType = 1;\n\t\t\tremoteGlobalIDString = 82819FA829C1DB2900399B7E;\n\t\t\tremoteInfo = llamaObjCxx;\n\t\t};\n/* End PBXContainerItemProxy section */\n\n/* Begin PBXCopyFilesBuildPhase section */\n\t\t82293E5629BDC71700C67BD9 /* CopyFiles */ = {\n\t\t\tisa = PBXCopyFilesBuildPhase;\n\t\t\tbuildActionMask = 2147483647;\n\t\t\tdstPath = /usr/share/man/man1/;\n\t\t\tdstSubfolderSpec = 0;\n\t\t\tfiles = (\n\t\t\t);\n\t\t\trunOnlyForDeploymentPostprocessing = 1;\n\t\t};\n/* End PBXCopyFilesBuildPhase section */\n\n/* Begin PBXFileReference section */\n\t\t8227D26E29C2A844003E3197 /* LlamaEvent.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LlamaEvent.h; sourceTree = \"<group>\"; };\n\t\t8227D26F29C2A844003E3197 /* LlamaRunnerBridge.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LlamaRunnerBridge.h; sourceTree = \"<group>\"; };\n\t\t8227D27029C2A844003E3197 /* LlamaError.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LlamaError.h; sourceTree = \"<group>\"; };\n\t\t8227D27129C2A844003E3197 /* LlamaRunnerBridgeConfig.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = LlamaRunnerBridgeConfig.h; sourceTree = \"<group>\"; };\n\t\t8227D27629C2A87E003E3197 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = Sources/cpp/ggml.h; sourceTree = SOURCE_ROOT; };\n\t\t8227D27829C2A883003E3197 /* utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = utils.h; path = Sources/cpp/utils.h; sourceTree = SOURCE_ROOT; };\n\t\t82293E3929BDC4ED00C67BD9 /* llama.framework */ = {isa = PBXFileReference; explicitFileType = wrapper.framework; includeInIndex = 0; path = llama.framework; sourceTree = BUILT_PRODUCTS_DIR; };\n\t\t82293E5129BDC5DE00C67BD9 /* LlamaRunnerBridge.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = LlamaRunnerBridge.mm; sourceTree = \"<group>\"; };\n\t\t82293E5829BDC71700C67BD9 /* llamaTest */ = {isa = PBXFileReference; explicitFileType = \"compiled.mach-o.executable\"; includeInIndex = 0; path = llamaTest; sourceTree = BUILT_PRODUCTS_DIR; };\n\t\t82293E5A29BDC71700C67BD9 /* main.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = main.swift; sourceTree = \"<group>\"; };\n\t\t82293E6429BDC7E200C67BD9 /* LlamaRunner.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaRunner.swift; sourceTree = \"<group>\"; };\n\t\t82819F7B29BDF61E00399B7E /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = \"<group>\"; };\n\t\t82819F7C29BDF7CB00399B7E /* LlamaTest.xcconfig */ = {isa = PBXFileReference; lastKnownFileType = text.xcconfig; path = LlamaTest.xcconfig; sourceTree = \"<group>\"; };\n\t\t82819F7D29BF2BFC00399B7E /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = Sources/llamaObjCxx/cpp/ggml.c; sourceTree = SOURCE_ROOT; };\n\t\t82819F8129BF2BFC00399B7E /* utils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = utils.cpp; path = Sources/llamaObjCxx/cpp/utils.cpp; sourceTree = SOURCE_ROOT; };\n\t\t82819F8C29BF2F5800399B7E /* LlamaRunnerBridgeConfig.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = LlamaRunnerBridgeConfig.m; sourceTree = \"<group>\"; };\n\t\t82819F8F29BF387400399B7E /* LlamaPredictOperation.hh */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.h; path = LlamaPredictOperation.hh; sourceTree = \"<group>\"; };\n\t\t82819F9029BF387400399B7E /* LlamaPredictOperation.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = LlamaPredictOperation.mm; sourceTree = \"<group>\"; };\n\t\t82819F9329C0526100399B7E /* LlamaEvent.mm */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.objcpp; path = LlamaEvent.mm; sourceTree = \"<group>\"; };\n\t\t82819F9829C07BC900399B7E /* LlamaError.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = LlamaError.m; sourceTree = \"<group>\"; };\n\t\t82819F9B29C0881800399B7E /* README.md */ = {isa = PBXFileReference; lastKnownFileType = net.daringfireball.markdown; path = README.md; sourceTree = \"<group>\"; };\n\t\t82819F9C29C0897900399B7E /* LICENSE */ = {isa = PBXFileReference; lastKnownFileType = text; path = LICENSE; sourceTree = \"<group>\"; };\n\t\t82819F9D29C1CCA300399B7E /* Package.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Package.swift; sourceTree = \"<group>\"; };\n\t\t82819FA929C1DB2900399B7E /* libllamaObjCxx.a */ = {isa = PBXFileReference; explicitFileType = archive.ar; includeInIndex = 0; path = libllamaObjCxx.a; sourceTree = BUILT_PRODUCTS_DIR; };\n\t\t82819FC729C2939100399B7E /* module.modulemap */ = {isa = PBXFileReference; lastKnownFileType = \"sourcecode.module-map\"; path = module.modulemap; sourceTree = \"<group>\"; };\n/* End PBXFileReference section */\n\n/* Begin PBXFrameworksBuildPhase section */\n\t\t82293E3629BDC4ED00C67BD9 /* Frameworks */ = {\n\t\t\tisa = PBXFrameworksBuildPhase;\n\t\t\tbuildActionMask = 2147483647;\n\t\t\tfiles = (\n\t\t\t\t82819FC529C2585700399B7E /* libllamaObjCxx.a in Frameworks */,\n\t\t\t);\n\t\t\trunOnlyForDeploymentPostprocessing = 0;\n\t\t};\n\t\t82293E5529BDC71700C67BD9 /* Frameworks */ = {\n\t\t\tisa = PBXFrameworksBuildPhase;\n\t\t\tbuildActionMask = 2147483647;\n\t\t\tfiles = (\n\t\t\t\t8227D27D29C4F6F8003E3197 /* llama.framework in Frameworks */,\n\t\t\t);\n\t\t\trunOnlyForDeploymentPostprocessing = 0;\n\t\t};\n\t\t82819FA729C1DB2900399B7E /* Frameworks */ = {\n\t\t\tisa = PBXFrameworksBuildPhase;\n\t\t\tbuildActionMask = 2147483647;\n\t\t\tfiles = (\n\t\t\t);\n\t\t\trunOnlyForDeploymentPostprocessing = 0;\n\t\t};\n/* End PBXFrameworksBuildPhase section */\n\n/* Begin PBXGroup section */\n\t\t8227D26D29C2A825003E3197 /* headers */ = {\n\t\t\tisa = PBXGroup;\n\t\t\tchildren = (\n\t\t\t\t8227D27029C2A844003E3197 /* LlamaError.h */,\n\t\t\t\t8227D26E29C2A844003E3197 /* LlamaEvent.h */,\n\t\t\t\t8227D26F29C2A844003E3197 /* LlamaRunnerBridge.h */,\n\t\t\t\t8227D27129C2A844003E3197 /* LlamaRunnerBridgeConfig.h */,\n\t\t\t);\n\t\t\tpath = headers;\n\t\t\tsourceTree = \"<group>\";\n\t\t};\n\t\t82293E2F29BDC4ED00C67BD9 = {\n\t\t\tisa = PBXGroup;\n\t\t\tchildren = (\n\t\t\t\t82819F9B29C0881800399B7E /* README.md */,\n\t\t\t\t82819F9C29C0897900399B7E /* LICENSE */,\n\t\t\t\t82819F9D29C1CCA300399B7E /* Package.swift */,\n\t\t\t\t82819FC629C289B400399B7E /* Sources */,\n\t\t\t\t82293E5929BDC71700C67BD9 /* llamaTest */,\n\t\t\t\t82293E3A29BDC4ED00C67BD9 /* Products */,\n\t\t\t\t82293E6129BDC73100C67BD9 /* Frameworks */,\n\t\t\t);\n\t\t\tsourceTree = \"<group>\";\n\t\t};\n\t\t82293E3A29BDC4ED00C67BD9 /* Products */ = {\n\t\t\tisa = PBXGroup;\n\t\t\tchildren = (\n\t\t\t\t82293E3929BDC4ED00C67BD9 /* llama.framework */,\n\t\t\t\t82293E5829BDC71700C67BD9 /* llamaTest */,\n\t\t\t\t82819FA929C1DB2900399B7E /* libllamaObjCxx.a */,\n\t\t\t);\n\t\t\tname = Products;\n\t\t\tsourceTree = \"<group>\";\n\t\t};\n\t\t82293E3B29BDC4ED00C67BD9 /* llama */ = {\n\t\t\tisa = PBXGroup;\n\t\t\tchildren = (\n\t\t\t\t82293E6429BDC7E200C67BD9 /* LlamaRunner.swift */,\n\t\t\t);\n\t\t\tpath = llama;\n\t\t\tsourceTree = \"<group>\";\n\t\t};\n\t\t82293E4329BDC51A00C67BD9 /* cpp */ = {\n\t\t\tisa = PBXGroup;\n\t\t\tchildren = (\n\t\t\t\t82819F7D29BF2BFC00399B7E /* ggml.c */,\n\t\t\t\t8227D27629C2A87E003E3197 /* ggml.h */,\n\t\t\t\t82819F8129BF2BFC00399B7E /* utils.cpp */,\n\t\t\t\t8227D27829C2A883003E3197 /* utils.h */,\n\t\t\t);\n\t\t\tpath = cpp;\n\t\t\tsourceTree = \"<group>\";\n\t\t};\n\t\t82293E5929BDC71700C67BD9 /* llamaTest */ = {\n\t\t\tisa = PBXGroup;\n\t\t\tchildren = (\n\t\t\t\t82293E5A29BDC71700C67BD9 /* main.swift */,\n\t\t\t\t82819F7C29BDF7CB00399B7E /* LlamaTest.xcconfig */,\n\t\t\t\t82819F7B29BDF61E00399B7E /* Info.plist */,\n\t\t\t);\n\t\t\tpath = llamaTest;\n\t\t\tsourceTree = \"<group>\";\n\t\t};\n\t\t82293E6129BDC73100C67BD9 /* Frameworks */ = {\n\t\t\tisa = PBXGroup;\n\t\t\tchildren = (\n\t\t\t);\n\t\t\tname = Frameworks;\n\t\t\tsourceTree = \"<group>\";\n\t\t};\n\t\t82293E6329BDC75F00C67BD9 /* bridge */ = {\n\t\t\tisa = PBXGroup;\n\t\t\tchildren = (\n\t\t\t\t82819F9329C0526100399B7E /* LlamaEvent.mm */,\n\t\t\t\t82293E5129BDC5DE00C67BD9 /* LlamaRunnerBridge.mm */,\n\t\t\t\t82819F8C29BF2F5800399B7E /* LlamaRunnerBridgeConfig.m */,\n\t\t\t\t82819F8F29BF387400399B7E /* LlamaPredictOperation.hh */,\n\t\t\t\t82819F9029BF387400399B7E /* LlamaPredictOperation.mm */,\n\t\t\t);\n\t\t\tpath = bridge;\n\t\t\tsourceTree = \"<group>\";\n\t\t};\n\t\t82819F9E29C1CE2000399B7E /* llamaObjCxx */ = {\n\t\t\tisa = PBXGroup;\n\t\t\tchildren = (\n\t\t\t\t82819FC729C2939100399B7E /* module.modulemap */,\n\t\t\t\t82819F9829C07BC900399B7E /* LlamaError.m */,\n\t\t\t\t82293E6329BDC75F00C67BD9 /* bridge */,\n\t\t\t\t82293E4329BDC51A00C67BD9 /* cpp */,\n\t\t\t\t8227D26D29C2A825003E3197 /* headers */,\n\t\t\t);\n\t\t\tpath = llamaObjCxx;\n\t\t\tsourceTree = \"<group>\";\n\t\t};\n\t\t82819FC629C289B400399B7E /* Sources */ = {\n\t\t\tisa = PBXGroup;\n\t\t\tchildren = (\n\t\t\t\t82293E3B29BDC4ED00C67BD9 /* llama */,\n\t\t\t\t82819F9E29C1CE2000399B7E /* llamaObjCxx */,\n\t\t\t);\n\t\t\tpath = Sources;\n\t\t\tsourceTree = \"<group>\";\n\t\t};\n/* End PBXGroup section */\n\n/* Begin PBXHeadersBuildPhase section */\n\t\t82293E3429BDC4ED00C67BD9 /* Headers */ = {\n\t\t\tisa = PBXHeadersBuildPhase;\n\t\t\tbuildActionMask = 2147483647;\n\t\t\tfiles = (\n\t\t\t);\n\t\t\trunOnlyForDeploymentPostprocessing = 0;\n\t\t};\n\t\t82819FA529C1DB2900399B7E /* Headers */ = {\n\t\t\tisa = PBXHeadersBuildPhase;\n\t\t\tbuildActionMask = 2147483647;\n\t\t\tfiles = (\n\t\t\t\t82819FB729C1DB5800399B7E /* LlamaPredictOperation.hh in Headers */,\n\t\t\t\t8227D27329C2A844003E3197 /* LlamaRunnerBridge.h in Headers */,\n\t\t\t\t8227D27929C2A883003E3197 /* utils.h in Headers */,\n\t\t\t\t8227D27229C2A844003E3197 /* LlamaEvent.h in Headers */,\n\t\t\t\t8227D27729C2A87E003E3197 /* ggml.h in Headers */,\n\t\t\t\t8227D27529C2A844003E3197 /* LlamaRunnerBridgeConfig.h in Headers */,\n\t\t\t\t8227D27429C2A844003E3197 /* LlamaError.h in Headers */,\n\t\t\t);\n\t\t\trunOnlyForDeploymentPostprocessing = 0;\n\t\t};\n/* End PBXHeadersBuildPhase section */\n\n/* Begin PBXNativeTarget section */\n\t\t82293E3829BDC4ED00C67BD9 /* llama */ = {\n\t\t\tisa = PBXNativeTarget;\n\t\t\tbuildConfigurationList = 82293E4029BDC4ED00C67BD9 /* Build configuration list for PBXNativeTarget \"llama\" */;\n\t\t\tbuildPhases = (\n\t\t\t\t82293E3429BDC4ED00C67BD9 /* Headers */,\n\t\t\t\t82293E3529BDC4ED00C67BD9 /* Sources */,\n\t\t\t\t82293E3629BDC4ED00C67BD9 /* Frameworks */,\n\t\t\t\t82293E3729BDC4ED00C67BD9 /* Resources */,\n\t\t\t);\n\t\t\tbuildRules = (\n\t\t\t);\n\t\t\tdependencies = (\n\t\t\t\t82819FC229C1DB8B00399B7E /* PBXTargetDependency */,\n\t\t\t);\n\t\t\tname = llama;\n\t\t\tproductName = llama;\n\t\t\tproductReference = 82293E3929BDC4ED00C67BD9 /* llama.framework */;\n\t\t\tproductType = \"com.apple.product-type.framework\";\n\t\t};\n\t\t82293E5729BDC71700C67BD9 /* llamaTest */ = {\n\t\t\tisa = PBXNativeTarget;\n\t\t\tbuildConfigurationList = 82293E5C29BDC71700C67BD9 /* Build configuration list for PBXNativeTarget \"llamaTest\" */;\n\t\t\tbuildPhases = (\n\t\t\t\t82293E5429BDC71700C67BD9 /* Sources */,\n\t\t\t\t82293E5529BDC71700C67BD9 /* Frameworks */,\n\t\t\t\t82293E5629BDC71700C67BD9 /* CopyFiles */,\n\t\t\t);\n\t\t\tbuildRules = (\n\t\t\t);\n\t\t\tdependencies = (\n\t\t\t\t82293E6029BDC72B00C67BD9 /* PBXTargetDependency */,\n\t\t\t);\n\t\t\tname = llamaTest;\n\t\t\tproductName = llamaTest;\n\t\t\tproductReference = 82293E5829BDC71700C67BD9 /* llamaTest */;\n\t\t\tproductType = \"com.apple.product-type.tool\";\n\t\t};\n\t\t82819FA829C1DB2900399B7E /* llamaObjCxx */ = {\n\t\t\tisa = PBXNativeTarget;\n\t\t\tbuildConfigurationList = 82819FAF29C1DB2900399B7E /* Build configuration list for PBXNativeTarget \"llamaObjCxx\" */;\n\t\t\tbuildPhases = (\n\t\t\t\t82819FA529C1DB2900399B7E /* Headers */,\n\t\t\t\t82819FA629C1DB2900399B7E /* Sources */,\n\t\t\t\t82819FA729C1DB2900399B7E /* Frameworks */,\n\t\t\t);\n\t\t\tbuildRules = (\n\t\t\t);\n\t\t\tdependencies = (\n\t\t\t);\n\t\t\tname = llamaObjCxx;\n\t\t\tproductName = llamaObjCxx;\n\t\t\tproductReference = 82819FA929C1DB2900399B7E /* libllamaObjCxx.a */;\n\t\t\tproductType = \"com.apple.product-type.library.static\";\n\t\t};\n/* End PBXNativeTarget section */\n\n/* Begin PBXProject section */\n\t\t82293E3029BDC4ED00C67BD9 /* Project object */ = {\n\t\t\tisa = PBXProject;\n\t\t\tattributes = {\n\t\t\t\tBuildIndependentTargetsInParallel = 1;\n\t\t\t\tLastSwiftUpdateCheck = 1410;\n\t\t\t\tLastUpgradeCheck = 1410;\n\t\t\t\tTargetAttributes = {\n\t\t\t\t\t82293E3829BDC4ED00C67BD9 = {\n\t\t\t\t\t\tCreatedOnToolsVersion = 14.1;\n\t\t\t\t\t\tLastSwiftMigration = 1410;\n\t\t\t\t\t};\n\t\t\t\t\t82293E5729BDC71700C67BD9 = {\n\t\t\t\t\t\tCreatedOnToolsVersion = 14.1;\n\t\t\t\t\t\tLastSwiftMigration = 1410;\n\t\t\t\t\t};\n\t\t\t\t\t82819FA829C1DB2900399B7E = {\n\t\t\t\t\t\tCreatedOnToolsVersion = 14.1;\n\t\t\t\t\t};\n\t\t\t\t};\n\t\t\t};\n\t\t\tbuildConfigurationList = 82293E3329BDC4ED00C67BD9 /* Build configuration list for PBXProject \"llama\" */;\n\t\t\tcompatibilityVersion = \"Xcode 14.0\";\n\t\t\tdevelopmentRegion = en;\n\t\t\thasScannedForEncodings = 0;\n\t\t\tknownRegions = (\n\t\t\t\ten,\n\t\t\t\tBase,\n\t\t\t);\n\t\t\tmainGroup = 82293E2F29BDC4ED00C67BD9;\n\t\t\tproductRefGroup = 82293E3A29BDC4ED00C67BD9 /* Products */;\n\t\t\tprojectDirPath = \"\";\n\t\t\tprojectRoot = \"\";\n\t\t\ttargets = (\n\t\t\t\t82293E3829BDC4ED00C67BD9 /* llama */,\n\t\t\t\t82819FA829C1DB2900399B7E /* llamaObjCxx */,\n\t\t\t\t82293E5729BDC71700C67BD9 /* llamaTest */,\n\t\t\t);\n\t\t};\n/* End PBXProject section */\n\n/* Begin PBXResourcesBuildPhase section */\n\t\t82293E3729BDC4ED00C67BD9 /* Resources */ = {\n\t\t\tisa = PBXResourcesBuildPhase;\n\t\t\tbuildActionMask = 2147483647;\n\t\t\tfiles = (\n\t\t\t);\n\t\t\trunOnlyForDeploymentPostprocessing = 0;\n\t\t};\n/* End PBXResourcesBuildPhase section */\n\n/* Begin PBXSourcesBuildPhase section */\n\t\t82293E3529BDC4ED00C67BD9 /* Sources */ = {\n\t\t\tisa = PBXSourcesBuildPhase;\n\t\t\tbuildActionMask = 2147483647;\n\t\t\tfiles = (\n\t\t\t\t82293E6529BDC7E200C67BD9 /* LlamaRunner.swift in Sources */,\n\t\t\t);\n\t\t\trunOnlyForDeploymentPostprocessing = 0;\n\t\t};\n\t\t82293E5429BDC71700C67BD9 /* Sources */ = {\n\t\t\tisa = PBXSourcesBuildPhase;\n\t\t\tbuildActionMask = 2147483647;\n\t\t\tfiles = (\n\t\t\t\t82293E5B29BDC71700C67BD9 /* main.swift in Sources */,\n\t\t\t);\n\t\t\trunOnlyForDeploymentPostprocessing = 0;\n\t\t};\n\t\t82819FA629C1DB2900399B7E /* Sources */ = {\n\t\t\tisa = PBXSourcesBuildPhase;\n\t\t\tbuildActionMask = 2147483647;\n\t\t\tfiles = (\n\t\t\t\t82819FB629C1DB5800399B7E /* LlamaPredictOperation.mm in Sources */,\n\t\t\t\t82819FB529C1DB5800399B7E /* LlamaRunnerBridgeConfig.m in Sources */,\n\t\t\t\t82819FBA29C1DB5E00399B7E /* utils.cpp in Sources */,\n\t\t\t\t82819FB429C1DB5800399B7E /* LlamaRunnerBridge.mm in Sources */,\n\t\t\t\t82819FB329C1DB5800399B7E /* LlamaEvent.mm in Sources */,\n\t\t\t\t82819FB929C1DB5E00399B7E /* ggml.c in Sources */,\n\t\t\t\t82819FB229C1DB5400399B7E /* LlamaError.m in Sources */,\n\t\t\t);\n\t\t\trunOnlyForDeploymentPostprocessing = 0;\n\t\t};\n/* End PBXSourcesBuildPhase section */\n\n/* Begin PBXTargetDependency section */\n\t\t82293E6029BDC72B00C67BD9 /* PBXTargetDependency */ = {\n\t\t\tisa = PBXTargetDependency;\n\t\t\ttarget = 82293E3829BDC4ED00C67BD9 /* llama */;\n\t\t\ttargetProxy = 82293E5F29BDC72B00C67BD9 /* PBXContainerItemProxy */;\n\t\t};\n\t\t82819FC229C1DB8B00399B7E /* PBXTargetDependency */ = {\n\t\t\tisa = PBXTargetDependency;\n\t\t\ttarget = 82819FA829C1DB2900399B7E /* llamaObjCxx */;\n\t\t\ttargetProxy = 82819FC129C1DB8B00399B7E /* PBXContainerItemProxy */;\n\t\t};\n/* End PBXTargetDependency section */\n\n/* Begin XCBuildConfiguration section */\n\t\t82293E3E29BDC4ED00C67BD9 /* Debug */ = {\n\t\t\tisa = XCBuildConfiguration;\n\t\t\tbuildSettings = {\n\t\t\t\tALWAYS_SEARCH_USER_PATHS = NO;\n\t\t\t\tCLANG_ANALYZER_NONNULL = YES;\n\t\t\t\tCLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;\n\t\t\t\tCLANG_CXX_LANGUAGE_STANDARD = \"gnu++20\";\n\t\t\t\tCLANG_ENABLE_MODULES = YES;\n\t\t\t\tCLANG_ENABLE_OBJC_ARC = YES;\n\t\t\t\tCLANG_ENABLE_OBJC_WEAK = YES;\n\t\t\t\tCLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;\n\t\t\t\tCLANG_WARN_BOOL_CONVERSION = YES;\n\t\t\t\tCLANG_WARN_COMMA = YES;\n\t\t\t\tCLANG_WARN_CONSTANT_CONVERSION = YES;\n\t\t\t\tCLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;\n\t\t\t\tCLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;\n\t\t\t\tCLANG_WARN_DOCUMENTATION_COMMENTS = YES;\n\t\t\t\tCLANG_WARN_EMPTY_BODY = YES;\n\t\t\t\tCLANG_WARN_ENUM_CONVERSION = YES;\n\t\t\t\tCLANG_WARN_INFINITE_RECURSION = YES;\n\t\t\t\tCLANG_WARN_INT_CONVERSION = YES;\n\t\t\t\tCLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;\n\t\t\t\tCLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;\n\t\t\t\tCLANG_WARN_OBJC_LITERAL_CONVERSION = YES;\n\t\t\t\tCLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;\n\t\t\t\tCLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;\n\t\t\t\tCLANG_WARN_RANGE_LOOP_ANALYSIS = YES;\n\t\t\t\tCLANG_WARN_STRICT_PROTOTYPES = YES;\n\t\t\t\tCLANG_WARN_SUSPICIOUS_MOVE = YES;\n\t\t\t\tCLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;\n\t\t\t\tCLANG_WARN_UNREACHABLE_CODE = YES;\n\t\t\t\tCLANG_WARN__DUPLICATE_METHOD_MATCH = YES;\n\t\t\t\tCOPY_PHASE_STRIP = NO;\n\t\t\t\tCURRENT_PROJECT_VERSION = 1;\n\t\t\t\tDEBUG_INFORMATION_FORMAT = dwarf;\n\t\t\t\tENABLE_STRICT_OBJC_MSGSEND = YES;\n\t\t\t\tENABLE_TESTABILITY = YES;\n\t\t\t\tGCC_C_LANGUAGE_STANDARD = gnu11;\n\t\t\t\tGCC_DYNAMIC_NO_PIC = NO;\n\t\t\t\tGCC_NO_COMMON_BLOCKS = YES;\n\t\t\t\tGCC_OPTIMIZATION_LEVEL = 0;\n\t\t\t\tGCC_PREPROCESSOR_DEFINITIONS = (\n\t\t\t\t\t\"DEBUG=1\",\n\t\t\t\t\t\"$(inherited)\",\n\t\t\t\t);\n\t\t\t\tGCC_WARN_64_TO_32_BIT_CONVERSION = YES;\n\t\t\t\tGCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;\n\t\t\t\tGCC_WARN_UNDECLARED_SELECTOR = YES;\n\t\t\t\tGCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;\n\t\t\t\tGCC_WARN_UNUSED_FUNCTION = YES;\n\t\t\t\tGCC_WARN_UNUSED_VARIABLE = YES;\n\t\t\t\tMACOSX_DEPLOYMENT_TARGET = 13.0;\n\t\t\t\tMTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;\n\t\t\t\tMTL_FAST_MATH = YES;\n\t\t\t\tONLY_ACTIVE_ARCH = YES;\n\t\t\t\tSDKROOT = macosx;\n\t\t\t\tSWIFT_ACTIVE_COMPILATION_CONDITIONS = DEBUG;\n\t\t\t\tSWIFT_OPTIMIZATION_LEVEL = \"-Onone\";\n\t\t\t\tVERSIONING_SYSTEM = \"apple-generic\";\n\t\t\t\tVERSION_INFO_PREFIX = \"\";\n\t\t\t};\n\t\t\tname = Debug;\n\t\t};\n\t\t82293E3F29BDC4ED00C67BD9 /* Release */ = {\n\t\t\tisa = XCBuildConfiguration;\n\t\t\tbuildSettings = {\n\t\t\t\tALWAYS_SEARCH_USER_PATHS = NO;\n\t\t\t\tCLANG_ANALYZER_NONNULL = YES;\n\t\t\t\tCLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;\n\t\t\t\tCLANG_CXX_LANGUAGE_STANDARD = \"gnu++20\";\n\t\t\t\tCLANG_ENABLE_MODULES = YES;\n\t\t\t\tCLANG_ENABLE_OBJC_ARC = YES;\n\t\t\t\tCLANG_ENABLE_OBJC_WEAK = YES;\n\t\t\t\tCLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;\n\t\t\t\tCLANG_WARN_BOOL_CONVERSION = YES;\n\t\t\t\tCLANG_WARN_COMMA = YES;\n\t\t\t\tCLANG_WARN_CONSTANT_CONVERSION = YES;\n\t\t\t\tCLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;\n\t\t\t\tCLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;\n\t\t\t\tCLANG_WARN_DOCUMENTATION_COMMENTS = YES;\n\t\t\t\tCLANG_WARN_EMPTY_BODY = YES;\n\t\t\t\tCLANG_WARN_ENUM_CONVERSION = YES;\n\t\t\t\tCLANG_WARN_INFINITE_RECURSION = YES;\n\t\t\t\tCLANG_WARN_INT_CONVERSION = YES;\n\t\t\t\tCLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;\n\t\t\t\tCLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;\n\t\t\t\tCLANG_WARN_OBJC_LITERAL_CONVERSION = YES;\n\t\t\t\tCLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;\n\t\t\t\tCLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;\n\t\t\t\tCLANG_WARN_RANGE_LOOP_ANALYSIS = YES;\n\t\t\t\tCLANG_WARN_STRICT_PROTOTYPES = YES;\n\t\t\t\tCLANG_WARN_SUSPICIOUS_MOVE = YES;\n\t\t\t\tCLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;\n\t\t\t\tCLANG_WARN_UNREACHABLE_CODE = YES;\n\t\t\t\tCLANG_WARN__DUPLICATE_METHOD_MATCH = YES;\n\t\t\t\tCOPY_PHASE_STRIP = NO;\n\t\t\t\tCURRENT_PROJECT_VERSION = 1;\n\t\t\t\tDEBUG_INFORMATION_FORMAT = \"dwarf-with-dsym\";\n\t\t\t\tENABLE_NS_ASSERTIONS = NO;\n\t\t\t\tENABLE_STRICT_OBJC_MSGSEND = YES;\n\t\t\t\tGCC_C_LANGUAGE_STANDARD = gnu11;\n\t\t\t\tGCC_NO_COMMON_BLOCKS = YES;\n\t\t\t\tGCC_WARN_64_TO_32_BIT_CONVERSION = YES;\n\t\t\t\tGCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;\n\t\t\t\tGCC_WARN_UNDECLARED_SELECTOR = YES;\n\t\t\t\tGCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;\n\t\t\t\tGCC_WARN_UNUSED_FUNCTION = YES;\n\t\t\t\tGCC_WARN_UNUSED_VARIABLE = YES;\n\t\t\t\tMACOSX_DEPLOYMENT_TARGET = 13.0;\n\t\t\t\tMTL_ENABLE_DEBUG_INFO = NO;\n\t\t\t\tMTL_FAST_MATH = YES;\n\t\t\t\tSDKROOT = macosx;\n\t\t\t\tSWIFT_COMPILATION_MODE = wholemodule;\n\t\t\t\tSWIFT_OPTIMIZATION_LEVEL = \"-O\";\n\t\t\t\tVERSIONING_SYSTEM = \"apple-generic\";\n\t\t\t\tVERSION_INFO_PREFIX = \"\";\n\t\t\t};\n\t\t\tname = Release;\n\t\t};\n\t\t82293E4129BDC4ED00C67BD9 /* Debug */ = {\n\t\t\tisa = XCBuildConfiguration;\n\t\t\tbuildSettings = {\n\t\t\t\tCLANG_ENABLE_MODULES = YES;\n\t\t\t\tCODE_SIGN_STYLE = Automatic;\n\t\t\t\tCOMBINE_HIDPI_IMAGES = YES;\n\t\t\t\tCURRENT_PROJECT_VERSION = 1;\n\t\t\t\tDEFINES_MODULE = YES;\n\t\t\t\tDEVELOPMENT_TEAM = 44847G58BM;\n\t\t\t\tDYLIB_COMPATIBILITY_VERSION = 1;\n\t\t\t\tDYLIB_CURRENT_VERSION = 1;\n\t\t\t\tDYLIB_INSTALL_NAME_BASE = \"@rpath\";\n\t\t\t\tGENERATE_INFOPLIST_FILE = YES;\n\t\t\t\tINFOPLIST_KEY_NSHumanReadableCopyright = \"\";\n\t\t\t\tINSTALL_PATH = \"$(LOCAL_LIBRARY_DIR)/Frameworks\";\n\t\t\t\tIPHONEOS_DEPLOYMENT_TARGET = 13.0;\n\t\t\t\tLD_RUNPATH_SEARCH_PATHS = (\n\t\t\t\t\t\"$(inherited)\",\n\t\t\t\t\t\"@executable_path/../Frameworks\",\n\t\t\t\t\t\"@loader_path/Frameworks\",\n\t\t\t\t);\n\t\t\t\tMACOSX_DEPLOYMENT_TARGET = 10.15;\n\t\t\t\tMARKETING_VERSION = 1.0;\n\t\t\t\tOTHER_LDFLAGS = (\n\t\t\t\t\t\"-all_load\",\n\t\t\t\t\t\"-lc++\",\n\t\t\t\t);\n\t\t\t\tPRESERVE_DEAD_CODE_INITS_AND_TERMS = NO;\n\t\t\t\tPRODUCT_BUNDLE_IDENTIFIER = com.alexrozanski.llama;\n\t\t\t\tPRODUCT_NAME = \"$(TARGET_NAME:c99extidentifier)\";\n\t\t\t\tSKIP_INSTALL = YES;\n\t\t\t\tSUPPORTED_PLATFORMS = \"iphoneos iphonesimulator macosx\";\n\t\t\t\tSUPPORTS_MACCATALYST = NO;\n\t\t\t\tSUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;\n\t\t\t\tSWIFT_EMIT_LOC_STRINGS = YES;\n\t\t\t\tSWIFT_INCLUDE_PATHS = \"$(SRCROOT)/Sources/llamaObjCxx\";\n\t\t\t\tSWIFT_OPTIMIZATION_LEVEL = \"-Onone\";\n\t\t\t\tSWIFT_VERSION = 5.0;\n\t\t\t\tTARGETED_DEVICE_FAMILY = \"1,2\";\n\t\t\t};\n\t\t\tname = Debug;\n\t\t};\n\t\t82293E4229BDC4ED00C67BD9 /* Release */ = {\n\t\t\tisa = XCBuildConfiguration;\n\t\t\tbuildSettings = {\n\t\t\t\tCLANG_ENABLE_MODULES = YES;\n\t\t\t\tCODE_SIGN_STYLE = Automatic;\n\t\t\t\tCOMBINE_HIDPI_IMAGES = YES;\n\t\t\t\tCURRENT_PROJECT_VERSION = 1;\n\t\t\t\tDEFINES_MODULE = YES;\n\t\t\t\tDEVELOPMENT_TEAM = 44847G58BM;\n\t\t\t\tDYLIB_COMPATIBILITY_VERSION = 1;\n\t\t\t\tDYLIB_CURRENT_VERSION = 1;\n\t\t\t\tDYLIB_INSTALL_NAME_BASE = \"@rpath\";\n\t\t\t\tGENERATE_INFOPLIST_FILE = YES;\n\t\t\t\tINFOPLIST_KEY_NSHumanReadableCopyright = \"\";\n\t\t\t\tINSTALL_PATH = \"$(LOCAL_LIBRARY_DIR)/Frameworks\";\n\t\t\t\tIPHONEOS_DEPLOYMENT_TARGET = 13.0;\n\t\t\t\tLD_RUNPATH_SEARCH_PATHS = (\n\t\t\t\t\t\"$(inherited)\",\n\t\t\t\t\t\"@executable_path/../Frameworks\",\n\t\t\t\t\t\"@loader_path/Frameworks\",\n\t\t\t\t);\n\t\t\t\tMACOSX_DEPLOYMENT_TARGET = 10.15;\n\t\t\t\tMARKETING_VERSION = 1.0;\n\t\t\t\tOTHER_LDFLAGS = (\n\t\t\t\t\t\"-all_load\",\n\t\t\t\t\t\"-lc++\",\n\t\t\t\t);\n\t\t\t\tPRESERVE_DEAD_CODE_INITS_AND_TERMS = NO;\n\t\t\t\tPRODUCT_BUNDLE_IDENTIFIER = com.alexrozanski.llama;\n\t\t\t\tPRODUCT_NAME = \"$(TARGET_NAME:c99extidentifier)\";\n\t\t\t\tSKIP_INSTALL = YES;\n\t\t\t\tSUPPORTED_PLATFORMS = \"iphoneos iphonesimulator macosx\";\n\t\t\t\tSUPPORTS_MACCATALYST = NO;\n\t\t\t\tSUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO;\n\t\t\t\tSWIFT_EMIT_LOC_STRINGS = YES;\n\t\t\t\tSWIFT_INCLUDE_PATHS = \"$(SRCROOT)/Sources/llamaObjCxx\";\n\t\t\t\tSWIFT_VERSION = 5.0;\n\t\t\t\tTARGETED_DEVICE_FAMILY = \"1,2\";\n\t\t\t};\n\t\t\tname = Release;\n\t\t};\n\t\t82293E5D29BDC71700C67BD9 /* Debug */ = {\n\t\t\tisa = XCBuildConfiguration;\n\t\t\tbaseConfigurationReference = 82819F7C29BDF7CB00399B7E /* LlamaTest.xcconfig */;\n\t\t\tbuildSettings = {\n\t\t\t\tCLANG_ENABLE_MODULES = YES;\n\t\t\t\tCODE_SIGN_STYLE = Automatic;\n\t\t\t\tCREATE_INFOPLIST_SECTION_IN_BINARY = YES;\n\t\t\t\tDEVELOPMENT_TEAM = 44847G58BM;\n\t\t\t\tENABLE_HARDENED_RUNTIME = YES;\n\t\t\t\tGENERATE_INFOPLIST_FILE = YES;\n\t\t\t\tINFOPLIST_EXPAND_BUILD_SETTINGS = YES;\n\t\t\t\tINFOPLIST_FILE = llamaTest/Info.plist;\n\t\t\t\tLD_RUNPATH_SEARCH_PATHS = (\n\t\t\t\t\t\"$(inherited)\",\n\t\t\t\t\t\"@executable_path/../Frameworks\",\n\t\t\t\t\t\"@loader_path/../Frameworks\",\n\t\t\t\t);\n\t\t\t\tLLAMA_MODEL_PATH = \"\";\n\t\t\t\tMACOSX_DEPLOYMENT_TARGET = 10.15;\n\t\t\t\tPRODUCT_NAME = \"$(TARGET_NAME)\";\n\t\t\t\tSWIFT_OPTIMIZATION_LEVEL = \"-Onone\";\n\t\t\t\tSWIFT_VERSION = 5.0;\n\t\t\t};\n\t\t\tname = Debug;\n\t\t};\n\t\t82293E5E29BDC71700C67BD9 /* Release */ = {\n\t\t\tisa = XCBuildConfiguration;\n\t\t\tbaseConfigurationReference = 82819F7C29BDF7CB00399B7E /* LlamaTest.xcconfig */;\n\t\t\tbuildSettings = {\n\t\t\t\tCLANG_ENABLE_MODULES = YES;\n\t\t\t\tCODE_SIGN_STYLE = Automatic;\n\t\t\t\tCREATE_INFOPLIST_SECTION_IN_BINARY = YES;\n\t\t\t\tDEVELOPMENT_TEAM = 44847G58BM;\n\t\t\t\tENABLE_HARDENED_RUNTIME = YES;\n\t\t\t\tGENERATE_INFOPLIST_FILE = YES;\n\t\t\t\tINFOPLIST_EXPAND_BUILD_SETTINGS = YES;\n\t\t\t\tINFOPLIST_FILE = llamaTest/Info.plist;\n\t\t\t\tLD_RUNPATH_SEARCH_PATHS = (\n\t\t\t\t\t\"$(inherited)\",\n\t\t\t\t\t\"@executable_path/../Frameworks\",\n\t\t\t\t\t\"@loader_path/../Frameworks\",\n\t\t\t\t);\n\t\t\t\tLLAMA_MODEL_PATH = \"\";\n\t\t\t\tMACOSX_DEPLOYMENT_TARGET = 10.15;\n\t\t\t\tPRODUCT_NAME = \"$(TARGET_NAME)\";\n\t\t\t\tSWIFT_VERSION = 5.0;\n\t\t\t};\n\t\t\tname = Release;\n\t\t};\n\t\t82819FB029C1DB2900399B7E /* Debug */ = {\n\t\t\tisa = XCBuildConfiguration;\n\t\t\tbuildSettings = {\n\t\t\t\tCODE_SIGN_STYLE = Automatic;\n\t\t\t\tDEVELOPMENT_TEAM = 44847G58BM;\n\t\t\t\tEXECUTABLE_PREFIX = lib;\n\t\t\t\tMODULEMAP_FILE = \"\";\n\t\t\t\tPRODUCT_NAME = \"$(TARGET_NAME)\";\n\t\t\t\tSKIP_INSTALL = YES;\n\t\t\t};\n\t\t\tname = Debug;\n\t\t};\n\t\t82819FB129C1DB2900399B7E /* Release */ = {\n\t\t\tisa = XCBuildConfiguration;\n\t\t\tbuildSettings = {\n\t\t\t\tCODE_SIGN_STYLE = Automatic;\n\t\t\t\tDEVELOPMENT_TEAM = 44847G58BM;\n\t\t\t\tEXECUTABLE_PREFIX = lib;\n\t\t\t\tMODULEMAP_FILE = \"\";\n\t\t\t\tPRODUCT_NAME = \"$(TARGET_NAME)\";\n\t\t\t\tSKIP_INSTALL = YES;\n\t\t\t};\n\t\t\tname = Release;\n\t\t};\n/* End XCBuildConfiguration section */\n\n/* Begin XCConfigurationList section */\n\t\t82293E3329BDC4ED00C67BD9 /* Build configuration list for PBXProject \"llama\" */ = {\n\t\t\tisa = XCConfigurationList;\n\t\t\tbuildConfigurations = (\n\t\t\t\t82293E3E29BDC4ED00C67BD9 /* Debug */,\n\t\t\t\t82293E3F29BDC4ED00C67BD9 /* Release */,\n\t\t\t);\n\t\t\tdefaultConfigurationIsVisible = 0;\n\t\t\tdefaultConfigurationName = Release;\n\t\t};\n\t\t82293E4029BDC4ED00C67BD9 /* Build configuration list for PBXNativeTarget \"llama\" */ = {\n\t\t\tisa = XCConfigurationList;\n\t\t\tbuildConfigurations = (\n\t\t\t\t82293E4129BDC4ED00C67BD9 /* Debug */,\n\t\t\t\t82293E4229BDC4ED00C67BD9 /* Release */,\n\t\t\t);\n\t\t\tdefaultConfigurationIsVisible = 0;\n\t\t\tdefaultConfigurationName = Release;\n\t\t};\n\t\t82293E5C29BDC71700C67BD9 /* Build configuration list for PBXNativeTarget \"llamaTest\" */ = {\n\t\t\tisa = XCConfigurationList;\n\t\t\tbuildConfigurations = (\n\t\t\t\t82293E5D29BDC71700C67BD9 /* Debug */,\n\t\t\t\t82293E5E29BDC71700C67BD9 /* Release */,\n\t\t\t);\n\t\t\tdefaultConfigurationIsVisible = 0;\n\t\t\tdefaultConfigurationName = Release;\n\t\t};\n\t\t82819FAF29C1DB2900399B7E /* Build configuration list for PBXNativeTarget \"llamaObjCxx\" */ = {\n\t\t\tisa = XCConfigurationList;\n\t\t\tbuildConfigurations = (\n\t\t\t\t82819FB029C1DB2900399B7E /* Debug */,\n\t\t\t\t82819FB129C1DB2900399B7E /* Release */,\n\t\t\t);\n\t\t\tdefaultConfigurationIsVisible = 0;\n\t\t\tdefaultConfigurationName = Release;\n\t\t};\n/* End XCConfigurationList section */\n\t};\n\trootObject = 82293E3029BDC4ED00C67BD9 /* Project object */;\n}\n"
  },
  {
    "path": "llama.xcodeproj/project.xcworkspace/contents.xcworkspacedata",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<Workspace\n   version = \"1.0\">\n   <FileRef\n      location = \"self:\">\n   </FileRef>\n</Workspace>\n"
  },
  {
    "path": "llama.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE plist PUBLIC \"-//Apple//DTD PLIST 1.0//EN\" \"http://www.apple.com/DTDs/PropertyList-1.0.dtd\">\n<plist version=\"1.0\">\n<dict>\n\t<key>IDEDidComputeMac32BitWarning</key>\n\t<true/>\n</dict>\n</plist>\n"
  },
  {
    "path": "llamaTest/Info.plist",
    "content": "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<!DOCTYPE plist PUBLIC \"-//Apple//DTD PLIST 1.0//EN\" \"http://www.apple.com/DTDs/PropertyList-1.0.dtd\">\n<plist version=\"1.0\">\n<dict>\n\t<key>LlamaModelPath</key>\n\t<string>${MODEL_PATH}</string>\n</dict>\n</plist>\n"
  },
  {
    "path": "llamaTest/LlamaTest.xcconfig",
    "content": "MODEL_PATH=\n"
  },
  {
    "path": "llamaTest/main.swift",
    "content": "//\n//  main.swift\n//  llamaTest\n//\n//  Created by Alex Rozanski on 12/03/2023.\n//\n\nimport Foundation\nimport llama\n\nguard let pathString = Bundle.main.object(forInfoDictionaryKey: \"LlamaModelPath\") as? String else {\n  print(\"Model path not specified - define in MODEL_PATH\")\n  exit(1)\n}\n\nguard let url = URL(string: pathString), FileManager.default.fileExists(atPath: url.path) else {\n  print(\"Invalid model path, make sure this is a file URL\")\n  exit(1)\n}\n\n// Run Llama\n\n@Sendable func run() async {\n  while true {\n    print(\"Enter prompt: \")\n    guard let prompt = readLine()?.trimmingCharacters(in: .whitespacesAndNewlines), !prompt.isEmpty else {\n      break\n    }\n\n    let tokenStream = LlamaRunner(modelURL: url).run(\n      with: prompt,\n      stateChangeHandler: { state in\n        switch state {\n        case .notStarted:\n          break\n        case .initializing:\n          print(\"Initializing model... \", terminator: \"\")\n        case .generatingOutput:\n          print(\"Done.\")\n          print(\"\")\n          print(\"Generating output...\")\n          print(\"\\\"\", terminator: \"\")\n        case .completed:\n          print(\"\\\"\")\n          print(\"\")\n        case .failed:\n          // Handle this in the catch {}\n          break\n        }\n      })\n\n    do {\n      for try await token in tokenStream {\n        print(token, terminator: \"\")\n      }\n    } catch let error {\n      print(\"\")\n      print(\"Failed to generate output:\", error.localizedDescription)\n    }\n  }\n}\n\n// Run program.\nlet semaphore = DispatchSemaphore(value: 0)\n\nTask.init {\n  await run()\n}\n\n// Don't block the main thread to ensure that state changes are still called\n// on the main thread.\nwhile semaphore.wait(timeout: .now()) == .timedOut {\n  RunLoop.current.run(mode: .default, before: Date(timeIntervalSinceNow: 0))\n}\n"
  },
  {
    "path": "tools/.gitignore",
    "content": "quantize\n"
  },
  {
    "path": "tools/Makefile",
    "content": "ifndef UNAME_S\nUNAME_S := $(shell uname -s)\nendif\n\nifndef UNAME_P\nUNAME_P := $(shell uname -p)\nendif\n\nifndef UNAME_M\nUNAME_M := $(shell uname -m)\nendif\n\nCCV := $(shell $(CC) --version | head -n 1)\nCXXV := $(shell $(CXX) --version | head -n 1)\n\n# Mac OS + Arm can report x86_64\n# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789\nifeq ($(UNAME_S),Darwin)\n\tifneq ($(UNAME_P),arm)\n\t\tSYSCTL_M := $(shell sysctl -n hw.optional.arm64)\n\t\tifeq ($(SYSCTL_M),1)\n\t\t\t# UNAME_P := arm\n\t\t\t# UNAME_M := arm64\n\t\t\twarn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\\#issuecomment-1282546789)\n\t\tendif\n\tendif\nendif\n\n#\n# Compile flags\n#\n\nCPP_PATH = ../Sources/cpp\nCFLAGS   = -I. -I../Sources/llamaObjCxx/include/private/ -O3 -DNDEBUG -std=c11   -fPIC\nCXXFLAGS = -I. -I../Sources/llamaObjCxx/include/private/ -O3 -DNDEBUG -std=c++11 -fPIC\nLDFLAGS  =\n\n# OS specific\n# TODO: support Windows\nifeq ($(UNAME_S),Linux)\n\tCFLAGS   += -pthread\n\tCXXFLAGS += -pthread\nendif\nifeq ($(UNAME_S),Darwin)\n\tCFLAGS   += -pthread\n\tCXXFLAGS += -pthread\nendif\nifeq ($(UNAME_S),FreeBSD)\n\tCFLAGS   += -pthread\n\tCXXFLAGS += -pthread\nendif\nifeq ($(UNAME_S),NetBSD)\n\tCFLAGS   += -pthread\n\tCXXFLAGS += -pthread\nendif\nifeq ($(UNAME_S),Haiku)\n\tCFLAGS   += -pthread\n\tCXXFLAGS += -pthread\nendif\n\n# Architecture specific\n# TODO: probably these flags need to be tweaked on some architectures\n#       feel free to update the Makefile for your architecture and send a pull request or issue\nifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))\n\tifeq ($(UNAME_S),Darwin)\n\t\tCFLAGS += -mf16c\n\t\tAVX1_M := $(shell sysctl machdep.cpu.features)\n\t\tifneq (,$(findstring FMA,$(AVX1_M)))\n\t\t\tCFLAGS += -mfma\n\t\tendif\n\t\tifneq (,$(findstring AVX1.0,$(AVX1_M)))\n\t\t\tCFLAGS += -mavx\n\t\tendif\n\t\tAVX2_M := $(shell sysctl machdep.cpu.leaf7_features)\n\t\tifneq (,$(findstring AVX2,$(AVX2_M)))\n\t\t\tCFLAGS += -mavx2\n\t\tendif\n\telse ifeq ($(UNAME_S),Linux)\n\t\tAVX1_M := $(shell grep \"avx \" /proc/cpuinfo)\n\t\tifneq (,$(findstring avx,$(AVX1_M)))\n\t\t\tCFLAGS += -mavx\n\t\tendif\n\t\tAVX2_M := $(shell grep \"avx2 \" /proc/cpuinfo)\n\t\tifneq (,$(findstring avx2,$(AVX2_M)))\n\t\t\tCFLAGS += -mavx2\n\t\tendif\n\t\tFMA_M := $(shell grep \"fma \" /proc/cpuinfo)\n\t\tifneq (,$(findstring fma,$(FMA_M)))\n\t\t\tCFLAGS += -mfma\n\t\tendif\n\t\tF16C_M := $(shell grep \"f16c \" /proc/cpuinfo)\n\t\tifneq (,$(findstring f16c,$(F16C_M)))\n\t\t\tCFLAGS += -mf16c\n\t\tendif\n\t\tSSE3_M := $(shell grep \"sse3 \" /proc/cpuinfo)\n\t\tifneq (,$(findstring sse3,$(SSE3_M)))\n\t\t\tCFLAGS += -msse3\n\t\tendif\n\telse ifeq ($(UNAME_S),Haiku)\n\t\tAVX1_M := $(shell sysinfo -cpu | grep \"AVX \")\n\t\tifneq (,$(findstring avx,$(AVX1_M)))\n\t\t\tCFLAGS += -mavx\n\t\tendif\n\t\tAVX2_M := $(shell sysinfo -cpu | grep \"AVX2 \")\n\t\tifneq (,$(findstring avx2,$(AVX2_M)))\n\t\t\tCFLAGS += -mavx2\n\t\tendif\n\t\tFMA_M := $(shell sysinfo -cpu | grep \"FMA \")\n\t\tifneq (,$(findstring fma,$(FMA_M)))\n\t\t\tCFLAGS += -mfma\n\t\tendif\n\t\tF16C_M := $(shell sysinfo -cpu | grep \"F16C \")\n\t\tifneq (,$(findstring f16c,$(F16C_M)))\n\t\t\tCFLAGS += -mf16c\n\t\tendif\n\telse\n\t\tCFLAGS += -mfma -mf16c -mavx -mavx2\n\tendif\nendif\nifeq ($(UNAME_M),amd64)\n\tCFLAGS += -mavx -mavx2 -mfma -mf16c\nendif\nifneq ($(filter ppc64%,$(UNAME_M)),)\n\tPOWER9_M := $(shell grep \"POWER9\" /proc/cpuinfo)\n\tifneq (,$(findstring POWER9,$(POWER9_M)))\n\t\tCFLAGS += -mpower9-vector\n\tendif\n\t# Require c++23's std::byteswap for big-endian support.\n\tifeq ($(UNAME_M),ppc64)\n\t\tCXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN\n\tendif\nendif\nifndef LLAMA_NO_ACCELERATE\n\t# Mac M1 - include Accelerate framework\n\tifeq ($(UNAME_S),Darwin)\n\t\tCFLAGS  += -DGGML_USE_ACCELERATE\n\t\tLDFLAGS += -framework Accelerate\n\tendif\nendif\nifdef LLAMA_OPENBLAS\n\tCFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas\n\tLDFLAGS += -lopenblas\nendif\nifdef LLAMA_GPROF\n\tCFLAGS   += -pg\n\tCXXFLAGS += -pg\nendif\nifneq ($(filter aarch64%,$(UNAME_M)),)\n\tCFLAGS += -mcpu=native\n\tCXXFLAGS += -mcpu=native\nendif\nifneq ($(filter armv6%,$(UNAME_M)),)\n\t# Raspberry Pi 1, 2, 3\n\tCFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access\nendif\nifneq ($(filter armv7%,$(UNAME_M)),)\n\t# Raspberry Pi 4\n\tCFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations\nendif\nifneq ($(filter armv8%,$(UNAME_M)),)\n\t# Raspberry Pi 4\n\tCFLAGS += -mfp16-format=ieee -mno-unaligned-access\nendif\n\n#\n# Print build information\n#\n\n$(info I llama.cpp build info: )\n$(info I UNAME_S:  $(UNAME_S))\n$(info I UNAME_P:  $(UNAME_P))\n$(info I UNAME_M:  $(UNAME_M))\n$(info I CFLAGS:   $(CFLAGS))\n$(info I CXXFLAGS: $(CXXFLAGS))\n$(info I LDFLAGS:  $(LDFLAGS))\n$(info I CC:       $(CCV))\n$(info I CXX:      $(CXXV))\n$(info )\n\ndefault: quantize\n\n#\n# Build library\n#\n\nggml.o: $(CPP_PATH)/ggml.c $(CPP_PATH)/ggml.h\n\t$(CC)  $(CFLAGS)   -c $(CPP_PATH)/ggml.c -o ggml.o\n\nutils.o: $(CPP_PATH)/utils.cpp $(CPP_PATH)/utils.h\n\t$(CXX) $(CXXFLAGS) -c $(CPP_PATH)/utils.cpp -o utils.o\n\nclean:\n\trm -f *.o quantize\n\nquantize: $(CPP_PATH)/utils.cpp ggml.o utils.o\n\t$(CXX) $(CXXFLAGS) $(CPP_PATH)/quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)\n\n#\n# Tests\n#\n\n.PHONY: tests\ntests:\n\tbash ./tests/run-tests.sh\n"
  },
  {
    "path": "tools/convert-pth-to-ggml.py",
    "content": "# Convert a LLaMA model checkpoint to a ggml compatible file\n#\n# Load the model using Torch\n# Iterate over all variables and write them to a binary file.\n#\n# For each variable, write the following:\n#   - Number of dimensions (int)\n#   - Name length (int)\n#   - Dimensions (int[n_dims])\n#   - Name (char[name_length])\n#   - Data (float[n_dims])\n#\n# By default, the bigger matrices are converted to 16-bit floats.\n# This can be disabled by adding the \"use-f32\" CLI argument.\n#\n# At the start of the ggml file we write the model parameters\n# and vocabulary.\n#\n\nimport sys\nimport json\nimport struct\nimport numpy as np\nimport torch\nfrom sentencepiece import SentencePieceProcessor\n\nif len(sys.argv) < 3:\n    print(\"Usage: convert-ckpt-to-ggml.py dir-model ftype\\n\")\n    print(\"  ftype == 0 -> float32\")\n    print(\"  ftype == 1 -> float16\")\n    sys.exit(1)\n\n# output in the same directory as the model\ndir_model = sys.argv[1]\n\nfname_hparams   = sys.argv[1] + \"/params.json\"\nfname_tokenizer = sys.argv[1] + \"/../tokenizer.model\"\n\ndef get_n_parts(dim):\n    if dim == 4096:\n        return 1\n    elif dim == 5120:\n        return 2\n    elif dim == 6656:\n        return 4\n    elif dim == 8192:\n        return 8\n    else:\n        print(\"Invalid dim: \" + str(dim))\n        sys.exit(1)\n\n# possible data types\n#   ftype == 0 -> float32\n#   ftype == 1 -> float16\n#\n# map from ftype to string\nftype_str = [\"f32\", \"f16\"]\n\nftype = 1\nif len(sys.argv) > 2:\n    ftype = int(sys.argv[2])\n    if ftype < 0 or ftype > 1:\n        print(\"Invalid ftype: \" + str(ftype))\n        sys.exit(1)\n    fname_out = sys.argv[1] + \"/ggml-model-\" + ftype_str[ftype] + \".bin\"\n\nwith open(fname_hparams, \"r\") as f:\n    hparams = json.load(f)\n\ntokenizer = SentencePieceProcessor(fname_tokenizer)\n\nhparams.update({\"vocab_size\": tokenizer.vocab_size()})\n\nn_parts = get_n_parts(hparams[\"dim\"])\n\nprint(hparams)\nprint('n_parts = ', n_parts)\n\nfor p in range(n_parts):\n    print('Processing part ', p)\n\n    #fname_model = sys.argv[1] + \"/consolidated.00.pth\"\n    fname_model = sys.argv[1] + \"/consolidated.0\" + str(p) + \".pth\"\n    fname_out = sys.argv[1] + \"/ggml-model-\" + ftype_str[ftype] + \".bin\"\n    if (p > 0):\n        fname_out = sys.argv[1] + \"/ggml-model-\" + ftype_str[ftype] + \".bin\" + \".\" + str(p)\n\n    model = torch.load(fname_model, map_location=\"cpu\")\n\n    fout = open(fname_out, \"wb\")\n\n    fout.write(struct.pack(\"i\", 0x67676d6c)) # magic: ggml in hex\n    fout.write(struct.pack(\"i\", hparams[\"vocab_size\"]))\n    fout.write(struct.pack(\"i\", hparams[\"dim\"]))\n    fout.write(struct.pack(\"i\", hparams[\"multiple_of\"]))\n    fout.write(struct.pack(\"i\", hparams[\"n_heads\"]))\n    fout.write(struct.pack(\"i\", hparams[\"n_layers\"]))\n    fout.write(struct.pack(\"i\", hparams[\"dim\"] // hparams[\"n_heads\"])) # rot (obsolete)\n    fout.write(struct.pack(\"i\", ftype))\n\n    # Is this correct??\n    for i in range(32000):\n        if tokenizer.is_unknown(i):\n            # \"<unk>\" token (translated as ??)\n            text = \" \\u2047 \".encode(\"utf-8\")\n            fout.write(struct.pack(\"i\", len(text)))\n            fout.write(text)\n        elif tokenizer.is_control(i):\n            # \"<s>\"/\"</s>\" tokens\n            fout.write(struct.pack(\"i\", 0))\n        elif tokenizer.is_byte(i):\n            # \"<U+XX>\" tokens (which may be invalid UTF-8)\n            piece = tokenizer.id_to_piece(i)\n            if len(piece) != 6:\n                print(\"Invalid token: \" + piece)\n                sys.exit(1)\n            byte_value = int(piece[3:-1], 16)\n            fout.write(struct.pack(\"i\", 1))\n            fout.write(struct.pack(\"B\", byte_value))\n        else:\n            # normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.\n            text = tokenizer.id_to_piece(i).replace(\"\\u2581\", \" \").encode(\"utf-8\")\n            fout.write(struct.pack(\"i\", len(text)))\n            fout.write(text)\n\n    for k, v in model.items():\n        name = k\n        shape = v.shape\n\n        # skip layers.X.attention.inner_attention.rope.freqs\n        if name[-5:] == \"freqs\":\n            continue\n\n        print(\"Processing variable: \" + name + \" with shape: \", shape, \" and type: \", v.dtype)\n\n        #data = tf.train.load_variable(dir_model, name).squeeze()\n        data = v.numpy().squeeze()\n        n_dims = len(data.shape);\n\n        # for efficiency - transpose some matrices\n        # \"model/h.*/attn/c_attn/w\"\n        # \"model/h.*/attn/c_proj/w\"\n        # \"model/h.*/mlp/c_fc/w\"\n        # \"model/h.*/mlp/c_proj/w\"\n        #if name[-14:] == \"/attn/c_attn/w\" or \\\n        #   name[-14:] == \"/attn/c_proj/w\" or \\\n        #   name[-11:] == \"/mlp/c_fc/w\" or \\\n        #   name[-13:] == \"/mlp/c_proj/w\":\n        #    print(\"  Transposing\")\n        #    data = data.transpose()\n\n        dshape = data.shape\n\n        # default type is fp16\n        ftype_cur = 1\n        if ftype == 0 or n_dims == 1:\n            print(\"  Converting to float32\")\n            data = data.astype(np.float32)\n            ftype_cur = 0\n\n        # header\n        sname = name.encode('utf-8')\n        fout.write(struct.pack(\"iii\", n_dims, len(sname), ftype_cur))\n        for i in range(n_dims):\n            fout.write(struct.pack(\"i\", dshape[n_dims - 1 - i]))\n        fout.write(sname);\n\n        # data\n        data.tofile(fout)\n\n    # I hope this deallocates the memory ..\n    model = None\n\n    fout.close()\n\n    print(\"Done. Output file: \" + fname_out + \", (part \", p, \")\")\n    print(\"\")\n"
  },
  {
    "path": "tools/quantize.sh",
    "content": "#!/usr/bin/env bash\n\nif ! [[ \"$1\" =~ ^[0-9]{1,2}B$ ]]; then\n    echo\n    echo \"Usage: quantize.sh 7B|13B|30B|65B [--remove-f16]\"\n    echo\n    exit 1\nfi\n\nfor i in `ls ../models/$1/ggml-model-f16.bin*`; do\n    ./quantize \"$i\" \"${i/f16/q4_0}\" 2\n    if [[ \"$2\" == \"--remove-f16\" ]]; then\n        rm \"$i\"\n    fi\ndone\n"
  }
]