Full Code of ggml-org/ggml for AI

master c044a8eeae25 cached

1073 files

14.8 MB

3.9M tokens

1 requests

Copy disabled (too large) Download .txt

Showing preview only (15,691K chars total). Download the full file to get everything.

Repository: ggml-org/ggml
Branch: master
Commit: c044a8eeae25
Files: 1073
Total size: 14.8 MB

Directory structure:
gitextract_leym123p/

├── .editorconfig
├── .github/
│   ├── pull_request_template.md
│   └── workflows/
│       ├── ci.yml
│       └── release.yml
├── .gitignore
├── .gitmodules
├── AUTHORS
├── CMakeLists.txt
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── ci/
│   └── run.sh
├── cmake/
│   ├── GitVars.cmake
│   ├── common.cmake
│   └── ggml-config.cmake.in
├── docs/
│   └── gguf.md
├── examples/
│   ├── CMakeLists.txt
│   ├── common-ggml.cpp
│   ├── common-ggml.h
│   ├── common.cpp
│   ├── common.h
│   ├── gpt-2/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── convert-cerebras-to-ggml.py
│   │   ├── convert-ckpt-to-ggml.py
│   │   ├── convert-h5-to-ggml.py
│   │   ├── download-ggml-model.sh
│   │   ├── download-model.sh
│   │   ├── main-alloc.cpp
│   │   ├── main-backend.cpp
│   │   ├── main-batched.cpp
│   │   ├── main-ctx.cpp
│   │   ├── main-sched.cpp
│   │   └── quantize.cpp
│   ├── gpt-j/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── convert-h5-to-ggml.py
│   │   ├── download-ggml-model.sh
│   │   ├── download-model.sh
│   │   ├── main.cpp
│   │   └── quantize.cpp
│   ├── magika/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── convert.py
│   │   └── main.cpp
│   ├── mnist/
│   │   ├── .gitignore
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── mnist-common.cpp
│   │   ├── mnist-common.h
│   │   ├── mnist-eval.cpp
│   │   ├── mnist-train-cnn.py
│   │   ├── mnist-train-fc.py
│   │   ├── mnist-train.cpp
│   │   └── server.py
│   ├── perf-metal/
│   │   ├── CMakeLists.txt
│   │   └── perf-metal.cpp
│   ├── prompts/
│   │   ├── dolly-v2.txt
│   │   ├── gpt-2-chinese.txt
│   │   ├── gpt-2.txt
│   │   ├── gpt-j.txt
│   │   ├── gpt-neox-japanese.txt
│   │   ├── gpt-neox.txt
│   │   ├── polyglot-ko.txt
│   │   ├── replit.txt
│   │   ├── starcoder.txt
│   │   ├── test-cases.txt
│   │   ├── tokenize_huggingface.py
│   │   └── whisper.txt
│   ├── python/
│   │   ├── README.md
│   │   ├── api.h
│   │   ├── example_add_quant.py
│   │   ├── example_test_all_quants.py
│   │   ├── ggml/
│   │   │   ├── __init__.py
│   │   │   ├── __init__.pyi
│   │   │   ├── cffi.py
│   │   │   ├── ffi/
│   │   │   │   └── __init__.pyi
│   │   │   └── utils.py
│   │   ├── regenerate.py
│   │   ├── stubs.py
│   │   └── test_tensor.py
│   ├── sam/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── convert-pth-to-ggml.py
│   │   └── sam.cpp
│   ├── simple/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── simple-backend.cpp
│   │   └── simple-ctx.cpp
│   ├── stb_image.h
│   ├── stb_image_write.h
│   ├── test-cmake/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   └── test-cmake.cpp
│   └── yolo/
│       ├── CMakeLists.txt
│       ├── README.md
│       ├── convert-yolov3-tiny.py
│       ├── data/
│       │   └── coco.names
│       ├── yolo-image.cpp
│       ├── yolo-image.h
│       └── yolov3-tiny.cpp
├── ggml.pc.in
├── include/
│   ├── ggml-alloc.h
│   ├── ggml-backend.h
│   ├── ggml-blas.h
│   ├── ggml-cann.h
│   ├── ggml-cpp.h
│   ├── ggml-cpu.h
│   ├── ggml-cuda.h
│   ├── ggml-hexagon.h
│   ├── ggml-metal.h
│   ├── ggml-opencl.h
│   ├── ggml-openvino.h
│   ├── ggml-opt.h
│   ├── ggml-rpc.h
│   ├── ggml-sycl.h
│   ├── ggml-virtgpu.h
│   ├── ggml-vulkan.h
│   ├── ggml-webgpu.h
│   ├── ggml-zdnn.h
│   ├── ggml-zendnn.h
│   ├── ggml.h
│   └── gguf.h
├── requirements.txt
├── scripts/
│   ├── gen-authors.sh
│   ├── release.sh
│   ├── sync-llama-am.sh
│   ├── sync-llama.last
│   ├── sync-llama.sh
│   ├── sync-whisper-am.sh
│   ├── sync-whisper.last
│   └── sync-whisper.sh
├── src/
│   ├── CMakeLists.txt
│   ├── ggml-alloc.c
│   ├── ggml-backend-dl.cpp
│   ├── ggml-backend-dl.h
│   ├── ggml-backend-impl.h
│   ├── ggml-backend-reg.cpp
│   ├── ggml-backend.cpp
│   ├── ggml-blas/
│   │   ├── CMakeLists.txt
│   │   └── ggml-blas.cpp
│   ├── ggml-cann/
│   │   ├── CMakeLists.txt
│   │   ├── acl_tensor.cpp
│   │   ├── acl_tensor.h
│   │   ├── aclnn_ops.cpp
│   │   ├── aclnn_ops.h
│   │   ├── common.h
│   │   └── ggml-cann.cpp
│   ├── ggml-common.h
│   ├── ggml-cpu/
│   │   ├── CMakeLists.txt
│   │   ├── amx/
│   │   │   ├── amx.cpp
│   │   │   ├── amx.h
│   │   │   ├── common.h
│   │   │   ├── mmq.cpp
│   │   │   └── mmq.h
│   │   ├── arch/
│   │   │   ├── arm/
│   │   │   │   ├── cpu-feats.cpp
│   │   │   │   ├── quants.c
│   │   │   │   └── repack.cpp
│   │   │   ├── loongarch/
│   │   │   │   └── quants.c
│   │   │   ├── powerpc/
│   │   │   │   ├── cpu-feats.cpp
│   │   │   │   └── quants.c
│   │   │   ├── riscv/
│   │   │   │   ├── cpu-feats.cpp
│   │   │   │   ├── quants.c
│   │   │   │   └── repack.cpp
│   │   │   ├── s390/
│   │   │   │   ├── cpu-feats.cpp
│   │   │   │   └── quants.c
│   │   │   ├── wasm/
│   │   │   │   └── quants.c
│   │   │   └── x86/
│   │   │       ├── cpu-feats.cpp
│   │   │       ├── quants.c
│   │   │       └── repack.cpp
│   │   ├── arch-fallback.h
│   │   ├── binary-ops.cpp
│   │   ├── binary-ops.h
│   │   ├── cmake/
│   │   │   └── FindSIMD.cmake
│   │   ├── common.h
│   │   ├── ggml-cpu-impl.h
│   │   ├── ggml-cpu.c
│   │   ├── ggml-cpu.cpp
│   │   ├── hbm.cpp
│   │   ├── hbm.h
│   │   ├── kleidiai/
│   │   │   ├── kernels.cpp
│   │   │   ├── kernels.h
│   │   │   ├── kleidiai.cpp
│   │   │   └── kleidiai.h
│   │   ├── llamafile/
│   │   │   ├── sgemm.cpp
│   │   │   └── sgemm.h
│   │   ├── ops.cpp
│   │   ├── ops.h
│   │   ├── quants.c
│   │   ├── quants.h
│   │   ├── repack.cpp
│   │   ├── repack.h
│   │   ├── simd-gemm.h
│   │   ├── simd-mappings.h
│   │   ├── spacemit/
│   │   │   ├── ime.cpp
│   │   │   ├── ime.h
│   │   │   ├── ime1_kernels.cpp
│   │   │   └── ime_kernels.h
│   │   ├── traits.cpp
│   │   ├── traits.h
│   │   ├── unary-ops.cpp
│   │   ├── unary-ops.h
│   │   ├── vec.cpp
│   │   └── vec.h
│   ├── ggml-cuda/
│   │   ├── CMakeLists.txt
│   │   ├── acc.cu
│   │   ├── acc.cuh
│   │   ├── add-id.cu
│   │   ├── add-id.cuh
│   │   ├── arange.cu
│   │   ├── arange.cuh
│   │   ├── argmax.cu
│   │   ├── argmax.cuh
│   │   ├── argsort.cu
│   │   ├── argsort.cuh
│   │   ├── binbcast.cu
│   │   ├── binbcast.cuh
│   │   ├── clamp.cu
│   │   ├── clamp.cuh
│   │   ├── common.cuh
│   │   ├── concat.cu
│   │   ├── concat.cuh
│   │   ├── conv-transpose-1d.cu
│   │   ├── conv-transpose-1d.cuh
│   │   ├── conv2d-dw.cu
│   │   ├── conv2d-dw.cuh
│   │   ├── conv2d-transpose.cu
│   │   ├── conv2d-transpose.cuh
│   │   ├── conv2d.cu
│   │   ├── conv2d.cuh
│   │   ├── convert.cu
│   │   ├── convert.cuh
│   │   ├── count-equal.cu
│   │   ├── count-equal.cuh
│   │   ├── cp-async.cuh
│   │   ├── cpy-utils.cuh
│   │   ├── cpy.cu
│   │   ├── cpy.cuh
│   │   ├── cross-entropy-loss.cu
│   │   ├── cross-entropy-loss.cuh
│   │   ├── cumsum.cu
│   │   ├── cumsum.cuh
│   │   ├── dequantize.cuh
│   │   ├── diag.cu
│   │   ├── diag.cuh
│   │   ├── diagmask.cu
│   │   ├── diagmask.cuh
│   │   ├── fattn-common.cuh
│   │   ├── fattn-mma-f16.cuh
│   │   ├── fattn-tile.cu
│   │   ├── fattn-tile.cuh
│   │   ├── fattn-vec.cuh
│   │   ├── fattn-wmma-f16.cu
│   │   ├── fattn-wmma-f16.cuh
│   │   ├── fattn.cu
│   │   ├── fattn.cuh
│   │   ├── fill.cu
│   │   ├── fill.cuh
│   │   ├── gated_delta_net.cu
│   │   ├── gated_delta_net.cuh
│   │   ├── getrows.cu
│   │   ├── getrows.cuh
│   │   ├── ggml-cuda.cu
│   │   ├── gla.cu
│   │   ├── gla.cuh
│   │   ├── im2col.cu
│   │   ├── im2col.cuh
│   │   ├── mean.cu
│   │   ├── mean.cuh
│   │   ├── mma.cuh
│   │   ├── mmf.cu
│   │   ├── mmf.cuh
│   │   ├── mmid.cu
│   │   ├── mmid.cuh
│   │   ├── mmq.cu
│   │   ├── mmq.cuh
│   │   ├── mmvf.cu
│   │   ├── mmvf.cuh
│   │   ├── mmvq.cu
│   │   ├── mmvq.cuh
│   │   ├── norm.cu
│   │   ├── norm.cuh
│   │   ├── opt-step-adamw.cu
│   │   ├── opt-step-adamw.cuh
│   │   ├── opt-step-sgd.cu
│   │   ├── opt-step-sgd.cuh
│   │   ├── out-prod.cu
│   │   ├── out-prod.cuh
│   │   ├── pad.cu
│   │   ├── pad.cuh
│   │   ├── pad_reflect_1d.cu
│   │   ├── pad_reflect_1d.cuh
│   │   ├── pool2d.cu
│   │   ├── pool2d.cuh
│   │   ├── quantize.cu
│   │   ├── quantize.cuh
│   │   ├── reduce_rows.cuh
│   │   ├── roll.cu
│   │   ├── roll.cuh
│   │   ├── rope.cu
│   │   ├── rope.cuh
│   │   ├── scale.cu
│   │   ├── scale.cuh
│   │   ├── set-rows.cu
│   │   ├── set-rows.cuh
│   │   ├── set.cu
│   │   ├── set.cuh
│   │   ├── softcap.cu
│   │   ├── softcap.cuh
│   │   ├── softmax.cu
│   │   ├── softmax.cuh
│   │   ├── solve_tri.cu
│   │   ├── solve_tri.cuh
│   │   ├── ssm-conv.cu
│   │   ├── ssm-conv.cuh
│   │   ├── ssm-scan.cu
│   │   ├── ssm-scan.cuh
│   │   ├── sum.cu
│   │   ├── sum.cuh
│   │   ├── sumrows.cu
│   │   ├── sumrows.cuh
│   │   ├── template-instances/
│   │   │   ├── fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_1-ncols2_32.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_16-ncols2_1.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_16-ncols2_2.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_2-ncols2_32.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_32-ncols2_1.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_32-ncols2_2.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_4-ncols2_16.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_4-ncols2_2.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_4-ncols2_8.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_64-ncols2_1.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_8-ncols2_1.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_8-ncols2_2.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_8-ncols2_4.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_8-ncols2_8.cu
│   │   │   ├── fattn-tile-instance-dkq112-dv112.cu
│   │   │   ├── fattn-tile-instance-dkq128-dv128.cu
│   │   │   ├── fattn-tile-instance-dkq256-dv256.cu
│   │   │   ├── fattn-tile-instance-dkq40-dv40.cu
│   │   │   ├── fattn-tile-instance-dkq576-dv512.cu
│   │   │   ├── fattn-tile-instance-dkq64-dv64.cu
│   │   │   ├── fattn-tile-instance-dkq72-dv72.cu
│   │   │   ├── fattn-tile-instance-dkq80-dv80.cu
│   │   │   ├── fattn-tile-instance-dkq96-dv96.cu
│   │   │   ├── fattn-vec-instance-f16-f16.cu
│   │   │   ├── fattn-vec-instance-f16-q4_0.cu
│   │   │   ├── fattn-vec-instance-f16-q4_1.cu
│   │   │   ├── fattn-vec-instance-f16-q5_0.cu
│   │   │   ├── fattn-vec-instance-f16-q5_1.cu
│   │   │   ├── fattn-vec-instance-f16-q8_0.cu
│   │   │   ├── fattn-vec-instance-q4_0-f16.cu
│   │   │   ├── fattn-vec-instance-q4_0-q4_0.cu
│   │   │   ├── fattn-vec-instance-q4_0-q4_1.cu
│   │   │   ├── fattn-vec-instance-q4_0-q5_0.cu
│   │   │   ├── fattn-vec-instance-q4_0-q5_1.cu
│   │   │   ├── fattn-vec-instance-q4_0-q8_0.cu
│   │   │   ├── fattn-vec-instance-q4_1-f16.cu
│   │   │   ├── fattn-vec-instance-q4_1-q4_0.cu
│   │   │   ├── fattn-vec-instance-q4_1-q4_1.cu
│   │   │   ├── fattn-vec-instance-q4_1-q5_0.cu
│   │   │   ├── fattn-vec-instance-q4_1-q5_1.cu
│   │   │   ├── fattn-vec-instance-q4_1-q8_0.cu
│   │   │   ├── fattn-vec-instance-q5_0-f16.cu
│   │   │   ├── fattn-vec-instance-q5_0-q4_0.cu
│   │   │   ├── fattn-vec-instance-q5_0-q4_1.cu
│   │   │   ├── fattn-vec-instance-q5_0-q5_0.cu
│   │   │   ├── fattn-vec-instance-q5_0-q5_1.cu
│   │   │   ├── fattn-vec-instance-q5_0-q8_0.cu
│   │   │   ├── fattn-vec-instance-q5_1-f16.cu
│   │   │   ├── fattn-vec-instance-q5_1-q4_0.cu
│   │   │   ├── fattn-vec-instance-q5_1-q4_1.cu
│   │   │   ├── fattn-vec-instance-q5_1-q5_0.cu
│   │   │   ├── fattn-vec-instance-q5_1-q5_1.cu
│   │   │   ├── fattn-vec-instance-q5_1-q8_0.cu
│   │   │   ├── fattn-vec-instance-q8_0-f16.cu
│   │   │   ├── fattn-vec-instance-q8_0-q4_0.cu
│   │   │   ├── fattn-vec-instance-q8_0-q4_1.cu
│   │   │   ├── fattn-vec-instance-q8_0-q5_0.cu
│   │   │   ├── fattn-vec-instance-q8_0-q5_1.cu
│   │   │   ├── fattn-vec-instance-q8_0-q8_0.cu
│   │   │   ├── generate_cu_files.py
│   │   │   ├── mmf-instance-ncols_1.cu
│   │   │   ├── mmf-instance-ncols_10.cu
│   │   │   ├── mmf-instance-ncols_11.cu
│   │   │   ├── mmf-instance-ncols_12.cu
│   │   │   ├── mmf-instance-ncols_13.cu
│   │   │   ├── mmf-instance-ncols_14.cu
│   │   │   ├── mmf-instance-ncols_15.cu
│   │   │   ├── mmf-instance-ncols_16.cu
│   │   │   ├── mmf-instance-ncols_2.cu
│   │   │   ├── mmf-instance-ncols_3.cu
│   │   │   ├── mmf-instance-ncols_4.cu
│   │   │   ├── mmf-instance-ncols_5.cu
│   │   │   ├── mmf-instance-ncols_6.cu
│   │   │   ├── mmf-instance-ncols_7.cu
│   │   │   ├── mmf-instance-ncols_8.cu
│   │   │   ├── mmf-instance-ncols_9.cu
│   │   │   ├── mmq-instance-iq1_s.cu
│   │   │   ├── mmq-instance-iq2_s.cu
│   │   │   ├── mmq-instance-iq2_xs.cu
│   │   │   ├── mmq-instance-iq2_xxs.cu
│   │   │   ├── mmq-instance-iq3_s.cu
│   │   │   ├── mmq-instance-iq3_xxs.cu
│   │   │   ├── mmq-instance-iq4_nl.cu
│   │   │   ├── mmq-instance-iq4_xs.cu
│   │   │   ├── mmq-instance-mxfp4.cu
│   │   │   ├── mmq-instance-q2_k.cu
│   │   │   ├── mmq-instance-q3_k.cu
│   │   │   ├── mmq-instance-q4_0.cu
│   │   │   ├── mmq-instance-q4_1.cu
│   │   │   ├── mmq-instance-q4_k.cu
│   │   │   ├── mmq-instance-q5_0.cu
│   │   │   ├── mmq-instance-q5_1.cu
│   │   │   ├── mmq-instance-q5_k.cu
│   │   │   ├── mmq-instance-q6_k.cu
│   │   │   └── mmq-instance-q8_0.cu
│   │   ├── top-k.cu
│   │   ├── top-k.cuh
│   │   ├── topk-moe.cu
│   │   ├── topk-moe.cuh
│   │   ├── tri.cu
│   │   ├── tri.cuh
│   │   ├── tsembd.cu
│   │   ├── tsembd.cuh
│   │   ├── unary.cu
│   │   ├── unary.cuh
│   │   ├── upscale.cu
│   │   ├── upscale.cuh
│   │   ├── vecdotq.cuh
│   │   ├── vendors/
│   │   │   ├── cuda.h
│   │   │   ├── hip.h
│   │   │   └── musa.h
│   │   ├── wkv.cu
│   │   └── wkv.cuh
│   ├── ggml-hexagon/
│   │   ├── CMakeLists.txt
│   │   ├── ggml-hexagon.cpp
│   │   ├── htp/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── act-ops.c
│   │   │   ├── argsort-ops.c
│   │   │   ├── binary-ops.c
│   │   │   ├── cmake-toolchain.cmake
│   │   │   ├── cpy-ops.c
│   │   │   ├── flash-attn-ops.c
│   │   │   ├── get-rows-ops.c
│   │   │   ├── hex-dma.c
│   │   │   ├── hex-dma.h
│   │   │   ├── hex-dump.h
│   │   │   ├── hex-fastdiv.h
│   │   │   ├── hex-utils.h
│   │   │   ├── htp-ctx.h
│   │   │   ├── htp-msg.h
│   │   │   ├── htp-ops.h
│   │   │   ├── htp_iface.idl
│   │   │   ├── hvx-arith.h
│   │   │   ├── hvx-base.h
│   │   │   ├── hvx-copy.h
│   │   │   ├── hvx-div.h
│   │   │   ├── hvx-dump.h
│   │   │   ├── hvx-exp.h
│   │   │   ├── hvx-floor.h
│   │   │   ├── hvx-inverse.h
│   │   │   ├── hvx-reduce.h
│   │   │   ├── hvx-scale.h
│   │   │   ├── hvx-sigmoid.h
│   │   │   ├── hvx-sqrt.h
│   │   │   ├── hvx-types.h
│   │   │   ├── hvx-utils.h
│   │   │   ├── main.c
│   │   │   ├── matmul-ops.c
│   │   │   ├── rope-ops.c
│   │   │   ├── set-rows-ops.c
│   │   │   ├── softmax-ops.c
│   │   │   ├── ssm-conv.c
│   │   │   ├── sum-rows-ops.c
│   │   │   ├── unary-ops.c
│   │   │   ├── worker-pool.c
│   │   │   └── worker-pool.h
│   │   ├── htp-drv.cpp
│   │   ├── htp-drv.h
│   │   ├── libdl.h
│   │   ├── libggml-htp.inf
│   │   └── op-desc.h
│   ├── ggml-hip/
│   │   └── CMakeLists.txt
│   ├── ggml-impl.h
│   ├── ggml-metal/
│   │   ├── CMakeLists.txt
│   │   ├── ggml-metal-common.cpp
│   │   ├── ggml-metal-common.h
│   │   ├── ggml-metal-context.h
│   │   ├── ggml-metal-context.m
│   │   ├── ggml-metal-device.cpp
│   │   ├── ggml-metal-device.h
│   │   ├── ggml-metal-device.m
│   │   ├── ggml-metal-impl.h
│   │   ├── ggml-metal-ops.cpp
│   │   ├── ggml-metal-ops.h
│   │   ├── ggml-metal.cpp
│   │   └── ggml-metal.metal
│   ├── ggml-musa/
│   │   ├── CMakeLists.txt
│   │   ├── mudnn.cu
│   │   └── mudnn.cuh
│   ├── ggml-opencl/
│   │   ├── CMakeLists.txt
│   │   ├── ggml-opencl.cpp
│   │   └── kernels/
│   │       ├── add.cl
│   │       ├── add_id.cl
│   │       ├── argsort.cl
│   │       ├── clamp.cl
│   │       ├── concat.cl
│   │       ├── conv2d.cl
│   │       ├── conv2d_f16_f32.cl
│   │       ├── cpy.cl
│   │       ├── cumsum.cl
│   │       ├── cvt.cl
│   │       ├── diag.cl
│   │       ├── diag_mask_inf.cl
│   │       ├── div.cl
│   │       ├── embed_kernel.py
│   │       ├── exp.cl
│   │       ├── expm1.cl
│   │       ├── fill.cl
│   │       ├── flash_attn_f16.cl
│   │       ├── flash_attn_f32.cl
│   │       ├── flash_attn_f32_f16.cl
│   │       ├── gelu.cl
│   │       ├── gemm_moe_mxfp4_f32.cl
│   │       ├── gemm_noshuffle_q4_1_f32.cl
│   │       ├── gemv_moe_mxfp4_f32.cl
│   │       ├── gemv_noshuffle.cl
│   │       ├── gemv_noshuffle_general.cl
│   │       ├── gemv_noshuffle_general_q8_0_f32.cl
│   │       ├── gemv_noshuffle_q4_1_f32.cl
│   │       ├── get_rows.cl
│   │       ├── glu.cl
│   │       ├── group_norm.cl
│   │       ├── im2col_f16.cl
│   │       ├── im2col_f32.cl
│   │       ├── l2_norm.cl
│   │       ├── mean.cl
│   │       ├── mul.cl
│   │       ├── mul_mat_Ab_Bi_8x4.cl
│   │       ├── mul_mat_f16_f32.cl
│   │       ├── mul_mm_f16_f32_kq_kqv.cl
│   │       ├── mul_mm_f16_f32_l4_lm.cl
│   │       ├── mul_mm_f32_f32_l4_lm.cl
│   │       ├── mul_mm_q4_0_f32_l4_lm.cl
│   │       ├── mul_mm_q4_1_f32_l4_lm.cl
│   │       ├── mul_mm_q6_k_f32_l4_lm.cl
│   │       ├── mul_mm_q8_0_f32_8x4.cl
│   │       ├── mul_mm_q8_0_f32_l4_lm.cl
│   │       ├── mul_mv_f16_f16.cl
│   │       ├── mul_mv_f16_f32.cl
│   │       ├── mul_mv_f16_f32_1row.cl
│   │       ├── mul_mv_f16_f32_l4.cl
│   │       ├── mul_mv_f32_f32.cl
│   │       ├── mul_mv_id_mxfp4_f32.cl
│   │       ├── mul_mv_id_mxfp4_f32_flat.cl
│   │       ├── mul_mv_id_q4_0_f32_8x_flat.cl
│   │       ├── mul_mv_id_q8_0_f32.cl
│   │       ├── mul_mv_id_q8_0_f32_flat.cl
│   │       ├── mul_mv_mxfp4_f32.cl
│   │       ├── mul_mv_mxfp4_f32_flat.cl
│   │       ├── mul_mv_q4_0_f32.cl
│   │       ├── mul_mv_q4_0_f32_1d_16x_flat.cl
│   │       ├── mul_mv_q4_0_f32_1d_8x_flat.cl
│   │       ├── mul_mv_q4_0_f32_8x_flat.cl
│   │       ├── mul_mv_q4_0_f32_v.cl
│   │       ├── mul_mv_q4_1_f32.cl
│   │       ├── mul_mv_q4_1_f32_flat.cl
│   │       ├── mul_mv_q4_k_f32.cl
│   │       ├── mul_mv_q6_k_f32.cl
│   │       ├── mul_mv_q6_k_f32_flat.cl
│   │       ├── mul_mv_q8_0_f32.cl
│   │       ├── mul_mv_q8_0_f32_flat.cl
│   │       ├── neg.cl
│   │       ├── norm.cl
│   │       ├── pad.cl
│   │       ├── relu.cl
│   │       ├── repeat.cl
│   │       ├── rms_norm.cl
│   │       ├── rope.cl
│   │       ├── scale.cl
│   │       ├── set_rows.cl
│   │       ├── sigmoid.cl
│   │       ├── silu.cl
│   │       ├── softmax_4_f16.cl
│   │       ├── softmax_4_f32.cl
│   │       ├── softmax_f16.cl
│   │       ├── softmax_f32.cl
│   │       ├── softplus.cl
│   │       ├── solve_tri.cl
│   │       ├── sqr.cl
│   │       ├── sqrt.cl
│   │       ├── ssm_conv.cl
│   │       ├── sub.cl
│   │       ├── sum_rows.cl
│   │       ├── tanh.cl
│   │       ├── transpose.cl
│   │       ├── tri.cl
│   │       ├── tsembd.cl
│   │       └── upscale.cl
│   ├── ggml-openvino/
│   │   ├── .clang-format
│   │   ├── CMakeLists.txt
│   │   ├── ggml-decoder.cpp
│   │   ├── ggml-decoder.h
│   │   ├── ggml-openvino-extra.cpp
│   │   ├── ggml-openvino-extra.h
│   │   ├── ggml-openvino.cpp
│   │   ├── ggml-quants.cpp
│   │   ├── ggml-quants.h
│   │   ├── openvino/
│   │   │   ├── decoder.h
│   │   │   ├── frontend.cpp
│   │   │   ├── frontend.h
│   │   │   ├── input_model.cpp
│   │   │   ├── input_model.h
│   │   │   ├── node_context.h
│   │   │   ├── op/
│   │   │   │   ├── cont.cpp
│   │   │   │   ├── cpy.cpp
│   │   │   │   ├── flash_attn_ext.cpp
│   │   │   │   ├── get_rows.cpp
│   │   │   │   ├── glu_geglu.cpp
│   │   │   │   ├── glu_swiglu.cpp
│   │   │   │   ├── mulmat.cpp
│   │   │   │   ├── permute.cpp
│   │   │   │   ├── reshape.cpp
│   │   │   │   ├── rms_norm.cpp
│   │   │   │   ├── rope.cpp
│   │   │   │   ├── scale.cpp
│   │   │   │   ├── set_rows.cpp
│   │   │   │   ├── softmax.cpp
│   │   │   │   ├── transpose.cpp
│   │   │   │   ├── unary_silu.cpp
│   │   │   │   └── view.cpp
│   │   │   ├── op_table.cpp
│   │   │   ├── op_table.h
│   │   │   ├── pass/
│   │   │   │   ├── eliminate_zp.cpp
│   │   │   │   ├── eliminate_zp.h
│   │   │   │   ├── fuse_to_sdpa.cpp
│   │   │   │   ├── fuse_to_sdpa.h
│   │   │   │   ├── mark_decompression_convert_constant_folding.h
│   │   │   │   ├── squeeze_matmul.cpp
│   │   │   │   └── squeeze_matmul.h
│   │   │   ├── translate_session.cpp
│   │   │   ├── translate_session.h
│   │   │   ├── utils.cpp
│   │   │   └── utils.h
│   │   ├── utils.cpp
│   │   └── utils.h
│   ├── ggml-opt.cpp
│   ├── ggml-quants.c
│   ├── ggml-quants.h
│   ├── ggml-rpc/
│   │   ├── CMakeLists.txt
│   │   └── ggml-rpc.cpp
│   ├── ggml-sycl/
│   │   ├── CMakeLists.txt
│   │   ├── add-id.cpp
│   │   ├── add-id.hpp
│   │   ├── backend.hpp
│   │   ├── binbcast.cpp
│   │   ├── binbcast.hpp
│   │   ├── common.cpp
│   │   ├── common.hpp
│   │   ├── concat.cpp
│   │   ├── concat.hpp
│   │   ├── conv.cpp
│   │   ├── conv.hpp
│   │   ├── convert.cpp
│   │   ├── convert.hpp
│   │   ├── count-equal.cpp
│   │   ├── count-equal.hpp
│   │   ├── cpy.cpp
│   │   ├── cpy.hpp
│   │   ├── dequantize.hpp
│   │   ├── dmmv.cpp
│   │   ├── dmmv.hpp
│   │   ├── dpct/
│   │   │   └── helper.hpp
│   │   ├── element_wise.cpp
│   │   ├── element_wise.hpp
│   │   ├── fattn-common.hpp
│   │   ├── fattn-tile.cpp
│   │   ├── fattn-tile.hpp
│   │   ├── fattn-vec.hpp
│   │   ├── fattn.cpp
│   │   ├── fattn.hpp
│   │   ├── gated_delta_net.cpp
│   │   ├── gated_delta_net.hpp
│   │   ├── gemm.hpp
│   │   ├── getrows.cpp
│   │   ├── getrows.hpp
│   │   ├── ggml-sycl.cpp
│   │   ├── gla.cpp
│   │   ├── gla.hpp
│   │   ├── im2col.cpp
│   │   ├── im2col.hpp
│   │   ├── mmq.cpp
│   │   ├── mmq.hpp
│   │   ├── mmvq.cpp
│   │   ├── mmvq.hpp
│   │   ├── norm.cpp
│   │   ├── norm.hpp
│   │   ├── outprod.cpp
│   │   ├── outprod.hpp
│   │   ├── pad.cpp
│   │   ├── pad.hpp
│   │   ├── pad_reflect_1d.cpp
│   │   ├── pad_reflect_1d.hpp
│   │   ├── presets.hpp
│   │   ├── quantize.hpp
│   │   ├── quants.hpp
│   │   ├── repeat_back.cpp
│   │   ├── repeat_back.hpp
│   │   ├── roll.cpp
│   │   ├── roll.hpp
│   │   ├── rope.cpp
│   │   ├── rope.hpp
│   │   ├── set.cpp
│   │   ├── set.hpp
│   │   ├── set_rows.cpp
│   │   ├── set_rows.hpp
│   │   ├── softmax.cpp
│   │   ├── softmax.hpp
│   │   ├── ssm_conv.cpp
│   │   ├── ssm_conv.hpp
│   │   ├── sycl_hw.cpp
│   │   ├── sycl_hw.hpp
│   │   ├── template-instances/
│   │   │   ├── fattn-tile-instance-dkq112-dv112.cpp
│   │   │   ├── fattn-tile-instance-dkq128-dv128.cpp
│   │   │   ├── fattn-tile-instance-dkq256-dv256.cpp
│   │   │   ├── fattn-tile-instance-dkq40-dv40.cpp
│   │   │   ├── fattn-tile-instance-dkq576-dv512.cpp
│   │   │   ├── fattn-tile-instance-dkq64-dv64.cpp
│   │   │   ├── fattn-tile-instance-dkq72-dv72.cpp
│   │   │   ├── fattn-tile-instance-dkq80-dv80.cpp
│   │   │   ├── fattn-tile-instance-dkq96-dv96.cpp
│   │   │   ├── fattn-vec-instance-f16-f16.cpp
│   │   │   ├── fattn-vec-instance-f16-q4_0.cpp
│   │   │   ├── fattn-vec-instance-f16-q4_1.cpp
│   │   │   ├── fattn-vec-instance-f16-q5_0.cpp
│   │   │   ├── fattn-vec-instance-f16-q5_1.cpp
│   │   │   ├── fattn-vec-instance-f16-q8_0.cpp
│   │   │   ├── fattn-vec-instance-q4_0-f16.cpp
│   │   │   ├── fattn-vec-instance-q4_0-q4_0.cpp
│   │   │   ├── fattn-vec-instance-q4_0-q4_1.cpp
│   │   │   ├── fattn-vec-instance-q4_0-q5_0.cpp
│   │   │   ├── fattn-vec-instance-q4_0-q5_1.cpp
│   │   │   ├── fattn-vec-instance-q4_0-q8_0.cpp
│   │   │   ├── fattn-vec-instance-q4_1-f16.cpp
│   │   │   ├── fattn-vec-instance-q4_1-q4_0.cpp
│   │   │   ├── fattn-vec-instance-q4_1-q4_1.cpp
│   │   │   ├── fattn-vec-instance-q4_1-q5_0.cpp
│   │   │   ├── fattn-vec-instance-q4_1-q5_1.cpp
│   │   │   ├── fattn-vec-instance-q4_1-q8_0.cpp
│   │   │   ├── fattn-vec-instance-q5_0-f16.cpp
│   │   │   ├── fattn-vec-instance-q5_0-q4_0.cpp
│   │   │   ├── fattn-vec-instance-q5_0-q4_1.cpp
│   │   │   ├── fattn-vec-instance-q5_0-q5_0.cpp
│   │   │   ├── fattn-vec-instance-q5_0-q5_1.cpp
│   │   │   ├── fattn-vec-instance-q5_0-q8_0.cpp
│   │   │   ├── fattn-vec-instance-q5_1-f16.cpp
│   │   │   ├── fattn-vec-instance-q5_1-q4_0.cpp
│   │   │   ├── fattn-vec-instance-q5_1-q4_1.cpp
│   │   │   ├── fattn-vec-instance-q5_1-q5_0.cpp
│   │   │   ├── fattn-vec-instance-q5_1-q5_1.cpp
│   │   │   ├── fattn-vec-instance-q5_1-q8_0.cpp
│   │   │   ├── fattn-vec-instance-q8_0-f16.cpp
│   │   │   ├── fattn-vec-instance-q8_0-q4_0.cpp
│   │   │   ├── fattn-vec-instance-q8_0-q4_1.cpp
│   │   │   ├── fattn-vec-instance-q8_0-q5_0.cpp
│   │   │   ├── fattn-vec-instance-q8_0-q5_1.cpp
│   │   │   └── fattn-vec-instance-q8_0-q8_0.cpp
│   │   ├── tsembd.cpp
│   │   ├── tsembd.hpp
│   │   ├── vecdotq.hpp
│   │   ├── wkv.cpp
│   │   └── wkv.hpp
│   ├── ggml-threading.cpp
│   ├── ggml-threading.h
│   ├── ggml-virtgpu/
│   │   ├── CMakeLists.txt
│   │   ├── apir_cs_ggml-rpc-front.cpp
│   │   ├── backend/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── apir_cs_ggml-rpc-back.cpp
│   │   │   ├── backend-convert.h
│   │   │   ├── backend-dispatched-backend.cpp
│   │   │   ├── backend-dispatched-buffer-type.cpp
│   │   │   ├── backend-dispatched-buffer.cpp
│   │   │   ├── backend-dispatched-device.cpp
│   │   │   ├── backend-dispatched.cpp
│   │   │   ├── backend-dispatched.gen.h
│   │   │   ├── backend-dispatched.h
│   │   │   ├── backend-virgl-apir.h
│   │   │   ├── backend.cpp
│   │   │   └── shared/
│   │   │       ├── api_remoting.h
│   │   │       ├── apir_backend.gen.h
│   │   │       ├── apir_backend.h
│   │   │       ├── apir_cs.h
│   │   │       ├── apir_cs_ggml.h
│   │   │       └── apir_cs_rpc.h
│   │   ├── ggml-backend-buffer-type.cpp
│   │   ├── ggml-backend-buffer.cpp
│   │   ├── ggml-backend-device.cpp
│   │   ├── ggml-backend-reg.cpp
│   │   ├── ggml-backend.cpp
│   │   ├── ggml-remoting.h
│   │   ├── ggmlremoting_functions.yaml
│   │   ├── include/
│   │   │   └── apir_hw.h
│   │   ├── regenerate_remoting.py
│   │   ├── virtgpu-apir.h
│   │   ├── virtgpu-forward-backend.cpp
│   │   ├── virtgpu-forward-buffer-type.cpp
│   │   ├── virtgpu-forward-buffer.cpp
│   │   ├── virtgpu-forward-device.cpp
│   │   ├── virtgpu-forward-impl.h
│   │   ├── virtgpu-forward.gen.h
│   │   ├── virtgpu-shm.cpp
│   │   ├── virtgpu-shm.h
│   │   ├── virtgpu-utils.cpp
│   │   ├── virtgpu-utils.h
│   │   ├── virtgpu.cpp
│   │   └── virtgpu.h
│   ├── ggml-vulkan/
│   │   ├── CMakeLists.txt
│   │   ├── cmake/
│   │   │   └── host-toolchain.cmake.in
│   │   ├── ggml-vulkan.cpp
│   │   └── vulkan-shaders/
│   │       ├── CMakeLists.txt
│   │       ├── abs.comp
│   │       ├── acc.comp
│   │       ├── add.comp
│   │       ├── add1.comp
│   │       ├── add_id.comp
│   │       ├── arange.comp
│   │       ├── argmax.comp
│   │       ├── argsort.comp
│   │       ├── argsort_large.comp
│   │       ├── ceil.comp
│   │       ├── clamp.comp
│   │       ├── concat.comp
│   │       ├── contig_copy.comp
│   │       ├── conv2d_dw.comp
│   │       ├── conv2d_mm.comp
│   │       ├── conv_transpose_1d.comp
│   │       ├── copy.comp
│   │       ├── copy_from_quant.comp
│   │       ├── copy_to_quant.comp
│   │       ├── copy_transpose.comp
│   │       ├── cos.comp
│   │       ├── count_equal.comp
│   │       ├── count_experts.comp
│   │       ├── cumsum.comp
│   │       ├── cumsum_multipass1.comp
│   │       ├── cumsum_multipass2.comp
│   │       ├── dequant_f32.comp
│   │       ├── dequant_funcs.glsl
│   │       ├── dequant_funcs_cm2.glsl
│   │       ├── dequant_head.glsl
│   │       ├── dequant_iq1_m.comp
│   │       ├── dequant_iq1_s.comp
│   │       ├── dequant_iq2_s.comp
│   │       ├── dequant_iq2_xs.comp
│   │       ├── dequant_iq2_xxs.comp
│   │       ├── dequant_iq3_s.comp
│   │       ├── dequant_iq3_xxs.comp
│   │       ├── dequant_iq4_nl.comp
│   │       ├── dequant_iq4_xs.comp
│   │       ├── dequant_mxfp4.comp
│   │       ├── dequant_q2_k.comp
│   │       ├── dequant_q3_k.comp
│   │       ├── dequant_q4_0.comp
│   │       ├── dequant_q4_1.comp
│   │       ├── dequant_q4_k.comp
│   │       ├── dequant_q5_0.comp
│   │       ├── dequant_q5_1.comp
│   │       ├── dequant_q5_k.comp
│   │       ├── dequant_q6_k.comp
│   │       ├── dequant_q8_0.comp
│   │       ├── diag.comp
│   │       ├── diag_mask_inf.comp
│   │       ├── div.comp
│   │       ├── elu.comp
│   │       ├── exp.comp
│   │       ├── feature-tests/
│   │       │   ├── bfloat16.comp
│   │       │   ├── coopmat.comp
│   │       │   ├── coopmat2.comp
│   │       │   └── integer_dot.comp
│   │       ├── fill.comp
│   │       ├── flash_attn.comp
│   │       ├── flash_attn_base.glsl
│   │       ├── flash_attn_cm1.comp
│   │       ├── flash_attn_cm2.comp
│   │       ├── flash_attn_mask_opt.comp
│   │       ├── flash_attn_split_k_reduce.comp
│   │       ├── floor.comp
│   │       ├── gated_delta_net.comp
│   │       ├── geglu.comp
│   │       ├── geglu_erf.comp
│   │       ├── geglu_quick.comp
│   │       ├── gelu.comp
│   │       ├── gelu_erf.comp
│   │       ├── gelu_quick.comp
│   │       ├── generic_binary_head.glsl
│   │       ├── generic_head.glsl
│   │       ├── generic_unary_head.glsl
│   │       ├── get_rows.comp
│   │       ├── get_rows_quant.comp
│   │       ├── glu_head.glsl
│   │       ├── glu_main.glsl
│   │       ├── group_norm.comp
│   │       ├── hardsigmoid.comp
│   │       ├── hardswish.comp
│   │       ├── im2col.comp
│   │       ├── im2col_3d.comp
│   │       ├── l2_norm.comp
│   │       ├── leaky_relu.comp
│   │       ├── log.comp
│   │       ├── mul.comp
│   │       ├── mul_mat_split_k_reduce.comp
│   │       ├── mul_mat_vec.comp
│   │       ├── mul_mat_vec_base.glsl
│   │       ├── mul_mat_vec_iface.glsl
│   │       ├── mul_mat_vec_iq1_m.comp
│   │       ├── mul_mat_vec_iq1_s.comp
│   │       ├── mul_mat_vec_iq2_s.comp
│   │       ├── mul_mat_vec_iq2_xs.comp
│   │       ├── mul_mat_vec_iq2_xxs.comp
│   │       ├── mul_mat_vec_iq3_s.comp
│   │       ├── mul_mat_vec_iq3_xxs.comp
│   │       ├── mul_mat_vec_nc.comp
│   │       ├── mul_mat_vec_p021.comp
│   │       ├── mul_mat_vec_q2_k.comp
│   │       ├── mul_mat_vec_q3_k.comp
│   │       ├── mul_mat_vec_q4_k.comp
│   │       ├── mul_mat_vec_q5_k.comp
│   │       ├── mul_mat_vec_q6_k.comp
│   │       ├── mul_mat_vecq.comp
│   │       ├── mul_mat_vecq_funcs.glsl
│   │       ├── mul_mm.comp
│   │       ├── mul_mm_cm2.comp
│   │       ├── mul_mm_funcs.glsl
│   │       ├── mul_mm_id_funcs.glsl
│   │       ├── mul_mmq.comp
│   │       ├── mul_mmq_funcs.glsl
│   │       ├── mul_mmq_shmem_types.glsl
│   │       ├── multi_add.comp
│   │       ├── neg.comp
│   │       ├── norm.comp
│   │       ├── opt_step_adamw.comp
│   │       ├── opt_step_sgd.comp
│   │       ├── pad.comp
│   │       ├── pool2d.comp
│   │       ├── quantize_q8_1.comp
│   │       ├── reglu.comp
│   │       ├── relu.comp
│   │       ├── repeat.comp
│   │       ├── repeat_back.comp
│   │       ├── rms_norm.comp
│   │       ├── rms_norm_back.comp
│   │       ├── rms_norm_partials.comp
│   │       ├── roll.comp
│   │       ├── rope_funcs.glsl
│   │       ├── rope_head.glsl
│   │       ├── rope_multi.comp
│   │       ├── rope_neox.comp
│   │       ├── rope_norm.comp
│   │       ├── rope_params.glsl
│   │       ├── rope_vision.comp
│   │       ├── round.comp
│   │       ├── rte.glsl
│   │       ├── scale.comp
│   │       ├── sgn.comp
│   │       ├── sigmoid.comp
│   │       ├── silu.comp
│   │       ├── silu_back.comp
│   │       ├── sin.comp
│   │       ├── soft_max.comp
│   │       ├── soft_max_back.comp
│   │       ├── soft_max_large1.comp
│   │       ├── soft_max_large2.comp
│   │       ├── soft_max_large3.comp
│   │       ├── soft_max_large_common.glsl
│   │       ├── softplus.comp
│   │       ├── solve_tri.comp
│   │       ├── sqrt.comp
│   │       ├── square.comp
│   │       ├── ssm_conv.comp
│   │       ├── ssm_scan.comp
│   │       ├── step.comp
│   │       ├── sub.comp
│   │       ├── sum_rows.comp
│   │       ├── sum_rows.glsl
│   │       ├── swiglu.comp
│   │       ├── swiglu_oai.comp
│   │       ├── tanh.comp
│   │       ├── timestep_embedding.comp
│   │       ├── topk_argsort.comp
│   │       ├── topk_moe.comp
│   │       ├── topk_nary_search.comp
│   │       ├── tri.comp
│   │       ├── trunc.comp
│   │       ├── types.glsl
│   │       ├── upscale.comp
│   │       ├── utils.glsl
│   │       ├── vulkan-shaders-gen.cpp
│   │       ├── wkv6.comp
│   │       ├── wkv7.comp
│   │       └── xielu.comp
│   ├── ggml-webgpu/
│   │   ├── CMakeLists.txt
│   │   ├── ggml-webgpu-shader-lib.hpp
│   │   ├── ggml-webgpu.cpp
│   │   ├── pre_wgsl.hpp
│   │   └── wgsl-shaders/
│   │       ├── argmax.wgsl
│   │       ├── argsort.wgsl
│   │       ├── argsort_merge.wgsl
│   │       ├── binary.wgsl
│   │       ├── common_decls.tmpl
│   │       ├── concat.wgsl
│   │       ├── cpy.tmpl.wgsl
│   │       ├── cumsum.wgsl
│   │       ├── embed_wgsl.py
│   │       ├── flash_attn.wgsl
│   │       ├── get_rows.wgsl
│   │       ├── glu.tmpl.wgsl
│   │       ├── memset.wgsl
│   │       ├── mul_mat.wgsl
│   │       ├── mul_mat_decls.tmpl
│   │       ├── mul_mat_reg_tile.wgsl
│   │       ├── mul_mat_subgroup_matrix.wgsl
│   │       ├── mul_mat_vec.wgsl
│   │       ├── pad.wgsl
│   │       ├── repeat.wgsl
│   │       ├── rms_norm.wgsl
│   │       ├── rope.tmpl.wgsl
│   │       ├── scale.wgsl
│   │       ├── set_rows.wgsl
│   │       ├── soft_max.tmpl.wgsl
│   │       ├── sum_rows.wgsl
│   │       └── unary.wgsl
│   ├── ggml-zdnn/
│   │   ├── .gitignore
│   │   ├── CMakeLists.txt
│   │   ├── common.hpp
│   │   ├── ggml-zdnn.cpp
│   │   ├── mmf.cpp
│   │   ├── mmf.hpp
│   │   ├── utils.cpp
│   │   └── utils.hpp
│   ├── ggml-zendnn/
│   │   ├── CMakeLists.txt
│   │   └── ggml-zendnn.cpp
│   ├── ggml.c
│   ├── ggml.cpp
│   └── gguf.cpp
└── tests/
    ├── CMakeLists.txt
    ├── test-arange.cpp
    ├── test-backend-ops.cpp
    ├── test-cont.c
    ├── test-conv-transpose-1d.cpp
    ├── test-conv-transpose.c
    ├── test-conv1d-dw-c1.cpp
    ├── test-conv1d-dw-c2.cpp
    ├── test-conv1d.cpp
    ├── test-conv2d-dw.cpp
    ├── test-conv2d.cpp
    ├── test-customop.c
    ├── test-dup.c
    ├── test-interpolate.cpp
    ├── test-opt.cpp
    ├── test-pad-reflect-1d.cpp
    ├── test-pool.c
    ├── test-quantize-fns.cpp
    ├── test-quantize-perf.cpp
    ├── test-rel-pos.c
    ├── test-roll.cpp
    └── test-timestep_embedding.cpp

================================================
FILE CONTENTS
================================================

================================================
FILE: .editorconfig
================================================
# https://EditorConfig.org

# Top-most EditorConfig file
root = true

# Unix-style newlines with a newline ending every file, utf-8 charset
[*]
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true
charset = utf-8
indent_style = space
indent_size = 4

[*.md]
indent_size = 2

[Makefile]
indent_style = tab

[prompts/*.txt]
insert_final_newline = unset


================================================
FILE: .github/pull_request_template.md
================================================
*For changes to the core `ggml` library (including to the CMake build system), please open a PR in https://github.com/ggml-org/llama.cpp. Doing so will make your PR more visible, better tested and more likely to be reviewed.*


================================================
FILE: .github/workflows/ci.yml
================================================
name: CI

on:
  push:
    branches: [ master ]
  pull_request:
    branches: [ master ]

concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
  cancel-in-progress: true

jobs:
  build:
    strategy:
      matrix:
        os: [ubuntu-latest, macos-latest, windows-latest]
        libraries: [shared, static]

    runs-on: ${{ matrix.os }}

    steps:
    - name: Clone
      uses: actions/checkout@v6

    - name: Dependencies for Ubuntu
      if: matrix.os == 'ubuntu-latest'
      run: |
        sudo apt-get update
        sudo apt-get install llvm

    - name: Add msbuild to PATH
      if: matrix.os == 'windows-latest'
      uses: microsoft/setup-msbuild@v2

    - name: Create Build Environment
      run: mkdir build

    - name: Configure CMake
      working-directory: ./build
      run: cmake ..
        ${{ contains(matrix.os, 'windows') && '-A x64' || '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++' }}
        ${{ matrix.libraries == 'static' && '-DBUILD_SHARED_LIBS=OFF' || '-DBUILD_SHARED_LIBS=ON' }}
        -DCMAKE_INSTALL_PREFIX=${{ github.workspace }}/installed
        -DGGML_METAL=OFF

    - name: Build
      working-directory: ./build
      run: cmake --build . ${{ contains(matrix.os, 'windows') && '--config Release' || '' }}

    - name: Test
      working-directory: ./build
      run: ctest --verbose --timeout 900 ${{ contains(matrix.os, 'windows') && '--build-config Release' || '' }}

    - name: Install
      working-directory: ./build
      run: cmake --build . --target install ${{ contains(matrix.os, 'windows') && '--config Release' || '' }}

    - name: Test CMake config
      run: |
        mkdir test-cmake
        cmake -S examples/test-cmake -B test-cmake -DCMAKE_PREFIX_PATH=${{ github.workspace }}/installed ${{ contains(matrix.os, 'windows') && '-A x64' || '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++' }}
        cmake --build test-cmake ${{ contains(matrix.os, 'windows') && '--config Release' || '' }}

# TODO: simplify the following workflows using a matrix
  ggml-ci-x64-cpu-low-perf:
    runs-on: ubuntu-22.04

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ggml-ci-x64-cpu-low-perf
          evict-old-files: 1d

      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential libcurl4-openssl-dev

      - name: Test
        id: ggml-ci
        run: |
          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

  ggml-ci-arm64-cpu-low-perf:
    runs-on: ubuntu-22.04-arm

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ggml-ci-arm64-cpu-low-perf
          evict-old-files: 1d

      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential libcurl4-openssl-dev

      - name: Test
        id: ggml-ci
        run: |
          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_LOW_PERF=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

  ggml-ci-x64-cpu-high-perf:
    runs-on: ubuntu-22.04

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ggml-ci-x64-cpu-high-perf
          evict-old-files: 1d

      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential libcurl4-openssl-dev

      - name: Test
        id: ggml-ci
        run: |
          LLAMA_ARG_THREADS=$(nproc) bash ./ci/run.sh ./tmp/results ./tmp/mnt

  ggml-ci-arm64-cpu-high-perf:
    runs-on: ubuntu-22.04-arm

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ggml-ci-arm64-cpu-high-perf
          evict-old-files: 1d

      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential libcurl4-openssl-dev

      - name: Test
        id: ggml-ci
        run: |
          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_SVE=1 GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

  ggml-ci-arm64-cpu-high-perf-sve:
    runs-on: ubuntu-22.04-arm

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

      - name: ccache
        uses: ggml-org/ccache-action@v1.2.16
        with:
          key: ggml-ci-arm64-cpu-high-perf-sve
          evict-old-files: 1d

      - name: Dependencies
        id: depends
        run: |
          sudo apt-get update
          sudo apt-get install build-essential libcurl4-openssl-dev

      - name: Test
        id: ggml-ci
        run: |
          LLAMA_ARG_THREADS=$(nproc) GG_BUILD_NO_BF16=1 GG_BUILD_EXTRA_TESTS_0=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt

  ggml-ci-x64-nvidia-cuda:
    runs-on: [self-hosted, Linux, X64, NVIDIA]

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

      - name: Test
        id: ggml-ci
        run: |
          nvidia-smi
          GG_BUILD_CUDA=1 bash ./ci/run.sh ~/results/ggml /mnt/ggml

  ggml-ci-x64-nvidia-vulkan-cm:
    runs-on: [self-hosted, Linux, X64, NVIDIA]

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

      - name: Test
        id: ggml-ci
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 GGML_VK_DISABLE_COOPMAT2=1 bash ./ci/run.sh ~/results/ggml /mnt/ggml

  ggml-ci-x64-nvidia-vulkan-cm2:
    runs-on: [self-hosted, Linux, X64, NVIDIA, COOPMAT2]

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

      - name: Test
        id: ggml-ci
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/ggml /mnt/ggml

  # TODO: provision AMX-compatible machine
  #ggml-ci-x64-cpu-amx:
  #  runs-on: [self-hosted, Linux, X64, CPU, AMX]

  #  steps:
  #    - name: Clone
  #      id: checkout
  #      uses: actions/checkout@v6

  #    - name: Test
  #      id: ggml-ci
  #      run: |
  #        bash ./ci/run.sh ~/results/ggml /mnt/ggml

  ggml-ci-mac-metal:
    runs-on: [self-hosted, macOS, ARM64]

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

      - name: Test
        id: ggml-ci
        run: |
          GG_BUILD_METAL=1 bash ./ci/run.sh ~/results/ggml ~/mnt/ggml

  ggml-ci-mac-vulkan:
    runs-on: [self-hosted, macOS, ARM64]

    steps:
      - name: Clone
        id: checkout
        uses: actions/checkout@v6

      - name: Test
        id: ggml-ci
        run: |
          vulkaninfo --summary
          GG_BUILD_VULKAN=1 bash ./ci/run.sh ~/results/ggml ~/mnt/ggml


================================================
FILE: .github/workflows/release.yml
================================================
name: Release

on:
  push:
    tags:
      - 'v*'

jobs:
  release:
    runs-on: ubuntu-latest
    permissions:
      contents: write

    steps:
    - name: Checkout code
      uses: actions/checkout@v6

    - name: Create Release
      id: create_release
      uses: ggml-org/action-create-release@v1
      env:
        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      with:
          tag_name: ${{ github.ref_name }}
          release_name: ${{ github.ref }}
          draft: false
          prerelease: false


================================================
FILE: .gitignore
================================================
build/
build-*/
out/
tmp/
models/
models-mnt

compile_commands.json
CMakeSettings.json
.vs/
.vscode/
.idea/
.clangd

.venv/
ggml_env/
.exrc
.cache
.DS_Store
.stablelm
.gpt-2

src/arm_neon.h
tests/arm_neon.h

zig-out/
zig-cache/

*.o
*.d
*.dot

*.sw?

__pycache__/

# Model files
ggml-model-f16.bin
*.bat


================================================
FILE: .gitmodules
================================================


================================================
FILE: AUTHORS
================================================
# date: Tue Feb  4 13:03:51 EET 2025
# this file is auto-generated by scripts/gen-authors.sh

0cc4m <picard12@live.de>
65a <10104049+65a@users.noreply.github.com>
AT <manyoso@users.noreply.github.com>
Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Adam Tazi <52357206+ad1tazi@users.noreply.github.com>
Adrien Gallouët <adrien@gallouet.fr>
Adrien Gallouët <angt@huggingface.co>
Ahmad Tameem <113388789+Tameem-10xE@users.noreply.github.com>
AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com>
AidanBeltonS <aidan.belton@codeplay.com>
Akarshan Biswas <akarshan.biswas@gmail.com>
Akarshan Biswas <akarshanbiswas@fedoraproject.org>
Albert Jin <albert.jin@gmail.com>
Alberto Cabrera Pérez <alberto.cabrera@codeplay.com>
Alberto Cabrera Pérez <alberto.cabrera@intel.com>
Alex Azarov <alex@azarov.by>
Alex O'Connell <35843486+acon96@users.noreply.github.com>
Alex von Gluck IV <kallisti5@unixzen.com>
AmbientL <107641468+AmbientL@users.noreply.github.com>
AmirAli Mirian <37371367+amiralimi@users.noreply.github.com>
Ananta Bastola <anantarajbastola@gmail.com>
Andreas (Andi) Kunar <andreask@msn.com>
Andreas Kieslinger <47689530+aendk@users.noreply.github.com>
Andrei <abetlen@gmail.com>
Andrew Minh Nguyen <40281306+amqdn@users.noreply.github.com>
Andrii Ryzhkov <andriiryzhkov@users.noreply.github.com>
Arjun <ccldarjun@icloud.com>
Ashraful Islam <ashraful.meche@gmail.com>
Astariul <43774355+astariul@users.noreply.github.com>
AsukaMinato <asukaminato@nyan.eu.org>
Avi Lumelsky <avilume@gmail.com>
Bart Pelle <3662930+Velocity-@users.noreply.github.com>
Ben Ashbaugh <ben.ashbaugh@intel.com>
Bernhard M. Wiedemann <githubbmwprimary@lsmod.de>
Borislav Stanimirov <b.stanimirov@abv.bg>
Brad Ito <phlogisticfugu@users.noreply.github.com>
Brad Murray <59848399+bradmurray-dt@users.noreply.github.com>
Brian <mofosyne@gmail.com>
Bryan Lozano <b.lozano.havoc@gmail.com>
Carolinabanana <140120812+Carolinabanana@users.noreply.github.com>
CarterLi999 <664681047@qq.com>
Cebtenzzre <cebtenzzre@gmail.com>
Changyeon Kim <cyzero.kim@samsung.com>
Charles Xu <63788048+chaxu01@users.noreply.github.com>
Charles Xu <charles.xu@arm.com>
Chen Xi <xi2.chen@intel.com>
Chen Xi <xixichen08@foxmail.com>
Chenguang Li <87689256+noemotiovon@users.noreply.github.com>
Chris Elrod <elrodc@gmail.com>
Christian Kastner <ckk@kvr.at>
Clint Herron <hanclinto@gmail.com>
Conrad Kramer <conrad@conradkramer.com>
Cordeiro <1471463+ocordeiro@users.noreply.github.com>
Cristiano Calcagno <cristianoc@users.noreply.github.com>
DAN™ <dranger003@gmail.com>
Dan Forbes <dan@danforbes.dev>
Dan Johansson <164997844+eddnjjn@users.noreply.github.com>
Dan Johansson <dan.johansson@arm.com>
Daniel Bevenius <daniel.bevenius@gmail.com>
Daniel Ziegenberg <daniel@ziegenberg.at>
Daniele <57776841+daniandtheweb@users.noreply.github.com>
Daulet Zhanguzin <daulet@users.noreply.github.com>
Dave <dave-fl@users.noreply.github.com>
Dave Airlie <airlied@gmail.com>
Dave Airlie <airlied@redhat.com>
David Miller <david@patagona.ca>
DavidKorczynski <david@adalogics.com>
Davidson Francis <davidsondfgl@gmail.com>
Dibakar Gope <dibakar.gope@arm.com>
Didzis Gosko <didzis@users.noreply.github.com>
Diego Devesa <slarengh@gmail.com>
Diogo <dgcruz983@gmail.com>
Djip007 <3705339+Djip007@users.noreply.github.com>
Djip007 <djip.perois@free.fr>
Dou Xinpeng <15529241576@163.com>
Dou Xinpeng <81913537+Dou-Git@users.noreply.github.com>
Dr. Tom Murphy VII Ph.D <499244+tom7@users.noreply.github.com>
Ebey Abraham <ebey97@gmail.com>
Eldar Yusupov <eyusupov@gmail.com>
Emmanuel Durand <emmanueldurand@protonmail.com>
Engininja2 <139037756+Engininja2@users.noreply.github.com>
Eric Zhang <34133756+EZForever@users.noreply.github.com>
Erik Scholz <Green-Sky@users.noreply.github.com>
Ettore Di Giacinto <mudler@users.noreply.github.com>
Eve <139727413+netrunnereve@users.noreply.github.com>
F1L1P <78918286+F1L1Pv2@users.noreply.github.com>
Faisal Zaghloul <quic_fzaghlou@quicinc.com>
FantasyGmm <16450052+FantasyGmm@users.noreply.github.com>
Felix <stenbackfelix@gmail.com>
Finn Voorhees <finnvoorhees@gmail.com>
FirstTimeEZ <179362031+FirstTimeEZ@users.noreply.github.com>
Frankie Robertson <frankier@users.noreply.github.com>
GainLee <perfecter.gen@gmail.com>
George Hindle <george@georgehindle.com>
Georgi Gerganov <ggerganov@gmail.com>
Gilad S <7817232+giladgd@users.noreply.github.com>
Gilad S <giladgd@users.noreply.github.com>
Gilad S. <7817232+giladgd@users.noreply.github.com>
Guillaume Wenzek <gwenzek@users.noreply.github.com>
Halalaluyafail3 <55773281+Halalaluyafail3@users.noreply.github.com>
Haus1 <haus.xda@gmail.com>
Herman Semenov <GermanAizek@yandex.ru>
HimariO <dsfhe49854@gmail.com>
Hirochika Matsumoto <git@hkmatsumoto.com>
Hong Bo PENG <penghb@cn.ibm.com>
Hugo Rosenkranz-Costa <hugo.rosenkranz@gmail.com>
Hyunsung Lee <ita9naiwa@gmail.com>
IGUILIZ Salah-Eddine <76955987+salahiguiliz@users.noreply.github.com>
Ian Bull <irbull@eclipsesource.com>
Ihar Hrachyshka <ihrachys@redhat.com>
Ikko Eltociear Ashimine <eltociear@gmail.com>
Ivan <nekotekina@gmail.com>
Ivan Filipov <159561759+vanaka11@users.noreply.github.com>
Ivan Stepanov <ivanstepanovftw@gmail.com>
Ivan Zdane <accounts@ivanzdane.com>
Jack Mousseau <jmousseau@users.noreply.github.com>
Jack Vial <vialjack@gmail.com>
JacobLinCool <jacoblincool@gmail.com>
Jakob Frick <jakob.maria.frick@gmail.com>
Jan Ploski <jpl@plosquare.com>
Jared Van Bortel <jared@nomic.ai>
Jeff Bolz <jbolz@nvidia.com>
Jeffrey Quesnelle <jquesnelle@gmail.com>
Jeroen Mostert <jeroen.mostert@cm.com>
Jiahao Li <liplus17@163.com>
JidongZhang-THU <1119708529@qq.com>
Jiří Podivín <66251151+jpodivin@users.noreply.github.com>
Jo Liss <joliss42@gmail.com>
Joe Todd <joe.todd@codeplay.com>
Johannes Gäßler <johannesg@5d6.de>
John Balis <phobossystems@gmail.com>
Josh Bleecher Snyder <josharian@gmail.com>
Judd <foldl@users.noreply.github.com>
Jun Hee Yoo <contact.jhyoo@gmail.com>
Junil Kim <logyourself@gmail.com>
Justina Cho <justcho5@gmail.com>
Justine Tunney <jtunney@gmail.com>
Justine Tunney <jtunney@mozilla.com>
Karol Kontny <82021046+kkontny@users.noreply.github.com>
Kawrakow <48489457+ikawrakow@users.noreply.github.com>
Kevin Gibbons <bakkot@gmail.com>
Konstantin Zhuravlyov <konstantin.zhuravlyov@amd.com>
Kylin <56434533+KyL0N@users.noreply.github.com>
LoganDark <git@logandark.mozmail.com>
LoganDark <github@logandark.mozmail.com>
LostRuins <39025047+LostRuins@users.noreply.github.com>
Lukas Möller <mail@lukas-moeller.ch>
M Refi D.A <24388107+refinism@users.noreply.github.com>
M. Yusuf Sarıgöz <yusufsarigoz@gmail.com>
Ma Mingfei <mingfei.ma@intel.com>
Mahesh Madhav <67384846+heshpdx@users.noreply.github.com>
MaiHD <maihd.dev@gmail.com>
Mark Zhuang <zhuangqiubin@gmail.com>
Markus Tavenrath <mtavenrath@users.noreply.github.com>
Masaya, Kato <62578291+msy-kato@users.noreply.github.com>
Mathieu Baudier <mbaudier@argeo.org>
Mathijs de Bruin <mathijs@mathijsfietst.nl>
Matt Stephenson <mstephenson6@users.noreply.github.com>
Max Krasnyansky <max.krasnyansky@gmail.com>
Max Krasnyansky <quic_maxk@quicinc.com>
Mayank Kumar Pal <mynkpl1998@gmail.com>
Meng, Hengyu <hengyu.meng@intel.com>
Mengqing Cao <cmq0113@163.com>
Metal Whale <45712559+metalwhale@users.noreply.github.com>
Michael Klimenko <mklimenko29@gmail.com>
Michael Podvitskiy <podvitskiymichael@gmail.com>
Michael Verrilli <msv@pobox.com>
Molly Sophia <mollysophia379@gmail.com>
Natsu <chino@hotococoa.moe>
Neo Zhang <14088817+arthw@users.noreply.github.com>
Neo Zhang Jianyu <jianyu.zhang@intel.com>
Neuman Vong <neuman.vong@gmail.com>
Nevin <nevinpuri1901@gmail.com>
Nicholai Tukanov <nicholaitukanov@gmail.com>
Nico Bosshard <nico@bosshome.ch>
Nicolò Scipione <nicolo.scipione@codeplay.com>
Nikita Sarychev <42014488+sARY77@users.noreply.github.com>
Nouamane Tazi <nouamane98@gmail.com>
Olivier Chafik <ochafik@google.com>
Olivier Chafik <ochafik@users.noreply.github.com>
Ondřej Čertík <ondrej@certik.us>
Ouadie EL FAROUKI <ouadie.elfarouki@codeplay.com>
PAB <pierreantoine.bannier@gmail.com>
Paul Tsochantaris <ptsochantaris@icloud.com>
Peter <peter277@users.noreply.github.com>
Philpax <me@philpax.me>
Pierre Alexandre SCHEMBRI <pa.schembri@gmail.com>
Plamen Minev <pacominev@gmail.com>
Playdev <josang1204@gmail.com>
Prashant Vithule <119530321+Vithulep@users.noreply.github.com>
Przemysław Pawełczyk <przemoc@gmail.com>
R0CKSTAR <xiaodong.ye@mthreads.com>
R0CKSTAR <yeahdongcn@gmail.com>
Radoslav Gerganov <rgerganov@gmail.com>
Radosław Gryta <radek.gryta@gmail.com>
Ravindra Marella <marella@users.noreply.github.com>
Ray Cromwell <cromwellian@gmail.com>
Reinforce-II <fate@eastal.com>
Rémy Oudompheng <oudomphe@phare.normalesup.org>
Reza Rezvan <reza@rezvan.xyz>
Rick G <26732651+TheFlipbook@users.noreply.github.com>
RiverZhou <riverzhou2000@gmail.com>
Robert Ormandi <52251610+ormandi@users.noreply.github.com>
Romain Biessy <romain.biessy@codeplay.com>
Ronsor <ronsor@ronsor.pw>
Rotem Dan <rotemdan@gmail.com>
Ryan Hitchman <hitchmanr@gmail.com>
SRHMorris <69468379+SRHMorris@users.noreply.github.com>
SXX <sxx1136965276@gmail.com>
Salvatore Mesoraca <s.mesoraca16@gmail.com>
Sam Spilsbury <smspillaz@gmail.com>
Sanchit Gandhi <93869735+sanchit-gandhi@users.noreply.github.com>
Santtu Keskinen <santtu.keskinen@gmail.com>
Sergio López <slp@redhat.com>
Sergio López <slp@sinrega.org>
Shanshan Shen <467638484@qq.com>
Shijie <821898965@qq.com>
Shupei Fan <dymarkfan@outlook.com>
Siddharth Ramakrishnan <srr2141@columbia.edu>
Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
Skyler Celestinian-Sterling <80314197+Celestinian@users.noreply.github.com>
Slava Primenko <primenko.s@gmail.com>
Srihari-mcw <96763064+Srihari-mcw@users.noreply.github.com>
Steward Garcia <57494570+FSSRepo@users.noreply.github.com>
Supreet Sethi <supreet.sethi@gmail.com>
Takuya Takeuchi <takuya.takeuchi.dev@gmail.com>
Tamotsu Takahashi <ttakah+github@gmail.com>
Tanmay <tnmysachan@gmail.com>
Tanmay Sachan <tnmysachan@gmail.com>
Timothy Cronin <40186632+4imothy@users.noreply.github.com>
Tom Bailey <tombailey@users.noreply.github.com>
Tom Jobbins <784313+TheBloke@users.noreply.github.com>
Tony Wasserka <4840017+neobrain@users.noreply.github.com>
Tristan Druyen <tristan@vault81.mozmail.com>
Tyé singwa <92231658+tye-singwa@users.noreply.github.com>
UEXTM.com <84163508+uextm@users.noreply.github.com>
WillCorticesAI <150854901+WillCorticesAI@users.noreply.github.com>
William Tambellini <william.tambellini@gmail.com>
William Tambellini <wtambellini@sdl.com>
XiaotaoChen <chenxiaotao1234@gmail.com>
Xinpeng Dou <81913537+Dou-Git@users.noreply.github.com>
Xuan Son Nguyen <thichthat@gmail.com>
Yavor Ivanov <yivanov@viewray.com>
YavorGIvanov <yivanov@viewray.com>
Yilong Guo <vfirst218@gmail.com>
Yilong Guo <yilong.guo@intel.com>
Yuri Khrustalev <ykhrustalev@users.noreply.github.com>
Zhenwei Jin <109658203+kylo5aby@users.noreply.github.com>
Zhiyuan Li <lizhiyuan@uniartisan.com>
Zhiyuan Li <uniartisan2017@gmail.com>
a3sh <38979186+A3shTnT@users.noreply.github.com>
ag2s20150909 <19373730+ag2s20150909@users.noreply.github.com>
agray3 <agray3@users.noreply.github.com>
amd-dwang <dong.wang@amd.com>
amritahs-ibm <amritahs@linux.vnet.ibm.com>
apcameron <37645737+apcameron@users.noreply.github.com>
appvoid <78444142+appvoid@users.noreply.github.com>
ariez-xyz <41232910+ariez-xyz@users.noreply.github.com>
automaticcat <daogiatuank54@gmail.com>
bandoti <141645996+bandoti@users.noreply.github.com>
bmwl <brian.marshall@tolko.com>
bobqianic <129547291+bobqianic@users.noreply.github.com>
bssrdf <merlintiger@hotmail.com>
chengchi <davesjoewang@gmail.com>
compilade <113953597+compilade@users.noreply.github.com>
compilade <git@compilade.net>
ddpasa <112642920+ddpasa@users.noreply.github.com>
denersc <denerstassun@gmail.com>
dscripka <dscripka@users.noreply.github.com>
fitzsim <fitzsim@fitzsim.org>
fj-y-saito <85871716+fj-y-saito@users.noreply.github.com>
fraxy-v <65565042+fraxy-v@users.noreply.github.com>
gn64 <yukikaze.jp@gmail.com>
goerch <jhr.walter@t-online.de>
goldwaving <77494627+goldwaving@users.noreply.github.com>
haopeng <657407891@qq.com>
hidenorly <hidenorly@users.noreply.github.com>
hipudding <huafengchun@gmail.com>
hydai <z54981220@gmail.com>
issixx <46835150+issixx@users.noreply.github.com>
jaeminSon <woalsdnd@gmail.com>
jdomke <28772296+jdomke@users.noreply.github.com>
jiez <373447296@qq.com>
johnson442 <56517414+johnson442@users.noreply.github.com>
junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
k.h.lai <adrian.k.h.lai@outlook.com>
katsu560 <118887472+katsu560@users.noreply.github.com>
klosax <131523366+klosax@users.noreply.github.com>
kunnis <kunnis@users.noreply.github.com>
l3utterfly <gc.pthzfoldr@gmail.com>
le.chang <cljs118@126.com>
leejet <31925346+leejet@users.noreply.github.com>
leejet <leejet714@gmail.com>
leo-pony <nengjunma@outlook.com>
lhez <quic_lih@quicinc.com>
liuwei-git <14815172+liuwei-git@users.noreply.github.com>
luoyu-intel <yu.luo@intel.com>
magicse <magicse@users.noreply.github.com>
mahorozte <41834471+mahorozte@users.noreply.github.com>
mashizora <30516315+mashizora@users.noreply.github.com>
matt23654 <matthew.webber@protonmail.com>
matteo <matteogeniaccio@yahoo.it>
ochafik <ochafik@google.com>
otaGran <ujt2h8@gmail.com>
pengxin99 <pengxin.yuan@intel.com>
pikalover6 <49179590+pikalover6@users.noreply.github.com>
postmasters <namnguyen@google.com>
sjinzh <sjinzh@gmail.com>
skirodev <57715494+skirodev@users.noreply.github.com>
slaren <slarengh@gmail.com>
snadampal <87143774+snadampal@users.noreply.github.com>
someone13574 <81528246+someone13574@users.noreply.github.com>
stduhpf <stephduh@live.fr>
taher <8665427+nullhook@users.noreply.github.com>
texmex76 <40733439+texmex76@users.noreply.github.com>
the-crypt-keeper <84680712+the-crypt-keeper@users.noreply.github.com>
thewh1teagle <61390950+thewh1teagle@users.noreply.github.com>
ucag.li <ucag@qq.com>
ulatekh <ulatekh@yahoo.com>
uvos <devnull@uvos.xyz>
uvos <philipp@uvos.xyz>
wangshuai09 <391746016@qq.com>
woachk <24752637+woachk@users.noreply.github.com>
xctan <axunlei@gmail.com>
yangyaofei <yangyaofei@gmail.com>
yuri@FreeBSD <yuri@FreeBSD>
zhentaoyu <zhentao.yu@intel.com>
zhouwg <6889919+zhouwg@users.noreply.github.com>
zhouwg <zhouwg2000@gmail.com>
谢乃闻 <sienaiwun@users.noreply.github.com>
布客飞龙 <562826179@qq.com>
旺旺碎冰冰 <38837039+Cyberhan123@users.noreply.github.com>


================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.
project("ggml" C CXX ASM)

### GGML Version
set(GGML_VERSION_MAJOR 0)
set(GGML_VERSION_MINOR 9)
set(GGML_VERSION_PATCH 8)
set(GGML_VERSION_BASE "${GGML_VERSION_MAJOR}.${GGML_VERSION_MINOR}.${GGML_VERSION_PATCH}")

find_program(GIT_EXE NAMES git git.exe NO_CMAKE_FIND_ROOT_PATH)
if(GIT_EXE)
    # Get current git commit hash
    execute_process(COMMAND ${GIT_EXE} rev-parse --short HEAD
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        OUTPUT_VARIABLE GGML_BUILD_COMMIT
        OUTPUT_STRIP_TRAILING_WHITESPACE
        ERROR_QUIET
    )

    # Check if the working directory is dirty (i.e., has uncommitted changes)
    execute_process(COMMAND ${GIT_EXE} diff-index --quiet HEAD -- .
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
        RESULT_VARIABLE GGML_GIT_DIRTY
        ERROR_QUIET
    )
endif()

set(GGML_VERSION "${GGML_VERSION_BASE}")

if(NOT GGML_BUILD_COMMIT)
    set(GGML_BUILD_COMMIT "unknown")
endif()

# Build the commit string with optional dirty flag
if(DEFINED GGML_GIT_DIRTY AND GGML_GIT_DIRTY EQUAL 1)
    set(GGML_BUILD_COMMIT "${GGML_BUILD_COMMIT}-dirty")
endif()

include(CheckIncludeFileCXX)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
endif()

if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(GGML_STANDALONE ON)

    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

    # configure project version
    # TODO
else()
    set(GGML_STANDALONE OFF)

    if (NOT CMAKE_RUNTIME_OUTPUT_DIRECTORY)
        set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
    endif()
endif()

if (EMSCRIPTEN)
    set(BUILD_SHARED_LIBS_DEFAULT OFF)

    option(GGML_WASM_SINGLE_FILE "ggml: embed WASM inside the generated ggml.js" ON)
else()
    if (MINGW)
        set(BUILD_SHARED_LIBS_DEFAULT OFF)
    else()
        set(BUILD_SHARED_LIBS_DEFAULT ON)
    endif()
endif()

# remove the lib prefix on win32 mingw
if (WIN32)
    set(CMAKE_STATIC_LIBRARY_PREFIX "")
    set(CMAKE_SHARED_LIBRARY_PREFIX "")
    set(CMAKE_SHARED_MODULE_PREFIX  "")
endif()

option(BUILD_SHARED_LIBS           "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
option(GGML_BACKEND_DL             "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
set(GGML_BACKEND_DIR "" CACHE PATH "ggml: directory to load dynamic backends from (requires GGML_BACKEND_DL")

#
# option list
#

# TODO: mark all options as advanced when not GGML_STANDALONE

if (APPLE)
    set(GGML_METAL_DEFAULT ON)
    set(GGML_BLAS_DEFAULT ON)
    set(GGML_BLAS_VENDOR_DEFAULT "Apple")
else()
    set(GGML_METAL_DEFAULT OFF)
    set(GGML_BLAS_DEFAULT OFF)
    set(GGML_BLAS_VENDOR_DEFAULT "Generic")
endif()

if (CMAKE_CROSSCOMPILING OR DEFINED ENV{SOURCE_DATE_EPOCH})
    message(STATUS "Setting GGML_NATIVE_DEFAULT to OFF")
    set(GGML_NATIVE_DEFAULT OFF)
else()
    set(GGML_NATIVE_DEFAULT ON)
endif()

# defaults
if (NOT GGML_LLAMAFILE_DEFAULT)
    set(GGML_LLAMAFILE_DEFAULT OFF)
endif()

if (NOT GGML_CUDA_GRAPHS_DEFAULT)
    set(GGML_CUDA_GRAPHS_DEFAULT OFF)
endif()

# general
option(GGML_STATIC "ggml: static link libraries"                     OFF)
option(GGML_NATIVE "ggml: optimize the build for the current system" ${GGML_NATIVE_DEFAULT})
option(GGML_LTO    "ggml: enable link time optimization"             OFF)
option(GGML_CCACHE "ggml: use ccache if available"                   ON)

# debug
option(GGML_ALL_WARNINGS           "ggml: enable all compiler warnings"                   ON)
option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
option(GGML_GPROF                  "ggml: enable gprof"                                   OFF)

# build
option(GGML_FATAL_WARNINGS    "ggml: enable -Werror flag"    OFF)

# sanitizers
option(GGML_SANITIZE_THREAD    "ggml: enable thread sanitizer"    OFF)
option(GGML_SANITIZE_ADDRESS   "ggml: enable address sanitizer"   OFF)
option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)

# instruction set specific
if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
    set(INS_ENB OFF)
else()
    set(INS_ENB ON)
endif()

message(DEBUG "GGML_NATIVE         : ${GGML_NATIVE}")
message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
message(DEBUG "INS_ENB             : ${INS_ENB}")

option(GGML_CPU_HBM          "ggml: use memkind for CPU HBM" OFF)
option(GGML_CPU_REPACK       "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
option(GGML_CPU_KLEIDIAI     "ggml: use KleidiAI optimized kernels if applicable" OFF)
option(GGML_SSE42            "ggml: enable SSE 4.2"          ${INS_ENB})
option(GGML_AVX              "ggml: enable AVX"              ${INS_ENB})
option(GGML_AVX_VNNI         "ggml: enable AVX-VNNI"         OFF)
option(GGML_AVX2             "ggml: enable AVX2"             ${INS_ENB})
option(GGML_BMI2             "ggml: enable BMI2"             ${INS_ENB})
option(GGML_AVX512           "ggml: enable AVX512F"          OFF)
option(GGML_AVX512_VBMI      "ggml: enable AVX512-VBMI"      OFF)
option(GGML_AVX512_VNNI      "ggml: enable AVX512-VNNI"      OFF)
option(GGML_AVX512_BF16      "ggml: enable AVX512-BF16"      OFF)
if (NOT MSVC)
    # in MSVC F16C and FMA is implied with AVX2/AVX512
    option(GGML_FMA          "ggml: enable FMA"              ${INS_ENB})
    option(GGML_F16C         "ggml: enable F16C"             ${INS_ENB})
    # MSVC does not seem to support AMX
    option(GGML_AMX_TILE     "ggml: enable AMX-TILE"         OFF)
    option(GGML_AMX_INT8     "ggml: enable AMX-INT8"         OFF)
    option(GGML_AMX_BF16     "ggml: enable AMX-BF16"         OFF)
endif()
option(GGML_LASX             "ggml: enable lasx"             ON)
option(GGML_LSX              "ggml: enable lsx"              ON)
option(GGML_RVV              "ggml: enable rvv"              ON)
option(GGML_RV_ZFH           "ggml: enable riscv zfh"        ON)
option(GGML_RV_ZVFH          "ggml: enable riscv zvfh"       ON)
option(GGML_RV_ZICBOP        "ggml: enable riscv zicbop"     ON)
option(GGML_RV_ZIHINTPAUSE   "ggml: enable riscv zihintpause "  ON)
option(GGML_XTHEADVECTOR     "ggml: enable xtheadvector"     OFF)
option(GGML_VXE              "ggml: enable vxe"              ${GGML_NATIVE})

option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
set(GGML_CPU_ARM_ARCH        "" CACHE STRING "ggml: CPU architecture for ARM")
set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")

# ggml core
set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
option(GGML_CPU                             "ggml: enable CPU backend"                        ON)
option(GGML_SCHED_NO_REALLOC                "ggml: disallow reallocations in ggml-alloc (for debugging)" OFF)

# 3rd party libs / backends
option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
option(GGML_BLAS                            "ggml: use BLAS"                                  ${GGML_BLAS_DEFAULT})
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
                                            "ggml: BLAS library vendor")
option(GGML_LLAMAFILE                       "ggml: use LLAMAFILE"                             ${GGML_LLAMAFILE_DEFAULT})

option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
option(GGML_MUSA                            "ggml: use MUSA"                                  OFF)
option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                            "ggml: max. batch size for using peer access")
option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
option(GGML_CUDA_FA                         "ggml: compile ggml FlashAttention CUDA kernels"  ON)
option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
set   (GGML_CUDA_COMPRESSION_MODE "size" CACHE STRING
                                            "ggml: cuda link binary compression mode; requires cuda 12.8+")
set_property(CACHE GGML_CUDA_COMPRESSION_MODE PROPERTY STRINGS "none;speed;balance;size")

option(GGML_HIP                             "ggml: use HIP"                                   OFF)
option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
option(GGML_HIP_MMQ_MFMA                    "ggml: enable MFMA MMA for CDNA in MMQ"           ON)
option(GGML_HIP_EXPORT_METRICS              "ggml: enable kernel perf metrics output"         OFF)
option(GGML_MUSA_GRAPHS                     "ggml: use MUSA graph, experimental, unstable"    OFF)
option(GGML_MUSA_MUDNN_COPY                 "ggml: enable muDNN for accelerated copy"         OFF)
option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
option(GGML_VULKAN_MEMORY_DEBUG             "ggml: enable Vulkan memory debug output"         OFF)
option(GGML_VULKAN_SHADER_DEBUG_INFO        "ggml: enable Vulkan shader debug info"           OFF)
option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
option(GGML_WEBGPU                          "ggml: use WebGPU"                                OFF)
option(GGML_WEBGPU_DEBUG                    "ggml: enable WebGPU debug output"                OFF)
option(GGML_WEBGPU_CPU_PROFILE              "ggml: enable WebGPU profiling (CPU)"             OFF)
option(GGML_WEBGPU_GPU_PROFILE              "ggml: enable WebGPU profiling (GPU)"             OFF)
option(GGML_WEBGPU_JSPI                     "ggml: use JSPI for WebGPU"                       ON)
option(GGML_ZDNN                            "ggml: use zDNN"                                  OFF)
option(GGML_VIRTGPU                         "ggml: use the VirtGPU/Virglrenderer API Remoting frontend"     OFF)
option(GGML_VIRTGPU_BACKEND                 "ggml: build the VirtGPU/Virglrenderer API Remoting backend"    OFF)
option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
option(GGML_METAL_EMBED_LIBRARY             "ggml: embed Metal library"                       ${GGML_METAL})
set   (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
                                            "ggml: metal minimum macOS version")
set   (GGML_METAL_STD "" CACHE STRING       "ggml: metal standard version (-std flag)")
option(GGML_OPENMP                          "ggml: use OpenMP"                                ON)
option(GGML_RPC                             "ggml: use RPC"                                   OFF)
option(GGML_SYCL                            "ggml: use SYCL"                                  OFF)
option(GGML_SYCL_F16                        "ggml: use 16 bit floats for sycl calculations"   OFF)
option(GGML_SYCL_GRAPH                      "ggml: enable graphs in the SYCL backend"         ON)
option(GGML_SYCL_DNN                        "ggml: enable oneDNN in the SYCL backend"         ON)
set   (GGML_SYCL_TARGET "INTEL" CACHE STRING
                                            "ggml: sycl target device")
set   (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
                                            "ggml: sycl device architecture")

option(GGML_OPENVINO                        "ggml: use OPENVINO"                              OFF)

option(GGML_OPENCL                          "ggml: use OpenCL"                                OFF)
option(GGML_OPENCL_PROFILING                "ggml: use OpenCL profiling (increases overhead)" OFF)
option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"                             ON)
option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adreno"          ON)
set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
                                            "ggml: OpenCL API version to target")

option(GGML_HEXAGON                         "ggml: enable Hexagon backend"                    OFF)
set(GGML_HEXAGON_FP32_QUANTIZE_GROUP_SIZE 128 CACHE STRING "ggml: quantize group size (32, 64, or 128)")

# toolchain for vulkan-shaders-gen
set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")

option(GGML_ZENDNN                          "ggml: use ZenDNN"                                OFF)
option(ZENDNN_ROOT                          "ggml: path to ZenDNN installation"               "")

# extra artifacts
option(GGML_BUILD_TESTS    "ggml: build tests"    ${GGML_STANDALONE})
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})

#
# dependencies
#

set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true)

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED true)

set(THREADS_PREFER_PTHREAD_FLAG ON)

find_package(Threads REQUIRED)

include(GNUInstallDirs)

#
# build the library
#

add_subdirectory(src)

#
# tests and examples
#

if (GGML_BUILD_TESTS)
    enable_testing()
    add_subdirectory(tests)
endif ()

if (GGML_BUILD_EXAMPLES)
    add_subdirectory(examples)
endif ()

#
# install
#

include(CMakePackageConfigHelpers)

# all public headers
set(GGML_PUBLIC_HEADERS
    include/ggml.h
    include/ggml-cpu.h
    include/ggml-alloc.h
    include/ggml-backend.h
    include/ggml-blas.h
    include/ggml-cann.h
    include/ggml-cpp.h
    include/ggml-cuda.h
    include/ggml-opt.h
    include/ggml-metal.h
    include/ggml-rpc.h
    include/ggml-virtgpu.h
    include/ggml-sycl.h
    include/ggml-vulkan.h
    include/ggml-webgpu.h
    include/ggml-zendnn.h
    include/ggml-openvino.h
    include/gguf.h)

set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
#if (GGML_METAL)
#    set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
#endif()
install(TARGETS ggml LIBRARY PUBLIC_HEADER)
install(TARGETS ggml-base LIBRARY)

if (GGML_STANDALONE)
    configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
        ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
        @ONLY)

    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
        DESTINATION share/pkgconfig)
endif()

#
# Create CMake package
#



# Capture variables prefixed with GGML_.

set(variable_set_statements
"
####### Expanded from @GGML_VARIABLES_EXPANED@ by configure_package_config_file() #######
####### Any changes to this file will be overwritten by the next CMake run        #######

")

set(GGML_SHARED_LIB ${BUILD_SHARED_LIBS})

get_cmake_property(all_variables VARIABLES)
foreach(variable_name IN LISTS all_variables)
    if(variable_name MATCHES "^GGML_")
        string(REPLACE ";" "\\;"
               variable_value "${${variable_name}}")

        set(variable_set_statements
            "${variable_set_statements}set(${variable_name} \"${variable_value}\")\n")
    endif()
endforeach()

set(GGML_VARIABLES_EXPANDED ${variable_set_statements})

# Create the CMake package and set install location.

set(GGML_INSTALL_VERSION ${GGML_VERSION})
set(GGML_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header  files")
set(GGML_LIB_INSTALL_DIR     ${CMAKE_INSTALL_LIBDIR}     CACHE PATH "Location of library files")
set(GGML_BIN_INSTALL_DIR     ${CMAKE_INSTALL_BINDIR}     CACHE PATH "Location of binary  files")

configure_package_config_file(
        ${CMAKE_CURRENT_SOURCE_DIR}/cmake/ggml-config.cmake.in
        ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
    INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml
    PATH_VARS GGML_INCLUDE_INSTALL_DIR
              GGML_LIB_INSTALL_DIR
              GGML_BIN_INSTALL_DIR)

write_basic_package_version_file(
        ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
    VERSION ${GGML_INSTALL_VERSION}
    COMPATIBILITY SameMajorVersion)

target_compile_definitions(ggml-base PRIVATE
    GGML_VERSION="${GGML_INSTALL_VERSION}"
    GGML_COMMIT="${GGML_BUILD_COMMIT}"
)
message(STATUS "ggml version: ${GGML_INSTALL_VERSION}")
message(STATUS "ggml commit:  ${GGML_BUILD_COMMIT}")

install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml-config.cmake
              ${CMAKE_CURRENT_BINARY_DIR}/ggml-version.cmake
        DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/ggml)

if (MSVC)
    set(MSVC_WARNING_FLAGS
        /wd4005  # Macro redefinition
        /wd4244  # Conversion from one type to another type, possible loss of data
        /wd4267  # Conversion from 'size_t' to a smaller type, possible loss of data
        /wd4305  # Conversion from 'type1' to 'type2', possible loss of data
        /wd4566  # Conversion from 'char' to 'wchar_t', possible loss of data
        /wd4996  # Disable POSIX deprecation warnings
        /wd4702  # Unreachable code warnings
    )
    set(MSVC_COMPILE_OPTIONS
        "$<$<COMPILE_LANGUAGE:C>:/utf-8>"
        "$<$<COMPILE_LANGUAGE:CXX>:/utf-8>"
    )
    function(configure_msvc_target target_name)
        if(TARGET ${target_name})
            target_compile_options(${target_name} PRIVATE ${MSVC_WARNING_FLAGS})
            target_compile_options(${target_name} PRIVATE ${MSVC_COMPILE_OPTIONS})
        endif()
    endfunction()

    configure_msvc_target(ggml-base)
    configure_msvc_target(ggml)
    configure_msvc_target(ggml-cpu)
    configure_msvc_target(ggml-cpu-x64)
    configure_msvc_target(ggml-cpu-sse42)
    configure_msvc_target(ggml-cpu-sandybridge)
    # __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
    # skipping            ggml-cpu-ivybridge
    # skipping            ggml-cpu-piledriver
    configure_msvc_target(ggml-cpu-haswell)
    configure_msvc_target(ggml-cpu-skylakex)
    configure_msvc_target(ggml-cpu-cannonlake)
    configure_msvc_target(ggml-cpu-cascadelake)
    configure_msvc_target(ggml-cpu-icelake)
    # MSVC 2022 doesn't support BF16 intrinsics without `/arch:AVX10.1` ?!
    # https://learn.microsoft.com/en-us/cpp/intrinsics/x64-amd64-intrinsics-list?view=msvc-170
    # https://learn.microsoft.com/en-us/cpp/build/reference/arch-x64?view=msvc-170
    # skipping            ggml-cpu-cooperlake
    # skipping            ggml-cpu-zen4
    configure_msvc_target(ggml-cpu-alderlake)
    # MSVC doesn't support AMX
    # skipping            ggml-cpu-sapphirerapids

    if (GGML_BUILD_EXAMPLES)
        configure_msvc_target(common-ggml)
        configure_msvc_target(common)

        configure_msvc_target(mnist-common)
        configure_msvc_target(mnist-eval)
        configure_msvc_target(mnist-train)

        configure_msvc_target(gpt-2-ctx)
        configure_msvc_target(gpt-2-alloc)
        configure_msvc_target(gpt-2-backend)
        configure_msvc_target(gpt-2-sched)
        configure_msvc_target(gpt-2-quantize)
        configure_msvc_target(gpt-2-batched)

        configure_msvc_target(gpt-j)
        configure_msvc_target(gpt-j-quantize)

        configure_msvc_target(magika)
        configure_msvc_target(yolov3-tiny)
        configure_msvc_target(sam)

        configure_msvc_target(simple-ctx)
        configure_msvc_target(simple-backend)
    endif()

    if (GGML_BUILD_TESTS)
        configure_msvc_target(test-mul-mat)
        configure_msvc_target(test-arange)
        configure_msvc_target(test-backend-ops)
        configure_msvc_target(test-cont)
        configure_msvc_target(test-conv-transpose)
        configure_msvc_target(test-conv-transpose-1d)
        configure_msvc_target(test-conv1d)
        configure_msvc_target(test-conv2d)
        configure_msvc_target(test-conv2d-dw)
        configure_msvc_target(test-customop)
        configure_msvc_target(test-dup)
        configure_msvc_target(test-opt)
        configure_msvc_target(test-pool)
    endif ()
endif()


================================================
FILE: CONTRIBUTING.md
================================================
Please use [llama.cpp's contribution guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) for this project.

*For changes to the core `ggml` library (including to the CMake build system), please open a PR in https://github.com/ggml-org/llama.cpp. Doing so will make your PR more visible, better tested and more likely to be reviewed.*


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2023-2026 The ggml authors

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: README.md
================================================
# ggml

[Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205)

Tensor library for machine learning

***Note that this project is under active development. \
Some of the development is currently happening in the [llama.cpp](https://github.com/ggerganov/llama.cpp) and [whisper.cpp](https://github.com/ggerganov/whisper.cpp) repos***

## Features

- Low-level cross-platform implementation
- Integer quantization support
- Broad hardware support
- Automatic differentiation
- ADAM and L-BFGS optimizers
- No third-party dependencies
- Zero memory allocations during runtime

## Build

```bash
git clone https://github.com/ggml-org/ggml
cd ggml

# install python dependencies in a virtual environment
python3.10 -m venv .venv
source .venv/bin/activate
pip install -r requirements.txt

# build the examples
mkdir build && cd build
cmake ..
cmake --build . --config Release -j 8
```

## GPT inference (example)

```bash
# run the GPT-2 small 117M model
../examples/gpt-2/download-ggml-model.sh 117M
./bin/gpt-2-backend -m models/gpt-2-117M/ggml-model.bin -p "This is an example"
```

For more information, checkout the corresponding programs in the [examples](examples) folder.

## Resources

- [Introduction to ggml](https://huggingface.co/blog/introduction-to-ggml)
- [The GGUF file format](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md)


================================================
FILE: ci/run.sh
================================================
#/bin/bash
#
# sample usage:
#
# mkdir tmp
#
# # CPU-only build
# bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # with CUDA support
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#
# # With SYCL support
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
#

if [ -z "$2" ]; then
    echo "usage: $0 <output-dir> <mnt-dir>"
    exit 1
fi

mkdir -p "$1"
mkdir -p "$2"

OUT=$(realpath "$1")
MNT=$(realpath "$2")

rm -v $OUT/*.log
rm -v $OUT/*.exit
rm -v $OUT/*.md

sd=`dirname $0`
cd $sd/../
SRC=`pwd`

CMAKE_EXTRA=""
CTEST_EXTRA=""

if [ ! -z ${GG_BUILD_METAL} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=ON"
fi

if [ ! -z ${GG_BUILD_CUDA} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON"

    if command -v nvidia-smi >/dev/null 2>&1; then
        CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
        if [[ -n "$CUDA_ARCH" && "$CUDA_ARCH" =~ ^[0-9]+$ ]]; then
            CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH}"
        else
            echo "Warning: Using fallback CUDA architectures"
            CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=61;70;75;80;86;89"
        fi
    else
        echo "Error: nvidia-smi not found, cannot build with CUDA"
        exit 1
    fi
fi

if [ ! -z ${GG_BUILD_ROCM} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_HIP=ON"
    if [ -z ${GG_BUILD_AMDGPU_TARGETS} ]; then
        echo "Missing GG_BUILD_AMDGPU_TARGETS, please set it to your GPU architecture (e.g. gfx90a, gfx1100, etc.)"
        exit 1
    fi

    CMAKE_EXTRA="${CMAKE_EXTRA} -DAMDGPU_TARGETS=${GG_BUILD_AMDGPU_TARGETS}"
fi

if [ ! -z ${GG_BUILD_SYCL} ]; then
    if [ -z ${ONEAPI_ROOT} ]; then
        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:"
        echo "source /opt/intel/oneapi/setvars.sh"
        exit 1
    fi
    # Use only main GPU
    export ONEAPI_DEVICE_SELECTOR="level_zero:0"
    # Enable sysman for correct memory reporting
    export ZES_ENABLE_SYSMAN=1
    # to circumvent precision issues on CPY operations
    export SYCL_PROGRAM_COMPILE_OPTIONS="-cl-fp32-correctly-rounded-divide-sqrt"
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON"
fi

if [ ! -z ${GG_BUILD_VULKAN} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_VULKAN=1"

    # if on Mac, disable METAL
    if [[ "$OSTYPE" == "darwin"* ]]; then
        CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_METAL=OFF -DGGML_BLAS=OFF"
    fi

fi

if [ ! -z ${GG_BUILD_WEBGPU} ]; then
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_WEBGPU=1"
fi

if [ ! -z ${GG_BUILD_MUSA} ]; then
    # Use qy1 by default (MTT S80)
    MUSA_ARCH=${MUSA_ARCH:-21}
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_MUSA=ON -DMUSA_ARCHITECTURES=${MUSA_ARCH}"
fi

if [ ! -z ${GG_BUILD_NO_SVE} ]; then
    # arm 9 and newer enables sve by default, adjust these flags depending on the cpu used
    CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8.5-a+fp16+i8mm"
fi

## helpers

# download a file if it does not exist or if it is outdated
function gg_wget {
    local out=$1
    local url=$2

    local cwd=`pwd`

    mkdir -p $out
    cd $out

    # should not re-download if file is the same
    wget -nv -N $url

    cd $cwd
}

function gg_printf {
    printf -- "$@" >> $OUT/README.md
}

function gg_run {
    ci=$1

    set -o pipefail
    set -x

    gg_run_$ci | tee $OUT/$ci.log
    cur=$?
    echo "$cur" > $OUT/$ci.exit

    set +x
    set +o pipefail

    gg_sum_$ci

    ret=$((ret | cur))
}

## ci

# ctest_debug

function gg_run_ctest_debug {
    cd ${SRC}

    rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug

    set -e

    (time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} ..     ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                      ) 2>&1 | tee -a $OUT/${ci}-make.log

    (time ctest ${CTEST_EXTRA} --output-on-failure -E "test-opt|test-backend-ops" ) 2>&1 | tee -a $OUT/${ci}-ctest.log

    set +e
}

function gg_sum_ctest_debug {
    gg_printf '### %s\n\n' "${ci}"

    gg_printf 'Runs ctest in debug mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
    gg_printf '```\n'
    gg_printf '\n'
}

# ctest_release

function gg_run_ctest_release {
    cd ${SRC}

    rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release

    set -e

    (time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} ..   ) 2>&1 | tee -a $OUT/${ci}-cmake.log
    (time make -j$(nproc)                                      ) 2>&1 | tee -a $OUT/${ci}-make.log

    if [ -z $GG_BUILD_LOW_PERF ]; then
        (time ctest ${CTEST_EXTRA} --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
    else
        (time ctest ${CTEST_EXTRA} --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log
    fi

    set +e
}

function gg_sum_ctest_release {
    gg_printf '### %s\n\n' "${ci}"

    gg_printf 'Runs ctest in release mode\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-ctest.log)"
    gg_printf '```\n'
}

# gpt_2

function gg_run_gpt_2 {
    cd ${SRC}

    gg_wget models-mnt/gpt-2 https://huggingface.co/ggerganov/ggml/resolve/main/ggml-model-gpt-2-117M.bin

    cd build-ci-release

    set -e

    model="../models-mnt/gpt-2/ggml-model-gpt-2-117M.bin"
    prompts="../examples/prompts/gpt-2.txt"

    (time ./bin/gpt-2-backend --model ${model} -s 1234 -n 64 -tt ${prompts}                       ) 2>&1 | tee -a $OUT/${ci}-tg.log
    (time ./bin/gpt-2-backend --model ${model} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
    (time ./bin/gpt-2-sched   --model ${model} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log

    (time ./bin/gpt-2-batched --model ${model} -s 1234 -n 64 -np 8 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log

    set +e
}

function gg_sum_gpt_2 {
    gg_printf '### %s\n\n' "${ci}"

    gg_printf 'Runs short GPT-2 text generation\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-tg.log)"
    gg_printf '```\n'
}

# TODO: update
## mnist
#
#function gg_run_mnist {
#    cd ${SRC}
#
#    cd build-ci-release
#
#    set -e
#
#    mkdir -p models/mnist
#    python3 ../examples/mnist/convert-h5-to-ggml.py ../examples/mnist/models/mnist/mnist_model.state_dict
#
#    model_f32="./models/mnist/ggml-model-f32.bin"
#    samples="../examples/mnist/models/mnist/t10k-images.idx3-ubyte"
#
#    # first command runs and exports "mnist.ggml", the second command runs the exported model
#
#    (time ./bin/mnist     ${model_f32} ${samples} ) 2>&1 | tee -a $OUT/${ci}-mnist.log
#    (time ./bin/mnist-cpu ./mnist.ggml ${samples} ) 2>&1 | tee -a $OUT/${ci}-mnist.log
#
#    set +e
#}
#
#function gg_sum_mnist {
#    gg_printf '### %s\n\n' "${ci}"
#
#    gg_printf 'MNIST\n'
#    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
#    gg_printf '```\n'
#    gg_printf '%s\n' "$(cat $OUT/${ci}-mnist.log)"
#    gg_printf '```\n'
#}

# sam

function gg_run_sam {
    cd ${SRC}

    gg_wget models-mnt/sam/ https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth
    gg_wget models-mnt/sam/ https://raw.githubusercontent.com/YavorGIvanov/sam.cpp/ceafb7467bff7ec98e0c4f952e58a9eb8fd0238b/img.jpg

    cd build-ci-release

    set -e

    path_models="../models-mnt/sam/"
    model_f16="${path_models}/ggml-model-f16.bin"
    img_0="${path_models}/img.jpg"

    python3 ../examples/sam/convert-pth-to-ggml.py ${path_models}/sam_vit_b_01ec64.pth ${path_models}/ 1

    # Test default parameters
    (time ./bin/sam -m ${model_f16} -i ${img_0} -st 0.925 ) 2>&1 | tee -a $OUT/${ci}-main.log
    grep -q "point prompt" $OUT/${ci}-main.log
    grep -q "bbox (371, 436), (144, 168)" $OUT/${ci}-main.log ||
    grep -q "bbox (370, 439), (144, 168)" $OUT/${ci}-main.log

    # Test box prompt and single mask output
    (time ./bin/sam -m ${model_f16} -i ${img_0} -st 0.925 -b 368,144,441,173 -sm) 2>&1 | tee -a $OUT/${ci}-main.log
    grep -q "box prompt" $OUT/${ci}-main.log
    grep -q "bbox (370, 439), (144, 169)" $OUT/${ci}-main.log ||
    grep -q "bbox (370, 439), (144, 168)" $OUT/${ci}-main.log

    set +e
}

function gg_sum_sam {
    gg_printf '### %s\n\n' "${ci}"

    gg_printf 'Run SAM\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-main.log)"
    gg_printf '```\n'
}

# yolo

function gg_run_yolo {
    cd ${SRC}

    gg_wget models-mnt/yolo/ https://huggingface.co/ggml-org/models/resolve/main/yolo/yolov3-tiny.weights
    gg_wget models-mnt/yolo/ https://huggingface.co/ggml-org/models/resolve/main/yolo/dog.jpg

    cd build-ci-release
    cp -r ../examples/yolo/data .

    set -e

    path_models="../models-mnt/yolo/"

    python3 ../examples/yolo/convert-yolov3-tiny.py ${path_models}/yolov3-tiny.weights

    (time ./bin/yolov3-tiny -m yolov3-tiny.gguf -i ${path_models}/dog.jpg ) 2>&1 | tee -a $OUT/${ci}-main.log

    grep -qE "dog: (55|56|57|58|59)%" $OUT/${ci}-main.log
    grep -qE "car: (50|51|52|53|54)%" $OUT/${ci}-main.log
    grep -qE "truck: (54|55|56|57|58)%" $OUT/${ci}-main.log
    grep -qE "bicycle: (57|58|59|60|61)%" $OUT/${ci}-main.log

    set +e
}

function gg_sum_yolo {
    gg_printf '### %s\n\n' "${ci}"

    gg_printf 'Run YOLO\n'
    gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
    gg_printf '```\n'
    gg_printf '%s\n' "$(cat $OUT/${ci}-main.log)"
    gg_printf '```\n'
}

## main

if true ; then
    # Create symlink: ./ggml/models-mnt -> $MNT/models/models-mnt
    rm -rf ${SRC}/models-mnt
    mnt_models=${MNT}/models
    mkdir -p ${mnt_models}
    ln -sfn ${mnt_models} ${SRC}/models-mnt

    # Create a fresh python3 venv and enter it
    if ! python3 -m venv "$MNT/venv"; then
        echo "Error: Failed to create Python virtual environment at $MNT/venv."
        exit 1
    fi
    source "$MNT/venv/bin/activate"

    pip install -r ${SRC}/requirements.txt --disable-pip-version-check
fi


ret=0

test $ret -eq 0 && gg_run ctest_debug
test $ret -eq 0 && gg_run ctest_release

test $ret -eq 0 && gg_run gpt_2
#test $ret -eq 0 && gg_run mnist
test $ret -eq 0 && gg_run sam
test $ret -eq 0 && gg_run yolo

if [ -z $GG_BUILD_LOW_PERF ]; then
    # run tests meant for low-perf runners
    date
fi

cat $OUT/README.md

exit $ret


================================================
FILE: cmake/GitVars.cmake
================================================
find_package(Git)

# the commit's SHA1
execute_process(COMMAND
    "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
    OUTPUT_VARIABLE GIT_SHA1
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)

# the date of the commit
execute_process(COMMAND
    "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
    OUTPUT_VARIABLE GIT_DATE
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)

# the subject of the commit
execute_process(COMMAND
    "${GIT_EXECUTABLE}" log -1 --format=%s
    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
    OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)


================================================
FILE: cmake/common.cmake
================================================
function(ggml_get_flags CCID CCVER)
    set(C_FLAGS "")
    set(CXX_FLAGS "")

    if (CCID MATCHES "Clang")
        set(C_FLAGS   -Wunreachable-code-break -Wunreachable-code-return)
        set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)

        if (
            (CCID STREQUAL "Clang"      AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
            (CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
        )
            list(APPEND C_FLAGS -Wdouble-promotion)
        endif()
    elseif (CCID STREQUAL "GNU")
        set(C_FLAGS   -Wdouble-promotion)
        set(CXX_FLAGS -Wno-array-bounds)

        if (CCVER VERSION_GREATER_EQUAL 8.1.0)
            list(APPEND CXX_FLAGS -Wextra-semi)
        endif()
    endif()

    set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)
    set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
endfunction()

function(ggml_get_system_arch)
    if (CMAKE_OSX_ARCHITECTURES      STREQUAL "arm64" OR
        CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
        (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
            CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
        set(GGML_SYSTEM_ARCH "ARM" PARENT_SCOPE)
    elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR
            CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
            (NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
            CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64|amd64)$"))
        set(GGML_SYSTEM_ARCH "x86" PARENT_SCOPE)
    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc|power")
        set(GGML_SYSTEM_ARCH "PowerPC" PARENT_SCOPE)
    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
        set(GGML_SYSTEM_ARCH "loongarch64"  PARENT_SCOPE)
    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
        set(GGML_SYSTEM_ARCH "riscv64" PARENT_SCOPE)
    elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
        set(GGML_SYSTEM_ARCH "s390x" PARENT_SCOPE)
    else()
        set(GGML_SYSTEM_ARCH "UNKNOWN" PARENT_SCOPE)
    endif()
endfunction()


================================================
FILE: cmake/ggml-config.cmake.in
================================================
@PACKAGE_INIT@

@GGML_VARIABLES_EXPANDED@

# Find all dependencies before creating any target.
include(CMakeFindDependencyMacro)
find_dependency(Threads)
if (NOT GGML_SHARED_LIB)
    set(GGML_CPU_INTERFACE_LINK_LIBRARIES "")
    set(GGML_CPU_INTERFACE_LINK_OPTIONS   "")

    if (APPLE AND GGML_ACCELERATE)
        find_library(ACCELERATE_FRAMEWORK Accelerate)
        if(NOT ACCELERATE_FRAMEWORK)
            set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0)
            return()
        endif()
        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES ${ACCELERATE_FRAMEWORK})
    endif()

    if (GGML_OPENMP_ENABLED)
        find_dependency(OpenMP)
        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
    endif()

    if (GGML_CPU_HBM)
        find_library(memkind memkind)
        if(NOT memkind)
            set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0)
            return()
        endif()
        list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES memkind)
    endif()

    if (GGML_BLAS)
        find_dependency(BLAS)
        list(APPEND GGML_BLAS_INTERFACE_LINK_LIBRARIES ${BLAS_LIBRARIES})
        list(APPEND GGML_BLAS_INTERFACE_LINK_OPTIONS   ${BLAS_LINKER_FLAGS})
    endif()

    if (GGML_CUDA)
        set(GGML_CUDA_INTERFACE_LINK_LIBRARIES "")
        find_dependency(CUDAToolkit)
        if (GGML_STATIC)
            list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cudart_static>)
            if (WIN32)
                list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cublas> $<LINK_ONLY:CUDA::cublasLt>)
            else()
                list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cublas_static> $<LINK_ONLY:CUDA::cublasLt_static>)
            endif()
        endif()
        if (NOT GGML_CUDA_NO_VMM)
            list(APPEND GGML_CUDA_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:CUDA::cuda_driver>)
        endif()
    endif()

    if (GGML_METAL)
        find_library(FOUNDATION_LIBRARY Foundation)
        find_library(METAL_FRAMEWORK    Metal)
        find_library(METALKIT_FRAMEWORK MetalKit)
        if(NOT FOUNDATION_LIBRARY OR NOT METAL_FRAMEWORK OR NOT METALKIT_FRAMEWORK)
            set(${CMAKE_FIND_PACKAGE_NAME}_FOUND 0)
            return()
        endif()
        set(GGML_METAL_INTERFACE_LINK_LIBRARIES
            ${FOUNDATION_LIBRARY} ${METAL_FRAMEWORK} ${METALKIT_FRAMEWORK})
    endif()

    if (GGML_OPENCL)
        find_dependency(OpenCL)
        set(GGML_OPENCL_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:OpenCL::OpenCL>)
    endif()

    if (GGML_VULKAN)
        find_dependency(Vulkan)
        set(GGML_VULKAN_INTERFACE_LINK_LIBRARIES $<LINK_ONLY:Vulkan::Vulkan>)
    endif()

    if (GGML_HIP)
        find_dependency(hip)
        find_dependency(hipblas)
        find_dependency(rocblas)
        set(GGML_HIP_INTERFACE_LINK_LIBRARIES hip::host roc::rocblas roc::hipblas)
    endif()

    if (GGML_SYCL)
        set(GGML_SYCL_INTERFACE_LINK_LIBRARIES "")
        find_package(DNNL)
        if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
            list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl)
        endif()
        if (WIN32)
            find_dependency(IntelSYCL)
            find_dependency(MKL)
            list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
        endif()
    endif()
endif()

set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")

if(NOT TARGET ggml::ggml)
    find_package(Threads REQUIRED)

    find_library(GGML_LIBRARY ggml
        REQUIRED
        HINTS ${GGML_LIB_DIR}
        NO_CMAKE_FIND_ROOT_PATH)

    add_library(ggml::ggml UNKNOWN IMPORTED)
    set_target_properties(ggml::ggml
        PROPERTIES
            IMPORTED_LOCATION "${GGML_LIBRARY}")

    find_library(GGML_BASE_LIBRARY ggml-base
        REQUIRED
        HINTS ${GGML_LIB_DIR}
        NO_CMAKE_FIND_ROOT_PATH)

    add_library(ggml::ggml-base UNKNOWN IMPORTED)
    set_target_properties(ggml::ggml-base
        PROPERTIES
            IMPORTED_LOCATION "${GGML_BASE_LIBRARY}")

    set(_ggml_all_targets "")
    if (NOT GGML_BACKEND_DL)
        foreach(_ggml_backend ${GGML_AVAILABLE_BACKENDS})
            string(REPLACE "-" "_" _ggml_backend_pfx "${_ggml_backend}")
            string(TOUPPER "${_ggml_backend_pfx}" _ggml_backend_pfx)

            find_library(${_ggml_backend_pfx}_LIBRARY ${_ggml_backend}
                REQUIRED
                HINTS ${GGML_LIB_DIR}
                NO_CMAKE_FIND_ROOT_PATH)

            message(STATUS "Found ${${_ggml_backend_pfx}_LIBRARY}")

            add_library(ggml::${_ggml_backend} UNKNOWN IMPORTED)
            set_target_properties(ggml::${_ggml_backend}
                PROPERTIES
                    INTERFACE_INCLUDE_DIRECTORIES "${GGML_INCLUDE_DIR}"
                    IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
                    IMPORTED_LOCATION "${${_ggml_backend_pfx}_LIBRARY}"
                    INTERFACE_COMPILE_FEATURES c_std_90
                    POSITION_INDEPENDENT_CODE ON)

            string(REGEX MATCH "^ggml-cpu" is_cpu_variant "${_ggml_backend}")
            if(is_cpu_variant)
                list(APPEND GGML_CPU_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
                set_target_properties(ggml::${_ggml_backend}
                PROPERTIES
                    INTERFACE_LINK_LIBRARIES "${GGML_CPU_INTERFACE_LINK_LIBRARIES}")

                if(GGML_CPU_INTERFACE_LINK_OPTIONS)
                    set_target_properties(ggml::${_ggml_backend}
                        PROPERTIES
                            INTERFACE_LINK_OPTIONS "${GGML_CPU_INTERFACE_LINK_OPTIONS}")
                endif()

            else()
                list(APPEND ${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES "ggml::ggml-base")
                set_target_properties(ggml::${_ggml_backend}
                    PROPERTIES
                        INTERFACE_LINK_LIBRARIES "${${_ggml_backend_pfx}_INTERFACE_LINK_LIBRARIES}")

                if(${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS)
                    set_target_properties(ggml::${_ggml_backend}
                        PROPERTIES
                            INTERFACE_LINK_OPTIONS "${${_ggml_backend_pfx}_INTERFACE_LINK_OPTIONS}")
                endif()
            endif()

            list(APPEND _ggml_all_targets ggml::${_ggml_backend})
        endforeach()
    endif()

    list(APPEND GGML_INTERFACE_LINK_LIBRARIES ggml::ggml-base "${_ggml_all_targets}")
    set_target_properties(ggml::ggml
        PROPERTIES
            INTERFACE_LINK_LIBRARIES "${GGML_INTERFACE_LINK_LIBRARIES}")

    add_library(ggml::all INTERFACE IMPORTED)
    set_target_properties(ggml::all
        PROPERTIES
            INTERFACE_LINK_LIBRARIES "${_ggml_all_targets}")

endif()

check_required_components(ggml)


================================================
FILE: docs/gguf.md
================================================
# GGUF

GGUF is a file format for storing models for inference with GGML and executors based on GGML. GGUF is a binary format that is designed for fast loading and saving of models, and for ease of reading. Models are traditionally developed using PyTorch or another framework, and then converted to GGUF for use in GGML.

It is a successor file format to GGML, GGMF and GGJT, and is designed to be unambiguous by containing all the information needed to load a model. It is also designed to be extensible, so that new information can be added to models without breaking compatibility.

For more information about the motivation behind GGUF, see [Historical State of Affairs](#historical-state-of-affairs).

## Specification

GGUF is a format based on the existing GGJT, but makes a few changes to the format to make it more extensible and easier to use. The following features are desired:

- Single-file deployment: they can be easily distributed and loaded, and do not require any external files for additional information.
- Extensible: new features can be added to GGML-based executors/new information can be added to GGUF models without breaking compatibility with existing models.
- `mmap` compatibility: models can be loaded using `mmap` for fast loading and saving.
- Easy to use: models can be easily loaded and saved using a small amount of code, with no need for external libraries, regardless of the language used.
- Full information: all information needed to load a model is contained in the model file, and no additional information needs to be provided by the user.

The key difference between GGJT and GGUF is the use of a key-value structure for the hyperparameters (now referred to as metadata), rather than a list of untyped values. This allows for new metadata to be added without breaking compatibility with existing models, and to annotate the model with additional information that may be useful for inference or for identifying the model.

### GGUF Naming Convention

GGUF follow a naming convention of `<BaseName><SizeLabel><FineTune><Version><Encoding><Type><Shard>.gguf` where each component is delimitated by a `-` if present. Ultimately this is intended to make it easier for humans to at a glance get the most important details of a model. It is not intended to be perfectly parsable in the field due to the diversity of existing gguf filenames.

The components are:
1. **BaseName**: A descriptive name for the model base type or architecture.
    - This can be derived from gguf metadata `general.basename` substituting spaces for dashes.
1. **SizeLabel**: Parameter weight class (useful for leader boards) represented as `<expertCount>x<count><scale-prefix>`
    - This can be derived from gguf metadata `general.size_label` if available or calculated if missing.
    - Rounded decimal point is supported in count with a single letter scale prefix to assist in floating point exponent shown below
      - `Q`: Quadrillion parameters.
      - `T`: Trillion parameters.
      - `B`: Billion parameters.
      - `M`: Million parameters.
      - `K`: Thousand parameters.
    - Additional `-<attributes><count><scale-prefix>` can be appended as needed to indicate other attributes of interest
1. **FineTune**: A descriptive name for the model fine tuning goal (e.g. Chat, Instruct, etc...)
    - This can be derived from gguf metadata `general.finetune` substituting spaces for dashes.
1. **Version**: (Optional) Denotes the model version number, formatted as `v<Major>.<Minor>`
    - If model is missing a version number then assume `v1.0` (First Public Release)
    - This can be derived from gguf metadata `general.version`
1. **Encoding**: Indicates the weights encoding scheme that was applied to the model. Content, type mixture and arrangement however are determined by user code and can vary depending on project needs.
1. **Type**: Indicates the kind of gguf file and the intended purpose for it
  - If missing, then file is by default a typical gguf tensor model file
  - `LoRA` : GGUF file is a LoRA adapter
  - `vocab` : GGUF file with only vocab data and metadata
1. **Shard**: (Optional) Indicates and denotes that the model has been split into multiple shards, formatted as `<ShardNum>-of-<ShardTotal>`.
    - *ShardNum* : Shard position in this model. Must be 5 digits padded by zeros.
      - Shard number always starts from `00001` onwards (e.g. First shard always starts at `00001-of-XXXXX` rather than `00000-of-XXXXX`).
    - *ShardTotal* : Total number of shards in this model. Must be 5 digits padded by zeros.


#### Validating Above Naming Convention

At a minimum all model files should have at least BaseName, SizeLabel, Version, in order to be easily validated as a file that is keeping with the GGUF Naming Convention. An example of this issue is that it is easy for Encoding to be mistaken as a FineTune if Version is omitted.

To validate you can use this regular expression `^(?<BaseName>[A-Za-z0-9\s]*(?:(?:-(?:(?:[A-Za-z\s][A-Za-z0-9\s]*)|(?:[0-9\s]*)))*))-(?:(?<SizeLabel>(?:\d+x)?(?:\d+\.)?\d+[A-Za-z](?:-[A-Za-z]+(\d+\.)?\d+[A-Za-z]+)?)(?:-(?<FineTune>[A-Za-z0-9\s-]+))?)?-(?:(?<Version>v\d+(?:\.\d+)*))(?:-(?<Encoding>(?!LoRA|vocab)[\w_]+))?(?:-(?<Type>LoRA|vocab))?(?:-(?<Shard>\d{5}-of-\d{5}))?\.gguf$` which will check that you got the minimum BaseName, SizeLabel and Version present in the correct order.

For example:

  * `Mixtral-8x7B-v0.1-KQ2.gguf`:
    - Model Name: Mixtral
    - Expert Count: 8
    - Parameter Count: 7B
    - Version Number: v0.1
    - Weight Encoding Scheme: KQ2

  * `Hermes-2-Pro-Llama-3-8B-F16.gguf`:
    - Model Name: Hermes 2 Pro Llama 3
    - Expert Count: 0
    - Parameter Count: 8B
    - Version Number: v1.0
    - Weight Encoding Scheme: F16
    - Shard: N/A

  * `Grok-100B-v1.0-Q4_0-00003-of-00009.gguf`
    - Model Name: Grok
    - Expert Count: 0
    - Parameter Count: 100B
    - Version Number: v1.0
    - Weight Encoding Scheme: Q4_0
    - Shard: 3 out of 9 total shards


<details><summary>Example Node.js Regex Function</summary>

```js
#!/usr/bin/env node
const ggufRegex = /^(?<BaseName>[A-Za-z0-9\s]*(?:(?:-(?:(?:[A-Za-z\s][A-Za-z0-9\s]*)|(?:[0-9\s]*)))*))-(?:(?<SizeLabel>(?:\d+x)?(?:\d+\.)?\d+[A-Za-z](?:-[A-Za-z]+(\d+\.)?\d+[A-Za-z]+)?)(?:-(?<FineTune>[A-Za-z0-9\s-]+))?)?-(?:(?<Version>v\d+(?:\.\d+)*))(?:-(?<Encoding>(?!LoRA|vocab)[\w_]+))?(?:-(?<Type>LoRA|vocab))?(?:-(?<Shard>\d{5}-of-\d{5}))?\.gguf$/;

function parseGGUFFilename(filename) {
  const match = ggufRegex.exec(filename);
  if (!match)
    return null;
  const {BaseName = null, SizeLabel = null, FineTune = null, Version = "v1.0", Encoding = null, Type = null, Shard = null} = match.groups;
  return {BaseName: BaseName, SizeLabel: SizeLabel, FineTune: FineTune, Version: Version, Encoding: Encoding, Type: Type, Shard: Shard};
}

const testCases = [
  {filename: 'Mixtral-8x7B-v0.1-KQ2.gguf',                         expected: { BaseName: 'Mixtral',              SizeLabel: '8x7B',     FineTune: null, Version: 'v0.1',   Encoding: 'KQ2',  Type: null, Shard: null}},
  {filename: 'Grok-100B-v1.0-Q4_0-00003-of-00009.gguf',            expected: { BaseName: 'Grok',                 SizeLabel: '100B',     FineTune: null, Version: 'v1.0',   Encoding: 'Q4_0', Type: null, Shard: "00003-of-00009"}},
  {filename: 'Hermes-2-Pro-Llama-3-8B-v1.0-F16.gguf',              expected: { BaseName: 'Hermes-2-Pro-Llama-3', SizeLabel: '8B', FineTune: null, Version: 'v1.0',   Encoding: 'F16',  Type: null, Shard: null}},
  {filename: 'Phi-3-mini-3.8B-ContextLength4k-instruct-v1.0.gguf', expected: { BaseName: 'Phi-3-mini',   SizeLabel: '3.8B-ContextLength4k', FineTune: 'instruct', Version: 'v1.0',   Encoding: null,  Type: null, Shard: null}},
  {filename: 'not-a-known-arrangement.gguf',                       expected: null},
];

testCases.forEach(({ filename, expected }) => {
  const result = parseGGUFFilename(filename);
  const passed = JSON.stringify(result) === JSON.stringify(expected);
  console.log(`${filename}: ${passed ? "PASS" : "FAIL"}`);
  if (!passed) {
      console.log(result);
      console.log(expected);
  }
});
```

</details>


### File Structure

![image](https://github.com/ggerganov/ggml/assets/1991296/c3623641-3a1d-408e-bfaf-1b7c4e16aa63)
*diagram by [@mishig25](https://github.com/mishig25) (GGUF v3)*

GGUF files are structured as follows. They use a global alignment specified in the `general.alignment` metadata field, referred to as `ALIGNMENT` below. Where required, the file is padded with `0x00` bytes to the next multiple of `general.alignment`.

Fields, including arrays, are written sequentially without alignment unless otherwise specified.

Models are little-endian by default. They can also come in big-endian for use with big-endian computers; in this case, all values (including metadata values and tensors) will also be big-endian. At the time of writing, there is no way to determine if a model is big-endian; this may be rectified in future versions. If no additional information is provided, assume the model is little-endian.

```c
enum ggml_type: uint32_t {
    GGML_TYPE_F32     = 0,
    GGML_TYPE_F16     = 1,
    GGML_TYPE_Q4_0    = 2,
    GGML_TYPE_Q4_1    = 3,
    // GGML_TYPE_Q4_2 = 4, support has been removed
    // GGML_TYPE_Q4_3 = 5, support has been removed
    GGML_TYPE_Q5_0    = 6,
    GGML_TYPE_Q5_1    = 7,
    GGML_TYPE_Q8_0    = 8,
    GGML_TYPE_Q8_1    = 9,
    GGML_TYPE_Q2_K    = 10,
    GGML_TYPE_Q3_K    = 11,
    GGML_TYPE_Q4_K    = 12,
    GGML_TYPE_Q5_K    = 13,
    GGML_TYPE_Q6_K    = 14,
    GGML_TYPE_Q8_K    = 15,
    GGML_TYPE_IQ2_XXS = 16,
    GGML_TYPE_IQ2_XS  = 17,
    GGML_TYPE_IQ3_XXS = 18,
    GGML_TYPE_IQ1_S   = 19,
    GGML_TYPE_IQ4_NL  = 20,
    GGML_TYPE_IQ3_S   = 21,
    GGML_TYPE_IQ2_S   = 22,
    GGML_TYPE_IQ4_XS  = 23,
    GGML_TYPE_I8      = 24,
    GGML_TYPE_I16     = 25,
    GGML_TYPE_I32     = 26,
    GGML_TYPE_I64     = 27,
    GGML_TYPE_F64     = 28,
    GGML_TYPE_IQ1_M   = 29,
    GGML_TYPE_BF16    = 30,
    // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
    // GGML_TYPE_Q4_0_4_8 = 32,
    // GGML_TYPE_Q4_0_8_8 = 33,
    GGML_TYPE_TQ1_0   = 34,
    GGML_TYPE_TQ2_0   = 35,
    // GGML_TYPE_IQ4_NL_4_4 = 36,
    // GGML_TYPE_IQ4_NL_4_8 = 37,
    // GGML_TYPE_IQ4_NL_8_8 = 38,
    GGML_TYPE_MXFP4   = 39, // MXFP4 (1 block)
    GGML_TYPE_COUNT   = 40,
};

enum gguf_metadata_value_type: uint32_t {
    // The value is a 8-bit unsigned integer.
    GGUF_METADATA_VALUE_TYPE_UINT8 = 0,
    // The value is a 8-bit signed integer.
    GGUF_METADATA_VALUE_TYPE_INT8 = 1,
    // The value is a 16-bit unsigned little-endian integer.
    GGUF_METADATA_VALUE_TYPE_UINT16 = 2,
    // The value is a 16-bit signed little-endian integer.
    GGUF_METADATA_VALUE_TYPE_INT16 = 3,
    // The value is a 32-bit unsigned little-endian integer.
    GGUF_METADATA_VALUE_TYPE_UINT32 = 4,
    // The value is a 32-bit signed little-endian integer.
    GGUF_METADATA_VALUE_TYPE_INT32 = 5,
    // The value is a 32-bit IEEE754 floating point number.
    GGUF_METADATA_VALUE_TYPE_FLOAT32 = 6,
    // The value is a boolean.
    // 1-byte value where 0 is false and 1 is true.
    // Anything else is invalid, and should be treated as either the model being invalid or the reader being buggy.
    GGUF_METADATA_VALUE_TYPE_BOOL = 7,
    // The value is a UTF-8 non-null-terminated string, with length prepended.
    GGUF_METADATA_VALUE_TYPE_STRING = 8,
    // The value is an array of other values, with the length and type prepended.
    ///
    // Arrays can be nested, and the length of the array is the number of elements in the array, not the number of bytes.
    GGUF_METADATA_VALUE_TYPE_ARRAY = 9,
    // The value is a 64-bit unsigned little-endian integer.
    GGUF_METADATA_VALUE_TYPE_UINT64 = 10,
    // The value is a 64-bit signed little-endian integer.
    GGUF_METADATA_VALUE_TYPE_INT64 = 11,
    // The value is a 64-bit IEEE754 floating point number.
    GGUF_METADATA_VALUE_TYPE_FLOAT64 = 12,
};

// A string in GGUF.
struct gguf_string_t {
    // The length of the string, in bytes.
    uint64_t len;
    // The string as a UTF-8 non-null-terminated string.
    char string[len];
};

union gguf_metadata_value_t {
    uint8_t uint8;
    int8_t int8;
    uint16_t uint16;
    int16_t int16;
    uint32_t uint32;
    int32_t int32;
    float float32;
    uint64_t uint64;
    int64_t int64;
    double float64;
    bool bool_;
    gguf_string_t string;
    struct {
        // Any value type is valid, including arrays.
        gguf_metadata_value_type type;
        // Number of elements, not bytes
        uint64_t len;
        // The array of values.
        gguf_metadata_value_t array[len];
    } array;
};

struct gguf_metadata_kv_t {
    // The key of the metadata. It is a standard GGUF string, with the following caveats:
    // - It must be a valid ASCII string.
    // - It must be a hierarchical key, where each segment is `lower_snake_case` and separated by a `.`.
    // - It must be at most 2^16-1/65535 bytes long.
    // Any keys that do not follow these rules are invalid.
    gguf_string_t key;

    // The type of the value.
    // Must be one of the `gguf_metadata_value_type` values.
    gguf_metadata_value_type value_type;
    // The value.
    gguf_metadata_value_t value;
};

struct gguf_header_t {
    // Magic number to announce that this is a GGUF file.
    // Must be `GGUF` at the byte level: `0x47` `0x47` `0x55` `0x46`.
    // Your executor might do little-endian byte order, so it might be
    // check for 0x46554747 and letting the endianness cancel out.
    // Consider being *very* explicit about the byte order here.
    uint32_t magic;
    // The version of the format implemented.
    // Must be `3` for version described in this spec, which introduces big-endian support.
    //
    // This version should only be increased for structural changes to the format.
    // Changes that do not affect the structure of the file should instead update the metadata
    // to signify the change.
    uint32_t version;
    // The number of tensors in the file.
    // This is explicit, instead of being included in the metadata, to ensure it is always present
    // for loading the tensors.
    uint64_t tensor_count;
    // The number of metadata key-value pairs.
    uint64_t metadata_kv_count;
    // The metadata key-value pairs.
    gguf_metadata_kv_t metadata_kv[metadata_kv_count];
};

uint64_t align_offset(uint64_t offset) {
    return offset + (ALIGNMENT - (offset % ALIGNMENT)) % ALIGNMENT;
}

struct gguf_tensor_info_t {
    // The name of the tensor. It is a standard GGUF string, with the caveat that
    // it must be at most 64 bytes long.
    gguf_string_t name;
    // The number of dimensions in the tensor.
    // Currently at most 4, but this may change in the future.
    uint32_t n_dimensions;
    // The dimensions of the tensor.
    uint64_t dimensions[n_dimensions];
    // The type of the tensor.
    ggml_type type;
    // The offset of the tensor's data in this file in bytes.
    //
    // This offset is relative to `tensor_data`, not to the start
    // of the file, to make it easier for writers to write the file.
    // Readers should consider exposing this offset relative to the
    // file to make it easier to read the data.
    //
    // Must be a multiple of `ALIGNMENT`. That is, `align_offset(offset) == offset`.
    uint64_t offset;
};

struct gguf_file_t {
    // The header of the file.
    gguf_header_t header;

    // Tensor infos, which can be used to locate the tensor data.
    gguf_tensor_info_t tensor_infos[header.tensor_count];

    // Padding to the nearest multiple of `ALIGNMENT`.
    //
    // That is, if `sizeof(header) + sizeof(tensor_infos)` is not a multiple of `ALIGNMENT`,
    // this padding is added to make it so.
    //
    // This can be calculated as `align_offset(position) - position`, where `position` is
    // the position of the end of `tensor_infos` (i.e. `sizeof(header) + sizeof(tensor_infos)`).
    uint8_t _padding[];

    // Tensor data.
    //
    // This is arbitrary binary data corresponding to the weights of the model. This data should be close
    // or identical to the data in the original model file, but may be different due to quantization or
    // other optimizations for inference. Any such deviations should be recorded in the metadata or as
    // part of the architecture definition.
    //
    // Each tensor's data must be stored within this array, and located through its `tensor_infos` entry.
    // The offset of each tensor's data must be a multiple of `ALIGNMENT`, and the space between tensors
    // should be padded to `ALIGNMENT` bytes.
    uint8_t tensor_data[];
};
```

## Standardized key-value pairs

The following key-value pairs are standardized. This list may grow in the future as more use cases are discovered. Where possible, names are shared with the original model definitions to make it easier to map between the two.

Not all of these are required, but they are all recommended. Keys that are required are bolded. For omitted pairs, the reader should assume that the value is unknown and either default or error as appropriate.

The community can develop their own key-value pairs to carry additional data. However, these should be namespaced with the relevant community name to avoid collisions. For example, the `rustformers` community might use `rustformers.` as a prefix for all of their keys.

If a particular community key is widely used, it may be promoted to a standardized key.

By convention, most counts/lengths/etc are `uint64` unless otherwise specified. This is to allow for larger models to be supported in the future. Some models may use `uint32` for their values; it is recommended that readers support both.

### General

#### Required

- **`general.architecture: string`**: describes what architecture this model implements. All lowercase ASCII, with only `[a-z0-9]+` characters allowed. Known values include:
  - `llama`
  - `mpt`
  - `gptneox`
  - `gptj`
  - `gpt2`
  - `bloom`
  - `falcon`
  - `mamba`
  - `rwkv`
- **`general.quantization_version: uint32`**: The version of the quantization format. Not required if the model is not quantized (i.e. no tensors are quantized). If any tensors are quantized, this _must_ be present. This is separate to the quantization scheme of the tensors itself; the quantization version may change without changing the scheme's name (e.g. the quantization scheme is Q5_K, and the quantization version is 4).
- **`general.alignment: uint32`**: the global alignment to use, as described above. This can vary to allow for different alignment schemes, but it must be a multiple of 8. Some writers may not write the alignment. If the alignment is **not** specified, assume it is `32`.

#### General metadata

- `general.name: string`: The name of the model. This should be a human-readable name that can be used to identify the model. It should be unique within the community that the model is defined in.
- `general.author: string`: The author of the model.
- `general.version: string`: The version of the model.
- `general.organization: string`: The organization of the model.
- `general.basename: string`: The base model name / architecture of the model
- `general.finetune: string`: What has the base model been optimized toward.
- `general.description: string`: free-form description of the model including anything that isn't covered by the other fields
- `general.quantized_by: string`: The name of the individual who quantized the model
- `general.size_label: string`: Size class of the model, such as number of weights and experts. (Useful for leader boards)
- `general.license: string`: License of the model, expressed as a [SPDX license expression](https://spdx.github.io/spdx-spec/v2-draft/SPDX-license-expressions/) (e.g. `"MIT OR Apache-2.0`). Do not include any other information, such as the license text or the URL to the license.
- `general.license.name: string`: Human friendly license name
- `general.license.link: string`: URL to the license.
- `general.url: string`: URL to the model's homepage. This can be a GitHub repo, a paper, etc.
- `general.doi: string`: Digital Object Identifier (DOI) https://www.doi.org/
- `general.uuid: string`: [Universally unique identifier](https://en.wikipedia.org/wiki/Universally_unique_identifier)
- `general.repo_url: string`: URL to the model's repository such as a GitHub repo or HuggingFace repo
- `general.tags: string[]`: List of tags that can be used as search terms for a search engine or social media
- `general.languages: string[]`: What languages can the model speak. Encoded as [ISO 639](https://en.wikipedia.org/wiki/List_of_ISO_639_language_codes) two letter codes
- `general.datasets: string[]`: Links or references to datasets that the model was trained upon
- `general.file_type: uint32`: An enumerated value describing the type of the majority of the tensors in the file. Optional; can be inferred from the tensor types.
  - `ALL_F32 = 0`
  - `MOSTLY_F16 = 1`
  - `MOSTLY_Q4_0 = 2`
  - `MOSTLY_Q4_1 = 3`
  - `MOSTLY_Q4_1_SOME_F16 = 4`
  - `MOSTLY_Q4_2 = 5` (support removed)
  - `MOSTLY_Q4_3 = 6` (support removed)
  - `MOSTLY_Q8_0 = 7`
  - `MOSTLY_Q5_0 = 8`
  - `MOSTLY_Q5_1 = 9`
  - `MOSTLY_Q2_K = 10`
  - `MOSTLY_Q3_K_S = 11`
  - `MOSTLY_Q3_K_M = 12`
  - `MOSTLY_Q3_K_L = 13`
  - `MOSTLY_Q4_K_S = 14`
  - `MOSTLY_Q4_K_M = 15`
  - `MOSTLY_Q5_K_S = 16`
  - `MOSTLY_Q5_K_M = 17`
  - `MOSTLY_Q6_K = 18`

#### Source metadata

Information about where this model came from. This is useful for tracking the provenance of the model, and for finding the original source if the model is modified. For a model that was converted from GGML, for example, these keys would point to the model that was converted from.

- `general.source.url: string`: URL to the source of the model's homepage. This can be a GitHub repo, a paper, etc.
- `general.source.doi: string`: Source Digital Object Identifier (DOI) https://www.doi.org/
- `general.source.uuid: string`: Source [Universally unique identifier](https://en.wikipedia.org/wiki/Universally_unique_identifier)
- `general.source.repo_url: string`: URL to the source of the model's repository such as a GitHub repo or HuggingFace repo

- `general.base_model.count: uint32`: Number of parent models
- `general.base_model.{id}.name: string`: The name of the parent model.
- `general.base_model.{id}.author: string`: The author of the parent model.
- `general.base_model.{id}.version: string`: The version of the parent model.
- `general.base_model.{id}.organization: string`: The organization of the parent model.
- `general.base_model.{id}.url: string`: URL to the source of the parent model's homepage. This can be a GitHub repo, a paper, etc.
- `general.base_model.{id}.doi: string`: Parent Digital Object Identifier (DOI) https://www.doi.org/
- `general.base_model.{id}.uuid: string`: Parent [Universally unique identifier](https://en.wikipedia.org/wiki/Universally_unique_identifier)
- `general.base_model.{id}.repo_url: string`: URL to the source of the parent model's repository such as a GitHub repo or HuggingFace repo

### LLM

In the following, `[llm]` is used to fill in for the name of a specific LLM architecture. For example, `llama` for LLaMA, `mpt` for MPT, etc. If mentioned in an architecture's section, it is required for that architecture, but not all keys are required for all architectures. Consult the relevant section for more information.

- `[llm].context_length: uint64`: Also known as `n_ctx`. length of the context (in tokens) that the model was trained on. For most architectures, this is the hard limit on the length of the input. Architectures, like RWKV, that are not reliant on transformer-style attention may be able to handle larger inputs, but this is not guaranteed.
- `[llm].embedding_length: uint64`: Also known as `n_embd`. Embedding layer size.
- `[llm].block_count: uint64`: The number of blocks of attention+feed-forward layers (i.e. the bulk of the LLM). Does not include the input or embedding layers.
- `[llm].feed_forward_length: uint64`: Also known as `n_ff`. The length of the feed-forward layer.
- `[llm].use_parallel_residual: bool`: Whether or not the parallel residual logic should be used.
- `[llm].tensor_data_layout: string`: When a model is converted to GGUF, tensors may be rearranged to improve performance. This key describes the layout of the tensor data. This is not required; if not present, it is assumed to be `reference`.
  - `reference`: tensors are laid out in the same order as the original model
  - further options can be found for each architecture in their respective sections
- `[llm].expert_count: uint32`: Number of experts in MoE models (optional for non-MoE arches).
- `[llm].expert_used_count: uint32`: Number of experts used during each token token evaluation (optional for non-MoE arches).

#### Attention

- `[llm].attention.head_count: uint64`: Also known as `n_head`. Number of attention heads.
- `[llm].attention.head_count_kv: uint64`: The number of heads per group used in Grouped-Query-Attention. If not present or if present and equal to `[llm].attention.head_count`, the model does not use GQA.
- `[llm].attention.max_alibi_bias: float32`: The maximum bias to use for ALiBI.
- `[llm].attention.clamp_kqv: float32`: Value (`C`) to clamp the values of the `Q`, `K`, and `V` tensors between (`[-C, C]`).
- `[llm].attention.layer_norm_epsilon: float32`: Layer normalization epsilon.
- `[llm].attention.layer_norm_rms_epsilon: float32`: Layer RMS normalization epsilon.
- `[llm].attention.key_length: uint32`: The optional size of a key head, $d_k$. If not specified, it will be `n_embd / n_head`.
- `[llm].attention.value_length: uint32`: The optional size of a value head, $d_v$. If not specified, it will be `n_embd / n_head`.

#### RoPE

- `[llm].rope.dimension_count: uint64`: The number of rotary dimensions for RoPE.
- `[llm].rope.freq_base: float32`: The base frequency for RoPE.

##### Scaling

The following keys describe RoPE scaling parameters:

- `[llm].rope.scaling.type: string`: Can be `none`, `linear`, or `yarn`.
- `[llm].rope.scaling.factor: float32`: A scale factor for RoPE to adjust the context length.
- `[llm].rope.scaling.original_context_length: uint32_t`: The original context length of the base model.
- `[llm].rope.scaling.finetuned: bool`: True if model has been finetuned with RoPE scaling.

Note that older models may not have these keys, and may instead use the following key:

- `[llm].rope.scale_linear: float32`: A linear scale factor for RoPE to adjust the context length.

It is recommended that models use the newer keys if possible, as they are more flexible and allow for more complex scaling schemes. Executors will need to support both indefinitely.

#### SSM

- `[llm].ssm.conv_kernel: uint32`: The size of the rolling/shift state.
- `[llm].ssm.inner_size: uint32`: The embedding size of the states.
- `[llm].ssm.state_size: uint32`: The size of the recurrent state.
- `[llm].ssm.time_step_rank: uint32`: The rank of time steps.

#### Models

The following sections describe the metadata for each model architecture. Each key specified _must_ be present.

##### LLaMA

- `llama.context_length`
- `llama.embedding_length`
- `llama.block_count`
- `llama.feed_forward_length`
- `llama.rope.dimension_count`
- `llama.attention.head_count`
- `llama.attention.layer_norm_rms_epsilon`

###### Optional

- `llama.rope.scale`
- `llama.attention.head_count_kv`
- `llama.tensor_data_layout`:
  - `Meta AI original pth`:
    ```python
    def permute(weights: NDArray, n_head: int) -> NDArray:
        return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
                    .swapaxes(1, 2)
                    .reshape(weights.shape))
    ```
- `llama.expert_count`
- `llama.expert_used_count`

##### MPT

- `mpt.context_length`
- `mpt.embedding_length`
- `mpt.block_count`
- `mpt.attention.head_count`
- `mpt.attention.alibi_bias_max`
- `mpt.attention.clip_kqv`
- `mpt.attention.layer_norm_epsilon`

##### GPT-NeoX

- `gptneox.context_length`
- `gptneox.embedding_length`
- `gptneox.block_count`
- `gptneox.use_parallel_residual`
- `gptneox.rope.dimension_count`
- `gptneox.attention.head_count`
- `gptneox.attention.layer_norm_epsilon`

###### Optional

- `gptneox.rope.scale`

##### GPT-J

- `gptj.context_length`
- `gptj.embedding_length`
- `gptj.block_count`
- `gptj.rope.dimension_count`
- `gptj.attention.head_count`
- `gptj.attention.layer_norm_epsilon`

###### Optional

- `gptj.rope.scale`

##### GPT-2

- `gpt2.context_length`
- `gpt2.embedding_length`
- `gpt2.block_count`
- `gpt2.attention.head_count`
- `gpt2.attention.layer_norm_epsilon`

##### BLOOM

- `bloom.context_length`
- `bloom.embedding_length`
- `bloom.block_count`
- `bloom.feed_forward_length`
- `bloom.attention.head_count`
- `bloom.attention.layer_norm_epsilon`

##### Falcon

- `falcon.context_length`
- `falcon.embedding_length`
- `falcon.block_count`
- `falcon.attention.head_count`
- `falcon.attention.head_count_kv`
- `falcon.attention.use_norm`
- `falcon.attention.layer_norm_epsilon`

###### Optional

- `falcon.tensor_data_layout`:

  - `jploski` (author of the original GGML implementation of Falcon):

    ```python
    # The original query_key_value tensor contains n_head_kv "kv groups",
    # each consisting of n_head/n_head_kv query weights followed by one key
    # and one value weight (shared by all query heads in the kv group).
    # This layout makes it a big pain to work with in GGML.
    # So we rearrange them here,, so that we have n_head query weights
    # followed by n_head_kv key weights followed by n_head_kv value weights,
    # in contiguous fashion.

    if "query_key_value" in src:
        qkv = model[src].view(
            n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)

        q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
        k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
        v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)

        model[src] = torch.cat((q,k,v)).reshape_as(model[src])
    ```

##### Mamba

- `mamba.context_length`
- `mamba.embedding_length`
- `mamba.block_count`
- `mamba.ssm.conv_kernel`
- `mamba.ssm.inner_size`
- `mamba.ssm.state_size`
- `mamba.ssm.time_step_rank`
- `mamba.attention.layer_norm_rms_epsilon`

##### RWKV

The vocabulary size is the same as the number of rows in the `head` matrix.

- `rwkv.architecture_version: uint32`: The only allowed value currently is 4. Version 5 is expected to appear some time in the future.
- `rwkv.context_length: uint64`: Length of the context used during training or fine-tuning. RWKV is able to handle larger context than this limit, but the output quality may suffer.
- `rwkv.block_count: uint64`
- `rwkv.embedding_length: uint64`
- `rwkv.feed_forward_length: uint64`

##### Whisper

Keys that do not have types defined should be assumed to share definitions with `llm.` keys.
(For example, `whisper.context_length` is equivalent to `llm.context_length`.)
This is because they are both transformer models.

- `whisper.encoder.context_length`
- `whisper.encoder.embedding_length`
- `whisper.encoder.block_count`
- `whisper.encoder.mels_count: uint64`
- `whisper.encoder.attention.head_count`

- `whisper.decoder.context_length`
- `whisper.decoder.embedding_length`
- `whisper.decoder.block_count`
- `whisper.decoder.attention.head_count`

#### Prompting

**TODO**: Include prompt format, and/or metadata about how it should be used (instruction, conversation, autocomplete, etc).

### LoRA

**TODO**: Figure out what metadata is needed for LoRA. Probably desired features:

- match an existing model exactly, so that it can't be misapplied
- be marked as a LoRA so executors won't try to run it by itself

Should this be an architecture, or should it share the details of the original model with additional fields to mark it as a LoRA?

### Tokenizer

The following keys are used to describe the tokenizer of the model. It is recommended that model authors support as many of these as possible, as it will allow for better tokenization quality with supported executors.

#### GGML

GGML supports an embedded vocabulary that enables inference of the model, but implementations of tokenization using this vocabulary (i.e. `llama.cpp`'s tokenizer) may have lower accuracy than the original tokenizer used for the model. When a more accurate tokenizer is available and supported, it should be used instead.

It is not guaranteed to be standardized across models, and may change in the future. It is recommended that model authors use a more standardized tokenizer if possible.

- `tokenizer.ggml.model: string`: The name of the tokenizer model.
  - `llama`: Llama style SentencePiece (tokens and scores extracted from HF `tokenizer.model`)
  - `replit`: Replit style SentencePiece (tokens and scores extracted from HF `spiece.model`)
  - `gpt2`: GPT-2 / GPT-NeoX style BPE (tokens extracted from HF `tokenizer.json`)
  - `rwkv`: RWKV tokenizer
- `tokenizer.ggml.tokens: array[string]`: A list of tokens indexed by the token ID used by the model.
- `tokenizer.ggml.scores: array[float32]`: If present, the score/probability of each token. If not present, all tokens are assumed to have equal probability. If present, it must have the same length and index as `tokens`.
- `tokenizer.ggml.token_type: array[int32]`: The token type (1=normal, 2=unknown, 3=control, 4=user defined, 5=unused, 6=byte). If present, it must have the same length and index as `tokens`.
- `tokenizer.ggml.merges: array[string]`: If present, the merges of the tokenizer. If not present, the tokens are assumed to be atomic.
- `tokenizer.ggml.added_tokens: array[string]`: If present, tokens that were added after training.

##### Special tokens

- `tokenizer.ggml.bos_token_id: uint32`: Beginning of sequence marker
- `tokenizer.ggml.eos_token_id: uint32`: End of sequence marker
- `tokenizer.ggml.unknown_token_id: uint32`: Unknown token
- `tokenizer.ggml.separator_token_id: uint32`: Separator token
- `tokenizer.ggml.padding_token_id: uint32`: Padding token

#### Hugging Face

Hugging Face maintains their own `tokenizers` library that supports a wide variety of tokenizers. If your executor uses this library, it may be able to use the model's tokenizer directly.

- `tokenizer.huggingface.json: string`: the entirety of the HF `tokenizer.json` for a given model (e.g. <https://huggingface.co/mosaicml/mpt-7b-instruct/blob/main/tokenizer.json>). Included for compatibility with executors that support HF tokenizers directly.

#### Other

Other tokenizers may be used, but are not necessarily standardized. They may be executor-specific. They will be documented here as they are discovered/further developed.

- `tokenizer.rwkv.world: string`: a RWKV World tokenizer, like [this](https://github.com/BlinkDL/ChatRWKV/blob/main/tokenizer/rwkv_vocab_v20230424.txt). This text file should be included verbatim.
- `tokenizer.chat_template : string`: a Jinja template that specifies the input format expected by the model. For more details see: <https://huggingface.co/docs/transformers/main/en/chat_templating>

### Computation graph

This is a future extension and still needs to be discussed, and may necessitate a new GGUF version. At the time of writing, the primary blocker is the stabilization of the computation graph format.

A sample computation graph of GGML nodes could be included in the model itself, allowing an executor to run the model without providing its own implementation of the architecture. This would allow for a more consistent experience across executors, and would allow for more complex architectures to be supported without requiring the executor to implement them.

## Standardized tensor names

To minimize complexity and maximize compatibility, it is recommended that models using the transformer architecture use the following naming convention for their tensors:

### Base layers

`AA.weight` `AA.bias`

where `AA` can be:

- `token_embd`: Token embedding layer
- `pos_embd`: Position embedding layer
- `output_norm`: Output normalization layer
- `output`: Output layer

### Attention and feed-forward layer blocks

`blk.N.BB.weight` `blk.N.BB.bias`

where N signifies the block number a layer belongs to, and where `BB` could be:

- `attn_norm`: Attention normalization layer
- `attn_norm_2`: Attention normalization layer
- `attn_qkv`: Attention query-key-value layer
- `attn_q`: Attention query layer
- `attn_k`: Attention key layer
- `attn_v`: Attention value layer
- `attn_output`: Attention output layer

- `ffn_norm`: Feed-forward network normalization layer
- `ffn_up`: Feed-forward network "up" layer
- `ffn_gate`: Feed-forward network "gate" layer
- `ffn_down`: Feed-forward network "down" layer
- `ffn_gate_inp`: Expert-routing layer for the Feed-forward network in MoE models
- `ffn_gate_exp`: Feed-forward network "gate" layer per expert in MoE models
- `ffn_down_exp`: Feed-forward network "down" layer per expert in MoE models
- `ffn_up_exp`: Feed-forward network "up" layer per expert in MoE models

- `ssm_in`: State space model input projections layer
- `ssm_conv1d`: State space model rolling/shift layer
- `ssm_x`: State space model selective parametrization layer
- `ssm_a`: State space model state compression layer
- `ssm_d`: State space model skip connection layer
- `ssm_dt`: State space model time step layer
- `ssm_out`: State space model output projection layer

## Version History

This document is actively updated to describe the current state of the metadata, and these changes are not tracked outside of the commits.

However, the format _itself_ has changed. The following sections describe the changes to the format itself.

### v3

Adds big-endian support.

### v2

Most countable values (lengths, etc) were changed from `uint32` to `uint64` to allow for larger models to be supported in the future.

### v1

Initial version.

## Historical State of Affairs

The following information is provided for context, but is not necessary to understand the rest of this document.

### Overview

At present, there are three GGML file formats floating around for LLMs:

- **GGML** (unversioned): baseline format, with no versioning or alignment.
- **GGMF** (versioned): the same as GGML, but with versioning. Only one version exists.
- **GGJT**: Aligns the tensors to allow for use with `mmap`, which requires alignment. v1, v2 and v3 are identical, but the latter versions use a different quantization scheme that is incompatible with previous versions.

GGML is primarily used by the examples in `ggml`, while GGJT is used by `llama.cpp` models. Other executors may use any of the three formats, but this is not 'officially' supported.

These formats share the same fundamental structure:

- a magic number with an optional version number
- model-specific hyperparameters, including
  - metadata about the model, such as the number of layers, the number of heads, etc.
  - a `ftype` that describes the type of the majority of the tensors,
    - for GGML files, the quantization version is encoded in the `ftype` divided by 1000
- an embedded vocabulary, which is a list of strings with length prepended. The GGMF/GGJT formats embed a float32 score next to the strings.
- finally, a list of tensors with their length-prepended name, type, and (aligned, in the case of GGJT) tensor data

Notably, this structure does not identify what model architecture the model belongs to, nor does it offer any flexibility for changing the structure of the hyperparameters. This means that the only way to add new hyperparameters is to add them to the end of the list, which is a breaking change for existing models.

### Drawbacks

Unfortunately, over the last few months, there are a few issues that have become apparent with the existing models:

- There's no way to identify which model architecture a given model is for, because that information isn't present
  - Similarly, existing programs cannot intelligently fail upon encountering new architectures
- Adding or removing any new hyperparameters is a breaking change, which is impossible for a reader to detect without using heuristics
- Each model architecture requires its own conversion script to their architecture's variant of GGML
- Maintaining backwards compatibility without breaking the structure of the format requires clever tricks, like packing the quantization version into the ftype, which are not guaranteed to be picked up by readers/writers, and are not consistent between the two formats

### Why not other formats?

There are a few other formats that could be used, but issues include:

- requiring additional dependencies to load or save the model, which is complicated in a C environment
- limited or no support for 4-bit quantization
- existing cultural expectations (e.g. whether or not the model is a directory or a file)
- lack of support for embedded vocabularies
- lack of control over direction of future development

Ultimately, it is likely that GGUF will remain necessary for the foreseeable future, and it is better to have a single format that is well-documented and supported by all executors than to contort an existing format to fit the needs of GGML.


================================================
FILE: examples/CMakeLists.txt
================================================
if (GGML_ALL_WARNINGS)
  if (NOT MSVC)
      set(cxx_flags
          # TODO(marella): Add other warnings.
          -Wpedantic
          -Wunused-variable
          -Wno-unused-function
          -Wno-multichar
      )
      add_compile_options("$<$<COMPILE_LANGUAGE:CXX>:${cxx_flags}>")
  endif()
endif()

add_library(common STATIC common.cpp)
target_include_directories(common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})

add_library(common-ggml STATIC common-ggml.cpp)
target_link_libraries(common-ggml PRIVATE ggml)
target_include_directories(common-ggml PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})

add_subdirectory(yolo)

if (NOT GGML_BACKEND_DL)
    add_subdirectory(gpt-2)
    add_subdirectory(gpt-j)
    add_subdirectory(mnist)
    add_subdirectory(sam)
    add_subdirectory(simple)
    add_subdirectory(magika)
endif()

if (GGML_METAL)
    add_subdirectory(perf-metal)
endif()


================================================
FILE: examples/common-ggml.cpp
================================================
#include "common-ggml.h"

#include <regex>
#include <map>

static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
    {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
    {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
    {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
    {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
    {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
    {"q2_k", GGML_FTYPE_MOSTLY_Q2_K},
    {"q3_k", GGML_FTYPE_MOSTLY_Q3_K},
    {"q4_k", GGML_FTYPE_MOSTLY_Q4_K},
    {"q5_k", GGML_FTYPE_MOSTLY_Q5_K},
    {"q6_k", GGML_FTYPE_MOSTLY_Q6_K},
};

void ggml_print_ftypes(FILE * fp) {
    for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
        fprintf(fp, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
    }
}

enum ggml_ftype ggml_parse_ftype(const char * str) {
    enum ggml_ftype ftype;
    if (str[0] == 'q') {
        const auto it = GGML_FTYPE_MAP.find(str);
        if (it == GGML_FTYPE_MAP.end()) {
            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
            return GGML_FTYPE_UNKNOWN;
        }
        ftype = it->second;
    } else {
        ftype = (enum ggml_ftype) atoi(str);
    }

    return ftype;
}

bool ggml_common_quantize_0(
        std::ifstream & finp,
        std::ofstream & fout,
        const ggml_ftype ftype,
        const std::vector<std::string> & to_quant,
        const std::vector<std::string> & to_skip) {

    ggml_type qtype = GGML_TYPE_F32;

    switch (ftype) {
        case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
        case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
        case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
        case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
        case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
        case GGML_FTYPE_MOSTLY_Q2_K: qtype = GGML_TYPE_Q2_K; break;
        case GGML_FTYPE_MOSTLY_Q3_K: qtype = GGML_TYPE_Q3_K; break;
        case GGML_FTYPE_MOSTLY_Q4_K: qtype = GGML_TYPE_Q4_K; break;
        case GGML_FTYPE_MOSTLY_Q5_K: qtype = GGML_TYPE_Q5_K; break;
        case GGML_FTYPE_MOSTLY_Q6_K: qtype = GGML_TYPE_Q6_K; break;
        case GGML_FTYPE_UNKNOWN:
        case GGML_FTYPE_ALL_F32:
        case GGML_FTYPE_MOSTLY_F16:
        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
        case GGML_FTYPE_MOSTLY_IQ2_XXS:
        case GGML_FTYPE_MOSTLY_IQ2_XS:
        case GGML_FTYPE_MOSTLY_IQ2_S:
        case GGML_FTYPE_MOSTLY_IQ3_XXS:
        case GGML_FTYPE_MOSTLY_IQ3_S:
        case GGML_FTYPE_MOSTLY_IQ1_S:
        case GGML_FTYPE_MOSTLY_IQ4_NL:
        case GGML_FTYPE_MOSTLY_IQ4_XS:
        case GGML_FTYPE_MOSTLY_IQ1_M:
        case GGML_FTYPE_MOSTLY_BF16:
        case GGML_FTYPE_MOSTLY_MXFP4:
        case GGML_FTYPE_MOSTLY_NVFP4:
                {
                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                    return false;
                }
    };

    if (!ggml_is_quantized(qtype)) {
        fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
        return false;
    }

    size_t total_size_org = 0;
    size_t total_size_new = 0;

    std::vector<float> work;

    std::vector<uint8_t>     data_u8;
    std::vector<ggml_fp16_t> data_f16;
    std::vector<float>       data_f32;

    while (true) {
        int32_t n_dims;
        int32_t length;
        int32_t ttype;

        finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
        finp.read(reinterpret_cast<char *>(&length), sizeof(length));
        finp.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));

        if (finp.eof()) {
            break;
        }

        int32_t nelements = 1;
        int32_t ne[4] = { 1, 1, 1, 1 };
        for (int i = 0; i < n_dims; ++i) {
            finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
            nelements *= ne[i];
        }

        std::string name(length, 0);
        finp.read (&name[0], length);

        printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));

        bool quantize = false;

        // check if we should quantize this tensor
        for (const auto & s : to_quant) {
            if (std::regex_match(name, std::regex(s))) {
                quantize = true;
                break;
            }
        }

        // check if we should skip this tensor
        for (const auto & s : to_skip) {
            if (std::regex_match(name, std::regex(s))) {
                quantize = false;
                break;
            }
        }

        // quantize only 2D tensors
        quantize &= (n_dims == 2);

        if (quantize) {
            if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
                fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
                return false;
            }

            if (ttype == GGML_TYPE_F16) {
                data_f16.resize(nelements);
                finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
                data_f32.resize(nelements);
                for (int i = 0; i < nelements; ++i) {
                    data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
                }
            } else {
                data_f32.resize(nelements);
                finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
            }

            ttype = qtype;
        } else {
            const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);

            data_u8.resize(nelements*bpe);
            finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
        }

        fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
        fout.write(reinterpret_cast<char *>(&length), sizeof(length));
        fout.write(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
        for (int i = 0; i < n_dims; ++i) {
            fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
        }
        fout.write(&name[0], length);

        if (quantize) {
            work.resize(nelements); // for quantization

            size_t cur_size = 0;
            switch ((ggml_type) ttype) {
                case GGML_TYPE_Q4_0:
                case GGML_TYPE_Q4_1:
                case GGML_TYPE_Q5_0:
                case GGML_TYPE_Q5_1:
                case GGML_TYPE_Q8_0:
                case GGML_TYPE_Q2_K:
                case GGML_TYPE_Q3_K:
                case GGML_TYPE_Q4_K:
                case GGML_TYPE_Q5_K:
                case GGML_TYPE_Q6_K:
                    {
                        cur_size = ggml_quantize_chunk((ggml_type) ttype, data_f32.data(), work.data(), 0, nelements/ne[0], ne[0], nullptr);
                    } break;
                case GGML_TYPE_F32:
                case GGML_TYPE_F16:
                case GGML_TYPE_I8:
                case GGML_TYPE_I16:
                case GGML_TYPE_I32:
                case GGML_TYPE_I64:
                case GGML_TYPE_F64:
                case GGML_TYPE_Q8_1:
                case GGML_TYPE_Q8_K:
                case GGML_TYPE_IQ2_XXS:
                case GGML_TYPE_IQ2_XS:
                case GGML_TYPE_IQ2_S:
                case GGML_TYPE_IQ3_XXS:
                case GGML_TYPE_IQ3_S:
                case GGML_TYPE_IQ1_S:
                case GGML_TYPE_IQ4_NL:
                case GGML_TYPE_IQ4_XS:
                case GGML_TYPE_IQ1_M:
                case GGML_TYPE_BF16:
                case GGML_TYPE_TQ1_0:
                case GGML_TYPE_TQ2_0:
                case GGML_TYPE_MXFP4:
                case GGML_TYPE_NVFP4:
                case GGML_TYPE_COUNT:
                    {
                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
                        return false;
                    }
            }

            fout.write(reinterpret_cast<char *>(work.data()), cur_size);
            total_size_new += cur_size;

            printf("size = %8.2f MB -> %8.2f MB\n", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
        } else {
            printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
            fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
            total_size_new += data_u8.size();
        }

        total_size_org += nelements * sizeof(float);
    }

    printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
    printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));

    return true;
}


================================================
FILE: examples/common-ggml.h
================================================
#pragma once

#include "ggml.h"

#include <fstream>
#include <vector>
#include <string>

enum ggml_ftype ggml_parse_ftype(const char * str);

void ggml_print_ftypes(FILE * fp = stderr);

bool ggml_common_quantize_0(
        std::ifstream & finp,
        std::ofstream & fout,
        const ggml_ftype ftype,
        const std::vector<std::string> & to_quant,
        const std::vector<std::string> & to_skip);


================================================
FILE: examples/common.cpp
================================================
#define _USE_MATH_DEFINES // for M_PI

#include "common.h"

#include <cmath>
#include <codecvt>
#include <cstring>
#include <fstream>
#include <locale>
#include <regex>
#include <sstream>

// Function to check if the next argument exists
static std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
    if (i + 1 < argc && argv[i + 1][0] != '-') {
        return argv[++i];
    } else {
        fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
        gpt_print_usage(argc, argv, params);
        exit(0);
    }
}

bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];

        if (arg == "-s" || arg == "--seed") {
            params.seed = std::stoi(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "-t" || arg == "--threads") {
            params.n_threads = std::stoi(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "-p" || arg == "--prompt") {
            params.prompt = get_next_arg(i, argc, argv, arg, params);
        } else if (arg == "-n" || arg == "--n_predict") {
            params.n_predict = std::stoi(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "-np" || arg == "--n_parallel") {
            params.n_parallel = std::stoi(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "--top_k") {
            params.top_k = std::stoi(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "--top_p") {
            params.top_p = std::stof(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "--temp") {
            params.temp = std::stof(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "--repeat-last-n") {
            params.repeat_last_n = std::stoi(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "--repeat-penalty") {
            params.repeat_penalty = std::stof(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "-b" || arg == "--batch_size") {
            params.n_batch= std::stoi(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "-c" || arg == "--context") {
            params.n_ctx= std::stoi(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "-ngl" || arg == "--gpu-layers" || arg == "--n-gpu-layers") {
            params.n_gpu_layers = std::stoi(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "--ignore-eos") {
            params.ignore_eos = true;
        } else if (arg == "-m" || arg == "--model") {
            params.model = get_next_arg(i, argc, argv, arg, params);
        } else if (arg == "-i" || arg == "--interactive") {
            params.interactive = true;
        } else if (arg == "-ip" || arg == "--interactive-port") {
            params.interactive = true;
            params.interactive_port = std::stoi(get_next_arg(i, argc, argv, arg, params));
        } else if (arg == "-h" || arg == "--help") {
            gpt_print_usage(argc, argv, params);
            exit(0);
        } else if (arg == "-f" || arg == "--file") {
            get_next_arg(i, argc, argv, arg, params);
            std::ifstream file(argv[i]);
            if (!file) {
                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
                break;
            }
            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
            if (params.prompt.back() == '\n') {
                params.prompt.pop_back();
            }
        } else if (arg == "-tt" || arg == "--token_test") {
            params.token_test = get_next_arg(i, argc, argv, arg, params);
        }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            gpt_print_usage(argc, argv, params);
            exit(0);
        }
    }

    return true;
}

void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h, --help            show this help message and exit\n");
    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
    fprintf(stderr, "  -f FNAME, --file FNAME\n");
    fprintf(stderr, "                        load prompt from a file\n");
    fprintf(stderr, "  -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
    fprintf(stderr, "                        test tokenization\n");
    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
    fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k);
    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
    fprintf(stderr, "  --repeat-last-n N     last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
    fprintf(stderr, "  --repeat-penalty N    penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stderr, "  -c N, --context N     context / KV cache size (default: %d)\n", params.n_ctx);
    fprintf(stderr, "  --ignore-eos          ignore EOS token during generation\n");
    fprintf(stderr, "  -ngl N, --gpu-layers N  number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
    fprintf(stderr, "  -m FNAME, --model FNAME\n");
    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
    fprintf(stderr, "\n");
}

std::string gpt_random_prompt(std::mt19937 & rng) {
    const int r = rng() % 10;
    switch (r) {
        case 0: return "So";
        case 1: return "Once upon a time";
        case 2: return "When";
        case 3: return "The";
        case 4: return "After";
        case 5: return "If";
        case 6: return "import";
        case 7: return "He";
        case 8: return "She";
        case 9: return "They";
    }

    return "The";
}

std::string trim(const std::string & s) {
    std::regex e("^\\s+|\\s+$");
    return std::regex_replace(s, e, "");
}

std::string replace(const std::string & s, const std::string & from, const std::string & to) {
    std::string result = s;
    size_t pos = 0;
    while ((pos = result.find(from, pos)) != std::string::npos) {
        result.replace(pos, from.length(), to);
        pos += to.length();
    }
    return result;
}

void gpt_vocab::add_special_token(const std::string & token) {
    special_tokens.push_back(token);
}

std::map<std::string, int32_t> json_parse(const std::string & fname) {
    std::map<std::string, int32_t> result;

    // read file into string
    std::string json;
    {
        std::ifstream ifs(fname);
        if (!ifs) {
            fprintf(stderr, "Failed to open %s\n", fname.c_str());
            exit(1);
        }

        json = std::string((std::istreambuf_iterator<char>(ifs)),
                (std::istreambuf_iterator<char>()));
    }

    if (json[0] != '{') {
        return result;
    }

    // parse json
    {
        bool has_key  = false;
        bool in_token = false;

        std::string str_key = "";
        std::string str_val = "";

        int n = json.size();
        for (int i = 1; i < n; ++i) {
            if (!in_token) {
                if (json[i] == ' ') continue;
                if (json[i] == '"') {
                    in_token = true;
                    continue;
                }
            } else {
                if (json[i] == '\\' && i+1 < n) {
                    if (has_key == false) {
                        str_key += json[i];
                    } else {
                        str_val += json[i];
                    }
                    ++i;
                } else if (json[i] == '"') {
                    if (has_key == false) {
                        has_key = true;
                        ++i;
                        while (json[i] == ' ') ++i;
                        ++i; // :
                        while (json[i] == ' ') ++i;
                        if (json[i] != '\"') {
                            while (json[i] != ',' && json[i] != '}') {
                                str_val += json[i++];
                            }
                            has_key = false;
                        } else {
                            in_token = true;
                            continue;
                        }
                    } else {
                        has_key = false;
                    }

                    str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
                    str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
                    str_key = ::replace(str_key, "\\\"",    "\""); // \\\"   -> "

                    try {
                        result[str_key] = std::stoi(str_val);
                    } catch (...) {
                        //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());

                    }
                    str_key = "";
                    str_val = "";
                    in_token = false;
                    continue;
                }
                if (has_key == false) {
                    str_key += json[i];
                } else {
                    str_val += json[i];
                }
            }
        }
    }

    return result;
}

void gpt_split_words(std::string str, std::vector<std::string>& words) {
    const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
    const std::regex re(pattern);
    std::smatch m;

    while (std::regex_search(str, m, re)) {
        for (auto x : m) {
            words.push_back(x);
        }
        str = m.suffix();
    }
}

std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
    std::vector<std::string> words;

    // first split the text into words
    {
        std::string str = text;

        // Generate the subpattern from the special_tokens vector if it's not empty
        if (!vocab.special_tokens.empty()) {
            const std::regex escape(R"([\[\\\^\$\.\|\?\*\+\(\)\{\}])");
            std::string special_tokens_subpattern;
            for (const auto & token : vocab.special_tokens) {
                if (!special_tokens_subpattern.empty()) {
                    special_tokens_subpattern += "|";
                }
                special_tokens_subpattern += std::regex_replace(token, escape, R"(\$&)");
            }

            std::regex re(special_tokens_subpattern);
            std::smatch m;
            // Split the text by special tokens.
            while (std::regex_search(str, m, re)) {
                // Split the substrings in-between special tokens into words.
                gpt_split_words(m.prefix(), words);
                // Add matched special tokens as words.
                for (auto x : m) {
                    words.push_back(x);
                }
                str = m.suffix();
            }
            // Remaining text without special tokens will be handled below.
        }

        gpt_split_words(str, words);
    }

    // find the longest token that forms each word in words:
    std::vector<gpt_vocab::id> tokens;
    for (const auto & word : words) {
        for (int i = 0; i < (int) word.size(); ){
            for (int j = word.size() - 1; j >= i; j--){
                auto cand = word.substr(i, j-i+1);
                auto it = vocab.token_to_id.find(cand);
                if (it != vocab.token_to_id.end()){ // word.substr(i, j-i+1) in vocab
                    tokens.push_back(it->second);
                    i = j + 1;
                    break;
                }
                else if (j == i){ // word.substr(i, 1) has no matching
                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, word.substr(i, 1).data());
                    i++;
                }
            }
        }
    }

    return tokens;
}

static std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
    std::vector<gpt_vocab::id> output;
    std::stringstream ss(input);
    std::string token;

    while (std::getline(ss, token, delimiter)) {
        output.push_back(std::stoi(token));
    }

    return output;
}

static std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){
    if (fpath_test.empty()){
        fprintf(stderr, "%s : No test file found.\n", __func__);
        return std::map<std::string, std::vector<gpt_vocab::id>>();
    }

    std::map<std::string, std::vector<gpt_vocab::id>> tests;

    auto fin = std::ifstream(fpath_test, std::ios_base::in);
    const char * delimeter = " => ";
    const char del_tok = ',';
    std::string line;
    while (std::getline(fin, line)) {
        size_t delimiterPos = line.find(delimeter);
        if (delimiterPos != std::string::npos) {
            std::string text = line.substr(0, delimiterPos);
            std::string s_tokens = line.substr(delimiterPos + std::strlen(delimeter));
            tests[text] = parse_tokens_from_string(s_tokens, del_tok);
        }
    }
    return tests;
}

void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test){
    std::map<std::string, std::vector<gpt_vocab::id>> tests = extract_tests_from_file(fpath_test);

    size_t n_fails = 0;

    for (const auto & test : tests) {
        std::vector<gpt_vocab::id> tokens = gpt_tokenize(vocab, test.first);

        if (tokens != test.second){
            n_fails++;

            // print out failure cases
            fprintf(stderr, "%s : failed test: '%s'\n", __func__, test.first.c_str());
            fprintf(stderr, "%s : tokens in hf:   ", __func__);
            for (const auto & t : test.second) {
                fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
            }
            fprintf(stderr, "\n");
            fprintf(stderr, "%s : tokens in ggml: ", __func__);
            for (const auto & t : tokens) {
                fprintf(stderr, "%s(%d), ", vocab.id_to_token[t].c_str(), t);
            }
            fprintf(stderr, "\n");
        }
    }

    fprintf(stderr, "%s : %zu tests failed out of %zu tests.\n", __func__, n_fails, tests.size());
}

bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
    printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());

    vocab.token_to_id = ::json_parse(fname);

    for (const auto & kv : vocab.token_to_id) {
        vocab.id_to_token[kv.second] = kv.first;
    }

    printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());

    // print the vocabulary
    //for (auto kv : vocab.token_to_id) {
    //    printf("'%s' -> %d\n", kv.first.data(), kv.second);
    //}

    return true;
}

gpt_vocab::id gpt_sample_top_k_top_p(
        const gpt_vocab & vocab,
        const float * logits,
        int    top_k,
        double top_p,
        double temp,
        std::mt19937 & rng) {
    int n_logits = vocab.id_to_token.size();

    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
    logits_id.reserve(n_logits);

    {
        const double scale = 1.0/temp;
        for (int i = 0; i < n_logits; ++i) {
            logits_id.push_back(std::make_pair(logits[i]*scale, i));
        }
    }

    // find the top K tokens
    std::partial_sort(
            logits_id.begin(),
            logits_id.begin() + top_k, logits_id.end(),
            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
        return a.first > b.first;
    });

    logits_id.resize(top_k);

    double maxl = -INFINITY;
    for (const auto & kv : logits_id) {
        maxl = std::max(maxl, kv.first);
    }

    // compute probs for the top K tokens
    std::vector<double> probs;
    probs.reserve(logits_id.size());

    double sum = 0.0;
    for (const auto & kv : logits_id) {
        double p = exp(kv.first - maxl);
        probs.push_back(p);
        sum += p;
    }

    // normalize the probs
    for (auto & p : probs) {
        p /= sum;
    }

    if (top_p < 1.0f) {
        double cumsum = 0.0f;
        for (int i = 0; i < top_k; i++) {
            cumsum += probs[i];
            if (cumsum >= top_p) {
                top_k = i + 1;
                probs.resize(top_k);
                logits_id.resize(top_k);
                break;
            }
        }

        cumsum = 1.0/cumsum;
        for (int i = 0; i < (int) probs.size(); i++) {
            probs[i] *= cumsum;
        }
    }

    //printf("\n");
    //for (int i = 0; i < (int) probs.size(); i++) {
    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
    //}
    //exit(0);

    std::discrete_distribution<> dist(probs.begin(), probs.end());
    int idx = dist(rng);

    return logits_id[idx].second;
}

gpt_vocab::id gpt_sample_top_k_top_p_repeat(
        const gpt_vocab & vocab,
        const float * logits,
        const int32_t * last_n_tokens_data,
        size_t last_n_tokens_data_size,
        int    top_k,
        double top_p,
        double temp,
        int repeat_last_n,
        float repeat_penalty,
        std::mt19937 & rng) {

    int n_logits = vocab.id_to_token.size();

    const auto * plogits = logits;

    const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_data_size);

    if (temp <= 0) {
        // select the token with the highest logit directly
        float max_logit = plogits[0];
        gpt_vocab::id max_id = 0;

        for (int i = 1; i < n_logits; ++i) {
            if (plogits[i] > max_logit) {
                max_logit = plogits[i];
                max_id = i;
            }
        }
        return max_id;
    }


    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
    logits_id.reserve(n_logits);

    {
        const float scale = 1.0f/temp;
        for (int i = 0; i < n_logits; ++i) {
            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
            if (repeat_last_n > 0 && std::find(last_n_tokens.end()-repeat_last_n, last_n_tokens.end(), i) != last_n_tokens.end()) {
                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
                if (plogits[i] < 0.0f) {
                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
                } else {
                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
                }
            } else {
                logits_id.push_back(std::make_pair(plogits[i]*scale, i));
            }
        }
    }

    // find the top K tokens
    std::partial_sort(
            logits_id.begin(),
            logits_id.begin() + top_k, logits_id.end(),
            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
        return a.first > b.first;
    });

    logits_id.resize(top_k);

    double maxl = -INFINITY;
    for (const auto & kv : logits_id) {
        maxl = std::max(maxl, kv.first);
    }

    // compute probs for the top K tokens
    std::vector<double> probs;
    probs.reserve(logits_id.size());

    double sum = 0.0;
    for (const auto & kv : logits_id) {
        double p = exp(kv.first - maxl);
        probs.push_back(p);
        sum += p;
    }

    // normalize the probs
    for (auto & p : probs) {
        p /= sum;
    }

    if (top_p < 1.0f) {
        double cumsum = 0.0f;
        for (int i = 0; i < top_k; i++) {
            cumsum += probs[i];
            if (cumsum >= top_p) {
                top_k = i + 1;
                probs.resize(top_k);
                logits_id.resize(top_k);
                break;
            }
        }

        cumsum = 1.0/cumsum;
        for (int i = 0; i < (int) probs.size(); i++) {
            probs[i] *= cumsum;
        }
    }

//    printf("\n");
//    for (int i = 0; i < (int) probs.size(); i++) {
//    for (int i = 0; i < 10; i++) {
//        printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
//    }

    std::discrete_distribution<> dist(probs.begin(), probs.end());
    int idx = dist(rng);

    return logits_id[idx].second;

}

void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
    const float rc = 1.0f / (2.0f * M_PI * cutoff);
    const float dt = 1.0f / sample_rate;
    const float alpha = dt / (rc + dt);

    float y = data[0];

    for (size_t i = 1; i < data.size(); i++) {
        y = alpha * (y + data[i] - data[i - 1]);
        data[i] = y;
    }
}

bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
    const int n_samples      = pcmf32.size();
    const int n_samples_last = (sample_rate * last_ms) / 1000;

    if (n_samples_last >= n_samples) {
        // not enough samples - assume no speech
        return false;
    }

    if (freq_thold > 0.0f) {
        high_pass_filter(pcmf32, freq_thold, sample_rate);
    }

    float energy_all  = 0.0f;
    float energy_last = 0.0f;

    for (int i = 0; i < n_samples; i++) {
        energy_all += fabsf(pcmf32[i]);

        if (i >= n_samples - n_samples_last) {
            energy_last += fabsf(pcmf32[i]);
        }
    }

    energy_all  /= n_samples;
    energy_last /= n_samples_last;

    if (verbose) {
        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
    }

    if (energy_last > vad_thold*energy_all) {
        return false;
    }

    return true;
}

float similarity(const std::string & s0, const std::string & s1) {
    const size_t len0 = s0.size() + 1;
    const size_t len1 = s1.size() + 1;

    std::vector<int> col(len1, 0);
    std::vector<int> prevCol(len1, 0);

    for (size_t i = 0; i < len1; i++) {
        prevCol[i] = i;
    }

    for (size_t i = 0; i < len0; i++) {
        col[0] = i;
        for (size_t j = 1; j < len1; j++) {
            col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1));
        }
        col.swap(prevCol);
    }

    const float dist = prevCol[len1 - 1];

    return 1.0f - (dist / std::max(s0.size(), s1.size()));
}

bool is_file_exist(const char * filename) {
    std::ifstream infile(filename);
    return infile.good();
}


================================================
FILE: examples/common.h
================================================
// Various helper functions and utilities

#pragma once

#include <string>
#include <map>
#include <vector>
#include <random>
#include <thread>
#include <ctime>
#include <fstream>
#include <sstream>

//
// GPT CLI argument parsing
//

struct gpt_params {
    int32_t seed         = -1;   // RNG seed
    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t n_predict    = 200;  // new tokens to predict
    int32_t n_parallel   = 1;    // number of parallel streams
    int32_t n_batch      = 32;   // batch size for prompt processing
    int32_t n_ctx        = 2048; // context size (this is the KV cache max size)
    int32_t n_gpu_layers = 0;    // number of layers to offlload to the GPU

    bool ignore_eos = false; // ignore EOS token when generating text

    // sampling parameters
    int32_t top_k          = 40;
    float   top_p          = 0.9f;
    float   temp           = 0.9f;
    int32_t repeat_last_n  = 64;
    float   repeat_penalty = 1.00f;

    std::string model      = "models/gpt-2-117M/ggml-model.bin"; // model path
    std::string prompt     = "";
    std::string token_test = "";

    bool    interactive      = false;
    int32_t interactive_port = -1;
};

bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

void gpt_print_usage(int argc, char ** argv, const gpt_params & params);

std::string gpt_random_prompt(std::mt19937 & rng);

//
// Vocab utils
//

std::string trim(const std::string & s);

std::string replace(
        const std::string & s,
        const std::string & from,
        const std::string & to);

struct gpt_vocab {
    using id    = int32_t;
    using token = std::string;

    std::map<token, id> token_to_id;
    std::map<id, token> id_to_token;
    std::vector<std::string> special_tokens;

    void add_special_token(const std::string & token);
};

// poor-man's JSON parsing
std::map<std::string, int32_t> json_parse(const std::string & fname);

std::string convert_to_utf8(const std::wstring & input);

std::wstring convert_to_wstring(const std::string & input);

void gpt_split_words(std::string str, std::vector<std::string>& words);

// split text into tokens
//
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
//
// Regex (Python):
// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
//
// Regex (C++):
// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
//
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);

// test outputs of gpt_tokenize
//
//   - compare with tokens generated by the huggingface tokenizer
//   - test cases are chosen based on the model's main language (under 'prompt' directory)
//   - if all sentences are tokenized identically, print 'All tests passed.'
//   - otherwise, print sentence, huggingface tokens, ggml tokens
//
void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test);

// load the tokens from encoder.json
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);

// sample next token given probabilities for each embedding
//
//   - consider only the top K tokens
//   - from them, consider only the top tokens with cumulative probability > P
//
// TODO: not sure if this implementation is correct
// TODO: temperature is not implemented
//
gpt_vocab::id gpt_sample_top_k_top_p(
        const gpt_vocab & vocab,
        const float * logits,
        int    top_k,
        double top_p,
        double temp,
        std::mt19937 & rng);

gpt_vocab::id gpt_sample_top_k_top_p_repeat(
        const gpt_vocab & vocab,
        const float * logits,
        const int32_t * last_n_tokens_data,
        size_t last_n_tokens_data_size,
        int    top_k,
        double top_p,
        double temp,
        int repeat_last_n,
        float repeat_penalty,
        std::mt19937 & rng);

//
// Audio utils
//

// Write PCM data into WAV audio file
class wav_writer {
private:
    std::ofstream file;
    uint32_t dataSize = 0;
    std::string wav_filename;

    bool write_header(const uint32_t sample_rate,
                      const uint16_t bits_per_sample,
                      const uint16_t channels) {

        file.write("RIFF", 4);
        file.write("\0\0\0\0", 4);    // Placeholder for file size
        file.write("WAVE", 4);
        file.write("fmt ", 4);

        const uint32_t sub_chunk_size = 16;
        const uint16_t audio_format = 1;      // PCM format
        const uint32_t byte_rate = sample_rate * channels * bits_per_sample / 8;
        const uint16_t block_align = channels * bits_per_sample / 8;

        file.write(reinterpret_cast<const char *>(&sub_chunk_size), 4);
        file.write(reinterpret_cast<const char *>(&audio_format), 2);
        file.write(reinterpret_cast<const char *>(&channels), 2);
        file.write(reinterpret_cast<const char *>(&sample_rate), 4);
        file.write(reinterpret_cast<const char *>(&byte_rate), 4);
        file.write(reinterpret_cast<const char *>(&block_align), 2);
        file.write(reinterpret_cast<const char *>(&bits_per_sample), 2);
        file.write("data", 4);
        file.write("\0\0\0\0", 4);    // Placeholder for data size

        return true;
    }

    // It is assumed that PCM data is normalized to a range from -1 to 1
    bool write_audio(const float * data, size_t length) {
        for (size_t i = 0; i < length; ++i) {
            const int16_t intSample = int16_t(data[i] * 32767);
            file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
            dataSize += sizeof(int16_t);
        }
        if (file.is_open()) {
            file.seekp(4, std::ios::beg);
            uint32_t fileSize = 36 + dataSize;
            file.write(reinterpret_cast<char *>(&fileSize), 4);
            file.seekp(40, std::ios::beg);
            file.write(reinterpret_cast<char *>(&dataSize), 4);
            file.seekp(0, std::ios::end);
        }
        return true;
    }

    bool open_wav(const std::string & filename) {
        if (filename != wav_filename) {
            if (file.is_open()) {
                file.close();
            }
        }
        if (!file.is_open()) {
            file.open(filename, std::ios::binary);
            wav_filename = filename;
            dataSize = 0;
        }
        return file.is_open();
    }

public:
    bool open(const std::string & filename,
              const    uint32_t   sample_rate,
              const    uint16_t   bits_per_sample,
              const    uint16_t   channels) {

        if (open_wav(filename)) {
            write_header(sample_rate, bits_per_sample, channels);
        } else {
            return false;
        }

        return true;
    }

    bool close() {
        file.close();
        return true;
    }

    bool write(const float * data, size_t length) {
        return write_audio(data, length);
    }

    ~wav_writer() {
        if (file.is_open()) {
            file.close();
        }
    }
};


// Apply a high-pass frequency filter to PCM audio
// Suppresses frequencies below cutoff Hz
void high_pass_filter(
        std::vector<float> & data,
        float cutoff,
        float sample_rate);

// Basic voice activity detection (VAD) using audio energy adaptive threshold
bool vad_simple(
        std::vector<float> & pcmf32,
        int   sample_rate,
        int   last_ms,
        float vad_thold,
        float freq_thold,
        bool  verbose);

// compute similarity between two strings using Levenshtein distance
float similarity(const std::string & s0, const std::string & s1);

//
// Terminal utils
//

#define SQR(X)    ((X) * (X))
#define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40

/**
 * Quantizes 24-bit RGB to xterm256 code range [16,256).
 */
static int rgb2xterm256(int r, int g, int b) {
    unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377};
    int av, ir, ig, ib, il, qr, qg, qb, ql;
    av = r * .299 + g * .587 + b * .114 + .5;
    ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8;
    qr = cube[(ir = UNCUBE(r))];
    qg = cube[(ig = UNCUBE(g))];
    qb = cube[(ib = UNCUBE(b))];
    if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <=
        SQR(ql - r) + SQR(ql - g) + SQR(ql - b))
        return ir * 36 + ig * 6 + ib + 020;
    return il + 0350;
}

static std::string set_xterm256_foreground(int r, int g, int b) {
    int x = rgb2xterm256(r, g, b);
    std::ostringstream oss;
    oss << "\033[38;5;" << x << "m";
    return oss.str();
}

// Lowest is red, middle is yellow, highest is green. Color scheme from
// Paul Tol; it is colorblind friendly https://sronpersonalpages.nl/~pault
const std::vector<std::string> k_colors = {
    set_xterm256_foreground(220,   5,  12),
    set_xterm256_foreground(232,  96,  28),
    set_xterm256_foreground(241, 147,  45),
    set_xterm256_foreground(246, 193,  65),
    set_xterm256_foreground(247, 240,  86),
    set_xterm256_foreground(144, 201, 135),
    set_xterm256_foreground( 78, 178, 101),
};

// ANSI formatting codes
static std::string set_inverse() {
    return "\033[7m";
}

static std::string set_underline() {
    return "\033[4m";
}

static std::string set_dim() {
    return "\033[2m";
}

// Style scheme for different confidence levels
const std::vector<std::string> k_styles = {
    set_inverse(),   // Low confidence - inverse (highlighted)
    set_underline(), // Medium confidence - underlined
    set_dim(),       // High confidence - dim
};

//
// Other utils
//

// check if file exists using ifstream
bool is_file_exist(const char * filename);


================================================
FILE: examples/gpt-2/CMakeLists.txt
================================================
#
# gpt-2

set(TEST_TARGET gpt-2-ctx)
add_executable(${TEST_TARGET} main-ctx.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)

set(TEST_TARGET gpt-2-alloc)
add_executable(${TEST_TARGET} main-alloc.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)

set(TEST_TARGET gpt-2-backend)
add_executable(${TEST_TARGET} main-backend.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)

set(TEST_TARGET gpt-2-sched)
add_executable(${TEST_TARGET} main-sched.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)

#
# gpt-2-quantize

set(TEST_TARGET gpt-2-quantize)
add_executable(${TEST_TARGET} quantize.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)

#
# gpt-2-batched

set(TEST_TARGET gpt-2-batched)
add_executable(${TEST_TARGET} main-batched.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)


================================================
FILE: examples/gpt-2/README.md
================================================
# gpt-2

This is a C++ example running GPT-2 inference using the [ggml](https://github.com/ggerganov/ggml) library.

The program runs on the CPU - no video card is required.

The [Cerebras-GPT](https://huggingface.co/cerebras) models are also supported.

The example supports the following GPT-2 models:

| Model | Description  | Disk Size |
| ---   | ---          | ---       |
| 117M  | Small model  | 240 MB    |
| 345M  | Medium model | 680 MB    |
| 774M  | Large model  | 1.5 GB    |
| 1558M | XL model     | 3.0 GB    |

Sample performance on MacBook M1 Pro:

| Model | Size  | Time / Token |
| ---   | ---   | ---    |
| GPT-2 |  117M |   5 ms |
| GPT-2 |  345M |  12 ms |
| GPT-2 |  774M |  23 ms |
| GPT-2 | 1558M |  42 ms |

*TODO: add tables for Cerebras-GPT models*

Sample output:

```bash
$ ./bin/gpt-2 -h
usage: ./bin/gpt-2 [options]

options:
  -h, --help            show this help message and exit
  -s SEED, --seed SEED  RNG seed (default: -1)
  -t N, --threads N     number of threads to use during computation (default: 8)
  -p PROMPT, --prompt PROMPT
                        prompt to start generation with (default: random)
  -n N, --n_predict N   number of tokens to predict (default: 200)
  --top_k N             top-k sampling (default: 40)
  --top_p N             top-p sampling (default: 0.9)
  --temp N              temperature (default: 1.0)
  -b N, --batch_size N  batch size for prompt processing (default: 8)
  -m FNAME, --model FNAME
                        model path (default: models/gpt-2-117M/ggml-model.bin)

$ ./bin/gpt-2
gpt2_model_load: loading model from 'models/gpt-2-117M/ggml-model.bin'
gpt2_model_load: n_vocab = 50257
gpt2_model_load: n_ctx   = 1024
gpt2_model_load: n_embd  = 768
gpt2_model_load: n_head  = 12
gpt2_model_load: n_layer = 12
gpt2_model_load: f16     = 1
gpt2_model_load: ggml ctx size = 311.12 MB
gpt2_model_load: memory size =    72.00 MB, n_mem = 12288
gpt2_model_load: model size  =   239.08 MB
main: number of tokens in prompt = 1

So this is going to be the end of the line for us.

If the Dolphins continue to do their business, it's possible that the team could make a bid to bring in new defensive coordinator Scott Linehan.

Linehan's job is a little daunting, but he's a great coach and an excellent coach. I don't believe we're going to make the playoffs.

We're going to have to work hard to keep our heads down and get ready to go.<|endoftext|>

main: mem per token =  2048612 bytes
main:     load time =   106.32 ms
main:   sample time =     7.10 ms
main:  predict time =   506.40 ms / 5.06 ms per token
main:    total time =   629.84 ms
```

## Downloading and converting the original models (GPT-2)

You can download the original model files using the [download-model.sh](download-model.sh) Bash script. The models are
in Tensorflow format, so in order to use them with ggml, you need to convert them to appropriate format. This is done
via the [convert-ckpt-to-ggml.py](convert-ckpt-to-ggml.py) python script.

Here is the entire process for the GPT-2 117M model (download from official site + conversion):

```bash
cd ggml/build
../examples/gpt-2/download-model.sh 117M

Downloading model 117M ...
models/gpt-2-117M/checkpoint                      100%[=============================>]      77  --.-KB/s    in 0s
models/gpt-2-117M/encoder.json                    100%[=============================>]   1018K  1.20MB/s    in 0.8s
models/gpt-2-117M/hparams.json                    100%[=============================>]      90  --.-KB/s    in 0s
models/gpt-2-117M/model.ckpt.data-00000-of-00001  100%[=============================>] 474.70M  1.21MB/s    in 8m 39s
models/gpt-2-117M/model.ckpt.index                100%[=============================>]   5.09K  --.-KB/s    in 0s
models/gpt-2-117M/model.ckpt.meta                 100%[=============================>] 460.11K   806KB/s    in 0.6s
models/gpt-2-117M/vocab.bpe                       100%[=============================>] 445.62K   799KB/s    in 0.6s
Done! Model '117M' saved in 'models/gpt-2-117M/'

Run the convert-ckpt-to-ggml.py script to convert the model to ggml format.

  python /Users/john/ggml/examples/gpt-2/convert-ckpt-to-ggml.py models/gpt-2-117M/ 1

```

This conversion requires that you have python and Tensorflow installed on your computer. Still, if you want to avoid
this, you can download the already converted ggml models as described below.

## Downloading and converting the original models (Cerebras-GPT)

Clone the respective repository from here: https://huggingface.co/cerebras

Use the [convert-cerebras-to-ggml.py](convert-cerebras-to-ggml.py) script to convert the model to `ggml` format:

```bash
cd ggml/build
git clone https://huggingface.co/cerebras/Cerebras-GPT-111M models/
python ../examples/gpt-2/convert-cerebras-to-ggml.py models/Cerebras-GPT-111M/

```

## Downloading the ggml model directly (GPT-2)

For convenience, I will be hosting the converted ggml model files in order to make it easier to run the examples. This
way, you can directly download a single binary file and start using it. No python or Tensorflow is required.

Here is how to get the 117M ggml model:

```bash
cd ggml/build
../examples/gpt-2/download-ggml-model.sh 117M

Downloading ggml model 117M ...
models/gpt-2-117M/ggml-model.bin         100%[===============================>] 239.58M  8.52MB/s    in 28s
Done! Model '117M' saved in 'models/gpt-2-117M/ggml-model.bin'
You can now use it like this:

  $ ./bin/gpt-2 -m models/gpt-2-117M/ggml-model.bin -p "This is an example"

```

At some point, I might decide to stop hosting these models. So in that case, simply revert to the manual process above.

## Quantizing the models

You can also try to quantize the `ggml` models via 4-bit integer quantization.
Keep in mind that for smaller models, this will render them completely useless.
You generally want to quantize larger models.

```bash
# quantize GPT-2 F16 to Q4_0 (faster but less precise)
./bin/gpt-2-quantize models/gpt-2-1558M/ggml-model-f16.bin models/gpt-2-1558M/ggml-model-q4_0.bin 2
./bin/gpt-2 -m models/gpt-2-1558M/ggml-model-q4_0.bin -p "This is an example"

# quantize Cerebras F16 to Q4_1 (slower but more precise)
./bin/gpt-2-quantize models/Cerebras-GPT-6.7B/ggml-model-f16.bin models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin 3
./bin/gpt-2 -m models/Cerebras-GPT-6.7B/ggml-model-q4_1.bin -p "This is an example"

```

## Batched generation example

You can try the batched generation from a given prompt using the gpt-2-batched binary.

Sample output:

```bash
$ gpt-2-batched -np 5 -m models/gpt-2-117M/ggml-model.bin -p "Hello my name is" -n 50

main: seed = 1697037431
gpt2_model_load: loading model from 'models/gpt-2-117M/ggml-model.bin'
gpt2_model_load: n_vocab = 50257
gpt2_model_load: n_ctx   = 1024
gpt2_model_load: n_embd  = 768
gpt2_model_load: n_head  = 12
gpt2_model_load: n_layer = 12
gpt2_model_load: ftype   = 1
gpt2_model_load: qntvr   = 0
gpt2_model_load: ggml tensor size    = 320 bytes
gpt2_model_load: backend buffer size = 312.72 MB
ggml_init_cublas: found 1 CUDA devices:
  Device 0: NVIDIA GeForce GTX 1660, compute capability 7.5
gpt2_model_load: using CPU backend
gpt2_model_load: memory size =    72.00 MB, n_mem = 12288
gpt2_model_load: model size  =   239.08 MB
extract_tests_from_file : No test file found.
test_gpt_tokenizer : 0 tests failed out of 0 tests.
main: compute buffer size: 3.26 MB


main: generating 5 sequences ...
main: prompt: 'Hello my name is'
main: number of tokens in prompt = 4, first 8 tokens: 15496 616 1438 318


sequence 0:

Hello my name is John. You can call me any way you want, if you want, but for my very first date, I will be on the phone with you. We're both in our early 20s, but I feel like it's all

sequence 1:

Hello my name is Robert, and I want to say that we're proud to have your company here on the world's largest platform for sharing your stories with us. This is a huge opportunity for our community. We have hundreds of people on this team and

sequence 2:

Hello my name is Jack. I'm the one who created you.

Jack is a boy with a big smile and a big heart. He is a handsome guy. He loves the outdoors and loves the people he meets. He wants to be a

sequence 3:

Hello my name is John. I am a Canadian citizen with a large number of family in Quebec and I am interested in studying. My aim is to take up a post in the Journal of the International Academy of Sciences of Canada which I am currently finishing.

sequence 4:

Hello my name is Dan. I am an entrepreneur. I am a great father. I am a great husband. I am a great husband. I am a great dad. And I am a great husband.

I love my life. I love



main:     load time =   880.80 ms
main:   sample time =    91.43 ms
main:  predict time =  2518.29 ms
main:    total time =  3544.32 ms
```


================================================
FILE: examples/gpt-2/convert-cerebras-to-ggml.py
================================================
# Convert Cerebras models to ggml format
#
# ref: https://www.cerebras.net/blog/cerebras-gpt-a-family-of-open-compute-efficient-large-language-models/
#

import sys
import struct
import json
import torch
import numpy as np
import re

from transformers import AutoModelForCausalLM

# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))

if len(sys.argv) < 2:
    print("Usage: convert-cerebras-to-ggml.py dir-model [use-f32]\n")
    sys.exit(1)

# output in the same directory as the model
dir_model = sys.argv[1]
fname_out = sys.argv[1] + "/ggml-model-f16.bin"

with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
    encoder = json.load(f)

with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)

# use 16-bit or 32-bit floats
use_f16 = True
if len(sys.argv) > 2:
    use_f16 = False
    fname_out = sys.argv[1] + "/ggml-model-f32.bin"

model = AutoModelForCausalLM.from_pretrained(dir_model, low_cpu_mem_usage=True)
#print (model)

list_vars = model.state_dict()
#print (list_vars)

print(hparams)

fout = open(fname_out, "wb")

fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
fout.write(struct.pack("i", hparams["vocab_size"]))
fout.write(struct.pack("i", hparams["n_positions"]))
fout.write(struct.pack("i", hparams["n_embd"]))
fout.write(struct.pack("i", hparams["n_head"]))
fout.write(struct.pack("i", hparams["n_layer"]))
fout.write(struct.pack("i", use_f16))

byte_encoder = bytes_to_unicode()
byte_decoder = {v:k for k, v in byte_encoder.items()}

fout.write(struct.pack("i", len(encoder)))

for key in encoder:
    text = bytearray([byte_decoder[c] for c in key])
    fout.write(struct.pack("i", len(text)))
    fout.write(text)

for name in list_vars.keys():
    data = list_vars[name].squeeze().numpy()
    print("Processing variable: " + name + " with shape: ", data.shape)

    # rename headers to keep compatibility
    if name == "transformer.ln_f.weight":
        name = "model/ln_f/g"
    elif name == "transformer.ln_f.bias":
        name = "model/ln_f/b"
    elif name == "transformer.wte.weight":
        name = "model/wte"
    elif name == "transformer.wpe.weight":
        name = "model/wpe"
    elif name == "lm_head.weight":
        name = "model/lm_head"
    elif re.match(r"transformer.h\.\d+\.ln_1\.weight", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/ln_1/g"
    elif re.match(r"transformer.h\.\d+\.ln_1\.bias", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/ln_1/b"
    elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.weight", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/attn/c_attn/w"
    elif re.match(r"transformer.h\.\d+\.attn\.c_attn\.bias", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/attn/c_attn/b"
    elif re.match(r"transformer.h\.\d+\.attn\.c_proj\.weight", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/attn/c_proj/w"
    elif re.match(r"transformer.h.\d+.attn.c_proj.bias", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/attn/c_proj/b"
    elif re.match(r"transformer.h.\d+.ln_2.weight", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/ln_2/g"
    elif re.match(r"transformer.h.\d+.ln_2.bias", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/ln_2/b"
    elif re.match(r"transformer.h.\d+.mlp.c_fc.weight", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/mlp/c_fc/w"
    elif re.match(r"transformer.h.\d+.mlp.c_fc.bias", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/mlp/c_fc/b"
    elif re.match(r"transformer.h.\d+.mlp.c_proj.weight", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/mlp/c_proj/w"
    elif re.match(r"transformer.h.\d+.mlp.c_proj.bias", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/mlp/c_proj/b"
    else:
        print("Unrecognized variable name. %s", name)

    # we don't need these
    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
        print("  Skipping variable: " + name)
        continue

    n_dims = len(data.shape);

    # ftype == 0 -> float32, ftype == 1 -> float16
    ftype = 0;
    if use_f16:
        if (name == "model/wte" or name == "model/lm_head" or name[-2:] == "/g" or name[-2:] == "/w") and n_dims == 2:
            print("  Converting to float16")
            data = data.astype(np.float16)
            ftype = 1
        else:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype = 0

    # for efficiency - transpose the projection matrices
    # "model/h.*/attn/c_attn/w"
    # "model/h.*/attn/c_proj/w"
    # "model/h.*/mlp/c_fc/w"
    # "model/h.*/mlp/c_proj/w"
    if name[-14:] == "/attn/c_attn/w" or \
       name[-14:] == "/attn/c_proj/w" or \
       name[-11:] == "/mlp/c_fc/w" or \
       name[-13:] == "/mlp/c_proj/w":
        print("  Transposing")
        data = data.transpose()

    # header
    str = name.encode('utf-8')
    fout.write(struct.pack("iii", n_dims, len(str), ftype))
    for i in range(n_dims):
        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
    fout.write(str);

    # data
    data.tofile(fout)

fout.close()

print("Done. Output file: " + fname_out)
print("")


================================================
FILE: examples/gpt-2/convert-ckpt-to-ggml.py
================================================
# Convert a model checkpoint to a ggml compatible file
#
# Load the model using TensorFlow.
# Iterate over all variables and write them to a binary file.
#
# For each variable, write the following:
#   - Number of dimensions (int)
#   - Name length (int)
#   - Dimensions (int[n_dims])
#   - Name (char[name_length])
#   - Data (float[n_dims])
#
# By default, the bigger matrices are converted to 16-bit floats.
# This can be disabled by adding the "use-f32" CLI argument.
#
# At the start of the ggml file we write the model parameters
# and vocabulary.
#

import sys
import json
import struct
import numpy as np
import tensorflow as tf

# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))

# helper method to convert a numpy array to different float types
def convert_to_ftype(data, ftype):
    # fp16
    if ftype == 1:
        return data.astype(np.float16)

    assert False, "Invalid ftype: " + str(ftype)

if len(sys.argv) < 3:
    print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
    print("  ftype == 0 -> float32")
    print("  ftype == 1 -> float16")
    sys.exit(1)

# output in the same directory as the model
dir_model = sys.argv[1]
fname_out = sys.argv[1] + "/ggml-model.bin"

with open(dir_model + "/encoder.json", "r", encoding="utf-8") as f:
    encoder = json.load(f)

with open(dir_model + "/hparams.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)

# possible data types
#   ftype == 0 -> float32
#   ftype == 1 -> float16
#
# map from ftype to string
ftype_str = ["f32", "f16"]

ftype = 1
if len(sys.argv) > 2:
    ftype = int(sys.argv[2])
    if ftype < 0 or ftype > 1:
        print("Invalid ftype: " + str(ftype))
        sys.exit(1)
    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"

list_vars = tf.train.list_variables(dir_model)

fout = open(fname_out, "wb")

fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
fout.write(struct.pack("i", hparams["n_vocab"]))
fout.write(struct.pack("i", hparams["n_ctx"]))
fout.write(struct.pack("i", hparams["n_embd"]))
fout.write(struct.pack("i", hparams["n_head"]))
fout.write(struct.pack("i", hparams["n_layer"]))
fout.write(struct.pack("i", ftype))

byte_encoder = bytes_to_unicode()
byte_decoder = {v:k for k, v in byte_encoder.items()}

fout.write(struct.pack("i", len(encoder)))

for key in encoder:
    text = bytearray([byte_decoder[c] for c in key])
    fout.write(struct.pack("i", len(text)))
    fout.write(text)

for name, shape in list_vars:
    print("Processing variable: " + name + " with shape: ", shape)

    data = tf.train.load_variable(dir_model, name).squeeze()
    n_dims = len(data.shape);

    # for efficiency - transpose the projection matrices
    # "model/h.*/attn/c_attn/w"
    # "model/h.*/attn/c_proj/w"
    # "model/h.*/mlp/c_fc/w"
    # "model/h.*/mlp/c_proj/w"
    if name[-14:] == "/attn/c_attn/w" or \
       name[-14:] == "/attn/c_proj/w" or \
       name[-11:] == "/mlp/c_fc/w" or \
       name[-13:] == "/mlp/c_proj/w":
        print("  Transposing")
        data = data.transpose()

    dshape = data.shape

    ftype_cur = 0
    if ftype != 0:
        # match name:
        #  "model/wte"
        #  "model/h.*/attn/c_attn/w"
        #  "model/h.*/attn/c_proj/w"
        #  "model/h.*/mlp/c_fc/w"
        #  "model/h.*/mlp/c_proj/w"
        if name == "model/wte" or name[-2:] == "/w":
            print("  Converting to " + ftype_str[ftype])
            data = convert_to_ftype(data, ftype)
            ftype_cur = ftype
        else:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype_cur = 0

    # header
    str = name.encode('utf-8')
    fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
    for i in range(n_dims):
        fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
    fout.write(str);

    # data
    data.tofile(fout)

fout.close()

print("Done. Output file: " + fname_out)
print("")


================================================
FILE: examples/gpt-2/convert-h5-to-ggml.py
================================================
# Convert GPT-2 h5 transformer model to ggml format
#
# Load the model using GPT2Model.
# Iterate over all variables and write them to a binary file.
#
# For each variable, write the following:
#   - Number of dimensions (int)
#   - Name length (int)
#   - Dimensions (int[n_dims])
#   - Name (char[name_length])
#   - Data (float[n_dims])
#
# By default, the bigger matrices are converted to 16-bit floats.
# This can be disabled by adding the "use-f32" CLI argument.
#
# At the start of the ggml file we write the model parameters
# and vocabulary.
#

import sys
import struct
import json
import numpy as np
import re

from transformers import GPT2Model

# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
def bytes_to_unicode():
    """
    Returns list of utf-8 byte and a corresponding list of unicode strings.
    The reversible bpe codes work on unicode strings.
    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
    This is a signficant percentage of your normal, say, 32K bpe vocab.
    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
    And avoids mapping to whitespace/control characters the bpe code barfs on.
    """
    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
    cs = bs[:]
    n = 0
    for b in range(2**8):
        if b not in bs:
            bs.append(b)
            cs.append(2**8+n)
            n += 1
    cs = [chr(n) for n in cs]
    return dict(zip(bs, cs))

if len(sys.argv) < 2:
    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
    sys.exit(1)

# output in the same directory as the model
dir_model = sys.argv[1]
fname_out = sys.argv[1] + "/ggml-model.bin"

with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f:
    encoder = json.load(f)

with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f:
    encoder_added = json.load(f)

with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)

# use 16-bit or 32-bit floats
use_f16 = True
if len(sys.argv) > 2:
    use_f16 = False
    fname_out = sys.argv[1] + "/ggml-model-f32.bin"

model = GPT2Model.from_pretrained(dir_model, low_cpu_mem_usage=True)
#print (model)

list_vars = model.state_dict()
#print (list_vars)

fout = open(fname_out, "wb")

fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
fout.write(struct.pack("i", hparams["vocab_size"]))
fout.write(struct.pack("i", hparams["n_positions"]))
fout.write(struct.pack("i", hparams["n_embd"]))
fout.write(struct.pack("i", hparams["n_head"]))
fout.write(struct.pack("i", hparams["n_layer"]))
#fout.write(struct.pack("i", hparams["rotary_dim"]))
fout.write(struct.pack("i", use_f16))

byte_encoder = bytes_to_unicode()
byte_decoder = {v:k for k, v in byte_encoder.items()}

fout.write(struct.pack("i", len(encoder) + len(encoder_added)))

for key in encoder:
    text = bytearray([byte_decoder[c] for c in key])
    fout.write(struct.pack("i", len(text)))
    fout.write(text)

for key in encoder_added:
    text = bytearray([byte_decoder[c] for c in key])
    fout.write(struct.pack("i", len(text)))
    fout.write(text)

for name in list_vars.keys():
    data = list_vars[name].squeeze().numpy()
    print("Processing variable: " + name + " with shape: ", data.shape)

    # we don't need these
    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
        print("  Skipping variable: " + name)
        continue

    n_dims = len(data.shape);

    # ftype == 0 -> float32, ftype == 1 -> float16
    ftype = 0;
    if use_f16:
        if name[-7:] == ".weight" and n_dims == 2:
            print("  Converting to float16")
            data = data.astype(np.float16)
            ftype = 1
        else:
            print("  Converting to float32")
            data = data.astype(np.float32)
            ftype = 0

    # for efficiency - transpose these matrices:
    #  "transformer.h.*.mlp.c_proj.weight
    if name.endswith(".mlp.c_proj.weight"):
        print("  Transposing")
        data = data.transpose()

    # rename headers to keep compatibility
    if name == "ln_f.weight":
        name = "model/ln_f/g"
    elif name == "ln_f.bias":
        name = "model/ln_f/b"
    elif name == "wte.weight":
        name = "model/wte"
    elif name == "wpe.weight":
        name = "model/wpe"
    elif re.match(r"h\.\d+\.ln_1\.weight", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/ln_1/g"
    elif re.match(r"h\.\d+\.ln_1\.bias", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/ln_1/b"
    elif re.match(r"h\.\d+\.attn\.c_attn\.weight", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/attn/c_attn/w"
    elif re.match(r"h\.\d+\.attn\.c_attn\.bias", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/attn/c_attn/b"
    elif re.match(r"h\.\d+\.attn\.c_proj\.weight", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/attn/c_proj/w"
    elif re.match(r"h.\d+.attn.c_proj.bias", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/attn/c_proj/b"
    elif re.match(r"h.\d+.ln_2.weight", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/ln_2/g"
    elif re.match(r"h.\d+.ln_2.bias", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/ln_2/b"
    elif re.match(r"h.\d+.mlp.c_fc.weight", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/mlp/c_fc/w"
    elif re.match(r"h.\d+.mlp.c_fc.bias", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/mlp/c_fc/b"
    elif re.match(r"h.\d+.mlp.c_proj.weight", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/mlp/c_proj/w"
    elif re.match(r"h.\d+.mlp.c_proj.bias", name):
        i = re.findall("\d+", name)[0]
        name = f"model/h{i}/mlp/c_proj/b"
    else:
        print("Unrecognized variable name. %s", name)

    str = name.encode('utf-8')

    fout.write(struct.pack("iii", n_dims, len(str), ftype))
    for i in range(n_dims):
        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
    fout.write(str);

    # data
    data.tofile(fout)

fout.close()

print("Done. Output file: " + fname_out)
print("")


================================================
FILE: examples/gpt-2/download-ggml-model.sh
================================================
#!/bin/bash

# This script downloads GPT-2 model files that have already been converted to ggml format.
# This way you don't have to convert them yourself.
#
# If you want to download the original GPT-2 model files, use the "download-model.sh" script instead.

#src="https://ggml.ggerganov.com"
#pfx="ggml-model-gpt-2"

src="https://huggingface.co/ggerganov/ggml"
pfx="resolve/main/ggml-model-gpt-2"

ggml_path=$(dirname $(realpath $0))

# GPT-2 models
models=( "117M" "345M" "774M" "1558M" )

# list available models
function list_models {
    printf "\n"
    printf "  Available models:"
    for model in "${models[@]}"; do
        printf " $model"
    done
    printf "\n\n"
}

if [ "$#" -ne 1 ]; then
    printf "Usage: $0 <model>\n"
    list_models

    exit 1
fi

model=$1

if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
    printf "Invalid model: $model\n"
    list_models

    exit 1
fi

# download ggml model

printf "Downloading ggml model $model ...\n"

mkdir -p models/gpt-2-$model

if [ -x "$(command -v wget)" ]; then
    wget --quiet --show-progress -O models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin
elif [ -x "$(command -v curl)" ]; then
    curl -L --output models/gpt-2-$model/ggml-model.bin $src/$pfx-$model.bin
else
    printf "Either wget or curl is required to download models.\n"
    exit 1
fi

if [ $? -ne 0 ]; then
    printf "Failed to download ggml model $model \n"
    printf "Please try again later or download the original GPT-2 model files and convert them yourself.\n"
    exit 1
fi

printf "Done! Model '$model' saved in 'models/gpt-2-$model/ggml-model.bin'\n"
printf "You can now use it like this:\n\n"
printf "  $ ./bin/gpt-2 -m models/gpt-2-$model/ggml-model.bin -p \"This is an example\"\n"
printf "\n"


================================================
FILE: examples/gpt-2/download-model.sh
================================================
#!/bin/bash

ggml_path=$(dirname $(realpath $0))

# GPT-2 models
models=( "117M" "345M" "774M" "1558M" )

# list available models
function list_models {
    printf "\n"
    printf "  Available models:"
    for model in "${models[@]}"; do
        printf " $model"
    done
    printf "\n\n"
}

if [ "$#" -ne 1 ]; then
    printf "Usage: $0 <model>\n"
    list_models

    exit 1
fi

model=$1

if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
    printf "Invalid model: $model\n"
    list_models

    exit 1
fi

# download model

printf "Downloading model $model ...\n"

mkdir -p models/gpt-2-$model

for file in checkpoint encoder.json hparams.json model.ckpt.data-00000-of-00001 model.ckpt.index model.ckpt.meta vocab.bpe; do
    wget --quiet --show-progress -O models/gpt-2-$model/$file https://openaipublic.blob.core.windows.net/gpt-2/models/$model/$file
done

printf "Done! Model '$model' saved in 'models/gpt-2-$model/'\n\n"
printf "Run the convert-ckpt-to-ggml.py script to convert the model to ggml format.\n"
printf "\n"
printf "  python $ggml_path/convert-ckpt-to-ggml.py models/gpt-2-$model/\n"
printf "\n"


================================================
FILE: examples/gpt-2/main-alloc.cpp
================================================
#include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"

#include "common.h"
#include "common-ggml.h"

#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <map>
#include <string>
#include <vector>

#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

// default hparams (GPT-2 117M)
struct gpt2_hparams {
    int32_t n_vocab = 50257;
    int32_t n_ctx   = 1024;
    int32_t n_embd  = 768;
    int32_t n_head  = 12;
    int32_t n_layer = 12;
    int32_t ftype   = 1;
    float   eps     = 1e-5f;
};

struct gpt2_layer {
    // normalization
    struct ggml_tensor * ln_1_g;
    struct ggml_tensor * ln_1_b;

    struct ggml_tensor * ln_2_g;
    struct ggml_tensor * ln_2_b;

    // attention
    struct ggml_tensor * c_attn_attn_w;
    struct ggml_tensor * c_attn_attn_b;

    struct ggml_tensor * c_attn_proj_w;
    struct ggml_tensor * c_attn_proj_b;

    // mlp
    struct ggml_tensor * c_mlp_fc_w;
    struct ggml_tensor * c_mlp_fc_b;

    struct ggml_tensor * c_mlp_proj_w;
    struct ggml_tensor * c_mlp_proj_b;
};

struct gpt2_model {
    gpt2_hparams hparams;

    // normalization
    struct ggml_tensor * ln_f_g;
    struct ggml_tensor * ln_f_b;

    struct ggml_tensor * wte;     //    token embedding
    struct ggml_tensor * wpe;     // position embedding
    struct ggml_tensor * lm_head; // language model head

    std::vector<gpt2_layer> layers;

    // key + value memory
    struct ggml_tensor * memory_k;
    struct ggml_tensor * memory_v;

    //
    struct ggml_context * ctx_w;
    std::map<std::string, struct ggml_tensor *> tensors;
};

// load the model's weights from a file
bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
    printf("%s: loading model from '%s'\n", __func__, fname.c_str());

    auto fin = std::ifstream(fname, std::ios::binary);
    if (!fin) {
        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
        return false;
    }

    // verify magic
    {
        uint32_t magic;
        fin.read((char *) &magic, sizeof(magic));
        if (magic != GGML_FILE_MAGIC) {
            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
            return false;
        }
    }

    // load hparams
    {
        auto & hparams = model.hparams;

        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));

        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;

        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
        printf("%s: qntvr   = %d\n", __func__, qntvr);

        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
    }

    // load vocab
    {
        int32_t n_vocab = 0;
        fin.read((char *) &n_vocab, sizeof(n_vocab));

        if (n_vocab != model.hparams.n_vocab) {
            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
            return false;
        }

        std::string word;
        std::vector<char> buf(128);

        for (int i = 0; i < n_vocab; i++) {
            uint32_t len;
            fin.read((char *) &len, sizeof(len));

            buf.resize(len);
            fin.read((char *) buf.data(), len);
            word.assign(buf.data(), len);

            vocab.token_to_id[word] = i;
            vocab.id_to_token[i] = word;
        }
    }

    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
    // in order to save memory and also to speed up the computation
    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
    if (wtype == GGML_TYPE_COUNT) {
        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
                __func__, fname.c_str(), model.hparams.ftype);
        return false;
    }

    auto & ctx = model.ctx_w;

    size_t ctx_size = 0;

    {
        const auto & hparams = model.hparams;

        const int n_embd  = hparams.n_embd;
        const int n_layer = hparams.n_layer;
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;

        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
        ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b

        ctx_size += ggml_row_size(wtype,         n_vocab*n_embd); // wte
        ctx_size += ggml_row_size(GGML_TYPE_F32  , n_ctx*n_embd); // wpe
        ctx_size += ggml_row_size(wtype,         n_vocab*n_embd); // lm_head

        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b

        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b

        ctx_size += n_layer*(ggml_row_size(wtype,         3*n_embd*n_embd)); // c_attn_attn_w
        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd));        // c_attn_attn_b

        ctx_size += n_layer*(ggml_row_size(wtype,         n_embd*n_embd));   // c_attn_proj_w
        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd));          // c_attn_proj_b

        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_fc_w
        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_fc_b

        ctx_size += n_layer*(ggml_row_size(wtype,         4*n_embd*n_embd)); // c_mlp_proj_w
        ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd));        // c_mlp_proj_b

        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
        ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v

        ctx_size += (6 + 12*n_layer)*512; // object overhead

        printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
    }

    // create the ggml context
    {
        struct ggml_init_params params = {
            /*.mem_size   =*/ ctx_size,
            /*.mem_buffer =*/ NULL,
            /*.no_alloc   =*/ false,
        };

        model.ctx_w = ggml_init(params);
        if (!model.ctx_w) {
            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
            return false;
        }
    }

    // prepare memory for the weights
    {
        const auto & hparams = model.hparams;

        const int n_embd  = hparams.n_embd;
        const int n_layer = hparams.n_layer;
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;

        model.layers.resize(n_layer);

        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);

        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);

        // map by name
        model.tensors["model/ln_f/g"] = model.ln_f_g;
        model.tensors["model/ln_f/b"] = model.ln_f_b;

        model.tensors["model/wte"]     = model.wte;
        model.tensors["model/wpe"]     = model.wpe;
        model.tensors["model/lm_head"] = model.lm_head;

        for (int i = 0; i < n_layer; ++i) {
            auto & layer = model.layers[i];

            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);

            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);

            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

            // map by name
            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;

            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;

            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;

            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;

            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;

            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
        }
    }

    // key + value memory
    {
        const auto & hparams = model.hparams;

        const int n_embd  = hparams.n_embd;
        const int n_layer = hparams.n_layer;
        const int n_ctx   = hparams.n_ctx;

        const int n_mem      = n_layer*n_ctx;
        const int n_elements = n_embd*n_mem;

        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);

        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);

        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
    }

    // load weights
    {
        size_t total_size = 0;

        bool has_lm_head = false;

        while (true) {
            int32_t n_dims;
            int32_t length;
            int32_t ttype;

            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));

            if (fin.eof()) {
                break;
            }

            int32_t nelements = 1;
            int32_t ne[2] = { 1, 1 };
            for (int i = 0; i < n_dims; ++i) {
                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
                nelements *= ne[i];
            }

            std::string name(length, 0);
            fin.read(&name[0], length);

            if (model.tensors.find(name) == model.tensors.end()) {
                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
                return false;
            }

            auto tensor = model.tensors[name];
            if (ggml_nelements(tensor) != nelements) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
                return false;
            }

            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
                        __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
                return false;
            }

            // for debugging
            if (0) {
                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
            }

            const size_t bpe = ggml_type_size(ggml_type(ttype));

            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
                        __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
                return false;
            }

            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));

            // GPT-2 models share the WTE tensor as the LM head
            if (name == "model/wte" && has_lm_head == false) {
                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
            }

            if (name == "model/lm_head") {
                has_lm_head = true;
            }

            total_size += ggml_nbytes(tensor);
        }

        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
    }

    fin.close();

    return true;
}

// build the computation graph
struct ggml_cgraph * gpt2_graph(
        const gpt2_model & model,
        const int n_past,
        const int n_tokens) {
    const int N = n_tokens;

    const auto & hparams = model.hparams;

    const int n_embd  = hparams.n_embd;
    const int n_layer = hparams.n_layer;
    const int n_ctx   = hparams.n_ctx;
    const int n_head  = hparams.n_head;

    // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
    static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
    static std::vector<uint8_t> buf(buf_size);

    struct ggml_init_params params = {
        /*.mem_size   =*/ buf_size,
        /*.mem_buffer =*/ buf.data(),
        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
    };

    struct ggml_context * ctx

Download .txt

gitextract_leym123p/

├── .editorconfig
├── .github/
│   ├── pull_request_template.md
│   └── workflows/
│       ├── ci.yml
│       └── release.yml
├── .gitignore
├── .gitmodules
├── AUTHORS
├── CMakeLists.txt
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── ci/
│   └── run.sh
├── cmake/
│   ├── GitVars.cmake
│   ├── common.cmake
│   └── ggml-config.cmake.in
├── docs/
│   └── gguf.md
├── examples/
│   ├── CMakeLists.txt
│   ├── common-ggml.cpp
│   ├── common-ggml.h
│   ├── common.cpp
│   ├── common.h
│   ├── gpt-2/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── convert-cerebras-to-ggml.py
│   │   ├── convert-ckpt-to-ggml.py
│   │   ├── convert-h5-to-ggml.py
│   │   ├── download-ggml-model.sh
│   │   ├── download-model.sh
│   │   ├── main-alloc.cpp
│   │   ├── main-backend.cpp
│   │   ├── main-batched.cpp
│   │   ├── main-ctx.cpp
│   │   ├── main-sched.cpp
│   │   └── quantize.cpp
│   ├── gpt-j/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── convert-h5-to-ggml.py
│   │   ├── download-ggml-model.sh
│   │   ├── download-model.sh
│   │   ├── main.cpp
│   │   └── quantize.cpp
│   ├── magika/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── convert.py
│   │   └── main.cpp
│   ├── mnist/
│   │   ├── .gitignore
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── mnist-common.cpp
│   │   ├── mnist-common.h
│   │   ├── mnist-eval.cpp
│   │   ├── mnist-train-cnn.py
│   │   ├── mnist-train-fc.py
│   │   ├── mnist-train.cpp
│   │   └── server.py
│   ├── perf-metal/
│   │   ├── CMakeLists.txt
│   │   └── perf-metal.cpp
│   ├── prompts/
│   │   ├── dolly-v2.txt
│   │   ├── gpt-2-chinese.txt
│   │   ├── gpt-2.txt
│   │   ├── gpt-j.txt
│   │   ├── gpt-neox-japanese.txt
│   │   ├── gpt-neox.txt
│   │   ├── polyglot-ko.txt
│   │   ├── replit.txt
│   │   ├── starcoder.txt
│   │   ├── test-cases.txt
│   │   ├── tokenize_huggingface.py
│   │   └── whisper.txt
│   ├── python/
│   │   ├── README.md
│   │   ├── api.h
│   │   ├── example_add_quant.py
│   │   ├── example_test_all_quants.py
│   │   ├── ggml/
│   │   │   ├── __init__.py
│   │   │   ├── __init__.pyi
│   │   │   ├── cffi.py
│   │   │   ├── ffi/
│   │   │   │   └── __init__.pyi
│   │   │   └── utils.py
│   │   ├── regenerate.py
│   │   ├── stubs.py
│   │   └── test_tensor.py
│   ├── sam/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── convert-pth-to-ggml.py
│   │   └── sam.cpp
│   ├── simple/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   ├── simple-backend.cpp
│   │   └── simple-ctx.cpp
│   ├── stb_image.h
│   ├── stb_image_write.h
│   ├── test-cmake/
│   │   ├── CMakeLists.txt
│   │   ├── README.md
│   │   └── test-cmake.cpp
│   └── yolo/
│       ├── CMakeLists.txt
│       ├── README.md
│       ├── convert-yolov3-tiny.py
│       ├── data/
│       │   └── coco.names
│       ├── yolo-image.cpp
│       ├── yolo-image.h
│       └── yolov3-tiny.cpp
├── ggml.pc.in
├── include/
│   ├── ggml-alloc.h
│   ├── ggml-backend.h
│   ├── ggml-blas.h
│   ├── ggml-cann.h
│   ├── ggml-cpp.h
│   ├── ggml-cpu.h
│   ├── ggml-cuda.h
│   ├── ggml-hexagon.h
│   ├── ggml-metal.h
│   ├── ggml-opencl.h
│   ├── ggml-openvino.h
│   ├── ggml-opt.h
│   ├── ggml-rpc.h
│   ├── ggml-sycl.h
│   ├── ggml-virtgpu.h
│   ├── ggml-vulkan.h
│   ├── ggml-webgpu.h
│   ├── ggml-zdnn.h
│   ├── ggml-zendnn.h
│   ├── ggml.h
│   └── gguf.h
├── requirements.txt
├── scripts/
│   ├── gen-authors.sh
│   ├── release.sh
│   ├── sync-llama-am.sh
│   ├── sync-llama.last
│   ├── sync-llama.sh
│   ├── sync-whisper-am.sh
│   ├── sync-whisper.last
│   └── sync-whisper.sh
├── src/
│   ├── CMakeLists.txt
│   ├── ggml-alloc.c
│   ├── ggml-backend-dl.cpp
│   ├── ggml-backend-dl.h
│   ├── ggml-backend-impl.h
│   ├── ggml-backend-reg.cpp
│   ├── ggml-backend.cpp
│   ├── ggml-blas/
│   │   ├── CMakeLists.txt
│   │   └── ggml-blas.cpp
│   ├── ggml-cann/
│   │   ├── CMakeLists.txt
│   │   ├── acl_tensor.cpp
│   │   ├── acl_tensor.h
│   │   ├── aclnn_ops.cpp
│   │   ├── aclnn_ops.h
│   │   ├── common.h
│   │   └── ggml-cann.cpp
│   ├── ggml-common.h
│   ├── ggml-cpu/
│   │   ├── CMakeLists.txt
│   │   ├── amx/
│   │   │   ├── amx.cpp
│   │   │   ├── amx.h
│   │   │   ├── common.h
│   │   │   ├── mmq.cpp
│   │   │   └── mmq.h
│   │   ├── arch/
│   │   │   ├── arm/
│   │   │   │   ├── cpu-feats.cpp
│   │   │   │   ├── quants.c
│   │   │   │   └── repack.cpp
│   │   │   ├── loongarch/
│   │   │   │   └── quants.c
│   │   │   ├── powerpc/
│   │   │   │   ├── cpu-feats.cpp
│   │   │   │   └── quants.c
│   │   │   ├── riscv/
│   │   │   │   ├── cpu-feats.cpp
│   │   │   │   ├── quants.c
│   │   │   │   └── repack.cpp
│   │   │   ├── s390/
│   │   │   │   ├── cpu-feats.cpp
│   │   │   │   └── quants.c
│   │   │   ├── wasm/
│   │   │   │   └── quants.c
│   │   │   └── x86/
│   │   │       ├── cpu-feats.cpp
│   │   │       ├── quants.c
│   │   │       └── repack.cpp
│   │   ├── arch-fallback.h
│   │   ├── binary-ops.cpp
│   │   ├── binary-ops.h
│   │   ├── cmake/
│   │   │   └── FindSIMD.cmake
│   │   ├── common.h
│   │   ├── ggml-cpu-impl.h
│   │   ├── ggml-cpu.c
│   │   ├── ggml-cpu.cpp
│   │   ├── hbm.cpp
│   │   ├── hbm.h
│   │   ├── kleidiai/
│   │   │   ├── kernels.cpp
│   │   │   ├── kernels.h
│   │   │   ├── kleidiai.cpp
│   │   │   └── kleidiai.h
│   │   ├── llamafile/
│   │   │   ├── sgemm.cpp
│   │   │   └── sgemm.h
│   │   ├── ops.cpp
│   │   ├── ops.h
│   │   ├── quants.c
│   │   ├── quants.h
│   │   ├── repack.cpp
│   │   ├── repack.h
│   │   ├── simd-gemm.h
│   │   ├── simd-mappings.h
│   │   ├── spacemit/
│   │   │   ├── ime.cpp
│   │   │   ├── ime.h
│   │   │   ├── ime1_kernels.cpp
│   │   │   └── ime_kernels.h
│   │   ├── traits.cpp
│   │   ├── traits.h
│   │   ├── unary-ops.cpp
│   │   ├── unary-ops.h
│   │   ├── vec.cpp
│   │   └── vec.h
│   ├── ggml-cuda/
│   │   ├── CMakeLists.txt
│   │   ├── acc.cu
│   │   ├── acc.cuh
│   │   ├── add-id.cu
│   │   ├── add-id.cuh
│   │   ├── arange.cu
│   │   ├── arange.cuh
│   │   ├── argmax.cu
│   │   ├── argmax.cuh
│   │   ├── argsort.cu
│   │   ├── argsort.cuh
│   │   ├── binbcast.cu
│   │   ├── binbcast.cuh
│   │   ├── clamp.cu
│   │   ├── clamp.cuh
│   │   ├── common.cuh
│   │   ├── concat.cu
│   │   ├── concat.cuh
│   │   ├── conv-transpose-1d.cu
│   │   ├── conv-transpose-1d.cuh
│   │   ├── conv2d-dw.cu
│   │   ├── conv2d-dw.cuh
│   │   ├── conv2d-transpose.cu
│   │   ├── conv2d-transpose.cuh
│   │   ├── conv2d.cu
│   │   ├── conv2d.cuh
│   │   ├── convert.cu
│   │   ├── convert.cuh
│   │   ├── count-equal.cu
│   │   ├── count-equal.cuh
│   │   ├── cp-async.cuh
│   │   ├── cpy-utils.cuh
│   │   ├── cpy.cu
│   │   ├── cpy.cuh
│   │   ├── cross-entropy-loss.cu
│   │   ├── cross-entropy-loss.cuh
│   │   ├── cumsum.cu
│   │   ├── cumsum.cuh
│   │   ├── dequantize.cuh
│   │   ├── diag.cu
│   │   ├── diag.cuh
│   │   ├── diagmask.cu
│   │   ├── diagmask.cuh
│   │   ├── fattn-common.cuh
│   │   ├── fattn-mma-f16.cuh
│   │   ├── fattn-tile.cu
│   │   ├── fattn-tile.cuh
│   │   ├── fattn-vec.cuh
│   │   ├── fattn-wmma-f16.cu
│   │   ├── fattn-wmma-f16.cuh
│   │   ├── fattn.cu
│   │   ├── fattn.cuh
│   │   ├── fill.cu
│   │   ├── fill.cuh
│   │   ├── gated_delta_net.cu
│   │   ├── gated_delta_net.cuh
│   │   ├── getrows.cu
│   │   ├── getrows.cuh
│   │   ├── ggml-cuda.cu
│   │   ├── gla.cu
│   │   ├── gla.cuh
│   │   ├── im2col.cu
│   │   ├── im2col.cuh
│   │   ├── mean.cu
│   │   ├── mean.cuh
│   │   ├── mma.cuh
│   │   ├── mmf.cu
│   │   ├── mmf.cuh
│   │   ├── mmid.cu
│   │   ├── mmid.cuh
│   │   ├── mmq.cu
│   │   ├── mmq.cuh
│   │   ├── mmvf.cu
│   │   ├── mmvf.cuh
│   │   ├── mmvq.cu
│   │   ├── mmvq.cuh
│   │   ├── norm.cu
│   │   ├── norm.cuh
│   │   ├── opt-step-adamw.cu
│   │   ├── opt-step-adamw.cuh
│   │   ├── opt-step-sgd.cu
│   │   ├── opt-step-sgd.cuh
│   │   ├── out-prod.cu
│   │   ├── out-prod.cuh
│   │   ├── pad.cu
│   │   ├── pad.cuh
│   │   ├── pad_reflect_1d.cu
│   │   ├── pad_reflect_1d.cuh
│   │   ├── pool2d.cu
│   │   ├── pool2d.cuh
│   │   ├── quantize.cu
│   │   ├── quantize.cuh
│   │   ├── reduce_rows.cuh
│   │   ├── roll.cu
│   │   ├── roll.cuh
│   │   ├── rope.cu
│   │   ├── rope.cuh
│   │   ├── scale.cu
│   │   ├── scale.cuh
│   │   ├── set-rows.cu
│   │   ├── set-rows.cuh
│   │   ├── set.cu
│   │   ├── set.cuh
│   │   ├── softcap.cu
│   │   ├── softcap.cuh
│   │   ├── softmax.cu
│   │   ├── softmax.cuh
│   │   ├── solve_tri.cu
│   │   ├── solve_tri.cuh
│   │   ├── ssm-conv.cu
│   │   ├── ssm-conv.cuh
│   │   ├── ssm-scan.cu
│   │   ├── ssm-scan.cuh
│   │   ├── sum.cu
│   │   ├── sum.cuh
│   │   ├── sumrows.cu
│   │   ├── sumrows.cuh
│   │   ├── template-instances/
│   │   │   ├── fattn-mma-f16-instance-ncols1_1-ncols2_16.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_1-ncols2_32.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_1-ncols2_8.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_16-ncols2_1.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_16-ncols2_2.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_16-ncols2_4.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_2-ncols2_16.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_2-ncols2_32.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_2-ncols2_4.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_2-ncols2_8.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_32-ncols2_1.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_32-ncols2_2.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_4-ncols2_16.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_4-ncols2_2.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_4-ncols2_4.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_4-ncols2_8.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_64-ncols2_1.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_8-ncols2_1.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_8-ncols2_2.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_8-ncols2_4.cu
│   │   │   ├── fattn-mma-f16-instance-ncols1_8-ncols2_8.cu
│   │   │   ├── fattn-tile-instance-dkq112-dv112.cu
│   │   │   ├── fattn-tile-instance-dkq128-dv128.cu
│   │   │   ├── fattn-tile-instance-dkq256-dv256.cu
│   │   │   ├── fattn-tile-instance-dkq40-dv40.cu
│   │   │   ├── fattn-tile-instance-dkq576-dv512.cu
│   │   │   ├── fattn-tile-instance-dkq64-dv64.cu
│   │   │   ├── fattn-tile-instance-dkq72-dv72.cu
│   │   │   ├── fattn-tile-instance-dkq80-dv80.cu
│   │   │   ├── fattn-tile-instance-dkq96-dv96.cu
│   │   │   ├── fattn-vec-instance-f16-f16.cu
│   │   │   ├── fattn-vec-instance-f16-q4_0.cu
│   │   │   ├── fattn-vec-instance-f16-q4_1.cu
│   │   │   ├── fattn-vec-instance-f16-q5_0.cu
│   │   │   ├── fattn-vec-instance-f16-q5_1.cu
│   │   │   ├── fattn-vec-instance-f16-q8_0.cu
│   │   │   ├── fattn-vec-instance-q4_0-f16.cu
│   │   │   ├── fattn-vec-instance-q4_0-q4_0.cu
│   │   │   ├── fattn-vec-instance-q4_0-q4_1.cu
│   │   │   ├── fattn-vec-instance-q4_0-q5_0.cu
│   │   │   ├── fattn-vec-instance-q4_0-q5_1.cu
│   │   │   ├── fattn-vec-instance-q4_0-q8_0.cu
│   │   │   ├── fattn-vec-instance-q4_1-f16.cu
│   │   │   ├── fattn-vec-instance-q4_1-q4_0.cu
│   │   │   ├── fattn-vec-instance-q4_1-q4_1.cu
│   │   │   ├── fattn-vec-instance-q4_1-q5_0.cu
│   │   │   ├── fattn-vec-instance-q4_1-q5_1.cu
│   │   │   ├── fattn-vec-instance-q4_1-q8_0.cu
│   │   │   ├── fattn-vec-instance-q5_0-f16.cu
│   │   │   ├── fattn-vec-instance-q5_0-q4_0.cu
│   │   │   ├── fattn-vec-instance-q5_0-q4_1.cu
│   │   │   ├── fattn-vec-instance-q5_0-q5_0.cu
│   │   │   ├── fattn-vec-instance-q5_0-q5_1.cu
│   │   │   ├── fattn-vec-instance-q5_0-q8_0.cu
│   │   │   ├── fattn-vec-instance-q5_1-f16.cu
│   │   │   ├── fattn-vec-instance-q5_1-q4_0.cu
│   │   │   ├── fattn-vec-instance-q5_1-q4_1.cu
│   │   │   ├── fattn-vec-instance-q5_1-q5_0.cu
│   │   │   ├── fattn-vec-instance-q5_1-q5_1.cu
│   │   │   ├── fattn-vec-instance-q5_1-q8_0.cu
│   │   │   ├── fattn-vec-instance-q8_0-f16.cu
│   │   │   ├── fattn-vec-instance-q8_0-q4_0.cu
│   │   │   ├── fattn-vec-instance-q8_0-q4_1.cu
│   │   │   ├── fattn-vec-instance-q8_0-q5_0.cu
│   │   │   ├── fattn-vec-instance-q8_0-q5_1.cu
│   │   │   ├── fattn-vec-instance-q8_0-q8_0.cu
│   │   │   ├── generate_cu_files.py
│   │   │   ├── mmf-instance-ncols_1.cu
│   │   │   ├── mmf-instance-ncols_10.cu
│   │   │   ├── mmf-instance-ncols_11.cu
│   │   │   ├── mmf-instance-ncols_12.cu
│   │   │   ├── mmf-instance-ncols_13.cu
│   │   │   ├── mmf-instance-ncols_14.cu
│   │   │   ├── mmf-instance-ncols_15.cu
│   │   │   ├── mmf-instance-ncols_16.cu
│   │   │   ├── mmf-instance-ncols_2.cu
│   │   │   ├── mmf-instance-ncols_3.cu
│   │   │   ├── mmf-instance-ncols_4.cu
│   │   │   ├── mmf-instance-ncols_5.cu
│   │   │   ├── mmf-instance-ncols_6.cu
│   │   │   ├── mmf-instance-ncols_7.cu
│   │   │   ├── mmf-instance-ncols_8.cu
│   │   │   ├── mmf-instance-ncols_9.cu
│   │   │   ├── mmq-instance-iq1_s.cu
│   │   │   ├── mmq-instance-iq2_s.cu
│   │   │   ├── mmq-instance-iq2_xs.cu
│   │   │   ├── mmq-instance-iq2_xxs.cu
│   │   │   ├── mmq-instance-iq3_s.cu
│   │   │   ├── mmq-instance-iq3_xxs.cu
│   │   │   ├── mmq-instance-iq4_nl.cu
│   │   │   ├── mmq-instance-iq4_xs.cu
│   │   │   ├── mmq-instance-mxfp4.cu
│   │   │   ├── mmq-instance-q2_k.cu
│   │   │   ├── mmq-instance-q3_k.cu
│   │   │   ├── mmq-instance-q4_0.cu
│   │   │   ├── mmq-instance-q4_1.cu
│   │   │   ├── mmq-instance-q4_k.cu
│   │   │   ├── mmq-instance-q5_0.cu
│   │   │   ├── mmq-instance-q5_1.cu
│   │   │   ├── mmq-instance-q5_k.cu
│   │   │   ├── mmq-instance-q6_k.cu
│   │   │   └── mmq-instance-q8_0.cu
│   │   ├── top-k.cu
│   │   ├── top-k.cuh
│   │   ├── topk-moe.cu
│   │   ├── topk-moe.cuh
│   │   ├── tri.cu
│   │   ├── tri.cuh
│   │   ├── tsembd.cu
│   │   ├── tsembd.cuh
│   │   ├── unary.cu
│   │   ├── unary.cuh
│   │   ├── upscale.cu
│   │   ├── upscale.cuh
│   │   ├── vecdotq.cuh
│   │   ├── vendors/
│   │   │   ├── cuda.h
│   │   │   ├── hip.h
│   │   │   └── musa.h
│   │   ├── wkv.cu
│   │   └── wkv.cuh
│   ├── ggml-hexagon/
│   │   ├── CMakeLists.txt
│   │   ├── ggml-hexagon.cpp
│   │   ├── htp/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── act-ops.c
│   │   │   ├── argsort-ops.c
│   │   │   ├── binary-ops.c
│   │   │   ├── cmake-toolchain.cmake
│   │   │   ├── cpy-ops.c
│   │   │   ├── flash-attn-ops.c
│   │   │   ├── get-rows-ops.c
│   │   │   ├── hex-dma.c
│   │   │   ├── hex-dma.h
│   │   │   ├── hex-dump.h
│   │   │   ├── hex-fastdiv.h
│   │   │   ├── hex-utils.h
│   │   │   ├── htp-ctx.h
│   │   │   ├── htp-msg.h
│   │   │   ├── htp-ops.h
│   │   │   ├── htp_iface.idl
│   │   │   ├── hvx-arith.h
│   │   │   ├── hvx-base.h
│   │   │   ├── hvx-copy.h
│   │   │   ├── hvx-div.h
│   │   │   ├── hvx-dump.h
│   │   │   ├── hvx-exp.h
│   │   │   ├── hvx-floor.h
│   │   │   ├── hvx-inverse.h
│   │   │   ├── hvx-reduce.h
│   │   │   ├── hvx-scale.h
│   │   │   ├── hvx-sigmoid.h
│   │   │   ├── hvx-sqrt.h
│   │   │   ├── hvx-types.h
│   │   │   ├── hvx-utils.h
│   │   │   ├── main.c
│   │   │   ├── matmul-ops.c
│   │   │   ├── rope-ops.c
│   │   │   ├── set-rows-ops.c
│   │   │   ├── softmax-ops.c
│   │   │   ├── ssm-conv.c
│   │   │   ├── sum-rows-ops.c
│   │   │   ├── unary-ops.c
│   │   │   ├── worker-pool.c
│   │   │   └── worker-pool.h
│   │   ├── htp-drv.cpp
│   │   ├── htp-drv.h
│   │   ├── libdl.h
│   │   ├── libggml-htp.inf
│   │   └── op-desc.h
│   ├── ggml-hip/
│   │   └── CMakeLists.txt
│   ├── ggml-impl.h
│   ├── ggml-metal/
│   │   ├── CMakeLists.txt
│   │   ├── ggml-metal-common.cpp
│   │   ├── ggml-metal-common.h
│   │   ├── ggml-metal-context.h
│   │   ├── ggml-metal-context.m
│   │   ├── ggml-metal-device.cpp
│   │   ├── ggml-metal-device.h
│   │   ├── ggml-metal-device.m
│   │   ├── ggml-metal-impl.h
│   │   ├── ggml-metal-ops.cpp
│   │   ├── ggml-metal-ops.h
│   │   ├── ggml-metal.cpp
│   │   └── ggml-metal.metal
│   ├── ggml-musa/
│   │   ├── CMakeLists.txt
│   │   ├── mudnn.cu
│   │   └── mudnn.cuh
│   ├── ggml-opencl/
│   │   ├── CMakeLists.txt
│   │   ├── ggml-opencl.cpp
│   │   └── kernels/
│   │       ├── add.cl
│   │       ├── add_id.cl
│   │       ├── argsort.cl
│   │       ├── clamp.cl
│   │       ├── concat.cl
│   │       ├── conv2d.cl
│   │       ├── conv2d_f16_f32.cl
│   │       ├── cpy.cl
│   │       ├── cumsum.cl
│   │       ├── cvt.cl
│   │       ├── diag.cl
│   │       ├── diag_mask_inf.cl
│   │       ├── div.cl
│   │       ├── embed_kernel.py
│   │       ├── exp.cl
│   │       ├── expm1.cl
│   │       ├── fill.cl
│   │       ├── flash_attn_f16.cl
│   │       ├── flash_attn_f32.cl
│   │       ├── flash_attn_f32_f16.cl
│   │       ├── gelu.cl
│   │       ├── gemm_moe_mxfp4_f32.cl
│   │       ├── gemm_noshuffle_q4_1_f32.cl
│   │       ├── gemv_moe_mxfp4_f32.cl
│   │       ├── gemv_noshuffle.cl
│   │       ├── gemv_noshuffle_general.cl
│   │       ├── gemv_noshuffle_general_q8_0_f32.cl
│   │       ├── gemv_noshuffle_q4_1_f32.cl
│   │       ├── get_rows.cl
│   │       ├── glu.cl
│   │       ├── group_norm.cl
│   │       ├── im2col_f16.cl
│   │       ├── im2col_f32.cl
│   │       ├── l2_norm.cl
│   │       ├── mean.cl
│   │       ├── mul.cl
│   │       ├── mul_mat_Ab_Bi_8x4.cl
│   │       ├── mul_mat_f16_f32.cl
│   │       ├── mul_mm_f16_f32_kq_kqv.cl
│   │       ├── mul_mm_f16_f32_l4_lm.cl
│   │       ├── mul_mm_f32_f32_l4_lm.cl
│   │       ├── mul_mm_q4_0_f32_l4_lm.cl
│   │       ├── mul_mm_q4_1_f32_l4_lm.cl
│   │       ├── mul_mm_q6_k_f32_l4_lm.cl
│   │       ├── mul_mm_q8_0_f32_8x4.cl
│   │       ├── mul_mm_q8_0_f32_l4_lm.cl
│   │       ├── mul_mv_f16_f16.cl
│   │       ├── mul_mv_f16_f32.cl
│   │       ├── mul_mv_f16_f32_1row.cl
│   │       ├── mul_mv_f16_f32_l4.cl
│   │       ├── mul_mv_f32_f32.cl
│   │       ├── mul_mv_id_mxfp4_f32.cl
│   │       ├── mul_mv_id_mxfp4_f32_flat.cl
│   │       ├── mul_mv_id_q4_0_f32_8x_flat.cl
│   │       ├── mul_mv_id_q8_0_f32.cl
│   │       ├── mul_mv_id_q8_0_f32_flat.cl
│   │       ├── mul_mv_mxfp4_f32.cl
│   │       ├── mul_mv_mxfp4_f32_flat.cl
│   │       ├── mul_mv_q4_0_f32.cl
│   │       ├── mul_mv_q4_0_f32_1d_16x_flat.cl
│   │       ├── mul_mv_q4_0_f32_1d_8x_flat.cl
│   │       ├── mul_mv_q4_0_f32_8x_flat.cl
│   │       ├── mul_mv_q4_0_f32_v.cl
│   │       ├── mul_mv_q4_1_f32.cl
│   │       ├── mul_mv_q4_1_f32_flat.cl
│   │       ├── mul_mv_q4_k_f32.cl
│   │       ├── mul_mv_q6_k_f32.cl
│   │       ├── mul_mv_q6_k_f32_flat.cl
│   │       ├── mul_mv_q8_0_f32.cl
│   │       ├── mul_mv_q8_0_f32_flat.cl
│   │       ├── neg.cl
│   │       ├── norm.cl
│   │       ├── pad.cl
│   │       ├── relu.cl
│   │       ├── repeat.cl
│   │       ├── rms_norm.cl
│   │       ├── rope.cl
│   │       ├── scale.cl
│   │       ├── set_rows.cl
│   │       ├── sigmoid.cl
│   │       ├── silu.cl
│   │       ├── softmax_4_f16.cl
│   │       ├── softmax_4_f32.cl
│   │       ├── softmax_f16.cl
│   │       ├── softmax_f32.cl
│   │       ├── softplus.cl
│   │       ├── solve_tri.cl
│   │       ├── sqr.cl
│   │       ├── sqrt.cl
│   │       ├── ssm_conv.cl
│   │       ├── sub.cl
│   │       ├── sum_rows.cl
│   │       ├── tanh.cl
│   │       ├── transpose.cl
│   │       ├── tri.cl
│   │       ├── tsembd.cl
│   │       └── upscale.cl
│   ├── ggml-openvino/
│   │   ├── .clang-format
│   │   ├── CMakeLists.txt
│   │   ├── ggml-decoder.cpp
│   │   ├── ggml-decoder.h
│   │   ├── ggml-openvino-extra.cpp
│   │   ├── ggml-openvino-extra.h
│   │   ├── ggml-openvino.cpp
│   │   ├── ggml-quants.cpp
│   │   ├── ggml-quants.h
│   │   ├── openvino/
│   │   │   ├── decoder.h
│   │   │   ├── frontend.cpp
│   │   │   ├── frontend.h
│   │   │   ├── input_model.cpp
│   │   │   ├── input_model.h
│   │   │   ├── node_context.h
│   │   │   ├── op/
│   │   │   │   ├── cont.cpp
│   │   │   │   ├── cpy.cpp
│   │   │   │   ├── flash_attn_ext.cpp
│   │   │   │   ├── get_rows.cpp
│   │   │   │   ├── glu_geglu.cpp
│   │   │   │   ├── glu_swiglu.cpp
│   │   │   │   ├── mulmat.cpp
│   │   │   │   ├── permute.cpp
│   │   │   │   ├── reshape.cpp
│   │   │   │   ├── rms_norm.cpp
│   │   │   │   ├── rope.cpp
│   │   │   │   ├── scale.cpp
│   │   │   │   ├── set_rows.cpp
│   │   │   │   ├── softmax.cpp
│   │   │   │   ├── transpose.cpp
│   │   │   │   ├── unary_silu.cpp
│   │   │   │   └── view.cpp
│   │   │   ├── op_table.cpp
│   │   │   ├── op_table.h
│   │   │   ├── pass/
│   │   │   │   ├── eliminate_zp.cpp
│   │   │   │   ├── eliminate_zp.h
│   │   │   │   ├── fuse_to_sdpa.cpp
│   │   │   │   ├── fuse_to_sdpa.h
│   │   │   │   ├── mark_decompression_convert_constant_folding.h
│   │   │   │   ├── squeeze_matmul.cpp
│   │   │   │   └── squeeze_matmul.h
│   │   │   ├── translate_session.cpp
│   │   │   ├── translate_session.h
│   │   │   ├── utils.cpp
│   │   │   └── utils.h
│   │   ├── utils.cpp
│   │   └── utils.h
│   ├── ggml-opt.cpp
│   ├── ggml-quants.c
│   ├── ggml-quants.h
│   ├── ggml-rpc/
│   │   ├── CMakeLists.txt
│   │   └── ggml-rpc.cpp
│   ├── ggml-sycl/
│   │   ├── CMakeLists.txt
│   │   ├── add-id.cpp
│   │   ├── add-id.hpp
│   │   ├── backend.hpp
│   │   ├── binbcast.cpp
│   │   ├── binbcast.hpp
│   │   ├── common.cpp
│   │   ├── common.hpp
│   │   ├── concat.cpp
│   │   ├── concat.hpp
│   │   ├── conv.cpp
│   │   ├── conv.hpp
│   │   ├── convert.cpp
│   │   ├── convert.hpp
│   │   ├── count-equal.cpp
│   │   ├── count-equal.hpp
│   │   ├── cpy.cpp
│   │   ├── cpy.hpp
│   │   ├── dequantize.hpp
│   │   ├── dmmv.cpp
│   │   ├── dmmv.hpp
│   │   ├── dpct/
│   │   │   └── helper.hpp
│   │   ├── element_wise.cpp
│   │   ├── element_wise.hpp
│   │   ├── fattn-common.hpp
│   │   ├── fattn-tile.cpp
│   │   ├── fattn-tile.hpp
│   │   ├── fattn-vec.hpp
│   │   ├── fattn.cpp
│   │   ├── fattn.hpp
│   │   ├── gated_delta_net.cpp
│   │   ├── gated_delta_net.hpp
│   │   ├── gemm.hpp
│   │   ├── getrows.cpp
│   │   ├── getrows.hpp
│   │   ├── ggml-sycl.cpp
│   │   ├── gla.cpp
│   │   ├── gla.hpp
│   │   ├── im2col.cpp
│   │   ├── im2col.hpp
│   │   ├── mmq.cpp
│   │   ├── mmq.hpp
│   │   ├── mmvq.cpp
│   │   ├── mmvq.hpp
│   │   ├── norm.cpp
│   │   ├── norm.hpp
│   │   ├── outprod.cpp
│   │   ├── outprod.hpp
│   │   ├── pad.cpp
│   │   ├── pad.hpp
│   │   ├── pad_reflect_1d.cpp
│   │   ├── pad_reflect_1d.hpp
│   │   ├── presets.hpp
│   │   ├── quantize.hpp
│   │   ├── quants.hpp
│   │   ├── repeat_back.cpp
│   │   ├── repeat_back.hpp
│   │   ├── roll.cpp
│   │   ├── roll.hpp
│   │   ├── rope.cpp
│   │   ├── rope.hpp
│   │   ├── set.cpp
│   │   ├── set.hpp
│   │   ├── set_rows.cpp
│   │   ├── set_rows.hpp
│   │   ├── softmax.cpp
│   │   ├── softmax.hpp
│   │   ├── ssm_conv.cpp
│   │   ├── ssm_conv.hpp
│   │   ├── sycl_hw.cpp
│   │   ├── sycl_hw.hpp
│   │   ├── template-instances/
│   │   │   ├── fattn-tile-instance-dkq112-dv112.cpp
│   │   │   ├── fattn-tile-instance-dkq128-dv128.cpp
│   │   │   ├── fattn-tile-instance-dkq256-dv256.cpp
│   │   │   ├── fattn-tile-instance-dkq40-dv40.cpp
│   │   │   ├── fattn-tile-instance-dkq576-dv512.cpp
│   │   │   ├── fattn-tile-instance-dkq64-dv64.cpp
│   │   │   ├── fattn-tile-instance-dkq72-dv72.cpp
│   │   │   ├── fattn-tile-instance-dkq80-dv80.cpp
│   │   │   ├── fattn-tile-instance-dkq96-dv96.cpp
│   │   │   ├── fattn-vec-instance-f16-f16.cpp
│   │   │   ├── fattn-vec-instance-f16-q4_0.cpp
│   │   │   ├── fattn-vec-instance-f16-q4_1.cpp
│   │   │   ├── fattn-vec-instance-f16-q5_0.cpp
│   │   │   ├── fattn-vec-instance-f16-q5_1.cpp
│   │   │   ├── fattn-vec-instance-f16-q8_0.cpp
│   │   │   ├── fattn-vec-instance-q4_0-f16.cpp
│   │   │   ├── fattn-vec-instance-q4_0-q4_0.cpp
│   │   │   ├── fattn-vec-instance-q4_0-q4_1.cpp
│   │   │   ├── fattn-vec-instance-q4_0-q5_0.cpp
│   │   │   ├── fattn-vec-instance-q4_0-q5_1.cpp
│   │   │   ├── fattn-vec-instance-q4_0-q8_0.cpp
│   │   │   ├── fattn-vec-instance-q4_1-f16.cpp
│   │   │   ├── fattn-vec-instance-q4_1-q4_0.cpp
│   │   │   ├── fattn-vec-instance-q4_1-q4_1.cpp
│   │   │   ├── fattn-vec-instance-q4_1-q5_0.cpp
│   │   │   ├── fattn-vec-instance-q4_1-q5_1.cpp
│   │   │   ├── fattn-vec-instance-q4_1-q8_0.cpp
│   │   │   ├── fattn-vec-instance-q5_0-f16.cpp
│   │   │   ├── fattn-vec-instance-q5_0-q4_0.cpp
│   │   │   ├── fattn-vec-instance-q5_0-q4_1.cpp
│   │   │   ├── fattn-vec-instance-q5_0-q5_0.cpp
│   │   │   ├── fattn-vec-instance-q5_0-q5_1.cpp
│   │   │   ├── fattn-vec-instance-q5_0-q8_0.cpp
│   │   │   ├── fattn-vec-instance-q5_1-f16.cpp
│   │   │   ├── fattn-vec-instance-q5_1-q4_0.cpp
│   │   │   ├── fattn-vec-instance-q5_1-q4_1.cpp
│   │   │   ├── fattn-vec-instance-q5_1-q5_0.cpp
│   │   │   ├── fattn-vec-instance-q5_1-q5_1.cpp
│   │   │   ├── fattn-vec-instance-q5_1-q8_0.cpp
│   │   │   ├── fattn-vec-instance-q8_0-f16.cpp
│   │   │   ├── fattn-vec-instance-q8_0-q4_0.cpp
│   │   │   ├── fattn-vec-instance-q8_0-q4_1.cpp
│   │   │   ├── fattn-vec-instance-q8_0-q5_0.cpp
│   │   │   ├── fattn-vec-instance-q8_0-q5_1.cpp
│   │   │   └── fattn-vec-instance-q8_0-q8_0.cpp
│   │   ├── tsembd.cpp
│   │   ├── tsembd.hpp
│   │   ├── vecdotq.hpp
│   │   ├── wkv.cpp
│   │   └── wkv.hpp
│   ├── ggml-threading.cpp
│   ├── ggml-threading.h
│   ├── ggml-virtgpu/
│   │   ├── CMakeLists.txt
│   │   ├── apir_cs_ggml-rpc-front.cpp
│   │   ├── backend/
│   │   │   ├── CMakeLists.txt
│   │   │   ├── apir_cs_ggml-rpc-back.cpp
│   │   │   ├── backend-convert.h
│   │   │   ├── backend-dispatched-backend.cpp
│   │   │   ├── backend-dispatched-buffer-type.cpp
│   │   │   ├── backend-dispatched-buffer.cpp
│   │   │   ├── backend-dispatched-device.cpp
│   │   │   ├── backend-dispatched.cpp
│   │   │   ├── backend-dispatched.gen.h
│   │   │   ├── backend-dispatched.h
│   │   │   ├── backend-virgl-apir.h
│   │   │   ├── backend.cpp
│   │   │   └── shared/
│   │   │       ├── api_remoting.h
│   │   │       ├── apir_backend.gen.h
│   │   │       ├── apir_backend.h
│   │   │       ├── apir_cs.h
│   │   │       ├── apir_cs_ggml.h
│   │   │       └── apir_cs_rpc.h
│   │   ├── ggml-backend-buffer-type.cpp
│   │   ├── ggml-backend-buffer.cpp
│   │   ├── ggml-backend-device.cpp
│   │   ├── ggml-backend-reg.cpp
│   │   ├── ggml-backend.cpp
│   │   ├── ggml-remoting.h
│   │   ├── ggmlremoting_functions.yaml
│   │   ├── include/
│   │   │   └── apir_hw.h
│   │   ├── regenerate_remoting.py
│   │   ├── virtgpu-apir.h
│   │   ├── virtgpu-forward-backend.cpp
│   │   ├── virtgpu-forward-buffer-type.cpp
│   │   ├── virtgpu-forward-buffer.cpp
│   │   ├── virtgpu-forward-device.cpp
│   │   ├── virtgpu-forward-impl.h
│   │   ├── virtgpu-forward.gen.h
│   │   ├── virtgpu-shm.cpp
│   │   ├── virtgpu-shm.h
│   │   ├── virtgpu-utils.cpp
│   │   ├── virtgpu-utils.h
│   │   ├── virtgpu.cpp
│   │   └── virtgpu.h
│   ├── ggml-vulkan/
│   │   ├── CMakeLists.txt
│   │   ├── cmake/
│   │   │   └── host-toolchain.cmake.in
│   │   ├── ggml-vulkan.cpp
│   │   └── vulkan-shaders/
│   │       ├── CMakeLists.txt
│   │       ├── abs.comp
│   │       ├── acc.comp
│   │       ├── add.comp
│   │       ├── add1.comp
│   │       ├── add_id.comp
│   │       ├── arange.comp
│   │       ├── argmax.comp
│   │       ├── argsort.comp
│   │       ├── argsort_large.comp
│   │       ├── ceil.comp
│   │       ├── clamp.comp
│   │       ├── concat.comp
│   │       ├── contig_copy.comp
│   │       ├── conv2d_dw.comp
│   │       ├── conv2d_mm.comp
│   │       ├── conv_transpose_1d.comp
│   │       ├── copy.comp
│   │       ├── copy_from_quant.comp
│   │       ├── copy_to_quant.comp
│   │       ├── copy_transpose.comp
│   │       ├── cos.comp
│   │       ├── count_equal.comp
│   │       ├── count_experts.comp
│   │       ├── cumsum.comp
│   │       ├── cumsum_multipass1.comp
│   │       ├── cumsum_multipass2.comp
│   │       ├── dequant_f32.comp
│   │       ├── dequant_funcs.glsl
│   │       ├── dequant_funcs_cm2.glsl
│   │       ├── dequant_head.glsl
│   │       ├── dequant_iq1_m.comp
│   │       ├── dequant_iq1_s.comp
│   │       ├── dequant_iq2_s.comp
│   │       ├── dequant_iq2_xs.comp
│   │       ├── dequant_iq2_xxs.comp
│   │       ├── dequant_iq3_s.comp
│   │       ├── dequant_iq3_xxs.comp
│   │       ├── dequant_iq4_nl.comp
│   │       ├── dequant_iq4_xs.comp
│   │       ├── dequant_mxfp4.comp
│   │       ├── dequant_q2_k.comp
│   │       ├── dequant_q3_k.comp
│   │       ├── dequant_q4_0.comp
│   │       ├── dequant_q4_1.comp
│   │       ├── dequant_q4_k.comp
│   │       ├── dequant_q5_0.comp
│   │       ├── dequant_q5_1.comp
│   │       ├── dequant_q5_k.comp
│   │       ├── dequant_q6_k.comp
│   │       ├── dequant_q8_0.comp
│   │       ├── diag.comp
│   │       ├── diag_mask_inf.comp
│   │       ├── div.comp
│   │       ├── elu.comp
│   │       ├── exp.comp
│   │       ├── feature-tests/
│   │       │   ├── bfloat16.comp
│   │       │   ├── coopmat.comp
│   │       │   ├── coopmat2.comp
│   │       │   └── integer_dot.comp
│   │       ├── fill.comp
│   │       ├── flash_attn.comp
│   │       ├── flash_attn_base.glsl
│   │       ├── flash_attn_cm1.comp
│   │       ├── flash_attn_cm2.comp
│   │       ├── flash_attn_mask_opt.comp
│   │       ├── flash_attn_split_k_reduce.comp
│   │       ├── floor.comp
│   │       ├── gated_delta_net.comp
│   │       ├── geglu.comp
│   │       ├── geglu_erf.comp
│   │       ├── geglu_quick.comp
│   │       ├── gelu.comp
│   │       ├── gelu_erf.comp
│   │       ├── gelu_quick.comp
│   │       ├── generic_binary_head.glsl
│   │       ├── generic_head.glsl
│   │       ├── generic_unary_head.glsl
│   │       ├── get_rows.comp
│   │       ├── get_rows_quant.comp
│   │       ├── glu_head.glsl
│   │       ├── glu_main.glsl
│   │       ├── group_norm.comp
│   │       ├── hardsigmoid.comp
│   │       ├── hardswish.comp
│   │       ├── im2col.comp
│   │       ├── im2col_3d.comp
│   │       ├── l2_norm.comp
│   │       ├── leaky_relu.comp
│   │       ├── log.comp
│   │       ├── mul.comp
│   │       ├── mul_mat_split_k_reduce.comp
│   │       ├── mul_mat_vec.comp
│   │       ├── mul_mat_vec_base.glsl
│   │       ├── mul_mat_vec_iface.glsl
│   │       ├── mul_mat_vec_iq1_m.comp
│   │       ├── mul_mat_vec_iq1_s.comp
│   │       ├── mul_mat_vec_iq2_s.comp
│   │       ├── mul_mat_vec_iq2_xs.comp
│   │       ├── mul_mat_vec_iq2_xxs.comp
│   │       ├── mul_mat_vec_iq3_s.comp
│   │       ├── mul_mat_vec_iq3_xxs.comp
│   │       ├── mul_mat_vec_nc.comp
│   │       ├── mul_mat_vec_p021.comp
│   │       ├── mul_mat_vec_q2_k.comp
│   │       ├── mul_mat_vec_q3_k.comp
│   │       ├── mul_mat_vec_q4_k.comp
│   │       ├── mul_mat_vec_q5_k.comp
│   │       ├── mul_mat_vec_q6_k.comp
│   │       ├── mul_mat_vecq.comp
│   │       ├── mul_mat_vecq_funcs.glsl
│   │       ├── mul_mm.comp
│   │       ├── mul_mm_cm2.comp
│   │       ├── mul_mm_funcs.glsl
│   │       ├── mul_mm_id_funcs.glsl
│   │       ├── mul_mmq.comp
│   │       ├── mul_mmq_funcs.glsl
│   │       ├── mul_mmq_shmem_types.glsl
│   │       ├── multi_add.comp
│   │       ├── neg.comp
│   │       ├── norm.comp
│   │       ├── opt_step_adamw.comp
│   │       ├── opt_step_sgd.comp
│   │       ├── pad.comp
│   │       ├── pool2d.comp
│   │       ├── quantize_q8_1.comp
│   │       ├── reglu.comp
│   │       ├── relu.comp
│   │       ├── repeat.comp
│   │       ├── repeat_back.comp
│   │       ├── rms_norm.comp
│   │       ├── rms_norm_back.comp
│   │       ├── rms_norm_partials.comp
│   │       ├── roll.comp
│   │       ├── rope_funcs.glsl
│   │       ├── rope_head.glsl
│   │       ├── rope_multi.comp
│   │       ├── rope_neox.comp
│   │       ├── rope_norm.comp
│   │       ├── rope_params.glsl
│   │       ├── rope_vision.comp
│   │       ├── round.comp
│   │       ├── rte.glsl
│   │       ├── scale.comp
│   │       ├── sgn.comp
│   │       ├── sigmoid.comp
│   │       ├── silu.comp
│   │       ├── silu_back.comp
│   │       ├── sin.comp
│   │       ├── soft_max.comp
│   │       ├── soft_max_back.comp
│   │       ├── soft_max_large1.comp
│   │       ├── soft_max_large2.comp
│   │       ├── soft_max_large3.comp
│   │       ├── soft_max_large_common.glsl
│   │       ├── softplus.comp
│   │       ├── solve_tri.comp
│   │       ├── sqrt.comp
│   │       ├── square.comp
│   │       ├── ssm_conv.comp
│   │       ├── ssm_scan.comp
│   │       ├── step.comp
│   │       ├── sub.comp
│   │       ├── sum_rows.comp
│   │       ├── sum_rows.glsl
│   │       ├── swiglu.comp
│   │       ├── swiglu_oai.comp
│   │       ├── tanh.comp
│   │       ├── timestep_embedding.comp
│   │       ├── topk_argsort.comp
│   │       ├── topk_moe.comp
│   │       ├── topk_nary_search.comp
│   │       ├── tri.comp
│   │       ├── trunc.comp
│   │       ├── types.glsl
│   │       ├── upscale.comp
│   │       ├── utils.glsl
│   │       ├── vulkan-shaders-gen.cpp
│   │       ├── wkv6.comp
│   │       ├── wkv7.comp
│   │       └── xielu.comp
│   ├── ggml-webgpu/
│   │   ├── CMakeLists.txt
│   │   ├── ggml-webgpu-shader-lib.hpp
│   │   ├── ggml-webgpu.cpp
│   │   ├── pre_wgsl.hpp
│   │   └── wgsl-shaders/
│   │       ├── argmax.wgsl
│   │       ├── argsort.wgsl
│   │       ├── argsort_merge.wgsl
│   │       ├── binary.wgsl
│   │       ├── common_decls.tmpl
│   │       ├── concat.wgsl
│   │       ├── cpy.tmpl.wgsl
│   │       ├── cumsum.wgsl
│   │       ├── embed_wgsl.py
│   │       ├── flash_attn.wgsl
│   │       ├── get_rows.wgsl
│   │       ├── glu.tmpl.wgsl
│   │       ├── memset.wgsl
│   │       ├── mul_mat.wgsl
│   │       ├── mul_mat_decls.tmpl
│   │       ├── mul_mat_reg_tile.wgsl
│   │       ├── mul_mat_subgroup_matrix.wgsl
│   │       ├── mul_mat_vec.wgsl
│   │       ├── pad.wgsl
│   │       ├── repeat.wgsl
│   │       ├── rms_norm.wgsl
│   │       ├── rope.tmpl.wgsl
│   │       ├── scale.wgsl
│   │       ├── set_rows.wgsl
│   │       ├── soft_max.tmpl.wgsl
│   │       ├── sum_rows.wgsl
│   │       └── unary.wgsl
│   ├── ggml-zdnn/
│   │   ├── .gitignore
│   │   ├── CMakeLists.txt
│   │   ├── common.hpp
│   │   ├── ggml-zdnn.cpp
│   │   ├── mmf.cpp
│   │   ├── mmf.hpp
│   │   ├── utils.cpp
│   │   └── utils.hpp
│   ├── ggml-zendnn/
│   │   ├── CMakeLists.txt
│   │   └── ggml-zendnn.cpp
│   ├── ggml.c
│   ├── ggml.cpp
│   └── gguf.cpp
└── tests/
    ├── CMakeLists.txt
    ├── test-arange.cpp
    ├── test-backend-ops.cpp
    ├── test-cont.c
    ├── test-conv-transpose-1d.cpp
    ├── test-conv-transpose.c
    ├── test-conv1d-dw-c1.cpp
    ├── test-conv1d-dw-c2.cpp
    ├── test-conv1d.cpp
    ├── test-conv2d-dw.cpp
    ├── test-conv2d.cpp
    ├── test-customop.c
    ├── test-dup.c
    ├── test-interpolate.cpp
    ├── test-opt.cpp
    ├── test-pad-reflect-1d.cpp
    ├── test-pool.c
    ├── test-quantize-fns.cpp
    ├── test-quantize-perf.cpp
    ├── test-rel-pos.c
    ├── test-roll.cpp
    └── test-timestep_embedding.cpp

Copy disabled (too large) Download .json

Condensed preview — 1073 files, each showing path, character count, and a content snippet. Download the .json file for the full structured content (16,002K chars).

[
  {
    "path": ".editorconfig",
    "chars": 375,
    "preview": "# https://EditorConfig.org\n\n# Top-most EditorConfig file\nroot = true\n\n# Unix-style newlines with a newline ending every "
  },
  {
    "path": ".github/pull_request_template.md",
    "chars": 226,
    "preview": "*For changes to the core `ggml` library (including to the CMake build system), please open a PR in https://github.com/gg"
  },
  {
    "path": ".github/workflows/ci.yml",
    "chars": 7101,
    "preview": "name: CI\n\non:\n  push:\n    branches: [ master ]\n  pull_request:\n    branches: [ master ]\n\nconcurrency:\n  group: ${{ githu"
  },
  {
    "path": ".github/workflows/release.yml",
    "chars": 512,
    "preview": "name: Release\n\non:\n  push:\n    tags:\n      - 'v*'\n\njobs:\n  release:\n    runs-on: ubuntu-latest\n    permissions:\n      co"
  },
  {
    "path": ".gitignore",
    "chars": 304,
    "preview": "build/\nbuild-*/\nout/\ntmp/\nmodels/\nmodels-mnt\n\ncompile_commands.json\nCMakeSettings.json\n.vs/\n.vscode/\n.idea/\n.clangd\n\n.ve"
  },
  {
    "path": ".gitmodules",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "AUTHORS",
    "chars": 14308,
    "preview": "# date: Tue Feb  4 13:03:51 EET 2025\n# this file is auto-generated by scripts/gen-authors.sh\n\n0cc4m <picard12@live.de>\n6"
  },
  {
    "path": "CMakeLists.txt",
    "chars": 21000,
    "preview": "cmake_minimum_required(VERSION 3.14...3.28) # for add_link_options and implicit target directories.\nproject(\"ggml\" C CXX"
  },
  {
    "path": "CONTRIBUTING.md",
    "chars": 361,
    "preview": "Please use [llama.cpp's contribution guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) for "
  },
  {
    "path": "LICENSE",
    "chars": 1078,
    "preview": "MIT License\n\nCopyright (c) 2023-2026 The ggml authors\n\nPermission is hereby granted, free of charge, to any person obtai"
  },
  {
    "path": "README.md",
    "chars": 1362,
    "preview": "# ggml\n\n[Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205)\n\nTensor library for machine learning\n\n***Not"
  },
  {
    "path": "ci/run.sh",
    "chars": 10587,
    "preview": "#/bin/bash\n#\n# sample usage:\n#\n# mkdir tmp\n#\n# # CPU-only build\n# bash ./ci/run.sh ./tmp/results ./tmp/mnt\n#\n# # with CU"
  },
  {
    "path": "cmake/GitVars.cmake",
    "chars": 717,
    "preview": "find_package(Git)\n\n# the commit's SHA1\nexecute_process(COMMAND\n    \"${GIT_EXECUTABLE}\" describe --match=NeVeRmAtCh --alw"
  },
  {
    "path": "cmake/common.cmake",
    "chars": 2125,
    "preview": "function(ggml_get_flags CCID CCVER)\n    set(C_FLAGS \"\")\n    set(CXX_FLAGS \"\")\n\n    if (CCID MATCHES \"Clang\")\n        set"
  },
  {
    "path": "cmake/ggml-config.cmake.in",
    "chars": 6914,
    "preview": "@PACKAGE_INIT@\n\n@GGML_VARIABLES_EXPANDED@\n\n# Find all dependencies before creating any target.\ninclude(CMakeFindDependen"
  },
  {
    "path": "docs/gguf.md",
    "chars": 41317,
    "preview": "# GGUF\n\nGGUF is a file format for storing models for inference with GGML and executors based on GGML. GGUF is a binary f"
  },
  {
    "path": "examples/CMakeLists.txt",
    "chars": 871,
    "preview": "if (GGML_ALL_WARNINGS)\n  if (NOT MSVC)\n      set(cxx_flags\n          # TODO(marella): Add other warnings.\n          -Wpe"
  },
  {
    "path": "examples/common-ggml.cpp",
    "chars": 8695,
    "preview": "#include \"common-ggml.h\"\n\n#include <regex>\n#include <map>\n\nstatic const std::map<std::string, enum ggml_ftype> GGML_FTYP"
  },
  {
    "path": "examples/common-ggml.h",
    "chars": 410,
    "preview": "#pragma once\n\n#include \"ggml.h\"\n\n#include <fstream>\n#include <vector>\n#include <string>\n\nenum ggml_ftype ggml_parse_ftyp"
  },
  {
    "path": "examples/common.cpp",
    "chars": 23275,
    "preview": "#define _USE_MATH_DEFINES // for M_PI\n\n#include \"common.h\"\n\n#include <cmath>\n#include <codecvt>\n#include <cstring>\n#incl"
  },
  {
    "path": "examples/common.h",
    "chars": 9629,
    "preview": "// Various helper functions and utilities\n\n#pragma once\n\n#include <string>\n#include <map>\n#include <vector>\n#include <ra"
  },
  {
    "path": "examples/gpt-2/CMakeLists.txt",
    "chars": 930,
    "preview": "#\n# gpt-2\n\nset(TEST_TARGET gpt-2-ctx)\nadd_executable(${TEST_TARGET} main-ctx.cpp)\ntarget_link_libraries(${TEST_TARGET} P"
  },
  {
    "path": "examples/gpt-2/README.md",
    "chars": 8795,
    "preview": "# gpt-2\n\nThis is a C++ example running GPT-2 inference using the [ggml](https://github.com/ggerganov/ggml) library.\n\nThe"
  },
  {
    "path": "examples/gpt-2/convert-cerebras-to-ggml.py",
    "chars": 6315,
    "preview": "# Convert Cerebras models to ggml format\n#\n# ref: https://www.cerebras.net/blog/cerebras-gpt-a-family-of-open-compute-ef"
  },
  {
    "path": "examples/gpt-2/convert-ckpt-to-ggml.py",
    "chars": 4869,
    "preview": "# Convert a model checkpoint to a ggml compatible file\n#\n# Load the model using TensorFlow.\n# Iterate over all variables"
  },
  {
    "path": "examples/gpt-2/convert-h5-to-ggml.py",
    "chars": 6432,
    "preview": "# Convert GPT-2 h5 transformer model to ggml format\n#\n# Load the model using GPT2Model.\n# Iterate over all variables and"
  },
  {
    "path": "examples/gpt-2/download-ggml-model.sh",
    "chars": 1752,
    "preview": "#!/bin/bash\n\n# This script downloads GPT-2 model files that have already been converted to ggml format.\n# This way you d"
  },
  {
    "path": "examples/gpt-2/download-model.sh",
    "chars": 1117,
    "preview": "#!/bin/bash\n\nggml_path=$(dirname $(realpath $0))\n\n# GPT-2 models\nmodels=( \"117M\" \"345M\" \"774M\" \"1558M\" )\n\n# list availab"
  },
  {
    "path": "examples/gpt-2/main-alloc.cpp",
    "chars": 30900,
    "preview": "#include \"ggml.h\"\n#include \"ggml-cpu.h\"\n#include \"ggml-alloc.h\"\n#include \"ggml-backend.h\"\n\n#include \"common.h\"\n#include "
  },
  {
    "path": "examples/gpt-2/main-backend.cpp",
    "chars": 32384,
    "preview": "#include \"ggml.h\"\n#include \"ggml-cpu.h\"\n#include \"ggml-alloc.h\"\n#include \"ggml-backend.h\"\n\n#ifdef GGML_USE_CUDA\n#include"
  },
  {
    "path": "examples/gpt-2/main-batched.cpp",
    "chars": 41298,
    "preview": "#include \"ggml.h\"\n#include \"ggml-cpu.h\"\n#include \"ggml-alloc.h\"\n#include \"ggml-backend.h\"\n\n#ifdef GGML_USE_CUDA\n#include"
  },
  {
    "path": "examples/gpt-2/main-ctx.cpp",
    "chars": 29475,
    "preview": "#include \"ggml.h\"\n#include \"ggml-cpu.h\"\n#include \"common.h\"\n#include \"common-ggml.h\"\n\n#include <cassert>\n#include <cmath"
  },
  {
    "path": "examples/gpt-2/main-sched.cpp",
    "chars": 38558,
    "preview": "#include \"ggml.h\"\n#include \"ggml-cpu.h\"\n#include \"ggml-alloc.h\"\n#include \"ggml-backend.h\"\n\n#ifdef GGML_USE_CUDA\n#include"
  },
  {
    "path": "examples/gpt-2/quantize.cpp",
    "chars": 5854,
    "preview": "#include \"ggml.h\"\n\n#include \"common.h\"\n#include \"common-ggml.h\"\n\n#include <cassert>\n#include <cmath>\n#include <cstdio>\n#"
  },
  {
    "path": "examples/gpt-j/CMakeLists.txt",
    "chars": 311,
    "preview": "#\n# gpt-j\n\nset(TEST_TARGET gpt-j)\nadd_executable(${TEST_TARGET} main.cpp)\ntarget_link_libraries(${TEST_TARGET} PRIVATE g"
  },
  {
    "path": "examples/gpt-j/README.md",
    "chars": 10957,
    "preview": "# gpt-j\n\nLocal GPT-J inference on your computer using C/C++\n\nNo video card required. You just need to have 16 GB of RAM."
  },
  {
    "path": "examples/gpt-j/convert-h5-to-ggml.py",
    "chars": 5505,
    "preview": "# Convert GPT-J-6B h5 transformer model to ggml format\n#\n# Load the model using GPTJForCausalLM.\n# Iterate over all vari"
  },
  {
    "path": "examples/gpt-j/download-ggml-model.sh",
    "chars": 1728,
    "preview": "#!/bin/bash\n\n# This script downloads GPT-J model files that have already been converted to ggml format.\n# This way you d"
  },
  {
    "path": "examples/gpt-j/download-model.sh",
    "chars": 571,
    "preview": "#!/bin/bash\n\nprintf \"To obtain the GPT-J 6B model files, please visit: https://huggingface.co/EleutherAI/gpt-j-6B\\n\\n\"\n\n"
  },
  {
    "path": "examples/gpt-j/main.cpp",
    "chars": 26185,
    "preview": "#include \"ggml.h\"\n#include \"ggml-cpu.h\"\n\n#include \"common.h\"\n#include \"common-ggml.h\"\n\n#include <cassert>\n#include <cmat"
  },
  {
    "path": "examples/gpt-j/quantize.cpp",
    "chars": 5856,
    "preview": "#include \"ggml.h\"\n\n#include \"common.h\"\n#include \"common-ggml.h\"\n\n#include <cassert>\n#include <cmath>\n#include <cstdio>\n#"
  },
  {
    "path": "examples/magika/CMakeLists.txt",
    "chars": 306,
    "preview": "#\n# magika\n\nset(TEST_TARGET magika)\nadd_executable(${TEST_TARGET} main.cpp)\ntarget_link_libraries(${TEST_TARGET} PRIVATE"
  },
  {
    "path": "examples/magika/README.md",
    "chars": 1436,
    "preview": "# Google Magika inference\n\nSimple example that shows how to use GGML for inference with the [Google Magika](https://gith"
  },
  {
    "path": "examples/magika/convert.py",
    "chars": 938,
    "preview": "import sys\nfrom tensorflow import keras\nimport gguf\n\ndef convert(model_name):\n    model = keras.models.load_model(model_"
  },
  {
    "path": "examples/magika/main.cpp",
    "chars": 13721,
    "preview": "#include \"ggml.h\"\n#include \"gguf.h\"\n#include \"ggml-cpu.h\"\n#include \"ggml-alloc.h\"\n#include \"ggml-backend.h\"\n\n#include <a"
  },
  {
    "path": "examples/mnist/.gitignore",
    "chars": 20,
    "preview": "data/\n*.gguf\n*.ggml\n"
  },
  {
    "path": "examples/mnist/CMakeLists.txt",
    "chars": 1785,
    "preview": "#\n# mnist-common\n\nset(TEST_TARGET mnist-common)\nadd_library(${TEST_TARGET} STATIC mnist-common.cpp)\ntarget_link_librarie"
  },
  {
    "path": "examples/mnist/README.md",
    "chars": 9633,
    "preview": "# MNIST Examples for GGML\n\nThis directory contains simple examples of how to use GGML for training and inference using t"
  },
  {
    "path": "examples/mnist/mnist-common.cpp",
    "chars": 20278,
    "preview": "#include \"ggml.h\"\n#include \"ggml-alloc.h\"\n#include \"ggml-backend.h\"\n#include \"ggml-opt.h\"\n\n#include \"mnist-common.h\"\n\n#i"
  },
  {
    "path": "examples/mnist/mnist-common.h",
    "chars": 7098,
    "preview": "#include <algorithm>\n#include <cstdint>\n#include <random>\n#include <string>\n#include <thread>\n#include <vector>\n\n#includ"
  },
  {
    "path": "examples/mnist/mnist-eval.cpp",
    "chars": 2066,
    "preview": "#include \"ggml.h\"\n#include \"ggml-opt.h\"\n\n#include \"mnist-common.h\"\n\n#include <cmath>\n#include <cstdint>\n#include <cstdio"
  },
  {
    "path": "examples/mnist/mnist-train-cnn.py",
    "chars": 3296,
    "preview": "#!/usr/bin/env python3\nimport sys\nfrom time import time\nimport gguf\nimport numpy as np\nimport tensorflow as tf\nfrom tens"
  },
  {
    "path": "examples/mnist/mnist-train-fc.py",
    "chars": 4537,
    "preview": "import gguf\nimport numpy as np\nimport torch\nimport torch.nn as nn\nimport torchvision.datasets as dsets\nimport torchvisio"
  },
  {
    "path": "examples/mnist/mnist-train.cpp",
    "chars": 1434,
    "preview": "#include \"ggml-opt.h\"\n#include \"mnist-common.h\"\n\n#include <cmath>\n#include <cstdio>\n#include <cstring>\n#include <ctime>\n"
  },
  {
    "path": "examples/mnist/server.py",
    "chars": 1200,
    "preview": "import http.server\nimport socketserver\nimport os\nimport sys\n\nDIRECTORY = os.path.abspath(os.path.join(os.path.dirname(__"
  },
  {
    "path": "examples/perf-metal/CMakeLists.txt",
    "chars": 142,
    "preview": "#\n# perf-metal\n\nset(TEST_TARGET perf-metal)\nadd_executable(${TEST_TARGET} perf-metal.cpp)\ntarget_link_libraries(${TEST_T"
  },
  {
    "path": "examples/perf-metal/perf-metal.cpp",
    "chars": 4459,
    "preview": "// basic tool to experiment with the Metal backend\n//\n// 1. Get GPU trace of a dummy graph:\n//\n//   rm -rf /tmp/perf-met"
  },
  {
    "path": "examples/prompts/dolly-v2.txt",
    "chars": 8221,
    "preview": "Hello World! => 12092,3645,2\nI can't believe it's already Friday!\" => 42,476,626,2868,352,434,2168,6794,1476\nThe URL for"
  },
  {
    "path": "examples/prompts/gpt-2-chinese.txt",
    "chars": 58,
    "preview": "请问洗手间在哪里？ => 6435,7309,3819,2797,7313,1762,1525,7027,8043\n"
  },
  {
    "path": "examples/prompts/gpt-2.txt",
    "chars": 8175,
    "preview": "Hello World! => 15496,2159,0\nI can't believe it's already Friday!\" => 40,460,470,1975,340,338,1541,3217,2474\nThe URL for"
  },
  {
    "path": "examples/prompts/gpt-j.txt",
    "chars": 8175,
    "preview": "Hello World! => 15496,2159,0\nI can't believe it's already Friday!\" => 40,460,470,1975,340,338,1541,3217,2474\nThe URL for"
  },
  {
    "path": "examples/prompts/gpt-neox-japanese.txt",
    "chars": 54,
    "preview": "明日の天気はどうですか。 => 263,7353,268,18461,271,1722,18405,265\n"
  },
  {
    "path": "examples/prompts/gpt-neox.txt",
    "chars": 8221,
    "preview": "Hello World! => 12092,3645,2\nI can't believe it's already Friday!\" => 42,476,626,2868,352,434,2168,6794,1476\nThe URL for"
  },
  {
    "path": "examples/prompts/polyglot-ko.txt",
    "chars": 124,
    "preview": "이것은 테스트 이다. => 12271,296,6474,28037,17\n걱정할 필요 없다. => 18311,482,1062,550,267,17\n버그는 언젠가 고쳐진다. => 6904,272,8575,10381,1765"
  },
  {
    "path": "examples/prompts/replit.txt",
    "chars": 10110,
    "preview": "Hello World! => 6466,147,2317,350\nI can't believe it's already Friday!\" => 286,512,172,185,13392,393,172,155,3239,147,29"
  },
  {
    "path": "examples/prompts/starcoder.txt",
    "chars": 9133,
    "preview": "Hello World! => 8279,10896,19\nI can't believe it's already Friday!\" => 59,883,1330,13710,561,1182,3425,506,25674,11555\nT"
  },
  {
    "path": "examples/prompts/test-cases.txt",
    "chars": 4990,
    "preview": "# test case format\n# <language>: <sentence>\n\nEnglish: Hello World!\nEnglish: I can't believe it's already Friday!\"\nEnglis"
  },
  {
    "path": "examples/prompts/tokenize_huggingface.py",
    "chars": 3285,
    "preview": "import os\nfrom transformers import AutoTokenizer\n\nos.environ['TOKENIZERS_PARALLELISM'] = \"false\"\n\nlist_repo_hf  = [\"data"
  },
  {
    "path": "examples/prompts/whisper.txt",
    "chars": 8726,
    "preview": "Hello World! => 15947,3937,0\nI can't believe it's already Friday!\" => 40,393,380,1697,309,311,1217,6984,2963\nThe URL for"
  },
  {
    "path": "examples/python/README.md",
    "chars": 4813,
    "preview": "# Simple autogenerated Python bindings for ggml\n\nThis folder contains:\n\n- Scripts to generate full Python bindings from "
  },
  {
    "path": "examples/python/api.h",
    "chars": 411,
    "preview": "/*\n  List here all the headers you want to expose in the Python bindings,\n  then run `python regenerate.py` (see details"
  },
  {
    "path": "examples/python/example_add_quant.py",
    "chars": 853,
    "preview": "from ggml import lib, ffi\nfrom ggml.utils import init, copy, numpy\nimport numpy as np\n\nctx = init(mem_size=12*1024*1024)"
  },
  {
    "path": "examples/python/example_test_all_quants.py",
    "chars": 1945,
    "preview": "from ggml import ffi, lib\nfrom ggml.utils import init, numpy, copy\nimport numpy as np\nfrom math import pi, cos, sin, cei"
  },
  {
    "path": "examples/python/ggml/__init__.py",
    "chars": 1914,
    "preview": "\"\"\"\n  Python bindings for the ggml library.\n\n  Usage example:\n\n      from ggml import lib, ffi\n      from ggml.utils imp"
  },
  {
    "path": "examples/python/ggml/__init__.pyi",
    "chars": 94309,
    "preview": "# auto-generated file\nimport ggml.ffi as ffi\nimport numpy as np\nclass lib:\n  @property\n  def GGML_BACKEND_CPU(self) -> i"
  },
  {
    "path": "examples/python/ggml/cffi.py",
    "chars": 51506,
    "preview": "# auto-generated file\nimport _cffi_backend\n\nffi = _cffi_backend.FFI('ggml.cffi',\n    _version = 0x2601,\n    _types = b'\\"
  },
  {
    "path": "examples/python/ggml/ffi/__init__.pyi",
    "chars": 60,
    "preview": "# Phony stubs.\n\nclass CData:\n    pass\n\nclass CType:\n    pass"
  },
  {
    "path": "examples/python/ggml/utils.py",
    "chars": 8965,
    "preview": "\"\"\"\n  Common helpers for working with ggml + numpy\n\"\"\"\nfrom ggml import ffi, lib\nfrom typing import Union, Optional\nimpo"
  },
  {
    "path": "examples/python/regenerate.py",
    "chars": 2144,
    "preview": "# Generates bindings for the ggml library.\n#\n# cffi requires prior C preprocessing of the headers, and it uses pycparser"
  },
  {
    "path": "examples/python/stubs.py",
    "chars": 4516,
    "preview": "\"\"\"\n  This generates .pyi stubs for the cffi Python bindings generated by regenerate.py\n\"\"\"\nimport sys, re, itertools\nsy"
  },
  {
    "path": "examples/python/test_tensor.py",
    "chars": 9504,
    "preview": "import pytest\nfrom pytest import raises\n\nfrom ggml import lib, ffi\nfrom ggml.utils import init, copy, numpy\nimport numpy"
  },
  {
    "path": "examples/sam/CMakeLists.txt",
    "chars": 281,
    "preview": "#\n# sam\n\nset(TEST_TARGET sam)\nadd_executable(${TEST_TARGET} sam.cpp)\ntarget_link_libraries(${TEST_TARGET} PRIVATE ggml c"
  },
  {
    "path": "examples/sam/README.md",
    "chars": 3223,
    "preview": "# SAM.cpp\n\nInference of Meta's [Segment Anything Model](https://github.com/facebookresearch/segment-anything/) in pure C"
  },
  {
    "path": "examples/sam/convert-pth-to-ggml.py",
    "chars": 3910,
    "preview": "# Convert a SAM model checkpoint to a ggml compatible file\n#\n\nimport sys\nimport torch\nimport struct\nimport numpy as np\n\n"
  },
  {
    "path": "examples/sam/sam.cpp",
    "chars": 102822,
    "preview": "#define _USE_MATH_DEFINES // for M_PI\n#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous \"unsafe\" warnigns on Windo"
  },
  {
    "path": "examples/simple/CMakeLists.txt",
    "chars": 431,
    "preview": "#\n# simple-ctx\n\nset(TEST_TARGET simple-ctx)\nadd_executable(${TEST_TARGET} simple-ctx.cpp)\ntarget_link_libraries(${TEST_T"
  },
  {
    "path": "examples/simple/README.md",
    "chars": 1078,
    "preview": "## Simple\n\nThis example simply performs a matrix multiplication, solely for the purpose of demonstrating a basic usage o"
  },
  {
    "path": "examples/simple/simple-backend.cpp",
    "chars": 4248,
    "preview": "#include \"ggml.h\"\n#include \"ggml-backend.h\"\n\n#include <cassert>\n#include <cmath>\n#include <cstdio>\n#include <cstring>\n#i"
  },
  {
    "path": "examples/simple/simple-ctx.cpp",
    "chars": 3654,
    "preview": "#include \"ggml.h\"\n#include \"ggml-cpu.h\"\n\n#include <cassert>\n#include <cmath>\n#include <cstdio>\n#include <cstring>\n#inclu"
  },
  {
    "path": "examples/stb_image.h",
    "chars": 284733,
    "preview": "/* stb_image - v2.28 - public domain image loader - http://nothings.org/stb\n                                  no warrant"
  },
  {
    "path": "examples/stb_image_write.h",
    "chars": 71221,
    "preview": "/* stb_image_write - v1.16 - public domain - http://nothings.org/stb\n   writes out PNG/BMP/TGA/JPEG/HDR images to C stdi"
  },
  {
    "path": "examples/test-cmake/CMakeLists.txt",
    "chars": 246,
    "preview": "cmake_minimum_required(VERSION 3.14)\nproject(ggml-simple)\n\nset(CMAKE_CXX_STANDARD 17)\n\nfind_package(ggml CONFIG REQUIRED"
  },
  {
    "path": "examples/test-cmake/README.md",
    "chars": 89,
    "preview": "## cmake-test\n\nThis directory can be built as a separate project with an installed ggml.\n"
  },
  {
    "path": "examples/test-cmake/test-cmake.cpp",
    "chars": 89,
    "preview": "#include \"ggml-backend.h\"\n\nint main(void) {\n    ggml_backend_load_all();\n    return 0;\n}\n"
  },
  {
    "path": "examples/yolo/CMakeLists.txt",
    "chars": 166,
    "preview": "#\n# yolov3-tiny\n\nset(TEST_TARGET yolov3-tiny)\nadd_executable(${TEST_TARGET} yolov3-tiny.cpp yolo-image.cpp)\ntarget_link_"
  },
  {
    "path": "examples/yolo/README.md",
    "chars": 2080,
    "preview": "This example shows how to implement YOLO object detection with ggml using pretrained model.\n\n# YOLOv3-tiny\n\nDownload the"
  },
  {
    "path": "examples/yolo/convert-yolov3-tiny.py",
    "chars": 2471,
    "preview": "#!/usr/bin/env python3\nimport sys\nimport gguf\nimport numpy as np\n\ndef save_conv2d_layer(f, gguf_writer, prefix, inp_c, f"
  },
  {
    "path": "examples/yolo/data/coco.names",
    "chars": 625,
    "preview": "person\nbicycle\ncar\nmotorbike\naeroplane\nbus\ntrain\ntruck\nboat\ntraffic light\nfire hydrant\nstop sign\nparking meter\nbench\nbir"
  },
  {
    "path": "examples/yolo/yolo-image.cpp",
    "chars": 6099,
    "preview": "#define STB_IMAGE_IMPLEMENTATION\n#include \"stb_image.h\"\n#define STB_IMAGE_WRITE_IMPLEMENTATION\n#include \"stb_image_write"
  },
  {
    "path": "examples/yolo/yolo-image.h",
    "chars": 1346,
    "preview": "#pragma once\n\n#include <string>\n#include <vector>\n#include <cassert>\n\nstruct yolo_image {\n    int w, h, c;\n    std::vect"
  },
  {
    "path": "examples/yolo/yolov3-tiny.cpp",
    "chars": 24340,
    "preview": "#include \"ggml.h\"\n#include \"gguf.h\"\n#include \"ggml-alloc.h\"\n#include \"ggml-backend.h\"\n\n#include \"yolo-image.h\"\n\n#include"
  },
  {
    "path": "ggml.pc.in",
    "chars": 283,
    "preview": "prefix=@CMAKE_INSTALL_PREFIX@\nexec_prefix=${prefix}\nincludedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@\nlibdir=${prefix}/@CM"
  },
  {
    "path": "include/ggml-alloc.h",
    "chars": 3624,
    "preview": "#pragma once\n\n#include \"ggml.h\"\n\n#ifdef  __cplusplus\nextern \"C\" {\n#endif\n\ntypedef struct ggml_backend_buffer_type * ggml"
  },
  {
    "path": "include/ggml-backend.h",
    "chars": 21052,
    "preview": "#pragma once\n\n#include \"ggml.h\"\n#include \"ggml-alloc.h\"\n\n#ifdef GGML_BACKEND_SHARED\n#    if defined(_WIN32) && !defined("
  },
  {
    "path": "include/ggml-blas.h",
    "chars": 586,
    "preview": "#pragma once\n\n#include \"ggml.h\"\n#include \"ggml-backend.h\"\n\n\n#ifdef  __cplusplus\nextern \"C\" {\n#endif\n\n// backend API\nGGML"
  },
  {
    "path": "include/ggml-cann.h",
    "chars": 4553,
    "preview": "/*\n * Copyright (c) 2023-2026 The ggml authors\n *\n * Permission is hereby granted, free of charge, to any person obtaini"
  },
  {
    "path": "include/ggml-cpp.h",
    "chars": 1652,
    "preview": "#pragma once\n\n#ifndef __cplusplus\n#error \"This header is for C++ only\"\n#endif\n\n#include \"ggml.h\"\n#include \"ggml-alloc.h\""
  },
  {
    "path": "include/ggml-cpu.h",
    "chars": 7696,
    "preview": "#pragma once\n\n#include \"ggml.h\"\n#include \"ggml-backend.h\"\n\n#ifdef  __cplusplus\nextern \"C\" {\n#endif\n\n    // the compute p"
  },
  {
    "path": "include/ggml-cuda.h",
    "chars": 1585,
    "preview": "#pragma once\n\n#include \"ggml.h\"\n#include \"ggml-backend.h\"\n\n#ifdef  __cplusplus\nextern \"C\" {\n#endif\n\n#ifdef GGML_USE_HIP\n"
  },
  {
    "path": "include/ggml-hexagon.h",
    "chars": 351,
    "preview": "#pragma once\n\n#include \"ggml.h\"\n#include \"ggml-backend.h\"\n\n#ifdef  __cplusplus\nextern \"C\" {\n#endif\n\n// backend API\nGGML_"
  },
  {
    "path": "include/ggml-metal.h",
    "chars": 2141,
    "preview": "// Note: this description is outdated\n//\n// An interface allowing to compute ggml_cgraph with Metal\n//\n// This is a full"
  },
  {
    "path": "include/ggml-opencl.h",
    "chars": 581,
    "preview": "#ifndef GGML_OPENCL_H\n#define GGML_OPENCL_H\n\n#include \"ggml.h\"\n#include \"ggml-backend.h\"\n\n#ifdef  __cplusplus\nextern \"C\""
  },
  {
    "path": "include/ggml-openvino.h",
    "chars": 1034,
    "preview": "#pragma once\n\n#include \"ggml-backend.h\"\n\n#include <cstring>\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n#define GGML_OPENVI"
  },
  {
    "path": "include/ggml-opt.h",
    "chars": 14038,
    "preview": "// This file contains functionality for training models using GGML.\n// It is not strictly needed vs. just vanilla GGML b"
  },
  {
    "path": "include/ggml-rpc.h",
    "chars": 1183,
    "preview": "#pragma once\n\n#include \"ggml-backend.h\"\n\n#ifdef  __cplusplus\nextern \"C\" {\n#endif\n\n#define RPC_PROTO_MAJOR_VERSION    3\n#"
  },
  {
    "path": "include/ggml-sycl.h",
    "chars": 1763,
    "preview": "//\n//  MIT license\n//  Copyright (C) 2024 Intel Corporation\n//  SPDX-License-Identifier: MIT\n//\n\n#pragma once\n\n#include "
  },
  {
    "path": "include/ggml-virtgpu.h",
    "chars": 194,
    "preview": "#pragma once\n\n#include \"ggml.h\"\n#include \"ggml-backend.h\"\n\n#ifdef  __cplusplus\nextern \"C\" {\n#endif\n\nGGML_BACKEND_API ggm"
  },
  {
    "path": "include/ggml-vulkan.h",
    "chars": 952,
    "preview": "#pragma once\n\n#include \"ggml.h\"\n#include \"ggml-backend.h\"\n\n#ifdef  __cplusplus\nextern \"C\" {\n#endif\n\n#define GGML_VK_NAME"
  },
  {
    "path": "include/ggml-webgpu.h",
    "chars": 328,
    "preview": "#pragma once\n\n#include \"ggml.h\"\n#include \"ggml-backend.h\"\n\n#ifdef  __cplusplus\nextern \"C\" {\n#endif\n\n#define GGML_WEBGPU_"
  },
  {
    "path": "include/ggml-zdnn.h",
    "chars": 292,
    "preview": "#pragma once\n\n#include \"ggml.h\"\n#include \"ggml-backend.h\"\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n// device buffer\nGGML"
  },
  {
    "path": "include/ggml-zendnn.h",
    "chars": 498,
    "preview": "#pragma once\n\n#include \"ggml-backend.h\"\n#include \"ggml.h\"\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\n// backend API\nGGML_B"
  },
  {
    "path": "include/ggml.h",
    "chars": 104115,
    "preview": "#pragma once\n\n//\n// GGML Tensor Library\n//\n// This documentation is still a work in progress.\n// If you wish some specif"
  },
  {
    "path": "include/gguf.h",
    "chars": 10222,
    "preview": "// This file contains functionality related to \"GGUF\" files, the binary file format used by ggml.\n// GGUF files have the"
  },
  {
    "path": "requirements.txt",
    "chars": 215,
    "preview": "accelerate==0.19.0\nnumpy>=2.0.2\nsentencepiece~=0.1.98\ntorchvision>=0.15.2\ntransformers>=4.35.2,<5.0.0\ngguf>=0.1.0\nkeras="
  },
  {
    "path": "scripts/gen-authors.sh",
    "chars": 345,
    "preview": "#!/usr/bin/env bash\n\nprintf \"# date: $(date)\\n\" > AUTHORS\nprintf \"# this file is auto-generated by scripts/gen-authors.s"
  },
  {
    "path": "scripts/release.sh",
    "chars": 9394,
    "preview": "#!/bin/bash\n#\n# Automated release script for ggml.\n#\n# Note: Sync from llama.cpp should be done separately via PR proces"
  },
  {
    "path": "scripts/sync-llama-am.sh",
    "chars": 5422,
    "preview": "#!/bin/bash\n#\n# Synchronize llama.cpp changes to ggml\n#\n# Usage:\n#\n#   $ cd /path/to/ggml\n#   $ ./scripts/sync-llama-am."
  },
  {
    "path": "scripts/sync-llama.last",
    "chars": 41,
    "preview": "ae40cd27c85aa30b9cd56033da1d6a954290f7ea\n"
  },
  {
    "path": "scripts/sync-llama.sh",
    "chars": 877,
    "preview": "#!/bin/bash\n\ncp -rpv ../llama.cpp/ggml/CMakeLists.txt       CMakeLists.txt\ncp -rpv ../llama.cpp/ggml/src/CMakeLists.txt "
  },
  {
    "path": "scripts/sync-whisper-am.sh",
    "chars": 3968,
    "preview": "#!/bin/bash\n#\n# Synchronize whisper.cpp changes to ggml\n#\n# Usage:\n#\n#   $ cd /path/to/ggml\n#   $ ./scripts/sync-whisper"
  },
  {
    "path": "scripts/sync-whisper.last",
    "chars": 41,
    "preview": "79218f51d02ffe70575ef7fba3496dfc7adda027\n"
  },
  {
    "path": "scripts/sync-whisper.sh",
    "chars": 682,
    "preview": "#!/bin/bash\n\ncp -rpv ../whisper.cpp/ggml/CMakeLists.txt       CMakeLists.txt\ncp -rpv ../whisper.cpp/ggml/src/CMakeLists."
  },
  {
    "path": "src/CMakeLists.txt",
    "chars": 19035,
    "preview": "include(CheckCXXCompilerFlag)\ninclude(\"../cmake/common.cmake\")\n\nadd_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCH"
  },
  {
    "path": "src/ggml-alloc.c",
    "chars": 47692,
    "preview": "#include \"ggml-alloc.h\"\n#include \"ggml-backend-impl.h\"\n#include \"ggml.h\"\n#include \"ggml-impl.h\"\n#include <assert.h>\n#inc"
  },
  {
    "path": "src/ggml-backend-dl.cpp",
    "chars": 1051,
    "preview": "#include \"ggml-backend-dl.h\"\n\n#ifdef _WIN32\n\ndl_handle * dl_load_library(const fs::path & path) {\n    // suppress error "
  },
  {
    "path": "src/ggml-backend-dl.h",
    "chars": 791,
    "preview": "#pragma once\n\n#ifdef _WIN32\n#   define WIN32_LEAN_AND_MEAN\n#   ifndef NOMINMAX\n#       define NOMINMAX\n#   endif\n#   inc"
  },
  {
    "path": "src/ggml-backend-impl.h",
    "chars": 12068,
    "preview": "#pragma once\n\n// ggml-backend internal header\n\n#include \"ggml-backend.h\"\n\n#ifdef  __cplusplus\nextern \"C\" {\n#endif\n\n    #"
  },
  {
    "path": "src/ggml-backend-reg.cpp",
    "chars": 17635,
    "preview": "#include \"ggml-backend-impl.h\"\n#include \"ggml-backend.h\"\n#include \"ggml-backend-dl.h\"\n#include \"ggml-impl.h\"\n#include <a"
  },
  {
    "path": "src/ggml-backend.cpp",
    "chars": 89034,
    "preview": "// Note: porting this file to C++ is a work in progress\n\n#ifdef _WIN32\n#define WIN32_LEAN_AND_MEAN\n#ifndef NOMINMAX\n#   "
  },
  {
    "path": "src/ggml-blas/CMakeLists.txt",
    "chars": 4168,
    "preview": "if (GGML_STATIC)\n    set(BLA_STATIC ON)\nendif()\n#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)\n#    set(BLA_SIZEOF_INTEG"
  },
  {
    "path": "src/ggml-blas/ggml-blas.cpp",
    "chars": 17104,
    "preview": "#include \"ggml-impl.h\"\n#include \"ggml-blas.h\"\n#include \"ggml-backend-impl.h\"\n\n#include <future>\n#include <vector>\n#inclu"
  },
  {
    "path": "src/ggml-cann/CMakeLists.txt",
    "chars": 3476,
    "preview": "if (\"cann${CANN_INSTALL_DIR}\" STREQUAL \"cann\" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})\n    set(CANN_INSTALL_DIR $ENV{ASCEND"
  },
  {
    "path": "src/ggml-cann/acl_tensor.cpp",
    "chars": 8227,
    "preview": "/*\n * Copyright (c) 2023-2026 The ggml authors\n *\n * Permission is hereby granted, free of charge, to any person obtaini"
  },
  {
    "path": "src/ggml-cann/acl_tensor.h",
    "chars": 17486,
    "preview": "/*\n * Copyright (c) 2023-2026 The ggml authors\n *\n * Permission is hereby granted, free of charge, to any person obtaini"
  },
  {
    "path": "src/ggml-cann/aclnn_ops.cpp",
    "chars": 195581,
    "preview": "/*\n * Copyright (c) 2023-2026 The ggml authors\n *\n * Permission is hereby granted, free of charge, to any person obtaini"
  },
  {
    "path": "src/ggml-cann/aclnn_ops.h",
    "chars": 51308,
    "preview": "/**\n * Copyright (c) 2023-2026 The ggml authors\n *\n * Permission is hereby granted, free of charge, to any person obtain"
  },
  {
    "path": "src/ggml-cann/common.h",
    "chars": 23029,
    "preview": "/*\n * Copyright (c) 2023-2026 The ggml authors\n *\n * Permission is hereby granted, free of charge, to any person obtaini"
  },
  {
    "path": "src/ggml-cann/ggml-cann.cpp",
    "chars": 108601,
    "preview": "/*\n * Copyright (c) 2023-2026 The ggml authors\n *\n * Permission is hereby granted, free of charge, to any person obtaini"
  },
  {
    "path": "src/ggml-common.h",
    "chars": 134501,
    "preview": "#ifndef GGML_COMMON_DECL\n\n#if defined(GGML_COMMON_DECL_C)\n#include <stdint.h>\n\ntypedef uint16_t ggml_half;\ntypedef uint3"
  },
  {
    "path": "src/ggml-cpu/CMakeLists.txt",
    "chars": 33162,
    "preview": "function(ggml_add_cpu_backend_features cpu_name arch)\n    # The feature detection code is compiled as a separate target "
  },
  {
    "path": "src/ggml-cpu/amx/amx.cpp",
    "chars": 8189,
    "preview": "#include \"amx.h\"\n#include \"common.h\"\n#include \"mmq.h\"\n#include \"ggml-backend-impl.h\"\n#include \"ggml-backend.h\"\n#include "
  },
  {
    "path": "src/ggml-cpu/amx/amx.h",
    "chars": 202,
    "preview": "#include \"ggml-backend.h\"\n#include \"ggml-cpu-impl.h\"\n\n// GGML internal header\n\n#if defined(__AMX_INT8__) && defined(__AV"
  },
  {
    "path": "src/ggml-cpu/amx/common.h",
    "chars": 2675,
    "preview": "#pragma once\n\n#include \"ggml.h\"\n#include \"ggml-cpu-impl.h\"\n\n#include <algorithm>\n#include <memory>\n#include <type_traits"
  },
  {
    "path": "src/ggml-cpu/amx/mmq.cpp",
    "chars": 108025,
    "preview": "#if defined(__GNUC__)\n#pragma GCC diagnostic ignored \"-Wpedantic\"\n#pragma GCC diagnostic ignored \"-Wunused-local-typedef"
  },
  {
    "path": "src/ggml-cpu/amx/mmq.h",
    "chars": 397,
    "preview": "#pragma once\n#include \"common.h\"\n\nsize_t ggml_backend_amx_desired_wsize(const struct ggml_tensor * dst);\n\nsize_t ggml_ba"
  },
  {
    "path": "src/ggml-cpu/arch/arm/cpu-feats.cpp",
    "chars": 2387,
    "preview": "#include \"ggml-backend-impl.h\"\n\n#if defined(__aarch64__)\n\n#if defined(__linux__)\n#include <sys/auxv.h>\n#elif defined(__A"
  },
  {
    "path": "src/ggml-cpu/arch/arm/quants.c",
    "chars": 209967,
    "preview": "#define GGML_COMMON_IMPL_C\n#include \"ggml-common.h\"\n#include \"ggml-quants.h\"\n#include \"ggml-impl.h\"\n#include \"ggml-cpu.h"
  },
  {
    "path": "src/ggml-cpu/arch/arm/repack.cpp",
    "chars": 252225,
    "preview": "#define GGML_COMMON_IMPL_CPP\n#define GGML_COMMON_DECL_CPP\n#include \"ggml-common.h\"\n#include \"ggml-backend-impl.h\"\n\n#incl"
  },
  {
    "path": "src/ggml-cpu/arch/loongarch/quants.c",
    "chars": 87168,
    "preview": "#define GGML_COMMON_IMPL_C\n#include \"ggml-common.h\"\n#include \"ggml-quants.h\"\n#include \"ggml-impl.h\"\n#include \"ggml-cpu.h"
  },
  {
    "path": "src/ggml-cpu/arch/powerpc/cpu-feats.cpp",
    "chars": 2074,
    "preview": "# include \"ggml-backend-impl.h\"\n\n#if defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__)\n\n#if defined(__l"
  },
  {
    "path": "src/ggml-cpu/arch/powerpc/quants.c",
    "chars": 97986,
    "preview": "#define GGML_COMMON_IMPL_C\n#include \"ggml-common.h\"\n#include \"ggml-quants.h\"\n#include \"ggml-impl.h\"\n#include \"ggml-cpu.h"
  },
  {
    "path": "src/ggml-cpu/arch/riscv/cpu-feats.cpp",
    "chars": 807,
    "preview": "#include \"ggml-backend-impl.h\"\n\n#if defined(__riscv) && __riscv_xlen == 64\n#include <asm/hwprobe.h>\n#include <asm/unistd"
  },
  {
    "path": "src/ggml-cpu/arch/riscv/quants.c",
    "chars": 155184,
    "preview": "#define GGML_COMMON_IMPL_C\n#include \"ggml-common.h\"\n#include \"ggml-quants.h\"\n#include \"ggml-impl.h\"\n#include \"ggml-cpu.h"
  },
  {
    "path": "src/ggml-cpu/arch/riscv/repack.cpp",
    "chars": 98765,
    "preview": "#define GGML_COMMON_IMPL_CPP\n#define GGML_COMMON_DECL_CPP\n#include \"ggml-common.h\"\n#include \"ggml-backend-impl.h\"\n\n#incl"
  },
  {
    "path": "src/ggml-cpu/arch/s390/cpu-feats.cpp",
    "chars": 1017,
    "preview": "#include \"ggml-backend-impl.h\"\n\n#if defined(__s390x__)\n#include <sys/auxv.h>\n\n// find hwcap bits in asm/elf.h\n#ifndef HW"
  },
  {
    "path": "src/ggml-cpu/arch/s390/quants.c",
    "chars": 54048,
    "preview": "#define GGML_COMMON_IMPL_C\n#include \"ggml-common.h\"\n#include \"ggml-quants.h\"\n#include \"ggml-impl.h\"\n#include \"ggml-cpu.h"
  },
  {
    "path": "src/ggml-cpu/arch/wasm/quants.c",
    "chars": 44290,
    "preview": "#define GGML_COMMON_IMPL_C\n#include \"ggml-common.h\"\n#include \"ggml-quants.h\"\n#include \"ggml-impl.h\"\n#include \"ggml-cpu.h"
  },
  {
    "path": "src/ggml-cpu/arch/x86/cpu-feats.cpp",
    "chars": 10727,
    "preview": "#include \"ggml-backend-impl.h\"\n\n#if defined(__x86_64__) || (defined(_MSC_VER) && defined(_M_AMD64))\n\n#ifdef _MSC_VER\n#in"
  },
  {
    "path": "src/ggml-cpu/arch/x86/quants.c",
    "chars": 183439,
    "preview": "#define GGML_COMMON_IMPL_C\n#include \"ggml-common.h\"\n#include \"ggml-quants.h\"\n#include \"ggml-impl.h\"\n#include \"ggml-cpu.h"
  },
  {
    "path": "src/ggml-cpu/arch/x86/repack.cpp",
    "chars": 665014,
    "preview": "#define GGML_COMMON_IMPL_CPP\n#define GGML_COMMON_DECL_CPP\n#include \"ggml-common.h\"\n#include \"ggml-backend-impl.h\"\n\n#incl"
  },
  {
    "path": "src/ggml-cpu/arch-fallback.h",
    "chars": 21036,
    "preview": "\n#pragma once\n\n// Rename `_generic` functions if no native implementation is available.\n// This effectively selects the "
  },
  {
    "path": "src/ggml-cpu/binary-ops.cpp",
    "chars": 6709,
    "preview": "#include \"binary-ops.h\"\n\n#if defined(GGML_USE_ACCELERATE)\n#include <Accelerate/Accelerate.h>\n\nusing vDSP_fn_t = void (*)"
  },
  {
    "path": "src/ggml-cpu/binary-ops.h",
    "chars": 518,
    "preview": "#pragma once\n\n#include \"common.h\"\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\nvoid ggml_compute_forward_add_non_quantized(c"
  },
  {
    "path": "src/ggml-cpu/cmake/FindSIMD.cmake",
    "chars": 2651,
    "preview": "include(CheckCSourceRuns)\n\nset(AVX_CODE \"\n    #include <immintrin.h>\n    int main()\n    {\n        __m256 a;\n        a = "
  },
  {
    "path": "src/ggml-cpu/common.h",
    "chars": 2333,
    "preview": "#pragma once\n\n#include \"ggml.h\"\n#include \"traits.h\"\n#include \"ggml-cpu-impl.h\"\n#include \"ggml-impl.h\"\n#include \"simd-map"
  },
  {
    "path": "src/ggml-cpu/ggml-cpu-impl.h",
    "chars": 13154,
    "preview": "#pragma once\n\n// GGML CPU internal header\n\n#include \"ggml.h\"\n#include \"ggml-impl.h\"\n\n#include <stdlib.h> // load `stdlib"
  },
  {
    "path": "src/ggml-cpu/ggml-cpu.c",
    "chars": 124849,
    "preview": "#define _CRT_SECURE_NO_DEPRECATE // Disables \"unsafe\" warnings on Windows\n#define _USE_MATH_DEFINES // For M_PI on MSVC\n"
  },
  {
    "path": "src/ggml-cpu/ggml-cpu.cpp",
    "chars": 24042,
    "preview": "#include \"ggml-backend.h\"\n#include \"ggml-backend-impl.h\"\n#include \"ggml-cpu.h\"\n#include \"repack.h\"\n#include \"traits.h\"\n#"
  },
  {
    "path": "src/ggml-cpu/hbm.cpp",
    "chars": 2003,
    "preview": "#ifdef GGML_USE_CPU_HBM\n\n#include \"ggml-backend.h\"\n#include \"ggml-backend-impl.h\"\n#include \"ggml-cpu.h\"\n#include \"ggml-i"
  },
  {
    "path": "src/ggml-cpu/hbm.h",
    "chars": 155,
    "preview": "#pragma once\n\n#include \"ggml-backend.h\"\n#include \"ggml.h\"\n\n// GGML CPU internal header\n\nggml_backend_buffer_type_t ggml_"
  },
  {
    "path": "src/ggml-cpu/kleidiai/kernels.cpp",
    "chars": 61882,
    "preview": "// SPDX-FileCopyrightText: Copyright 2025-2026 Arm Limited and/or its affiliates <open-source-office@arm.com>\n// SPDX-Li"
  },
  {
    "path": "src/ggml-cpu/kleidiai/kernels.h",
    "chars": 3105,
    "preview": "// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>\n// SPDX-License"
  },
  {
    "path": "src/ggml-cpu/kleidiai/kleidiai.cpp",
    "chars": 60563,
    "preview": "// SPDX-FileCopyrightText: Copyright 2025-2026 Arm Limited and/or its affiliates <open-source-office@arm.com>\n// SPDX-Li"
  },
  {
    "path": "src/ggml-cpu/kleidiai/kleidiai.h",
    "chars": 323,
    "preview": "// SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its affiliates <open-source-office@arm.com>\n// SPDX-License"
  },
  {
    "path": "src/ggml-cpu/llamafile/sgemm.cpp",
    "chars": 156949,
    "preview": "// Copyright 2024 Mozilla Foundation\n//\n// Permission is hereby granted, free of charge, to any person obtaining\n// a co"
  },
  {
    "path": "src/ggml-cpu/llamafile/sgemm.h",
    "chars": 524,
    "preview": "#pragma once\n#include <stdint.h>\n#include <stdbool.h>\n\n#if defined(__VXE__) || defined(__VXE2__)\n#include <vecintrin.h>\n"
  },
  {
    "path": "src/ggml-cpu/ops.cpp",
    "chars": 380697,
    "preview": "#include \"ops.h\"\n\n#include \"ggml-cpu.h\"\n#include \"ggml-impl.h\"\n#include \"binary-ops.h\"\n#include \"simd-gemm.h\"\n#include \""
  },
  {
    "path": "src/ggml-cpu/ops.h",
    "chars": 9302,
    "preview": "#pragma once\n\n#include \"ggml.h\"\n\n//\n// cache line\n//\n\n#if defined(__cpp_lib_hardware_interference_size)\n#define CACHE_LI"
  },
  {
    "path": "src/ggml-cpu/quants.c",
    "chars": 41704,
    "preview": "#define GGML_COMMON_IMPL_C\n#include \"ggml-common.h\"\n\n#include \"ggml-cpu-impl.h\"\n#include \"simd-mappings.h\"\n#include \"ggm"
  },
  {
    "path": "src/ggml-cpu/quants.h",
    "chars": 10006,
    "preview": "#pragma once\n\n#define GGML_COMMON_DECL_C\n#include \"ggml-common.h\"\n\n#include \"ggml.h\"\n\n// GGML CPU internal header\n\n#ifde"
  },
  {
    "path": "src/ggml-cpu/repack.cpp",
    "chars": 195836,
    "preview": "#define GGML_COMMON_IMPL_CPP\n#define GGML_COMMON_DECL_CPP\n#include \"ggml-common.h\"\n#include \"ggml-backend-impl.h\"\n\n#incl"
  },
  {
    "path": "src/ggml-cpu/repack.h",
    "chars": 19905,
    "preview": "#pragma once\n\n#define GGML_COMMON_DECL_CPP\n#include \"ggml-common.h\"\n\n#include \"traits.h\"\n#include \"ggml.h\"\n\n// GGML inte"
  },
  {
    "path": "src/ggml-cpu/simd-gemm.h",
    "chars": 3767,
    "preview": "#pragma once\n\n// Computes C[M x N] += A[M x K] * B[K x N]\n\n#include \"simd-mappings.h\"\n\n// TODO: add support for sizeless"
  },
  {
    "path": "src/ggml-cpu/simd-mappings.h",
    "chars": 54372,
    "preview": "#pragma once\n\n#include \"ggml-cpu-impl.h\"\n\n#ifdef __ARM_FEATURE_SVE\n#include <arm_sve.h>\n#endif // __ARM_FEATURE_SVE\n\n#if"
  },
  {
    "path": "src/ggml-cpu/spacemit/ime.cpp",
    "chars": 42943,
    "preview": "#define GGML_COMMON_IMPL_CPP\n#define GGML_COMMON_DECL_CPP\n\n#include \"ime.h\"\n\n#include \"ggml-backend-impl.h\"\n#include \"gg"
  },
  {
    "path": "src/ggml-cpu/spacemit/ime.h",
    "chars": 188,
    "preview": "#pragma once\n\n#include \"ggml-alloc.h\"\n\n#ifdef __cplusplus\nextern \"C\" {\n#endif\n\nggml_backend_buffer_type_t ggml_backend_c"
  },
  {
    "path": "src/ggml-cpu/spacemit/ime1_kernels.cpp",
    "chars": 176791,
    "preview": "#include \"ggml.h\"\n#include \"ime_kernels.h\"\n\n#include <algorithm>\n#include <cmath>\n\n// clang-format off\n#if defined(__GNU"
  },
  {
    "path": "src/ggml-cpu/spacemit/ime_kernels.h",
    "chars": 1040,
    "preview": "#pragma once\n\n#include <cstddef>\n\nnamespace sqnbitgemm_spacemit_ime {\nnamespace ime1 {\nsize_t gemm_kernel_i8i4(size_t   "
  },
  {
    "path": "src/ggml-cpu/traits.cpp",
    "chars": 1231,
    "preview": "#include \"traits.h\"\n\n#include \"ggml-backend-impl.h\"\n#include \"ggml-backend.h\"\n\nnamespace ggml::cpu {\ntensor_traits::~ten"
  },
  {
    "path": "src/ggml-cpu/traits.h",
    "chars": 1157,
    "preview": "#pragma once\n#include \"ggml-backend-impl.h\"\n#include \"ggml-cpu-impl.h\"\n#include \"ggml.h\"\n\n#ifdef __cplusplus\n#    includ"
  }
]

// ... and 873 more files (download for full content)

About this extraction

This page contains the full source code of the ggml-org/ggml GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 1073 files (14.8 MB), approximately 3.9M tokens. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo