[
  {
    "path": ".gitattributes",
    "content": "* text=auto eol=lf\n\n**/go.sum linguist-generated=true\n**/zz_generated.*.go linguist-generated=true\n"
  },
  {
    "path": ".github/workflows/ci.yml",
    "content": "name: ci\n\npermissions:\n  contents: read\n  pull-requests: read\n  actions: read\n\ndefaults:\n  run:\n    shell: bash\n\non:\n  push:\n    branches:\n      - 'main'\n      - 'branch-v*.*'\n    paths-ignore:\n      - \"docs/**\"\n      - \"**.md\"\n      - \"**.mdx\"\n      - \"**.png\"\n      - \"**.jpg\"\n      - \".github/workflows/cmd.yml\"\n      - \".github/workflows/prune.yml\"\n      - \".github/workflows/sync.yml\"\n  pull_request:\n    branches:\n      - 'main'\n    paths-ignore:\n      - \"docs/**\"\n      - \"**.md\"\n      - \"**.mdx\"\n      - \"**.png\"\n      - \"**.jpg\"\n      - \".github/workflows/cmd.yml\"\n      - \".github/workflows/prune.yml\"\n      - \".github/workflows/sync.yml\"\n\njobs:\n  ci:\n    timeout-minutes: 15\n    runs-on: ubuntu-22.04\n    steps:\n      - name: Checkout\n        uses: actions/checkout@v4\n        with:\n          fetch-depth: 1\n          persist-credentials: false\n      - name: Setup Go\n        timeout-minutes: 15\n        uses: actions/setup-go@v5\n        with:\n          go-version: \"1.22.9\"\n          cache-dependency-path: |\n            **/go.sum\n      - name: Setup Toolbox\n        timeout-minutes: 5\n        uses: actions/cache@v4\n        with:\n          key: toolbox-${{ runner.os }}\n          path: |\n            ${{ github.workspace }}/.sbin\n      - name: Make\n        run: make ci\n        env:\n          LINT_DIRTY: \"true\"\n"
  },
  {
    "path": ".github/workflows/cmd.yml",
    "content": "name: cmd\n\npermissions:\n  contents: write\n  actions: read\n  id-token: write\n\ndefaults:\n  run:\n    shell: bash\n\non:\n  push:\n    branches:\n      - 'main'\n      - 'branch-v*.*'\n    paths-ignore:\n      - \"docs/**\"\n      - \"**.md\"\n      - \"**.mdx\"\n      - \"**.png\"\n      - \"**.jpg\"\n      - \".github/workflows/ci.yml\"\n      - \".github/workflows/prune.yml\"\n      - \".github/workflows/sync.yml\"\n    tags:\n      - \"v*.*.*\"\n\njobs:\n  build:\n    timeout-minutes: 15\n    runs-on: ubuntu-22.04\n    steps:\n      - name: Checkout\n        uses: actions/checkout@v4\n        with:\n          fetch-depth: 1\n          persist-credentials: false\n      - name: Setup Go\n        timeout-minutes: 15\n        uses: actions/setup-go@v5\n        with:\n          go-version: \"1.22.9\"\n          cache-dependency-path: |\n            cmd/**/go.sum\n      - name: Make\n        run: make build\n        env:\n          VERSION: \"${{ github.ref_name }}\"\n      - name: Upload Artifact\n        uses: actions/upload-artifact@v4\n        with:\n          include-hidden-files: true\n          path: ${{ github.workspace }}/.dist/*\n      - name: Release\n        if: ${{ startsWith(github.ref, 'refs/tags/') }}\n        uses: softprops/action-gh-release@v2\n        with:\n          fail_on_unmatched_files: true\n          tag_name: \"${{ github.ref_name }}\"\n          prerelease: ${{ contains(github.ref, 'rc') }}\n          files: ${{ github.workspace }}/.dist/*\n\n  publish:\n    needs:\n      - build\n    permissions:\n      contents: write\n      actions: read\n      id-token: write\n    timeout-minutes: 15\n    runs-on: ubuntu-22.04\n    env:\n      PACKAGE_REGISTRY: \"gpustack\"\n      PACKAGE_IMAGE: \"gguf-parser\"\n    steps:\n      - name: Checkout\n        uses: actions/checkout@v4\n        with:\n          fetch-depth: 1\n          persist-credentials: false\n      - name: Setup QEMU\n        uses: docker/setup-qemu-action@v3\n        with:\n          image: tonistiigi/binfmt:qemu-v9.2.2\n          platforms: \"arm64\"\n      - name: Setup Buildx\n        uses: docker/setup-buildx-action@v3\n      - name: Login DockerHub\n        uses: docker/login-action@v3\n        with:\n          username: ${{ secrets.CI_DOCKERHUB_USERNAME }}\n          password: ${{ secrets.CI_DOCKERHUB_PASSWORD }}\n      - name: Download Artifact\n        uses: actions/download-artifact@v4\n        with:\n          path: ${{ github.workspace }}/.dist\n          merge-multiple: true\n      - name: Get Metadata\n        id: metadata\n        uses: docker/metadata-action@v5\n        with:\n          images: \"${{ env.PACKAGE_REGISTRY }}/${{ env.PACKAGE_IMAGE }}\"\n      - name: Package\n        uses: docker/build-push-action@v6\n        with:\n          push: true\n          file: ${{ github.workspace }}/Dockerfile\n          context: ${{ github.workspace }}\n          platforms: \"linux/amd64,linux/arm64\"\n          tags: ${{ steps.metadata.outputs.tags }}\n          labels: ${{ steps.metadata.outputs.labels }}\n          cache-from: |\n            type=registry,ref=${{ env.PACKAGE_REGISTRY }}/${{ env.PACKAGE_IMAGE }}:build-cache\n          cache-to: |\n            type=registry,mode=max,compression=gzip,ref=${{ env.PACKAGE_REGISTRY }}/${{ env.PACKAGE_IMAGE }}:build-cache,ignore-error=true\n          provenance: true\n          sbom: true\n"
  },
  {
    "path": ".github/workflows/prune.yml",
    "content": "name: prune\n\npermissions:\n  contents: write\n  pull-requests: write\n  actions: write\n  issues: write\n\ndefaults:\n  run:\n    shell: bash\n\non:\n  workflow_dispatch:\n    inputs:\n      prune:\n        description: 'Prune all caches'\n        required: false\n        type: boolean\n        default: false\n  schedule:\n    - cron: \"0 0 * * *\" # every day at 00:00 UTC\n\njobs:\n  close-stale-issues-and-prs:\n    uses: gpustack/.github/.github/workflows/close-stale-issues-and-prs.yml@main\n\n  clean-stale-caches:\n    uses: gpustack/.github/.github/workflows/clean-stale-caches.yml@main\n    with:\n      # allow to prune all caches on demand\n      prune: ${{ github.event_name != 'schedule' && inputs.prune || false }}\n"
  },
  {
    "path": ".github/workflows/sync.yml",
    "content": "name: sync\n\npermissions:\n  contents: read\n  pull-requests: read\n  actions: read\n\ndefaults:\n  run:\n    shell: bash\n\non:\n  workflow_dispatch:\n    inputs:\n      max_releases:\n        description: \"Maximum number of latest releases to sync\"\n        required: false\n        default: 1\n        type: number\n      specific_release_tag:\n        description: \"Specific release tag to sync\"\n        required: false\n        default: \"\"\n        type: string\n      dry_run:\n        description: \"Skip the actual sync\"\n        required: false\n        default: false\n        type: boolean\n  schedule:\n    - cron: \"0 */12 * * *\" # every 12 hours\n\njobs:\n  gitcode:\n    runs-on: ubuntu-22.04\n    timeout-minutes: 240\n    steps:\n      - name: Checkout\n        uses: actions/checkout@v4\n        with:\n          fetch-depth: 0\n          persist-credentials: false\n      - name: Sync\n        uses: gpustack/.github/.github/actions/mirror-release-gitcode@main\n        with:\n          gitcode-username: \"${{ secrets.CI_GITCODE_USERNAME }}\"\n          gitcode-password: \"${{ secrets.CI_GITCODE_PASSWORD }}\"\n          gitcode-token: \"${{ secrets.CI_GITCODE_TOKEN }}\"\n          max-releases: \"${{ inputs.max_releases && inputs.max_releases || '1' }}\"\n          specific-release-tag: \"${{ inputs.specific_release_tag && inputs.specific_release_tag || '' }}\"\n          code-only: true\n          dry-run: \"${{ inputs.dry_run && inputs.dry_run || 'false' }}\"\n\n  gitee:\n    runs-on: ubuntu-22.04\n    timeout-minutes: 120\n    steps:\n      - name: Checkout\n        uses: actions/checkout@v4\n        with:\n          fetch-depth: 0\n          persist-credentials: false\n      - name: Sync\n        uses: gpustack/.github/.github/actions/mirror-release-gitee@main\n        with:\n          gitee-username: \"${{ secrets.CI_GITEE_USERNAME }}\"\n          gitee-token: \"${{ secrets.CI_GITEE_TOKEN }}\"\n          max-releases: \"${{ inputs.max_releases && inputs.max_releases || '1' }}\"\n          specific-release-tag: \"${{ inputs.specific_release_tag && inputs.specific_release_tag || '' }}\"\n          code-only: true\n          dry-run: \"${{ inputs.dry_run && inputs.dry_run || 'false' }}\"\n\n  tencent-cos:\n    runs-on: ubuntu-22.04\n    timeout-minutes: 120\n    steps:\n      - name: Sync\n        uses: gpustack/.github/.github/actions/mirror-release-tencent-cos@main\n        with:\n          tencent-secret-id: \"${{ secrets.CI_TECENTCOS_SECRET_ID }}\"\n          tencent-secret-key: \"${{ secrets.CI_TECENTCOS_SECRET_KEY }}\"\n          tencent-cos-region: \"ap-guangzhou\"\n          tencent-cos-bucket: \"gpustack-1303613262\"\n          max-releases: \"${{ inputs.max_releases && inputs.max_releases || '1' }}\"\n          specific-release-tag: \"${{ inputs.specific_release_tag && inputs.specific_release_tag || '' }}\"\n          dry-run: \"${{ inputs.dry_run && inputs.dry_run || 'false' }}\"\n"
  },
  {
    "path": ".gitignore",
    "content": "# Files\n.DS_Store\n*.lock\n*.test\n*.out\n*.swp\n*.swo\n*.db\n*.exe\n*.exe~\n*.dll\n*.so\n*.dylib\n*.log\ngo.work\ngo.work.*\n\n# Dirs\n/.idea\n/.vscode\n/.kube\n/.terraform\n/.vagrant\n/.bundle\n/.cache\n/.docker\n/.entc\n/.sbin\n/.dist\n/log\n/certs\n"
  },
  {
    "path": ".golangci.yaml",
    "content": "version: \"1\"\n\nrun:\n  timeout: 10m\n  tests: true\n  modules-download-mode: readonly\n  go: \"1.22\"\n\n# output configuration options\noutput:\n  print-issued-lines: true\n  print-linter-name: true\n  path-prefix: \"\"\n  sort-results: true\n\nlinters:\n  disable-all: true\n  enable:\n    - asciicheck\n    - bidichk\n    - decorder\n    - durationcheck\n    - errcheck\n    - errname\n    - errorlint\n    - copyloopvar\n    - godot\n    - goconst\n    - gocritic\n    - gosimple\n    - gosec\n    - govet\n    - gofumpt\n    - gofmt\n    - ineffassign\n    - importas\n    - lll\n    - makezero\n    - misspell\n    - nakedret\n    - nilerr\n    - prealloc\n    - predeclared\n    - revive\n    - staticcheck\n    - stylecheck\n    - typecheck\n    - unconvert\n    - unparam\n    - unused\n    - usestdlibvars\n    - whitespace\n\nlinters-settings:\n  decorder:\n    dec-order:\n      - const\n      - var\n      - func\n    disable-init-func-first-check: false\n    disable-dec-order-check: true\n  errorlint:\n    errorf: true\n    asserts: true\n    comparison: true\n  godot:\n    scope: all\n    exclude:\n      - \"(?i)^ FIXME:\"\n      - \"(?i)^ TODO:\"\n      - \"(?i)^ SPDX\\\\-License\\\\-Identifier:\"\n      - \"(?i)^ +\"\n    period: true\n    capital: false\n  goconst:\n    min-len: 3\n    min-occurrences: 10\n  gosimple:\n    checks: [ \"all\" ]\n  gosec:\n    severity: \"low\"\n    confidence: \"low\"\n    excludes:\n      - G101\n      - G107\n      - G112\n      - G115\n      - G404\n  gofumpt:\n    extra-rules: true\n  gofmt:\n    simplify: true\n    rewrite-rules:\n      - pattern: 'interface{}'\n        replacement: 'any'\n      - pattern: 'a[b:len(a)]'\n        replacement: 'a[b:]'\n  importas:\n    no-unaliased: true\n  lll:\n    line-length: 150\n    tab-width: 1\n  makezero:\n    always: false\n  misspell:\n    locale: US\n  nakedret:\n    max-func-lines: 60\n  revive:\n    rules:\n      - name: var-naming\n        disabled: true\n        arguments:\n          - [ \"HTTP\", \"ID\", \"TLS\", \"TCP\", \"UDP\", \"API\", \"CA\", \"URL\", \"DNS\" ]\n  staticcheck:\n    checks: [ \"all\", \"-SA1019\", \"-SA2002\", \"-SA5008\" ]\n  stylecheck:\n    checks: [ \"all\", \"-ST1003\" ]\n  unparam:\n    check-exported: false\n  unused:\n    field-writes-are-uses: true\n    post-statements-are-reads: true\n    exported-fields-are-used: true\n    parameters-are-used: true\n    local-variables-are-used: true\n    generated-is-used: true\n  usestdlibvars:\n    http-method: true\n    http-status-code: true\n    time-weekday: true\n    time-month: true\n    time-layout: true\n    crypto-hash: true\n\nissues:\n  uniq-by-line: true\n  exclude-files:\n    - \"doc.go\"\n    - \"zz_generated.*.go\"\n    - \"gen.*.go\"\n  exclude-rules:\n    - path: _test\\.go\n      linters:\n        - errcheck\n        - gosec\n        - makezero\n        - lll\n"
  },
  {
    "path": "Dockerfile",
    "content": "FROM scratch\nARG TARGETOS\nARG TARGETARCH\nCOPY --chmod=755 .dist/gguf-parser-${TARGETOS}-${TARGETARCH} /bin/gguf-parser\nENTRYPOINT [\"/bin/gguf-parser\"]\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2024 gguf-parser-go authors\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE."
  },
  {
    "path": "Makefile",
    "content": ".SILENT:\n.DEFAULT_GOAL := ci\n\nSHELL := /bin/bash\n\nSRCDIR := $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST)))))\nGOOS := $(shell go env GOOS)\nGOARCH := $(shell go env GOARCH)\nLINT_DIRTY ?= false\nVERSION ?= $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null | tr '[:upper:]' '[:lower:]' || echo \"unknown\")\n\nDEPS_UPDATE ?= false\ndeps:\n\t@echo \"+++ $@ +++\"\n\n\tcd $(SRCDIR) && go mod tidy && go mod download\n\tcd $(SRCDIR)/cmd/gguf-parser && go mod tidy && go mod download\n\n\tif [[ \"$(DEPS_UPDATE)\" == \"true\" ]]; then \\\n\t\tcd $(SRCDIR) && go get -u -v ./...; \\\n\t\tcd $(SRCDIR)/cmd/gguf-parser && go get -u -v ./...; \\\n\tfi\n\n\t@echo \"--- $@ ---\"\n\ngenerate:\n\t@echo \"+++ $@ +++\"\n\n\tcd $(SRCDIR) && go generate ./...\n\tcd $(SRCDIR)/cmd/gguf-parser && go generate ./...\n\n\t@echo \"--- $@ ---\"\n\nlint:\n\t@echo \"+++ $@ +++\"\n\n\t[[ -d \"$(SRCDIR)/.sbin\" ]] || mkdir -p \"$(SRCDIR)/.sbin\"\n\n\t[[ -f \"$(SRCDIR)/.sbin/goimports-reviser\" ]] || \\\n\t\tcurl --retry 3 --retry-all-errors --retry-delay 3 -sSfL \"https://github.com/incu6us/goimports-reviser/releases/download/v3.8.2/goimports-reviser_3.8.2_$(GOOS)_$(GOARCH).tar.gz\" \\\n\t\t| tar -zxvf - --directory \"$(SRCDIR)/.sbin\" --no-same-owner --exclude ./LICENSE --exclude ./README.md && chmod +x \"$(SRCDIR)/.sbin/goimports-reviser\"\n\tcd $(SRCDIR) && \\\n\t\tgo list -f \"{{.Dir}}\" ./... | xargs -I {} find {} -maxdepth 1 -type f -name '*.go' ! -name 'gen.*' ! -name 'zz_generated.*' \\\n\t\t| xargs -I {} \"$(SRCDIR)/.sbin/goimports-reviser\" -use-cache -imports-order=std,general,company,project,blanked,dotted -output=file {} 1>/dev/null 2>&1\n\tcd $(SRCDIR)/cmd/gguf-parser && \\\n\t\tgo list -f \"{{.Dir}}\" ./... | xargs -I {} find {} -maxdepth 1 -type f -name '*.go' ! -name 'gen.*' ! -name 'zz_generated.*' \\\n\t\t| xargs -I {} \"$(SRCDIR)/.sbin/goimports-reviser\" -use-cache -imports-order=std,general,company,project,blanked,dotted -output=file {} 1>/dev/null 2>&1\n\n\t[[ -f \"$(SRCDIR)/.sbin/golangci-lint\" ]] || \\\n\t\tcurl --retry 3 --retry-all-errors --retry-delay 3 -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh \\\n\t\t| sh -s -- -b \"$(SRCDIR)/.sbin\" \"v1.63.4\"\n\tcd $(SRCDIR) && \\\n\t\t\"$(SRCDIR)/.sbin/golangci-lint\" run --fix ./...\n\tcd $(SRCDIR)/cmd/gguf-parser && \\\n\t\t\"$(SRCDIR)/.sbin/golangci-lint\" run --fix ./...\n\n\tif [[ \"$(LINT_DIRTY)\" == \"true\" ]]; then \\\n\t\tif [[ -n $$(git status --porcelain) ]]; then \\\n\t\t\techo \"Code tree is dirty.\"; \\\n\t\t\tgit diff --exit-code; \\\n\t\tfi; \\\n\tfi\n\n\t@echo \"--- $@ ---\"\n\ntest:\n\t@echo \"+++ $@ +++\"\n\n\tgo test -v -failfast -race -cover -timeout=30m $(SRCDIR)/...\n\n\t@echo \"--- $@ ---\"\n\nbenchmark:\n\t@echo \"+++ $@ +++\"\n\n\tgo test -v -failfast -run=\"^Benchmark[A-Z]+\" -bench=. -benchmem -timeout=30m $(SRCDIR)/...\n\n\t@echo \"--- $@ ---\"\n\ngguf-parser:\n\t[[ -d \"$(SRCDIR)/.dist\" ]] || mkdir -p \"$(SRCDIR)/.dist\"\n\n\tcd \"$(SRCDIR)/cmd/gguf-parser\" && for os in darwin linux windows; do \\\n  \t\ttags=\"netgo\"; \\\n  \t\tif [[ $$os == \"windows\" ]]; then \\\n\t\t  suffix=\".exe\"; \\\n\t\t  tags=\"netcgo\"; \\\n\t\telse \\\n\t\t  suffix=\"\"; \\\n\t\tfi; \\\n\t\tfor arch in amd64 arm64; do \\\n\t\t  \techo \"Building gguf-parser for $$os-$$arch $(VERSION)\"; \\\n\t\t\tGOOS=\"$$os\" GOARCH=\"$$arch\" CGO_ENABLED=1 go build \\\n\t\t\t\t-trimpath \\\n\t\t\t\t-ldflags=\"-w -s -X main.Version=$(VERSION)\" \\\n\t\t\t\t-tags=\"urfave_cli_no_docs $$tags\" \\\n\t\t\t\t-o $(SRCDIR)/.dist/gguf-parser-$$os-$$arch$$suffix; \\\n\t\tdone; \\\n\t\tif [[ $$os == \"darwin\" ]]; then \\\n\t\t  [[ -d \"$(SRCDIR)/.sbin\" ]] || mkdir -p \"$(SRCDIR)/.sbin\"; \\\n\t\t  [[ -f \"$(SRCDIR)/.sbin/lipo\" ]] || \\\n\t\t\tGOBIN=\"$(SRCDIR)/.sbin\" go install github.com/konoui/lipo@v0.9.2; \\\n\t\t  \t\"$(SRCDIR)/.sbin/lipo\" -create -output $(SRCDIR)/.dist/gguf-parser-darwin-universal $(SRCDIR)/.dist/gguf-parser-darwin-amd64 $(SRCDIR)/.dist/gguf-parser-darwin-arm64; \\\n\t\tfi;\\\n\t\tif [[ $$os == \"$(GOOS)\" ]] && [[ $$arch == \"$(GOARCH)\" ]]; then \\\n\t\t\tcp -rf $(SRCDIR)/.dist/gguf-parser-$$os-$$arch$$suffix $(SRCDIR)/.dist/gguf-parser$$suffix; \\\n\t\tfi; \\\n\tdone\n\nbuild: gguf-parser\n\nPACKAGE_PUBLISH ?= false\nPACKAGE_REGISTRY ?= \"gpustack\"\nPACKAGE_IMAGE ?= \"gguf-parser\"\npackage: build\n\t@echo \"+++ $@ +++\"\n\n\tif [[ -z $$(command -v docker) ]]; then \\\n  \t\techo \"Docker is not installed.\"; \\\n\t\texit 1; \\\n\tfi; \\\n\tplatform=\"linux/amd64,linux/arm64\"; \\\n\timage=\"$(PACKAGE_IMAGE):$(VERSION)\"; \\\n\tif [[ -n \"$(PACKAGE_REGISTRY)\" ]]; then \\\n\t\timage=\"$(PACKAGE_REGISTRY)/$$image\"; \\\n\tfi; \\\n\tif [[ \"$(PACKAGE_PUBLISH)\" == \"true\" ]]; then \\\n\t  \tif [[ -z $$(docker buildx inspect --builder \"gguf-parser\") ]]; then \\\n      \t\tdocker run --rm --privileged tonistiigi/binfmt:qemu-v9.2.2 --install $$platform; \\\n      \t\tdocker buildx create --name \"gguf-parser\" --driver \"docker-container\" --buildkitd-flags \"--allow-insecure-entitlement security.insecure --allow-insecure-entitlement network.host\" --bootstrap; \\\n      \tfi; \\\n\t\tdocker buildx build --progress=plain --platform=$$platform --builder=\"gguf-parser\" --output=\"type=image,name=$$image,push=true\" \"$(SRCDIR)\"; \\\n\telse \\\n\t  \tplatform=\"linux/$(GOARCH)\"; \\\n  \t\tdocker buildx build --progress=plain --platform=$$platform --output=\"type=docker,name=$$image\" \"$(SRCDIR)\"; \\\n\tfi\n\n\t@echo \"--- $@ ---\"\n\nci: deps generate lint test build\n"
  },
  {
    "path": "README.md",
    "content": "# GGUF Parser\n\n> tl;dr, Review/Check [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) files and estimate the memory\n> usage.\n\n[![Go Report Card](https://goreportcard.com/badge/github.com/gpustack/gguf-parser-go)](https://goreportcard.com/report/github.com/gpustack/gguf-parser-go)\n[![CI](https://img.shields.io/github/actions/workflow/status/gpustack/gguf-parser-go/cmd.yml?label=ci)](https://github.com/gpustack/gguf-parser-go/actions)\n[![License](https://img.shields.io/github/license/gpustack/gguf-parser-go?label=license)](https://github.com/gpustack/gguf-parser-go#license)\n[![Download](https://img.shields.io/github/downloads/gpustack/gguf-parser-go/total)](https://github.com/gpustack/gguf-parser-go/releases)\n[![Docker Pulls](https://img.shields.io/docker/pulls/gpustack/gguf-parser)](https://hub.docker.com/r/gpustack/gguf-parser)\n[![Release](https://img.shields.io/github/v/release/gpustack/gguf-parser-go)](https://github.com/gpustack/gguf-parser-go/releases/latest)\n\n[GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) is a file format for storing models for inference\nwith GGML and executors based on GGML. GGUF is a binary format that is designed for fast loading and saving of models,\nand for ease of reading. Models are traditionally developed using PyTorch or another framework, and then converted to\nGGUF for use in GGML.\n\nGGUF Parser helps in reviewing and estimating the usage and maximum tokens per second of a GGUF format model without\ndownload it.\n\n## Key Features\n\n- **No File Required**: GGUF Parser uses chunking reading to parse the metadata of remote GGUF file, which means you\n  don't need to download the entire file and load it.\n- **Accurate Prediction**: The evaluation results of GGUF Parser usually deviate from the actual usage by about 100MiB.\n- **Quick Verification**: You can provide device metrics to calculate the maximum tokens per second (TPS) without\n  running the model.\n- **Type Screening**: GGUF Parser can distinguish what the GGUF file used for, such as Embedding, Reranking, LoRA, etc.\n- **Fast**: GGUF Parser is written in Go, which is fast and efficient.\n\n## Agenda\n\n- [Notes](#notes)\n- [Installation](#installation)\n- [Overview](#overview)\n    + [Parse](#parse)\n        * [Local File](#parse-local-file)\n        * [Remote File](#parse-remote-file)\n        * [From HuggingFace](#parse-from-huggingface)\n        * [From ModelScope](#parse-from-modelscope)\n        * [From Ollama Library](#parse-from-ollama-library)\n        * [Others](#others)\n            * [Image Model](#parse-image-model)\n            * [None Model](#parse-none-model)\n    + [Estimate](#estimate)\n        * [Across Multiple GPU devices](#across-multiple-gpu-devices)\n        * [Maximum Tokens Per Second](#maximum-tokens-per-second)\n        * [Full Layers Offload (default)](#full-layers-offload-default)\n        * [Zero Layers Offload](#zero-layers-offload)\n        * [Specific Layers Offload](#specific-layers-offload)\n        * [Specific Context Size](#specific-context-size)\n        * [Enable Flash Attention](#enable-flash-attention)\n        * [Disable MMap](#disable-mmap)\n        * [With Adapter](#with-adapter)\n        * [Get Proper Offload Layers](#get-proper-offload-layers)\n\n## Notes\n\n- **Since v0.20.0**, GGUF Parser supports leveraging `--override-tensor` to indicate how to place the model tensors.\n- **Since v0.19.0**, GGUF Parser supports estimating Audio projector model file, like Ultravox series, Qwen2 Audio\n  series, etc.\n- **Since v0.18.0**, GGUF Parser supports estimating SWA-supported(sliding window attention) model file, like LLaMA 4\n  series, Gemma2/3 series, etc.\n- **Since v0.17.0**, GGUF Parser align the `QUANTIZATION`(\n  aka. [`general.file_type`](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#general-metadata))\n  to [HuggingFace processing](https://github.com/huggingface/huggingface.js/blob/2475d6d316135c0a4fceff6b3fe2aed0dde36ac1/packages/gguf/src/types.ts#L11-L48),\n  but there are still many model files whose naming does not fully follow `general.file_type`.\n- **Since v0.16.0**, GGUF Parser supports estimating MLA-supported model file, like DeepSeek series.\n- **Since v0.14.0 (BREAKING CHANGE)**, GGUF Parser parses `*.feed_forward_length` metadata as `[]uint64`,\n  which means the architecture `feedForwardLength` is a list of integers.\n- **Since v0.13.0 (BREAKING CHANGE)**, GGUF Parser can parse files\n  for [StableDiffusion.Cpp](https://github.com/leejet/stable-diffusion.cpp) or StableDiffusion.Cpp like application.\n    + [LLaMA Box](https://github.com/gpustack/llama-box) is able to offload different components of the all-in-one model\n      to different devices, e.g. with `-ts 1,1,1`, GGUF Parser return the usage of Text Encoder Models in 1st device,\n      VAE Model in 2nd device, and Diffusion Model in 3rd device.\n- Experimentally, GGUF Parser can estimate the maximum tokens per second(`MAX TPS`) for a (V)LM model according to the\n  `--device-metric` options.\n- GGUF Parser distinguishes the remote devices from `--tensor-split` via `--rpc`.\n    + For one host multiple GPU devices, you can use `--tensor-split` to get the estimated memory usage of each GPU.\n    + For multiple hosts multiple GPU devices, you can use `--tensor-split` and `--rpc` to get the estimated memory\n      usage of each GPU. Since v0.11.0, `--rpc` flag masks the devices specified by `--tensor-split` in front.\n- Table result usage:\n    + `DISTRIBUTABLE` indicates the GGUF file supports distribution inference or not, if the file doesn't support\n      distribution inference, you can not offload it\n      with [RPC servers](https://github.com/ggerganov/llama.cpp/tree/master/examples/rpc).\n    + `RAM` indicates the system memory usage.\n    + `VRAM *` indicates the local GPU memory usage.\n    + `RPC * (V)RAM` indicates the remote memory usage. The kind of memory is determined by which backend the RPC server\n      uses, check the running logs for more details.\n    + `UMA` indicates the memory usage of Apple macOS only. `NONUMA` adapts to other cases, including non-GPU devices.\n    + `LAYERS`(`I`/`T`/`O`) indicates the count for input layers, transformer layers, and output layers. Input layers\n      are not offloaded at present.\n\n## Installation\n\nInstall from [releases](https://github.com/gpustack/gguf-parser-go/releases).\n\n## Overview\n\n### Parse\n\n#### Parse Local File\n\n```shell\n$ gguf-parser --path ~/.cache/lm-studio/models/unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF/DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf\n+-----------------------------------------------------------------------------------------------------------+\n| METADATA                                                                                                  |\n+-------+-------------------------+-------+--------------+---------------+----------+------------+----------+\n|  TYPE |           NAME          |  ARCH | QUANTIZATION | LITTLE ENDIAN |   SIZE   | PARAMETERS |    BPW   |\n+-------+-------------------------+-------+--------------+---------------+----------+------------+----------+\n| model | DeepSeek R1 Distill ... | qwen2 |    Q4_K_M    |      true     | 4.36 GiB |   7.62 B   | 4.91 bpw |\n+-------+-------------------------+-------+--------------+---------------+----------+------------+----------+\n\n+-----------------------------------------------------------------------------------------------------------------------------------+\n| ARCHITECTURE                                                                                                                      |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n|      131072     |      3584     |       true       |         28         |   28   |       18944      |      0     |     152064     |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n\n+-------------------------------------------------------------------------------------------------------------------------------------------------------+\n| TOKENIZER                                                                                                                                             |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n|  gpt2 |   2.47 MiB  |   152064   |        N/A       |   151646  |   151643  |    N/A    |    N/A    |      N/A      |       N/A       |     151654    |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n\n+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n| ESTIMATE                                                                                                                                                                                                                                        |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+\n|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                 VRAM 0                |\n|       |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+----------+-----------+\n|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |   NONUMA  |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+\n| qwen2 |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   29 (28 + 1)  |       Yes      |      1 + 0 + 0     | 677.44 MiB | 827.44 MiB |     28 + 1     | 7.30 GiB | 18.89 GiB |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+\n\n$ # Retrieve the model's metadata via split file,\n$ # which needs all split files has been downloaded.\n$  gguf-parser --path ~/.cache/lm-studio/models/Qwen/Qwen2.5-7B-Instruct-GGUF/qwen2.5-7b-instruct-q8_0-00001-of-00003.gguf \n+-------------------------------------------------------------------------------------------------------+\n| METADATA                                                                                              |\n+-------+---------------------+-------+--------------+---------------+----------+------------+----------+\n|  TYPE |         NAME        |  ARCH | QUANTIZATION | LITTLE ENDIAN |   SIZE   | PARAMETERS |    BPW   |\n+-------+---------------------+-------+--------------+---------------+----------+------------+----------+\n| model | qwen2.5-7b-instruct | qwen2 |     Q8_0     |      true     | 7.54 GiB |   7.62 B   | 8.50 bpw |\n+-------+---------------------+-------+--------------+---------------+----------+------------+----------+\n\n+-----------------------------------------------------------------------------------------------------------------------------------+\n| ARCHITECTURE                                                                                                                      |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n|      131072     |      3584     |       true       |         28         |   28   |       18944      |      0     |     152064     |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n\n+-------------------------------------------------------------------------------------------------------------------------------------------------------+\n| TOKENIZER                                                                                                                                             |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n|  gpt2 |   2.47 MiB  |   152064   |        N/A       |   151643  |   151645  |    N/A    |    N/A    |      N/A      |       N/A       |     151643    |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n\n+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n| ESTIMATE                                                                                                                                                                                                                                        |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+\n|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                 VRAM 0                |\n|       |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+----------+-----------+\n|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |   NONUMA  |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+\n| qwen2 |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   29 (28 + 1)  |       Yes      |      1 + 0 + 0     | 677.44 MiB | 827.44 MiB |     28 + 1     | 7.30 GiB | 21.82 GiB |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+\n```\n\n#### Parse Remote File\n\n```shell\n$ gguf-parser --url=\"https://huggingface.co/bartowski/Qwen2.5-72B-Instruct-GGUF/resolve/main/Qwen2.5-72B-Instruct-Q4_K_M.gguf\"\n+---------------------------------------------------------------------------------------------------------+\n| METADATA                                                                                                |\n+-------+----------------------+-------+--------------+---------------+-----------+------------+----------+\n|  TYPE |         NAME         |  ARCH | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW   |\n+-------+----------------------+-------+--------------+---------------+-----------+------------+----------+\n| model | Qwen2.5 72B Instruct | qwen2 |    Q4_K_M    |      true     | 44.15 GiB |   72.71 B  | 5.22 bpw |\n+-------+----------------------+-------+--------------+---------------+-----------+------------+----------+\n\n+-----------------------------------------------------------------------------------------------------------------------------------+\n| ARCHITECTURE                                                                                                                      |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n|      32768      |      8192     |       true       |         64         |   80   |       29568      |      0     |     152064     |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n\n+-------------------------------------------------------------------------------------------------------------------------------------------------------+\n| TOKENIZER                                                                                                                                             |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n|  gpt2 |   2.47 MiB  |   152064   |        N/A       |   151643  |   151645  |    N/A    |    N/A    |      N/A      |       N/A       |     151643    |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n\n+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n| ESTIMATE                                                                                                                                                                                                                                         |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+----------------------------------------+\n|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                 VRAM 0                 |\n|       |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+-----------+-----------+\n|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA    |   NONUMA  |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+-----------+-----------+\n| qwen2 |     32768    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   81 (80 + 1)  |       Yes      |      1 + 0 + 0     | 426.57 MiB | 576.57 MiB |     80 + 1     | 10.31 GiB | 58.18 GiB |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+-----------+-----------+\n\n$ # Retrieve the model's metadata via split file\n\n$ gguf-parser --url=\"https://huggingface.co/unsloth/DeepSeek-R1-GGUF/resolve/main/DeepSeek-R1-UD-IQ1_S/DeepSeek-R1-UD-IQ1_S-00001-of-00003.gguf\"\n+----------------------------------------------------------------------------------------------------------+\n| METADATA                                                                                                 |\n+-------+------------------+-----------+--------------+---------------+------------+------------+----------+\n|  TYPE |       NAME       |    ARCH   | QUANTIZATION | LITTLE ENDIAN |    SIZE    | PARAMETERS |    BPW   |\n+-------+------------------+-----------+--------------+---------------+------------+------------+----------+\n| model | DeepSeek R1 BF16 | deepseek2 |     IQ1_S    |      true     | 130.60 GiB |  671.03 B  | 1.67 bpw |\n+-------+------------------+-----------+--------------+---------------+------------+------------+----------+\n\n+-----------------------------------------------------------------------------------------------------------------------------------+\n| ARCHITECTURE                                                                                                                      |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n|      163840     |      7168     |       true       |         N/A        |   61   |       18432      |     256    |     129280     |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n\n+-------------------------------------------------------------------------------------------------------------------------------------------------------+\n| TOKENIZER                                                                                                                                             |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n|  gpt2 |   2.21 MiB  |   129280   |        N/A       |     0     |     1     |    N/A    |    N/A    |      N/A      |       N/A       |     128815    |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n\n+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n| ESTIMATE                                                                                                                                                                                                                                         |\n+-----------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------------------------------+--------------------------------------+\n|    ARCH   | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                     RAM                    |                VRAM 0                |\n|           |              |                    |                 |           |                |             |               |                |                +--------------------+-----------+-----------+----------------+------------+--------+\n|           |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |    UMA    |   NONUMA  | LAYERS (T + O) |     UMA    | NONUMA |\n+-----------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+-----------+-----------+----------------+------------+--------+\n| deepseek2 |    163840    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   62 (61 + 1)  |       Yes      |      1 + 0 + 0     | 13.03 GiB | 13.18 GiB |     61 + 1     | 762.76 GiB |  1 TB  |\n+-----------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+-----------+-----------+----------------+------------+--------+\n```\n\n#### Parse From HuggingFace\n\n> [!NOTE]\n>\n> Allow using `HF_ENDPOINT` to override the default HuggingFace endpoint: `https://huggingface.co`.\n\n```shell\n$ gguf-parser --hf-repo=\"bartowski/Qwen2-VL-2B-Instruct-GGUF\" --hf-file=\"Qwen2-VL-2B-Instruct-f16.gguf\" --hf-mmproj-file=\"mmproj-Qwen2-VL-2B-Instruct-f32.gguf\" --visual-max-image-size 1344\n+-----------------------------------------------------------------------------------------------------------+\n| METADATA                                                                                                  |\n+-------+----------------------+---------+--------------+---------------+----------+------------+-----------+\n|  TYPE |         NAME         |   ARCH  | QUANTIZATION | LITTLE ENDIAN |   SIZE   | PARAMETERS |    BPW    |\n+-------+----------------------+---------+--------------+---------------+----------+------------+-----------+\n| model | Qwen2 VL 2B Instruct | qwen2vl |      F16     |      true     | 2.88 GiB |   1.54 B   | 16.00 bpw |\n+-------+----------------------+---------+--------------+---------------+----------+------------+-----------+\n\n+-----------------------------------------------------------------------------------------------------------------------------------+\n| ARCHITECTURE                                                                                                                      |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n|      32768      |      1536     |       true       |         12         |   28   |       8960       |      0     |     151936     |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n\n+-------------------------------------------------------------------------------------------------------------------------------------------------------+\n| TOKENIZER                                                                                                                                             |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n|  gpt2 |   2.47 MiB  |   151936   |        N/A       |   151643  |   151645  |    N/A    |    N/A    |      N/A      |       N/A       |     151643    |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n\n+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n| ESTIMATE                                                                                                                                                                                                                                          |\n+---------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+\n|   ARCH  | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                 VRAM 0                |\n|         |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+----------+-----------+\n|         |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |   NONUMA  |\n+---------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+\n| qwen2vl |     32768    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   29 (28 + 1)  |       Yes      |      1 + 0 + 0     | 236.87 MiB | 386.87 MiB |     28 + 1     | 3.65 GiB | 12.86 GiB |\n+---------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+\n\n$ # Retrieve the model's metadata via split file\n\n$ gguf-parser --hf-repo=\"bartowski/openbuddy-llama3.3-70b-v24.1-131k-GGUF\" --hf-file=\"openbuddy-llama3.3-70b-v24.1-131k-Q4_0.gguf\"\n+------------------------------------------------------------------------------------------------------------+\n| METADATA                                                                                                   |\n+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+\n|  TYPE |           NAME          |  ARCH | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW   |\n+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+\n| model | Openbuddy Llama3.3 7... | llama |     Q4_0     |      true     | 37.35 GiB |   70.55 B  | 4.55 bpw |\n+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+\n\n+-----------------------------------------------------------------------------------------------------------------------------------+\n| ARCHITECTURE                                                                                                                      |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n|      131072     |      8192     |       true       |         64         |   80   |       28672      |      0     |     128256     |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n\n+-------------------------------------------------------------------------------------------------------------------------------------------------------+\n| TOKENIZER                                                                                                                                             |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n|  gpt2 |    2 MiB    |   128256   |        N/A       |   128000  |   128048  |    N/A    |    N/A    |      N/A      |       N/A       |     128044    |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n\n+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n| ESTIMATE                                                                                                                                                                                                                                    |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+-----------------------------------------+----------------------------------------+\n|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                   RAM                   |                 VRAM 0                 |\n|       |              |                    |                 |           |                |             |               |                |                +--------------------+---------+----------+----------------+-----------+-----------+\n|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |   UMA   |  NONUMA  | LAYERS (T + O) |    UMA    |   NONUMA  |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+\n| llama |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   81 (80 + 1)  |       Yes      |      1 + 0 + 0     | 1.06 GB | 1.13 GiB |     80 + 1     | 40.26 GiB | 93.62 GiB |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+\n```\n\n#### Parse From ModelScope\n\n> [!NOTE]\n>\n> Allow using `MS_ENDPOINT` to override the default ModelScope endpoint: `https://modelscope.cn`.\n\n```shell\n$ gguf-parser --ms-repo=\"unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF\" --ms-file=\"DeepSeek-R1-Distill-Qwen-7B-F16.gguf\"\n+-------------------------------------------------------------------------------------------------------------+\n| METADATA                                                                                                    |\n+-------+-------------------------+-------+--------------+---------------+-----------+------------+-----------+\n|  TYPE |           NAME          |  ARCH | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW    |\n+-------+-------------------------+-------+--------------+---------------+-----------+------------+-----------+\n| model | DeepSeek R1 Distill ... | qwen2 |      F16     |      true     | 14.19 GiB |   7.62 B   | 16.00 bpw |\n+-------+-------------------------+-------+--------------+---------------+-----------+------------+-----------+\n\n+-----------------------------------------------------------------------------------------------------------------------------------+\n| ARCHITECTURE                                                                                                                      |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n|      131072     |      3584     |       true       |         28         |   28   |       18944      |      0     |     152064     |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n\n+-------------------------------------------------------------------------------------------------------------------------------------------------------+\n| TOKENIZER                                                                                                                                             |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n|  gpt2 |   2.47 MiB  |   152064   |        N/A       |   151646  |   151643  |    N/A    |    N/A    |      N/A      |       N/A       |     151654    |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n\n+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n| ESTIMATE                                                                                                                                                                                                                                        |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+\n|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                 VRAM 0                |\n|       |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+----------+-----------+\n|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |   NONUMA  |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+\n| qwen2 |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   29 (28 + 1)  |       Yes      |      1 + 0 + 0     | 677.44 MiB | 827.44 MiB |     28 + 1     | 7.30 GiB | 27.99 GiB |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+\n```\n\n#### Parse From Ollama Library\n\n> [!NOTE]\n>\n> Allow using `--ol-base-url` to override the default Ollama registry endpoint: `https://registry.ollama.ai`.\n\n```shell\n$ gguf-parser --ol-model=\"llama3.3\"\n+------------------------------------------------------------------------------------------------------------+\n| METADATA                                                                                                   |\n+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+\n|  TYPE |           NAME          |  ARCH | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW   |\n+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+\n| model | Llama 3.1 70B Instru... | llama |    Q4_K_M    |      true     | 39.59 GiB |   70.55 B  | 4.82 bpw |\n+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+\n\n+-----------------------------------------------------------------------------------------------------------------------------------+\n| ARCHITECTURE                                                                                                                      |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n|      131072     |      8192     |       true       |         64         |   80   |       28672      |      0     |     128256     |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n\n+-------------------------------------------------------------------------------------------------------------------------------------------------------+\n| TOKENIZER                                                                                                                                             |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n|  gpt2 |    2 MiB    |   128256   |        N/A       |   128000  |   128009  |    N/A    |    N/A    |      N/A      |       N/A       |      N/A      |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n\n+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n| ESTIMATE                                                                                                                                                                                                                                    |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+-----------------------------------------+----------------------------------------+\n|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                   RAM                   |                 VRAM 0                 |\n|       |              |                    |                 |           |                |             |               |                |                +--------------------+---------+----------+----------------+-----------+-----------+\n|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |   UMA   |  NONUMA  | LAYERS (T + O) |    UMA    |   NONUMA  |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+\n| llama |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   81 (80 + 1)  |       Yes      |      1 + 0 + 0     | 1.06 GB | 1.13 GiB |     80 + 1     | 40.26 GiB | 95.86 GiB |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+\n\n$ # Ollama Model includes the preset params and other artifacts, like multimodal projectors or LoRA adapters, \n$ # you can get the usage of Ollama running by using `--ol-usage` option.\n\n+------------------------------------------------------------------------------------------------------------+\n| METADATA                                                                                                   |\n+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+\n|  TYPE |           NAME          |  ARCH | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW   |\n+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+\n| model | Llama 3.1 70B Instru... | llama |    Q4_K_M    |      true     | 39.59 GiB |   70.55 B  | 4.82 bpw |\n+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+\n\n+-----------------------------------------------------------------------------------------------------------------------------------+\n| ARCHITECTURE                                                                                                                      |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n|      131072     |      8192     |       true       |         64         |   80   |       28672      |      0     |     128256     |\n+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+\n\n+-------------------------------------------------------------------------------------------------------------------------------------------------------+\n| TOKENIZER                                                                                                                                             |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n|  gpt2 |    2 MiB    |   128256   |        N/A       |   128000  |   128009  |    N/A    |    N/A    |      N/A      |       N/A       |      N/A      |\n+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+\n\n+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n| ESTIMATE                                                                                                                                                                                                                                          |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+-----------------------------------------+\n|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                  VRAM 0                 |\n|       |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+------------+-----------+\n|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |     UMA    |   NONUMA  |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+------------+-----------+\n| llama |     2048     |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   81 (80 + 1)  |       Yes      |      1 + 0 + 0     | 255.27 MiB | 405.27 MiB |     80 + 1     | 906.50 MiB | 40.49 GiB |\n+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+------------+-----------+\n```\n\n#### Others\n\n##### Parse Image Model\n\n```shell\n$ # Parse FLUX.1-dev Model\n$ gguf-parser --hf-repo=\"gpustack/FLUX.1-dev-GGUF\" --hf-file=\"FLUX.1-dev-FP16.gguf\"\n+----------------------------------------------------------------------------------------------+\n| METADATA                                                                                     |\n+-------+------+-----------+--------------+---------------+-----------+------------+-----------+\n|  TYPE | NAME |    ARCH   | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW    |\n+-------+------+-----------+--------------+---------------+-----------+------------+-----------+\n| model |  N/A | diffusion |      F16     |      true     | 31.79 GiB |    17 B    | 16.06 bpw |\n+-------+------+-----------+--------------+---------------+-----------+------------+-----------+\n\n+----------------------------------------------------------------------------------------------------------+\n| ARCHITECTURE                                                                                             |\n+----------------+---------------------------------------------------------------+-------------------------+\n| DIFFUSION ARCH |                          CONDITIONERS                         |       AUTOENCODER       |\n+----------------+---------------------------------------------------------------+-------------------------+\n|     FLUX.1     | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) |\n+----------------+---------------------------------------------------------------+-------------------------+\n\n+---------------------------------------------------------------------------------------------------------------------------+\n| ESTIMATE                                                                                                                  |\n+--------+-----------------+-------------+---------------+----------------+-------------------------+-----------------------+\n|  ARCH  | FLASH ATTENTION |  MMAP LOAD  | DISTRIBUTABLE | FULL OFFLOADED |           RAM           |         VRAM 0        |\n|        |                 |             |               |                +------------+------------+-----------+-----------+\n|        |                 |             |               |                |     UMA    |   NONUMA   |    UMA    |   NONUMA  |\n+--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+\n| flux_1 |     Disabled    | Unsupported |   Supported   |       Yes      | 343.89 MiB | 493.89 MiB | 31.89 GiB | 41.15 GiB |\n+--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+\n\n$ # Parse FLUX.1-dev Model without offload Conditioner and Autoencoder\n$ gguf-parser --hf-repo=\"gpustack/FLUX.1-dev-GGUF\" --hf-file=\"FLUX.1-dev-FP16.gguf\" --clip-on-cpu --vae-on-cpu\n+----------------------------------------------------------------------------------------------+\n| METADATA                                                                                     |\n+-------+------+-----------+--------------+---------------+-----------+------------+-----------+\n|  TYPE | NAME |    ARCH   | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW    |\n+-------+------+-----------+--------------+---------------+-----------+------------+-----------+\n| model |  N/A | diffusion |      F16     |      true     | 31.79 GiB |    17 B    | 16.06 bpw |\n+-------+------+-----------+--------------+---------------+-----------+------------+-----------+\n\n+----------------------------------------------------------------------------------------------------------+\n| ARCHITECTURE                                                                                             |\n+----------------+---------------------------------------------------------------+-------------------------+\n| DIFFUSION ARCH |                          CONDITIONERS                         |       AUTOENCODER       |\n+----------------+---------------------------------------------------------------+-------------------------+\n|     FLUX.1     | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) |\n+----------------+---------------------------------------------------------------+-------------------------+\n\n+-------------------------------------------------------------------------------------------------------------------------+\n| ESTIMATE                                                                                                                |\n+--------+-----------------+-------------+---------------+----------------+-----------------------+-----------------------+\n|  ARCH  | FLASH ATTENTION |  MMAP LOAD  | DISTRIBUTABLE | FULL OFFLOADED |          RAM          |         VRAM 0        |\n|        |                 |             |               |                +-----------+-----------+-----------+-----------+\n|        |                 |             |               |                |    UMA    |   NONUMA  |    UMA    |   NONUMA  |\n+--------+-----------------+-------------+---------------+----------------+-----------+-----------+-----------+-----------+\n| flux_1 |     Disabled    | Unsupported |   Supported   |       Yes      | 16.44 GiB | 16.59 GiB | 22.29 GiB | 25.05 GiB |\n+--------+-----------------+-------------+---------------+----------------+-----------+-----------+-----------+-----------+\n\n$ # Parse FLUX.1-dev Model with Autoencoder tiling\n$ gguf-parser --hf-repo=\"gpustack/FLUX.1-dev-GGUF\" --hf-file=\"FLUX.1-dev-FP16.gguf\" --vae-tiling\n+----------------------------------------------------------------------------------------------+\n| METADATA                                                                                     |\n+-------+------+-----------+--------------+---------------+-----------+------------+-----------+\n|  TYPE | NAME |    ARCH   | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW    |\n+-------+------+-----------+--------------+---------------+-----------+------------+-----------+\n| model |  N/A | diffusion |      F16     |      true     | 31.79 GiB |    17 B    | 16.06 bpw |\n+-------+------+-----------+--------------+---------------+-----------+------------+-----------+\n\n+----------------------------------------------------------------------------------------------------------+\n| ARCHITECTURE                                                                                             |\n+----------------+---------------------------------------------------------------+-------------------------+\n| DIFFUSION ARCH |                          CONDITIONERS                         |       AUTOENCODER       |\n+----------------+---------------------------------------------------------------+-------------------------+\n|     FLUX.1     | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) |\n+----------------+---------------------------------------------------------------+-------------------------+\n\n+---------------------------------------------------------------------------------------------------------------------------+\n| ESTIMATE                                                                                                                  |\n+--------+-----------------+-------------+---------------+----------------+-------------------------+-----------------------+\n|  ARCH  | FLASH ATTENTION |  MMAP LOAD  | DISTRIBUTABLE | FULL OFFLOADED |           RAM           |         VRAM 0        |\n|        |                 |             |               |                +------------+------------+-----------+-----------+\n|        |                 |             |               |                |     UMA    |   NONUMA   |    UMA    |   NONUMA  |\n+--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+\n| flux_1 |     Disabled    | Unsupported |   Supported   |       Yes      | 343.89 MiB | 493.89 MiB | 31.89 GiB | 36.28 GiB |\n+--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+\n\n$ # Parse FLUX.1-dev Model with multiple devices offloading\n$ # Support by LLaMA Box v0.0.106+, https://github.com/gpustack/llama-box.\n$ gguf-parser --hf-repo=\"gpustack/FLUX.1-dev-GGUF\" --hf-file=\"FLUX.1-dev-FP16.gguf\" --tensor-split=\"1,1,1\"\n+----------------------------------------------------------------------------------------------+\n| METADATA                                                                                     |\n+-------+------+-----------+--------------+---------------+-----------+------------+-----------+\n|  TYPE | NAME |    ARCH   | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW    |\n+-------+------+-----------+--------------+---------------+-----------+------------+-----------+\n| model |  N/A | diffusion |      F16     |      true     | 31.79 GiB |    17 B    | 16.06 bpw |\n+-------+------+-----------+--------------+---------------+-----------+------------+-----------+\n\n+----------------------------------------------------------------------------------------------------------+\n| ARCHITECTURE                                                                                             |\n+----------------+---------------------------------------------------------------+-------------------------+\n| DIFFUSION ARCH |                          CONDITIONERS                         |       AUTOENCODER       |\n+----------------+---------------------------------------------------------------+-------------------------+\n|     FLUX.1     | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) |\n+----------------+---------------------------------------------------------------+-------------------------+\n\n+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n| ESTIMATE                                                                                                                                                              |\n+--------+-----------------+-------------+---------------+----------------+-------------------------+---------------------+---------------------+-----------------------+\n|  ARCH  | FLASH ATTENTION |  MMAP LOAD  | DISTRIBUTABLE | FULL OFFLOADED |           RAM           |        VRAM 0       |        VRAM 1       |         VRAM 2        |\n|        |                 |             |               |                +------------+------------+----------+----------+------------+--------+-----------+-----------+\n|        |                 |             |               |                |     UMA    |   NONUMA   |    UMA   |  NONUMA  |     UMA    | NONUMA |    UMA    |   NONUMA  |\n+--------+-----------------+-------------+---------------+----------------+------------+------------+----------+----------+------------+--------+-----------+-----------+\n| flux_1 |     Disabled    | Unsupported |   Supported   |       Yes      | 343.89 MiB | 493.89 MiB | 9.34 GiB | 9.60 GiB | 259.96 MiB |  7 GiB | 22.29 GiB | 25.05 GiB |\n+--------+-----------------+-------------+---------------+----------------+------------+------------+----------+----------+------------+--------+-----------+-----------+\n```\n\n##### Parse None Model\n\n```shell\n$ # Parse Multi-Modal Projector\n$ gguf-parser --hf-repo=\"unsloth/Qwen2.5-Omni-3B-GGUF\" --hf-file=\"mmproj-F32.gguf\"                                                                        \n+-------------------------------------------------------------------------------------------------------+\n| METADATA                                                                                              |\n+-----------+-----------------+------+--------------+---------------+----------+------------+-----------+\n|    TYPE   |       NAME      | ARCH | QUANTIZATION | LITTLE ENDIAN |   SIZE   | PARAMETERS |    BPW    |\n+-----------+-----------------+------+--------------+---------------+----------+------------+-----------+\n| projector | Qwen2.5-Omni-3B | clip |      F32     |      true     | 4.86 GiB |   1.31 B   | 31.93 bpw |\n+-----------+-----------------+------+--------------+---------------+----------+------------+-----------+\n\n+-------------------------------------------------------------------------------------------------------------------------+\n| ARCHITECTURE                                                                                                            |\n+----------------+-------------------------------+-----------------+-------------------------------------+----------------+\n| PROJECTOR TYPE |         EMBEDDING LEN         |      LAYERS     |           FEED FORWARD LEN          |     ENCODER    |\n|                +---------------+---------------+--------+--------+------------------+------------------+                |\n|                |     VISION    |     AUDIO     | VISION |  AUDIO |      VISION      |       AUDIO      |                |\n+----------------+---------------+---------------+--------+--------+------------------+------------------+----------------+\n|    qwen2.5o    |      1280     |      1280     |   32   |   32   |       1280       |       5120       | Vision & Audio |\n+----------------+---------------+---------------+--------+--------+------------------+------------------+----------------+\n\n$ # Parse LoRA Adapter\n$ gguf-parser --hf-repo=\"ngxson/test_gguf_lora_adapter\" --hf-file=\"lora-Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf\"\n+---------------------------------------------------------------------------------------------+\n| METADATA                                                                                    |\n+---------+------+-------+--------------+---------------+------------+------------+-----------+\n|   TYPE  | NAME |  ARCH | QUANTIZATION | LITTLE ENDIAN |    SIZE    | PARAMETERS |    BPW    |\n+---------+------+-------+--------------+---------------+------------+------------+-----------+\n| adapter |  N/A | llama |      F16     |      true     | 168.08 MiB |   88.12 M  | 16.00 bpw |\n+---------+------+-------+--------------+---------------+------------+------------+-----------+\n\n+---------------------------+\n| ARCHITECTURE              |\n+--------------+------------+\n| ADAPTER TYPE | LORA ALPHA |\n+--------------+------------+\n|     lora     |     32     |\n+--------------+------------+\n```\n\n### Estimate\n\n#### Across Multiple GPU Devices\n\nImaging you're preparing to run\nthe [hierholzer/Llama-3.1-70B-Instruct-GGUF](https://huggingface.co/hierholzer/Llama-3.1-70B-Instruct-GGUF) model file\nacross several hosts in your local network. Some of these hosts are equipped with GPU devices, while others do not have\nany GPU capabilities.\n\n```mermaid\nflowchart TD\n    subgraph host4[\"Windows 11 (host4)\"]\n        ram40([\"11GiB RAM remaining\"])\n    end\n    subgraph host3[\"Apple macOS (host3)\"]\n        gpu10[\"Apple M1 Max (6GiB VRAM remaining)\"]\n    end\n    subgraph host2[\"Windows 11 (host2)\"]\n        gpu20[\"NVIDIA 4090 (12GiB VRAM remaining)\"]\n    end\n    subgraph host1[\"Ubuntu (host1)\"]\n        gpu30[\"NVIDIA 4080 0 (8GiB VRAM remaining)\"]\n        gpu31[\"NVIDIA 4080 1 (10GiB VRAM remaining)\"]\n    end\n```\n\n##### Single Host Multiple GPU Devices\n\nLet's assume you plan to run the model on `host1` only.\n\n```mermaid\nflowchart TD\n    subgraph host1[\"Ubuntu (host1)\"]\n        gpu30[\"NVIDIA 4080 0 (8GiB VRAM remaining)\"]\n        gpu31[\"NVIDIA 4080 1 (10GiB VRAM remaining)\"]\n    end\n```\n\n```shell\n$ gguf-parser --hf-repo=\"hierholzer/Llama-3.1-70B-Instruct-GGUF\" --hf-file=\"Llama-3.1-70B-Instruct-Q4_K_M.gguf\" --ctx-size=1024 --tensor-split=\"8,10\" --estimate --in-short\n+------------------------------------------------------------------------------------------------------------------------------+\n| ESTIMATE                                                                                                                     |\n+----------------------------------------------+--------------------------------------+----------------------------------------+\n|                      RAM                     |                VRAM 0                |                 VRAM 1                 |\n+--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+\n| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA   |   NONUMA  | LAYERS (T + O) |    UMA    |   NONUMA  |\n+--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+\n|      1 + 0 + 0     | 249.27 MiB | 399.27 MiB |     36 + 0     | 144 MiB | 17.83 GiB |     44 + 1     | 22.27 GiB | 22.83 GiB |\n+--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+\n```\n\nBased on the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host1` has the following\nresource consumption:\n\n| Host                  | Available RAM | Request RAM | Available VRAM | Request VRAM | Result     |\n|-----------------------|---------------|-------------|----------------|--------------|------------|\n| host1                 | ENOUGH        | 399.27 MiB  |                |              | :thumbsup: |\n| host1 (NVIDIA 4080 0) |               |             | 8 GiB          | 17.83 GiB    |            |\n| host1 (NVIDIA 4080 1) |               |             | 10 GiB         | 22.83 GiB    |            |\n\nIt appears that running the model on `host1` alone is not feasible.\n\n##### Multiple Hosts Multiple GPU Devices\n\nNext, let's consider the scenario where you plan to run the model on `host4`, while offloading all layers to `host1`,\n`host2`,\nand `host3`.\n\n```mermaid\nflowchart TD\n    host4 -->|TCP| gpu10\n    host4 -->|TCP| gpu20\n    host4 -->|TCP| gpu30\n    host4 -->|TCP| gpu31\n\n    subgraph host4[\"Windows 11 (host4)\"]\n        ram40([\"11GiB RAM remaining\"])\n    end\n    subgraph host3[\"Apple macOS (host3)\"]\n        gpu10[\"Apple M1 Max (6GiB VRAM remaining)\"]\n    end\n    subgraph host2[\"Windows 11 (host2)\"]\n        gpu20[\"NVIDIA 4090 (12GiB VRAM remaining)\"]\n    end\n    subgraph host1[\"Ubuntu (host1)\"]\n        gpu30[\"NVIDIA 4080 0 (8GiB VRAM remaining)\"]\n        gpu31[\"NVIDIA 4080 1 (10GiB VRAM remaining)\"]\n    end\n```\n\n```shell\n$ gguf-parser --hf-repo=\"hierholzer/Llama-3.1-70B-Instruct-GGUF\" --hf-file=\"Llama-3.1-70B-Instruct-Q4_K_M.gguf\" --ctx-size=1024 --tensor-split=\"8,10,12,6\" --rpc=\"host1:50052,host1:50053,host2:50052,host3:50052\" --estimate --in-short\n+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n| ESTIMATE                                                                                                                                                                                                                                 |\n+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+\n|                      RAM                     |                 RPC 0 (V)RAM                 |                 RPC 1 (V)RAM                 |                 RPC 2 (V)RAM                 |                 RPC 3 (V)RAM                 |\n+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+\n| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    |\n+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+\n|      1 + 0 + 0     | 249.27 MiB | 399.27 MiB |     18 + 0     |   8.85 GiB   |   9.28 GiB   |     23 + 0     |   10.88 GiB  |   11.32 GiB  |     27 + 0     |   12.75 GiB  |   13.19 GiB  |     12 + 1     |   7.13 GiB   |   7.64 GiB   |\n+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+\n```\n\nAccording to the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host4` results in the\nfollowing resource consumption:\n\n| Host                  | Available RAM | Request RAM | Available VRAM | Request VRAM | Result     |\n|-----------------------|---------------|-------------|----------------|--------------|------------|\n| host4                 | 11 GiB        | 399.27 MiB  |                |              | :thumbsup: |\n| host1 (NVIDIA 4080 0) |               |             | 8 GiB          | 9.28 GiB     |            |\n| host1 (NVIDIA 4080 1) |               |             | 10 GiB         | 11.32 GiB    |            |\n| host2 (NVIDIA 4090)   |               |             | 12 GiB         | 13.19 GiB    |            |\n| host3 (Apple M1 Max)  | ENOUGH        |             | 6 GiB          | 7.13 GiB     |            |\n\nIt seems that the model cannot be served on `host4`, even with all layers offloaded to `host1`, `host2`, and `host3`.\n\nWe should consider a different approach: running the model on `host3` while offloading all layers to `host1`, `host2`,\nand `host4`.\n\n```mermaid\nflowchart TD\n    host3 -->|TCP| ram40\n    host3 -->|TCP| gpu20\n    host3 -->|TCP| gpu30\n    host3 -->|TCP| gpu31\n\n    subgraph host4[\"Windows 11 (host4)\"]\n        ram40([\"11GiB RAM remaining\"])\n    end\n    subgraph host3[\"Apple macOS (host3)\"]\n        gpu10[\"Apple M1 Max (6GiB VRAM remaining)\"]\n    end\n    subgraph host2[\"Windows 11 (host2)\"]\n        gpu20[\"NVIDIA 4090 (12GiB VRAM remaining)\"]\n    end\n    subgraph host1[\"Ubuntu (host1)\"]\n        gpu30[\"NVIDIA 4080 0 (8GiB VRAM remaining)\"]\n        gpu31[\"NVIDIA 4080 1 (10GiB VRAM remaining)\"]\n    end\n```\n\n```shell\n$ gguf-parser --hf-repo=\"hierholzer/Llama-3.1-70B-Instruct-GGUF\" --hf-file=\"Llama-3.1-70B-Instruct-Q4_K_M.gguf\" --ctx-size=1024 --tensor-split=\"11,12,8,10,6\" --rpc=\"host4:50052,host2:50052,host1:50052,host1:50053\" --estimate --in-short\n+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n| ESTIMATE                                                                                                                                                                                                                                                                          |\n+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------+\n|                      RAM                     |                 RPC 0 (V)RAM                 |                 RPC 1 (V)RAM                 |                 RPC 2 (V)RAM                 |                 RPC 3 (V)RAM                 |                 VRAM 0                 |\n+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+------------+----------+\n| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |     UMA    |  NONUMA  |\n+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+------------+----------+\n|      1 + 0 + 0     | 249.27 MiB | 399.27 MiB |     19 + 0     |   9.36 GiB   |   9.79 GiB   |     21 + 0     |   9.92 GiB   |   10.35 GiB  |     14 + 0     |   6.57 GiB   |   7.01 GiB   |     17 + 0     |   8.11 GiB   |   8.54 GiB   |      9 + 1     | 302.50 MiB | 6.16 GiB |\n+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+------------+----------+\n```\n\nAccording to the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host3` results in the\nfollowing resource consumption:\n\n| Host                  | Available RAM | Request RAM | Available VRAM | Request VRAM | Result     |\n|-----------------------|---------------|-------------|----------------|--------------|------------|\n| host3 (Apple M1 Max)  | ENOUGH        | 249.27 MiB  |                |              | :thumbsup: |\n| host4                 | 11 GiB        | 9.79 GiB    |                |              | :thumbsup: |\n| host2 (NVIDIA 4090)   |               |             | 12 GiB         | 10.35 GiB    | :thumbsup: |\n| host1 (NVIDIA 4080 0) |               |             | 8 GiB          | 7.01 GiB     | :thumbsup: |\n| host1 (NVIDIA 4080 1) |               |             | 10 GiB         | 8.54 GiB     | :thumbsup: |\n| host3 (Apple M1 Max)  |               |             | 6 GiB          | 302.50 MiB   | :thumbsup: |\n\nNow, the model can be successfully served on `host3`, with all layers offloaded to `host1`, `host2`, and `host4`.\n\n#### Maximum Tokens Per Second\n\nThe maximum TPS estimation for the GGUF Parser is determined by the model's parameter size, context size, model\noffloaded layers, and devices on which the model runs. Among these factors, the device's specifications are particularly\nimportant.\n\nInspired\nby [LLM inference speed of light](https://zeux.io/2024/03/15/llm-inference-sol/), GGUF Parser use the **FLOPS** and\n**bandwidth** of the device as evaluation metrics:\n\n- When the device is a CPU, FLOPS refers to the performance of that CPU, while bandwidth corresponds to the DRAM\n  bandwidth.\n- When the device is a (i)GPU, FLOPS indicates the performance of that (i)GPU, and bandwidth corresponds to the VRAM\n  bandwidth.\n- When the device is a specific host, FLOPS depends on whether the CPU or (i)GPU of that host is being used, while\n  bandwidth corresponds to the bandwidth connecting the main node to that host. **After all, a chain is only as strong\n  as\n  its weakest link.** If the connection bandwidth between the\n  main node and the host is equal to or greater than the *RAM bandwidth, then the bandwidth should be taken as the *RAM\n  bandwidth value.\n\n##### CPU FLOPS Calculation\n\nThe performance of a single CPU cache can be calculated using the following formula:\n\n$$ CPU\\ FLOPS = Number\\ of \\ Cores \\times Core\\ Frequency \\times Floating\\ Point\\ Operations\\ per\\ Cycle $$\n\nThe Apple M1 Max CPU features a total of 10 cores, consisting of 8 performance cores and 2 efficiency cores. The\nperformance cores operate at a clock speed of 3.2 GHz, while the efficiency cores run at 2.2 GHz. All cores support\nthe [ARM NEON instruction set](https://en.wikipedia.org/wiki/ARM_architecture_family#Advanced_SIMD_(Neon)), which\nenables 128-bit SIMD operations, allowing multiple floating-point numbers to be processed simultaneously within a\nsingle CPU cycle. Specifically, using single-precision (32-bit) floating-point numbers, each cycle can handle 4\nfloating-point operations.\n\nThe peak floating-point performance for a single performance core is calculated as follows:\n\n$$ Peak\\ Performance = 3.2\\ GHz \\times 4\\ FLOPS = 12.8\\ GFLOPS $$\n\nFor a single efficiency core, the calculation is:\n\n$$ Peak\\ Performance = 2.2\\ GHz \\times 4\\ FLOPS = 8.8\\ GFLOPS $$\n\nThus, the overall peak floating-point performance of the entire CPU can be determined by combining the contributions\nfrom both types of cores:\n\n$$ Peak\\ Performance = 8\\ Cores \\times 12.8\\ GFLOPS + 2\\ Cores \\times 8.8\\ GFLOPS = 120\\ GFLOPS $$\n\n> This results in an average performance of 12 GFLOPS per core. It is evident that the average performance achieved by\n> utilizing both performance and efficiency cores is lower than that obtained by exclusively using performance cores.\n\n##### Run LLaMA2-7B-Chat with Apple Silicon M-series\n\nTaking [TheBloke/Llama-2-7B-Chat-GGUF](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF) as an\nexample and estimate the maximum tokens per second for Apple Silicon M-series using the GGUF Parser.\n\n```shell\n$ # Estimate full offloaded Q8_0 model\n$ gguf-parser --hf-repo TheBloke/LLaMA-7b-GGUF --hf-file llama-7b.Q8_0.gguf --estimate --in-short \\\n  -c 512 \\\n  --device-metric \"<CPU FLOPS>;<RAM BW>,<iGPU FLOPS>;<VRAM BW>\"\n\n$ # Estimate full offloaded Q4_0 model\n$ gguf-parser --hf-repo TheBloke/LLaMA-7b-GGUF --hf-file llama-7b.Q4_0.gguf --estimate --in-short \\\n  -c 512 \\\n  --device-metric \"<CPU FLOPS>;<RAM BW>,<iGPU FLOPS>;<VRAM BW>\"\n```\n\n| Variant  | CPU FLOPS (Performance Core) | iGPU FLOPS             | (V)RAM Bandwidth | Q8_0 Max TPS | Q4_0 Max TPS |\n|----------|------------------------------|------------------------|------------------|--------------|--------------|\n| M1       | 51.2 GFLOPS  (4 cores)       | 2.6 TFLOPS (8 cores)   | 68.3 GBps        | 8.68         | 14.56        |\n| M1 Pro   | 102.4 GFLOPS  (8 cores)      | 5.2 TFLOPS (16 cores)  | 204.8 GBps       | 26.04        | 43.66        |\n| M1 Max   | 102.4 GFLOPS  (8 cores)      | 10.4 TFLOPS (32 cores) | 409.6 GBps       | 52.08        | 87.31        |\n| M1 Ultra | 204.8 GFLOPS (16 cores)      | 21 TFLOPS (64 cores)   | 819.2 GBps       | 104.16       | 174.62       |\n| M2       | 56 GFLOPS (4 cores)          | 3.6 TFLOPS (10 cores)  | 102.4 GBps       | 13.02        | 21.83        |\n| M2 Pro   | 112 GFLOPS (8 cores)         | 6.8 TFLOPS (19 cores)  | 204.8 GBps       | 26.04        | 43.66        |\n| M2 Max   | 112 GFLOPS (8 cores)         | 13.6 TFLOPS (38 cores) | 409.6 GBps       | 52.08        | 87.31        |\n| M2 Ultra | 224 GFLOPS (16 cores)        | 27.2 TFLOPS (76 cores) | 819.2 GBps       | 104.16       | 174.62       |\n| M3       | 64.96 GFLOPS (4 cores)       | 4.1 TFLOPS (10 cores)  | 102.4 GBps       | 13.02        | 21.83        |\n| M3 Pro   | 97.44 GFLOPS (6 cores)       | 7.4 TFLOPS (18 cores)  | 153.6 GBps       | 19.53        | 32.74        |\n| M3 Max   | 194.88 GFLOPS (12 cores)     | 16.4 TFLOPS (40 cores) | 409.6 GBps       | 52.08        | 87.31        |\n| M4       | 70.56 GFLOPS (4 cores)       | 4.1 TFLOPS             | 120 GBps         | 15.26        | 25.58        |\n\n> References:\n> - https://www.cpu-monkey.com/en/cpu_family-apple_m_series\n> - https://nanoreview.net/\n> - https://en.wikipedia.org/wiki/Apple_M1#Variants\n> - https://en.wikipedia.org/wiki/Apple_M2#Variants\n> - https://en.wikipedia.org/wiki/Apple_M3#Variants\n> - https://en.wikipedia.org/wiki/Apple_M4#Variants\n\nYou can further verify the above results in [Performance of llama.cpp on Apple Silicon M-series\n](https://github.com/ggerganov/llama.cpp/discussions/4167#user-content-fn-1-e9a4caf2848534167e450e18fc4ede7f).\n\n##### Run LLaMA3.1-405B-Instruct with Apple Mac Studio devices combined with Thunderbolt\n\nExample\nby [leafspark/Meta-Llama-3.1-405B-Instruct-GGUF](https://huggingface.co/leafspark/Meta-Llama-3.1-405B-Instruct-GGUF)\nand estimate the maximum tokens per second for three Apple Mac Studio devices combined with Thunderbolt.\n\n| Device                        | CPU FLOPS (Performance Core) | iGPU FLOPS             | (V)RAM Bandwidth | Thunderbolt Bandwidth | Role       |\n|-------------------------------|------------------------------|------------------------|------------------|-----------------------|------------|\n| Apple Mac Studio (M2 Ultra) 0 | 224 GFLOPS (16 cores)        | 27.2 TFLOPS (76 cores) | 819.2 GBps       | 40 Gbps               | Main       |\n| Apple Mac Studio (M2 Ultra) 1 | 224 GFLOPS (16 cores)        | 27.2 TFLOPS (76 cores) | 819.2 GBps       | 40 Gbps               | RPC Server |\n| Apple Mac Studio (M2 Ultra) 2 | 224 GFLOPS (16 cores)        | 27.2 TFLOPS (76 cores) | 819.2 GBps       | 40 Gbps               | RPC Server |\n\nGet the maximum tokens per second with the following command:\n\n```shell\n$ # Explain the command:\n$ # --device-metric \"224GFLOPS;819.2GBps\"         <-- Apple Mac Studio 0 CPU FLOPS and RAM Bandwidth\n$ # --device-metric \"27.2TFLOPS;819.2GBps;40Gbps\" <-- Apple Mac Studio 1 (RPC 0) iGPU FLOPS, VRAM Bandwidth, and Thunderbolt Bandwidth\n$ # --device-metric \"27.2TFLOPS;819.2GBps;40Gbps\" <-- Apple Mac Studio 2 (RPC 1) iGPU FLOPS, VRAM Bandwidth, and Thunderbolt Bandwidth\n$ # --device-metric \"27.2TFLOPS;819.2GBps\"        <-- Apple Mac Studio 0 iGPU FLOPS and VRAM Bandwidth\n$ gguf-parser --hf-repo leafspark/Meta-Llama-3.1-405B-Instruct-GGUF --hf-file Llama-3.1-405B-Instruct.Q4_0.gguf/Llama-3.1-405B-Instruct.Q4_0-00001-of-00012.gguf --estimate --in-short \\\n  --no-mmap \\\n  -c 512 \\\n  --rpc host1:port,host2:port \\\n  --tensor-split \"<Proportions>\" \\\n  --device-metric \"224GFLOPS;819.2GBps\" \\\n  --device-metric \"27.2TFLOPS;819.2GBps;40Gbps\" \\\n  --device-metric \"27.2TFLOPS;819.2GBps;40Gbps\" \\\n  --device-metric \"27.2TFLOPS;819.2GBps\"\n```\n\n| Tensor Split | Apple Mac Studio 0 RAM | Apple Mac Studio 1 VRAM (RPC 0) | Apple Mac Studio 2 VRAM  (RPC 1) | Apple Mac Studio 0 VRAM | Q4_0 Max TPS |\n|--------------|------------------------|---------------------------------|----------------------------------|-------------------------|--------------|\n| 1,1,1        | 1.99 GiB               | 72.74 GiB                       | 71.04 GiB                        | 70.96 GiB               | 10.71        |\n| 2,1,1        | 1.99 GiB               | 108.26 GiB                      | 54.13 GiB                        | 52.35 GiB               | 11.96        |\n| 3,1,1        | 1.99 GiB               | 130.25 GiB                      | 42.29 GiB                        | 42.20 GiB               | 9.10         |\n| 4,1,1        | 1.99 GiB               | 143.78 GiB                      | 35.52 GiB                        | 35.44 GiB               | 7.60         |\n\n##### Run Qwen2.5-72B-Instruct with NVIDIA RTX 4080 and remote RPC by Apple Mac Studio (M2)\n\nExample by [Qwen/Qwen2.5-72B-Instruct-GGUF](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GGUF) and estimate the\nmaximum tokens per second for NVIDIA RTX 4080.\n\n| Hardware                                    | FLOPS        | Bandwidth  |\n|---------------------------------------------|--------------|------------|\n| Intel i5-14600k                             | 510.4 GFLOPS |            |\n| 2 x Corsair Vengeance RGB DDR5-6000 (32GiB) |              | 96 GBps    |\n| 2 x NVIDIA GeForce RTX 4080                 | 48.74 TFLOPS | 736.3 GBps |\n| Apple Mac Studio (M2)                       | 27.2 TFLOPS  | 819.2 GBps |\n\n```shell\n$ # Explain the command:\n$ # --tensor-split 20369,12935,13325               <-- Available Memory in MiB for each device\n$ # --device-metric \"510.4GFLOPS;96GBps\"           <-- Intel i5-14600k CPU FLOPS and RAM Bandwidth\n$ # --device-metric \"27.2TFLOPS;819.2GBps;40Gbps\"  <-- Apple Mac Studio (M2) (RPC 0) iGPU FLOPS, VRAM Bandwidth, and Thunderbolt Bandwidth\n$ # --device-metric \"48.74TFLOPS;736.3GBps;64GBps\" <-- NVIDIA GeForce RTX 0 4080 GPU FLOPS, VRAM Bandwidth, and PCIe 5.0 x16 Bandwidth\n$ # --device-metric \"48.74TFLOPS;736.3GBps;8GBps\"  <-- NVIDIA GeForce RTX 1 4080 GPU FLOPS, VRAM Bandwidth, and PCIe 4.0 x4 Bandwidth\n$ gguf-parser --hf-repo Qwen/Qwen2.5-72B-Instruct-GGUF --hf-file qwen2.5-72b-instruct-q4_k_m-00001-of-00012.gguf --estimate --in-short \\\n  --no-mmap \\\n  -c 8192 \\\n  --rpc host:port \\\n  --tensor-split 20369,12935,13325 \\\n  --device-metric \"510.4GFLOPS;96GBps\" \\\n  --device-metric \"27.2TFLOPS;819.2GBps;40Gbps\" \\\n  --device-metric \"48.74TFLOPS;736.3GBps;64GBps\" \\\n  --device-metric \"48.74TFLOPS;736.3GBps;8GBps\"\n+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n| ESTIMATE                                                                                                                                                                              |\n+-----------+------------------------------------------+----------------------------------------------+----------------------------------------+----------------------------------------+\n|  MAX TPS  |                    RAM                   |                 RPC 0 (V)RAM                 |                 VRAM 0                 |                 VRAM 1                 |\n|           +--------------------+----------+----------+----------------+--------------+--------------+----------------+-----------+-----------+----------------+-----------+-----------+\n|           | LAYERS (I + T + O) |    UMA   |  NONUMA  | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |    UMA    |   NONUMA  | LAYERS (T + O) |    UMA    |   NONUMA  |\n+-----------+--------------------+----------+----------+----------------+--------------+--------------+----------------+-----------+-----------+----------------+-----------+-----------+\n| 51.82 tps |      1 + 0 + 0     | 1.19 GiB | 1.34 GiB |     36 + 0     |   18.85 GiB  |   20.17 GiB  |     22 + 0     | 11.34 GiB | 12.66 GiB |     22 + 1     | 12.65 GiB | 13.97 GiB |\n+-----------+--------------------+----------+----------+----------------+--------------+--------------+----------------+-----------+-----------+----------------+-----------+-----------+\n```\n\n#### Full Layers Offload (default)\n\n```shell\n$ gguf-parser --hf-repo=\"etemiz/Llama-3.1-405B-Inst-GGUF\" --hf-file=\"llama-3.1-405b-IQ1_M-00019-of-00019.gguf\" --estimate --in-short\n+-------------------------------------------------------------------------------------+\n| ESTIMATE                                                                            |\n+------------------------------------------+------------------------------------------+\n|                    RAM                   |                  VRAM 0                  |\n+--------------------+----------+----------+----------------+------------+------------+\n| LAYERS (I + T + O) |    UMA   |  NONUMA  | LAYERS (T + O) |     UMA    |   NONUMA   |\n+--------------------+----------+----------+----------------+------------+------------+\n|      1 + 0 + 0     | 1.63 GiB | 1.78 GiB |     126 + 1    | 126.28 GiB | 246.86 GiB |\n+--------------------+----------+----------+----------------+------------+------------+\n```\n\n#### Zero Layers Offload\n\n```shell\n$ gguf-parser --hf-repo=\"etemiz/Llama-3.1-405B-Inst-GGUF\" --hf-file=\"llama-3.1-405b-IQ1_M-00019-of-00019.gguf\" --gpu-layers=0 --estimate --in-short\n+------------------------------------------------------------------------------------+\n| ESTIMATE                                                                           |\n+----------------------------------------------+-------------------------------------+\n|                      RAM                     |                VRAM 0               |\n+--------------------+------------+------------+----------------+--------+-----------+\n| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA  |   NONUMA  |\n+--------------------+------------+------------+----------------+--------+-----------+\n|     1 + 126 + 1    | 127.64 GiB | 127.79 GiB |      0 + 0     |   0 B  | 33.62 GiB |\n+--------------------+------------+------------+----------------+--------+-----------+\n```\n\n#### Specific Layers Offload\n\n```shell\n$ gguf-parser --hf-repo=\"etemiz/Llama-3.1-405B-Inst-GGUF\" --hf-file=\"llama-3.1-405b-IQ1_M-00019-of-00019.gguf\" --gpu-layers=10 --estimate --in-short\n+----------------------------------------------------------------------------------+\n| ESTIMATE                                                                         |\n+----------------------------------------------+-----------------------------------+\n|                      RAM                     |               VRAM 0              |\n+--------------------+------------+------------+----------------+--------+---------+\n| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA  |  NONUMA |\n+--------------------+------------+------------+----------------+--------+---------+\n|     1 + 126 + 1    | 127.64 GiB | 127.79 GiB |      0 + 0     |   0 B  | 250 MiB |\n+--------------------+------------+------------+----------------+--------+---------+\n```\n\n#### Specific Context Size\n\nBy default, the context size retrieved from the model's metadata.\n\nUse `--ctx-size` to specify the context size.\n\n```shell\n$ gguf-parser --hf-repo=\"etemiz/Llama-3.1-405B-Inst-GGUF\" --hf-file=\"llama-3.1-405b-IQ1_M-00019-of-00019.gguf\" --ctx-size=4096 --estimate --in-short\n+--------------------------------------------------------------------------------------+\n| ESTIMATE                                                                             |\n+----------------------------------------------+---------------------------------------+\n|                      RAM                     |                 VRAM 0                |\n+--------------------+------------+------------+----------------+----------+-----------+\n| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |   NONUMA  |\n+--------------------+------------+------------+----------------+----------+-----------+\n|      1 + 0 + 0     | 404.53 MiB | 554.53 MiB |     126 + 1    | 3.94 GiB | 93.28 GiB |\n+--------------------+------------+------------+----------------+----------+-----------+\n```\n\n#### Enable Flash Attention\n\nBy default, LLaMA.cpp disables the Flash Attention.\n\nEnable Flash Attention will reduce the VRAM usage, but it also increases the GPU/CPU usage.\n\nUse `--flash-attention` to enable the Flash Attention.\n\nPlease note that not all models support Flash Attention, if the model does not support, the \"FLASH ATTENTION\" shows \"\nDisabled\" even if you enable it.\n\n```shell\n$ gguf-parser --hf-repo=\"etemiz/Llama-3.1-405B-Inst-GGUF\" --hf-file=\"llama-3.1-405b-IQ1_M-00019-of-00019.gguf\" --flash-attention --estimate --in-short\n+-------------------------------------------------------------------------------------+\n| ESTIMATE                                                                            |\n+------------------------------------------+------------------------------------------+\n|                    RAM                   |                  VRAM 0                  |\n+--------------------+----------+----------+----------------+------------+------------+\n| LAYERS (I + T + O) |    UMA   |  NONUMA  | LAYERS (T + O) |     UMA    |   NONUMA   |\n+--------------------+----------+----------+----------------+------------+------------+\n|      1 + 0 + 0     | 1.63 GiB | 1.78 GiB |     126 + 1    | 126.28 GiB | 215.98 GiB |\n+--------------------+----------+----------+----------------+------------+------------+\n```\n\n#### Disable MMap\n\nBy default, LLaMA.cpp loads the model via Memory-Mapped.\n\nFor Apple MacOS, Memory-Mapped is an efficient way to load the model, and results in a lower VRAM usage.\nFor other platforms, Memory-Mapped affects the first-time model loading speed only.\n\nUse `--no-mmap` to disable loading the model via Memory-Mapped.\n\nPlease note that some models require loading the whole weight into memory, if the model does not support MMap, the \"MMAP\nLOAD\" shows \"Not Supported\".\n\n```shell\n$ gguf-parser --hf-repo=\"etemiz/Llama-3.1-405B-Inst-GGUF\" --hf-file=\"llama-3.1-405b-IQ1_M-00019-of-00019.gguf\" --no-mmap --estimate --in-short\n+-------------------------------------------------------------------------------------+\n| ESTIMATE                                                                            |\n+------------------------------------------+------------------------------------------+\n|                    RAM                   |                  VRAM 0                  |\n+--------------------+----------+----------+----------------+------------+------------+\n| LAYERS (I + T + O) |    UMA   |  NONUMA  | LAYERS (T + O) |     UMA    |   NONUMA   |\n+--------------------+----------+----------+----------------+------------+------------+\n|      1 + 0 + 0     | 2.97 GiB | 3.12 GiB |     126 + 1    | 214.24 GiB | 246.86 GiB |\n+--------------------+----------+----------+----------------+------------+------------+\n```\n\n#### With Adapter\n\nUse `--lora`/`--control-vector` to estimate the usage when loading a model with adapters.\n\n```shell\n$ gguf-parser --hf-repo=\"QuantFactory/Meta-Llama-3-8B-Instruct-GGUF\" --hf-file=\"Meta-Llama-3-8B-Instruct.Q5_K_M.gguf\" --estimate --in-short\n+-------------------------------------------------------------------------------------+\n| ESTIMATE                                                                            |\n+----------------------------------------------+--------------------------------------+\n|                      RAM                     |                VRAM 0                |\n+--------------------+------------+------------+----------------+----------+----------+\n| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |  NONUMA  |\n+--------------------+------------+------------+----------------+----------+----------+\n|      1 + 0 + 0     | 210.80 MiB | 360.80 MiB |     32 + 1     | 1.25 GiB | 7.04 GiB |\n+--------------------+------------+------------+----------------+----------+----------+\n\n$ # With a LoRA adapter.\n$ gguf-parser --hf-repo=\"QuantFactory/Meta-Llama-3-8B-Instruct-GGUF\" --hf-file=\"Meta-Llama-3-8B-Instruct.Q5_K_M.gguf\" --lora-url=\"https://huggingface.co/ngxson/test_gguf_lora_adapter/resolve/main/lora-Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf\" --estimate --in-short\n+-------------------------------------------------------------------------------------+\n| ESTIMATE                                                                            |\n+----------------------------------------------+--------------------------------------+\n|                      RAM                     |                VRAM 0                |\n+--------------------+------------+------------+----------------+----------+----------+\n| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |  NONUMA  |\n+--------------------+------------+------------+----------------+----------+----------+\n|      1 + 0 + 0     | 223.91 MiB | 373.91 MiB |     32 + 1     | 1.42 GiB | 7.20 GiB |\n+--------------------+------------+------------+----------------+----------+----------+\n```\n\n#### Get Proper Offload Layers\n\nUse `--gpu-layers-step` to get the proper offload layers number when the model is too large to fit into the GPUs memory.\n\n```shell\n$ gguf-parser --hf-repo=\"etemiz/Llama-3.1-405B-Inst-GGUF\" --hf-file=\"llama-3.1-405b-IQ1_M-00019-of-00019.gguf\" --gpu-layers-step=6 --estimate --in-short\n+-----------------------------------------------------------------------------------------+\n| ESTIMATE                                                                                |\n+----------------------------------------------+------------------------------------------+\n|                      RAM                     |                  VRAM 0                  |\n+--------------------+------------+------------+----------------+------------+------------+\n| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |     UMA    |   NONUMA   |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 126 + 1    | 127.64 GiB | 127.79 GiB |      0 + 0     |     0 B    |   250 MiB  |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 120 + 1    | 121.90 GiB | 122.05 GiB |      6 + 0     |    6 GiB   |  44.68 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 114 + 1    | 115.90 GiB | 116.05 GiB |     12 + 0     |   12 GiB   |  54.74 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 108 + 1    | 109.90 GiB | 110.05 GiB |     18 + 0     |   18 GiB   |  64.80 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 102 + 1    | 103.90 GiB | 104.05 GiB |     24 + 0     |   24 GiB   |  74.86 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 96 + 1     |  97.90 GiB |  98.05 GiB |     30 + 0     |   30 GiB   |  84.93 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 90 + 1     |  91.90 GiB |  92.05 GiB |     36 + 0     |   36 GiB   |  94.99 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 84 + 1     |  85.90 GiB |  86.05 GiB |     42 + 0     |   42 GiB   | 105.05 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 78 + 1     |  79.90 GiB |  80.05 GiB |     48 + 0     |   48 GiB   | 115.11 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 72 + 1     |  73.90 GiB |  74.05 GiB |     54 + 0     |   54 GiB   | 125.17 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 66 + 1     |  67.90 GiB |  68.05 GiB |     60 + 0     |   60 GiB   | 135.23 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 60 + 1     |  61.90 GiB |  62.05 GiB |     66 + 0     |   66 GiB   | 145.29 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 54 + 1     |  55.90 GiB |  56.05 GiB |     72 + 0     |   72 GiB   | 155.35 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 48 + 1     |  49.90 GiB |  50.05 GiB |     78 + 0     |   78 GiB   | 165.42 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 42 + 1     |  43.90 GiB |  44.05 GiB |     84 + 0     |   84 GiB   | 175.48 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 36 + 1     |  37.90 GiB |  38.05 GiB |     90 + 0     |   90 GiB   | 185.54 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 30 + 1     |  31.90 GiB |  32.05 GiB |     96 + 0     |   96 GiB   | 195.60 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 24 + 1     |  25.90 GiB |  26.05 GiB |     102 + 0    |   102 GiB  | 205.66 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 18 + 1     |  19.90 GiB |  20.05 GiB |     108 + 0    |   108 GiB  | 215.72 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|     1 + 12 + 1     |  13.90 GiB |  14.05 GiB |     114 + 0    |   114 GiB  | 226.05 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|      1 + 6 + 1     |  7.90 GiB  |  8.05 GiB  |     120 + 0    |   120 GiB  | 236.64 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|      1 + 0 + 1     |  1.90 GiB  |  2.05 GiB  |     126 + 0    |   126 GiB  | 246.24 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n|      1 + 0 + 0     |  1.63 GiB  |  1.78 GiB  |     126 + 1    | 126.28 GiB | 246.86 GiB |\n+--------------------+------------+------------+----------------+------------+------------+\n```\n\n## License\n\nMIT\n"
  },
  {
    "path": "cache.go",
    "content": "package gguf_parser\n\nimport (\n\t\"errors\"\n\t\"fmt\"\n\t\"os\"\n\t\"path/filepath\"\n\t\"time\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/json\"\n\t\"github.com/gpustack/gguf-parser-go/util/osx\"\n\t\"github.com/gpustack/gguf-parser-go/util/stringx\"\n)\n\nvar (\n\tErrGGUFFileCacheDisabled  = errors.New(\"GGUF file cache disabled\")\n\tErrGGUFFileCacheMissed    = errors.New(\"GGUF file cache missed\")\n\tErrGGUFFileCacheCorrupted = errors.New(\"GGUF file cache corrupted\")\n)\n\ntype GGUFFileCache string\n\nfunc (c GGUFFileCache) getKeyPath(key string) string {\n\tk := stringx.SumByFNV64a(key)\n\tp := filepath.Join(string(c), k[:1], k)\n\treturn p\n}\n\nfunc (c GGUFFileCache) Get(key string, exp time.Duration) (*GGUFFile, error) {\n\tif c == \"\" {\n\t\treturn nil, ErrGGUFFileCacheDisabled\n\t}\n\n\tif key == \"\" {\n\t\treturn nil, ErrGGUFFileCacheMissed\n\t}\n\n\tp := c.getKeyPath(key)\n\tif !osx.Exists(p, func(stat os.FileInfo) bool {\n\t\tif !stat.Mode().IsRegular() {\n\t\t\treturn false\n\t\t}\n\t\treturn exp == 0 || time.Since(stat.ModTime()) < exp\n\t}) {\n\t\treturn nil, ErrGGUFFileCacheMissed\n\t}\n\n\tvar gf GGUFFile\n\t{\n\t\tbs, err := os.ReadFile(p)\n\t\tif err != nil {\n\t\t\treturn nil, fmt.Errorf(\"GGUF file cache get: %w\", err)\n\t\t}\n\t\tif err = json.Unmarshal(bs, &gf); err != nil {\n\t\t\treturn nil, fmt.Errorf(\"GGUF file cache get: %w\", err)\n\t\t}\n\t}\n\n\tif len(gf.TensorInfos) == 0 {\n\t\t_ = os.Remove(p)\n\t\treturn nil, ErrGGUFFileCacheCorrupted\n\t}\n\n\treturn &gf, nil\n}\n\nfunc (c GGUFFileCache) Put(key string, gf *GGUFFile) error {\n\tif c == \"\" {\n\t\treturn ErrGGUFFileCacheDisabled\n\t}\n\n\tif key == \"\" || gf == nil {\n\t\treturn nil\n\t}\n\n\tbs, err := json.Marshal(gf)\n\tif err != nil {\n\t\treturn fmt.Errorf(\"GGUF file cache put: %w\", err)\n\t}\n\n\tp := c.getKeyPath(key)\n\tif err = osx.WriteFile(p, bs, 0o600); err != nil {\n\t\treturn fmt.Errorf(\"GGUF file cache put: %w\", err)\n\t}\n\treturn nil\n}\n\nfunc (c GGUFFileCache) Delete(key string) error {\n\tif c == \"\" {\n\t\treturn ErrGGUFFileCacheDisabled\n\t}\n\n\tif key == \"\" {\n\t\treturn ErrGGUFFileCacheMissed\n\t}\n\n\tp := c.getKeyPath(key)\n\tif !osx.ExistsFile(p) {\n\t\treturn ErrGGUFFileCacheMissed\n\t}\n\n\tif err := os.Remove(p); err != nil {\n\t\treturn fmt.Errorf(\"GGUF file cache delete: %w\", err)\n\t}\n\treturn nil\n}\n"
  },
  {
    "path": "cmd/gguf-parser/README.md",
    "content": "# GGUF Parser\n\nReview/Check [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) files,\nestimate the memory usage\nfor [llama.cpp](https://github.com/ggerganov/llama.cpp), [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)\nand [llama-box](https://github.com/gpustack/llama-box).\n\nSee [GGUF Parser detail introduction](https://github.com/gpustack/gguf-parser-go) for more information.\n\n## Usage\n\n```shell\n$ gguf-parser --help\nNAME:\n   gguf-parser - Review/Check GGUF files and estimate the memory usage.\n\nUSAGE:\n   gguf-parser [GLOBAL OPTIONS]\n\nVERSION:\n   ...\n\nGLOBAL OPTIONS:\n   --debug        Enable debugging, verbosity. (default: false)\n   --help, -h     Print the usage.\n   --version, -v  Print the version.\n\n   Estimate\n\n   --device-metric value [ --device-metric value ]                              Specify the device metrics, which is used to estimate the throughput, in form of \"FLOPS;Up Bandwidth[;Down Bandwidth]\". The FLOPS unit, select from [PFLOPS, TFLOPS, GFLOPS, MFLOPS, KFLOPS]. The Up/Down Bandwidth unit, select from [PiBps, TiBps, GiBps, MiBps, KiBps, PBps, TBps, GBps, MBps, KBps, Pbps, Tbps, Gbps, Mbps, Kbps]. Up Bandwidth usually indicates the bandwidth to transmit the data to calculate, and Down Bandwidth indicates the bandwidth to transmit the calculated result to next layer. For example, \"--device-metric 10TFLOPS;400GBps\" means the device has 10 TFLOPS and 400 GBps Up/Down bandwidth, \"--device-metric 10TFLOPS;400GBps;5000MBps\" means the device has 5000MBps Down bandwidth. If the quantity specified by \"--device-metric\" is less than the number of estimation devices(determined by \"--tensor-split\" and \"--rpc\" to infer the device count), then replicate the last \"--device-metric\" to meet the required number of evaluation devices.\n   --flash-attention, --flash-attn, --fa, --diffusion-fa                        Specify enabling Flash Attention, which is used to estimate the usage. Flash Attention can reduce the usage of RAM/VRAM. (default: false)\n   --gpu-layers value, --ngl value, --n-gpu-layers value                        Specify how many layers of the main model to offload, which is used to estimate the usage, default is full offloaded. (default: -1)\n   --main-gpu value, --mg value                                                 Specify the GPU to use for the model (with \"--split-mode=none\") or for intermediate results and KV (with \"--split-mode=row\"), which is used to estimate the usage. Since gguf-parser cannot recognize the host GPU devices or RPC servers, \"--main-gpu\" only works when \"--tensor-split\" is set. (default: 0)\n   --no-flash-attention, --no-flash-attn                                        Specify disabling Flash Attention. (default: false)\n   --override-tensor value, --ot value [ --override-tensor value, --ot value ]  Override tensor buffer type, for example, use --override-tensor \"[2-9][0-9]\\.ffn_.*_exps\\.=CPU\" to keep experts of layers 20-99 in the CPU\n   --parallel-size value, --parallel value, --np value, --threads-http value    Specify the number of parallel sequences to decode, which is used to estimate the usage. (default: 1)\n   --platform-footprint value                                                   Specify the platform footprint(RAM,VRAM) of running host in MiB, which is used to estimate the NonUMA usage, default is \"150,250\". Different platform always gets different RAM and VRAM footprints, for example, within CUDA, \"cudaMemGetInfo\" or \"cudaSetDevice\" would occupy some RAM and VRAM, see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo. (default: \"150,250\")\n   --rpc value                                                                  Specify the RPC servers, which is used to estimate the usage, it is a comma-separated list of host:port. Woks with \"--tensor-split\".\n   --tensor-split value, --ts value                                             Specify the fraction of the model to offload to each device, which is used to estimate the usage, it is a comma-separated list of integer. Since gguf-parser cannot recognize the host GPU devices or RPC servers, must explicitly set \"--tensor-split\" to indicate how many devices are used. To declare the devices belong to RPC servers, set \"--rpc\" please.\n\n   Estimate/LLaMACpp\n\n   --batch-size value, -b value                                        Specify the logical batch size, which is used to estimate the usage. (default: 2048)\n   --cache-type-k value, --ctk value                                   Specify the type of Key cache, which is used to estimate the usage, select from [f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1]. (default: \"f16\")\n   --cache-type-v value, --ctv value                                   Specify the type of Value cache, which is used to estimate the usage, select from [f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1]. (default: \"f16\")\n   --ctx-size value, -c value                                          Specify the size of prompt context, which is used to estimate the usage, default is equal to the model's maximum context size. (default: 0)\n   --gpu-layers-draft value, --ngld value, --n-gpu-layers-draft value  Specify how many layers of the draft model to offload, which is used to estimate the usage, default is full offloaded. (default: -1)\n   --gpu-layers-step value                                             Specify the step of layers to offload, works with \"--gpu-layers\". (default: 0)\n   --in-max-ctx-size                                                   Limit the context size to the maximum context size of the model, if the context size is larger than the maximum context size. (default: false)\n   --max-projected-cache value, --visual-max-image-cache value         Specify how many projected embedding to be cached. (default: 0)\n   --mmap                                                              Specify enabling Memory-Mapped using, which is used to estimate the usage. Memory-Mapped can avoid loading the entire model weights into RAM. (default: false)\n   --no-kv-offload, --nkvo                                             Specify disabling Key-Value offloading, which is used to estimate the usage. Disable Key-Value offloading can reduce the usage of VRAM. (default: false)\n   --no-mmap                                                           Specify disabling Memory-Mapped using, which is used to estimate the usage. Memory-Mapped can avoid loading the entire model weights into RAM. (default: false)\n   --rope-freq-base value                                              RoPE base frequency, used by NTK-aware scaling. (default: 0)\n   --rope-freq-scale value                                             RoPE frequency scaling factor, expands context by a factor of 1/N. (default: 0)\n   --rope-scale value                                                  RoPE context scaling factor, expands context by a factor of N. (default: 0)\n   --rope-scaling value                                                RoPE frequency scaling method, defaults to linear unless specified by the model, select from [none, linear, yarn].\n   --split-mode value, --sm value                                      Specify how to split the model across multiple devices, which is used to estimate the usage, select from [layer, row, none]. Since gguf-parser always estimates the usage of VRAM, \"none\" is meaningless here, keep for compatibility. (default: \"layer\")\n   --swa-full                                                          Specify using full-size SWA cache. (default: false)\n   --ubatch-size value, --ub value                                     Specify the physical maximum batch size, which is used to estimate the usage. (default: 512)\n   --visual-max-image-size value                                       Specify maximum image size when completion with vision model. (default: 0)\n   --yarn-orig-ctx value                                               YaRN original context size of model, defaults to model training context size. (default: 0)\n\n   Estimate/StableDiffusionCpp\n\n   --image-autoencoder-tiling, --vae-tiling, --image-vae-tiling                             Specify to enable tiling for the vae model. (default: false)\n   --image-batch-count value, --batch-count value, --image-max-batch value                  Specify the batch(generation) count of the image. (default: 1)\n   --image-free-compute-memory-immediately                                                  Specify to free the compute memory immediately after the generation, which burst using VRAM. (default: false)\n   --image-height value, --height value, --image-max-height value                           Specify the (maximum) height of the image. (default: 1024)\n   --image-no-autoencoder-offload, --vae-on-cpu, --image-no-vae-model-offload               Specify to offload the vae model to CPU. (default: false)\n   --image-no-autoencoder-tiling, --image-no-vae-tiling                                     Specify to disable tiling for the vae model, it takes precedence over --image-autoencoder-tiling. (default: false)\n   --image-no-conditioner-offload, --clip-on-cpu, --image-no-text-encoder-model-offload     Specify to offload the text encoder model to CPU. (default: false)\n   --image-no-control-net-offload, --control-net-cpu, --image-no-control-net-model-offload  Specify to offload the control net model to CPU. (default: false)\n   --image-width value, --width value, --image-max-width value                              Specify the (maximum) width of the image. (default: 1024)\n\n   Load\n\n   --cache-expiration value      Specify the expiration of cache, works with \"--url/--hf-*/--ms-*/--ol-*\". (default: 24h0m0s)\n   --cache-path value            Cache the read result to the path, works with \"--url/--hf-*/--ms-*/--ol-*\". (default: \"/Users/thxcode/.cache/gguf-parser\")\n   --skip-cache                  Skip cache, works with \"--url/--hf-*/--ms-*/--ol-*\", default is caching the read result. (default: false) [$SKIP_CACHE]\n   --skip-dns-cache              Skip DNS cache, works with \"--url/--hf-*/--ms-*/--ol-*\", default is caching the DNS lookup result. (default: false) [$SKIP_DNS_CACHE]\n   --skip-proxy                  Skip proxy settings, works with \"--url/--hf-*/--ms-*/--ol-*\", default is respecting the environment variables \"HTTP_PROXY/HTTPS_PROXY/NO_PROXY\". (default: false) [$SKIP_PROXY]\n   --skip-range-download-detect  Skip range download detect, works with \"--url/--hf-*/--ms-*/--ol-*\", default is detecting the range download support. (default: false) [$SKIP_RANGE_DOWNLOAD_DETECT]\n   --skip-tls-verify             Skip TLS verification, works with \"--url/--hf-*/--ms-*/--ol-*\", default is verifying the TLS certificate on HTTPs request. (default: false) [$SKIP_TLS_VERIFY]\n\n   Model/Local\n\n   --control-net-path value, --control-net value, --image-control-net-model value                               Path where the GGUF file to load for the Control Net model, optional.\n   --control-vector-path value, --control-vector value [ --control-vector-path value, --control-vector value ]  Path where the GGUF file to load for the Control Vector adapter, optional.\n   --draft-path value, --model-draft value, --md value                                                          Path where the GGUF file to load for the draft model, optional, e.g. \"~/.cache/lm-studio/models/QuantFactory/Qwen2-1.5B-Instruct-GGUF/Qwen2-1.5B-Instruct.Q5_K_M.gguf\".\n   --lora-path value, --lora value [ --lora-path value, --lora value ]                                          Path where the GGUF file to load for the LoRA adapter, optional.\n   --mmproj-path value, --mmproj value                                                                          Path where the GGUF file to load for the multimodal projector, optional.\n   --path value, --model value, -m value                                                                        Path where the GGUF file to load for the main model, e.g. \"~/.cache/lm-studio/models/QuantFactory/Qwen2-7B-Instruct-GGUF/Qwen2-7B-Instruct.Q5_K_M.gguf\".\n   --upscale-path value, --upscale-model value, --image-upscale-model value                                     Path where the GGUF file to load for the Upscale model, optional.\n\n   Model/Remote\n\n   --control-net-url value                                    Url where the GGUF file to load for the Control Net model, optional.\n   --control-vector-url value [ --control-vector-url value ]  Url where the GGUF file to load for the Control Vector adapter, optional.\n   --draft-url value                                          Url where the GGUF file to load for the draft model, optional, e.g. \"https://huggingface.co/QuantFactory/Qwen2-1.5B-Instruct-GGUF/resolve/main/Qwen2-1.5B-Instruct.Q5_K_M.gguf\". Note that gguf-parser does not need to download the entire GGUF file.\n   --header value [ --header value ]                          Custom HTTP header in \"Key: Value\" format, works with \"--url/--draft-url\".\n   --lora-url value [ --lora-url value ]                      Url where the GGUF file to load for the LoRA adapter, optional.\n   --mmproj-url value                                         Url where the GGUF file to load for the multimodal projector, optional.\n   --token value                                              Bearer auth token to load GGUF file, optional, works with \"--url/--draft-url\". [$TOKEN]\n   --upscale-url value                                        Url where the GGUF file to load for the Upscale model, optional.\n   --url value, --model-url value, --mu value                 Url where the GGUF file to load for the main model, e.g. \"https://huggingface.co/QuantFactory/Qwen2-7B-Instruct-GGUF/resolve/main/Qwen2-7B-Instruct.Q5_K_M.gguf\". Note that gguf-parser does not need to download the entire GGUF file.\n\n   Model/Remote/HuggingFace\n\n   --hf-control-net-file value                                        Model file below the \"--hf-control-net-repo\", optional.\n   --hf-control-net-repo value                                        Repository of HuggingFace which the GGUF file store for the Control Net model, optional, works with \"--hf-control-net-file\".\n   --hf-control-vector-file value [ --hf-control-vector-file value ]  Control Vector adapter file below the \"--hf-repo\".\n   --hf-draft-file value                                              Model file below the \"--hf-draft-repo\", optional, e.g. \"Qwen2-1.5B-Instruct.Q5_K_M.gguf\".\n   --hf-draft-repo value                                              Repository of HuggingFace which the GGUF file store for the draft model, optional, e.g. \"QuantFactory/Qwen2-1.5B-Instruct-GGUF\", works with \"--hf-draft-file\".\n   --hf-file value, --hff value                                       Model file below the \"--hf-repo\", e.g. \"Qwen2-7B-Instruct.Q5_K_M.gguf\".\n   --hf-lora-file value [ --hf-lora-file value ]                      LoRA adapter file below the \"--hf-repo\".\n   --hf-mmproj-file value                                             Multimodal projector file below the \"--hf-repo\".\n   --hf-repo value, --hfr value                                       Repository of HuggingFace which the GGUF file store for the main model, e.g. \"QuantFactory/Qwen2-7B-Instruct-GGUF\", works with \"--hf-file\".\n   --hf-token value, --hft value                                      User access token of HuggingFace, optional, works with \"--hf-repo/--hf-file pair\" or \"--hf-draft-repo/--hf-draft-file\" pair. See https://huggingface.co/settings/tokens. [$HF_TOKEN]\n   --hf-upscale-file value                                            Model file below the \"--hf-upscale-repo\", optional.\n   --hf-upscale-repo value                                            Repository of HuggingFace which the GGUF file store for the Upscale model, optional, works with \"--hf-upscale-file\".\n\n   Model/Remote/ModelScope\n\n   --ms-control-net-file value                                        Model file below the \"--ms-control-net-repo\", optional.\n   --ms-control-net-repo value                                        Repository of ModelScope which the GGUF file store for the Control Net model, optional, works with \"--ms-control-net-file\".\n   --ms-control-vector-file value [ --ms-control-vector-file value ]  Control Vector adapter file below the \"--ms-repo\".\n   --ms-draft-file value                                              Model file below the \"--ms-draft-repo\", optional, e.g. \"qwen1_5-1_8b-chat-q5_k_m.gguf\".\n   --ms-draft-repo value                                              Repository of ModelScope which the GGUF file store for the draft model, optional, e.g. \"qwen/Qwen1.5-1.8B-Chat-GGUF\", works with \"--ms-draft-file\".\n   --ms-file value                                                    Model file below the \"--ms-repo\", e.g. \"qwen1_5-7b-chat-q5_k_m.gguf\".\n   --ms-lora-file value [ --ms-lora-file value ]                      LoRA adapter file below the \"--ms-repo\".\n   --ms-mmproj-file value                                             Multimodal projector file below the \"--ms-repo\".\n   --ms-repo value                                                    Repository of ModelScope which the GGUF file store for the main model, e.g. \"qwen/Qwen1.5-7B-Chat-GGUF\", works with \"--ms-file\".\n   --ms-token value                                                   Git access token of ModelScope, optional, works with \"--ms-repo/--ms-file\" pair or \"--ms-draft-repo/--ms-draft-file\" pair. See https://modelscope.cn/my/myaccesstoken. [$HF_TOKEN, $MS_TOKEN]\n   --ms-upscale-file value                                            Model file below the \"--ms-upscale-repo\", optional.\n   --ms-upscale-repo value                                            Repository of ModelScope which the GGUF file store for the Upscale model, optional, works with \"--ms-upscale-file\".\n\n   Model/Remote/Ollama\n\n   --ol-base-url value  Model base URL of Ollama, e.g. https://registry.ollama.ai. (default: \"https://registry.ollama.ai\")\n   --ol-model value     Model name of Ollama, e.g. \"gemma2\".\n   --ol-usage           Specify respecting the extending layers introduced by Ollama, works with \"--ol-model\", which affects the usage estimation. (default: false)\n\n   Output\n\n   --estimate           Skip all the information except the estimate result. (default: false)\n   --in-mib             Display the estimated result in table with MiB. (default: false)\n   --in-short           Display the estimated result in table in short form. (default: false)\n   --json               Output as JSON. (default: false)\n   --json-pretty        Works with \"--json\", to output pretty format JSON. (default: true)\n   --raw                Output the GGUF file information as JSON only, skip anything. (default: false)\n   --raw-output value   Works with \"--raw\", to save the result to the file\n   --skip-architecture  Skip to display architecture. (default: false)\n   --skip-estimate      Skip to estimate. By default, gguf-parser always estimates the file which types with \"model\". (default: false)\n   --skip-metadata      Skip to display metadata. (default: false)\n   --skip-tokenizer     Skip to display tokenizer. By default, gguf-parser always displays the tokenizer of the file which types with \"model\". (default: false)\n\n```\n\n### Environment Variables Support\n\n- `TOKEN`: The bearer auth token to load GGUF file, works with `--url/--draft-url`.\n- `HF_ENDPOINT`: The HuggingFace endpoint, default is `https://huggingface.co`.\n- `HF_TOKEN`: The HuggingFace token, see [HuggingFace token](https://huggingface.co/settings/tokens).\n- `MS_ENDPOINT`: The ModelScope endpoint, default is `https://modelscope.cn`.\n- `MS_TOKEN`: The ModelScope token, see [ModelScope token](https://modelscope.cn/my/myaccesstoken).\n\n## License\n\nMIT\n"
  },
  {
    "path": "cmd/gguf-parser/go.mod",
    "content": "module github.com/gpustack/gguf-parser-go/cmd/gguf-parser\n\ngo 1.22.0\n\ntoolchain go1.22.9\n\nreplace github.com/gpustack/gguf-parser-go => ../../\n\nrequire (\n\tgithub.com/gpustack/gguf-parser-go v0.6.0\n\tgithub.com/jedib0t/go-pretty/v6 v6.6.1\n\tgithub.com/urfave/cli/v2 v2.27.5\n)\n\nrequire (\n\tgithub.com/cpuguy83/go-md2man/v2 v2.0.5 // indirect\n\tgithub.com/henvic/httpretty v0.1.4 // indirect\n\tgithub.com/json-iterator/go v1.1.12 // indirect\n\tgithub.com/mattn/go-runewidth v0.0.16 // indirect\n\tgithub.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect\n\tgithub.com/modern-go/reflect2 v1.0.2 // indirect\n\tgithub.com/rivo/uniseg v0.4.7 // indirect\n\tgithub.com/russross/blackfriday/v2 v2.1.0 // indirect\n\tgithub.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d // indirect\n\tgithub.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect\n\tgolang.org/x/crypto v0.29.0 // indirect\n\tgolang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f // indirect\n\tgolang.org/x/mod v0.22.0 // indirect\n\tgolang.org/x/sync v0.9.0 // indirect\n\tgolang.org/x/sys v0.27.0 // indirect\n\tgolang.org/x/tools v0.27.0 // indirect\n\tgonum.org/v1/gonum v0.15.1 // indirect\n)\n"
  },
  {
    "path": "cmd/gguf-parser/go.sum",
    "content": "github.com/cpuguy83/go-md2man/v2 v2.0.5 h1:ZtcqGrnekaHpVLArFSe4HK5DoKx1T0rq2DwVB0alcyc=\ngithub.com/cpuguy83/go-md2man/v2 v2.0.5/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=\ngithub.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=\ngithub.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=\ngithub.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=\ngithub.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=\ngithub.com/henvic/httpretty v0.1.4 h1:Jo7uwIRWVFxkqOnErcoYfH90o3ddQyVrSANeS4cxYmU=\ngithub.com/henvic/httpretty v0.1.4/go.mod h1:Dn60sQTZfbt2dYsdUSNsCljyF4AfdqnuJFDLJA1I4AM=\ngithub.com/jedib0t/go-pretty/v6 v6.6.1 h1:iJ65Xjb680rHcikRj6DSIbzCex2huitmc7bDtxYVWyc=\ngithub.com/jedib0t/go-pretty/v6 v6.6.1/go.mod h1:zbn98qrYlh95FIhwwsbIip0LYpwSG8SUOScs+v9/t0E=\ngithub.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=\ngithub.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=\ngithub.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=\ngithub.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=\ngithub.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=\ngithub.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=\ngithub.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=\ngithub.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=\ngithub.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=\ngithub.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=\ngithub.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=\ngithub.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=\ngithub.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=\ngithub.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=\ngithub.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=\ngithub.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=\ngithub.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d h1:3VwvTjiRPA7cqtgOWddEL+JrcijMlXUmj99c/6YyZoY=\ngithub.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d/go.mod h1:tAG61zBM1DYRaGIPloumExGvScf08oHuo0kFoOqdbT0=\ngithub.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=\ngithub.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=\ngithub.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=\ngithub.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=\ngithub.com/urfave/cli/v2 v2.27.5 h1:WoHEJLdsXr6dDWoJgMq/CboDmyY/8HMMH1fTECbih+w=\ngithub.com/urfave/cli/v2 v2.27.5/go.mod h1:3Sevf16NykTbInEnD0yKkjDAeZDS0A6bzhBH5hrMvTQ=\ngithub.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4=\ngithub.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM=\ngolang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ=\ngolang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg=\ngolang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f h1:XdNn9LlyWAhLVp6P/i8QYBW+hlyhrhei9uErw2B5GJo=\ngolang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f/go.mod h1:D5SMRVC3C2/4+F/DB1wZsLRnSNimn2Sp/NPsCrsv8ak=\ngolang.org/x/mod v0.22.0 h1:D4nJWe9zXqHOmWqj4VMOJhvzj7bEZg4wEYa759z1pH4=\ngolang.org/x/mod v0.22.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY=\ngolang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ=\ngolang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=\ngolang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s=\ngolang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=\ngolang.org/x/term v0.26.0 h1:WEQa6V3Gja/BhNxg540hBip/kkaYtRg3cxg4oXSw4AU=\ngolang.org/x/term v0.26.0/go.mod h1:Si5m1o57C5nBNQo5z1iq+XDijt21BDBDp2bK0QI8e3E=\ngolang.org/x/tools v0.27.0 h1:qEKojBykQkQ4EynWy4S8Weg69NumxKdn40Fce3uc/8o=\ngolang.org/x/tools v0.27.0/go.mod h1:sUi0ZgbwW9ZPAq26Ekut+weQPR5eIM6GQLQ1Yjm1H0Q=\ngonum.org/v1/gonum v0.15.1 h1:FNy7N6OUZVUaWG9pTiD+jlhdQ3lMP+/LcTpJ6+a8sQ0=\ngonum.org/v1/gonum v0.15.1/go.mod h1:eZTZuRFrzu5pcyjN5wJhcIhnUdNijYxX1T2IcrOGY0o=\ngopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=\ngopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=\n"
  },
  {
    "path": "cmd/gguf-parser/main.go",
    "content": "package main\n\nimport (\n\t\"errors\"\n\t\"fmt\"\n\t\"net\"\n\t\"os\"\n\t\"path/filepath\"\n\t\"regexp\"\n\t\"strconv\"\n\t\"strings\"\n\t\"sync\"\n\t\"time\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/anyx\"\n\t\"github.com/gpustack/gguf-parser-go/util/json\"\n\t\"github.com/gpustack/gguf-parser-go/util/osx\"\n\t\"github.com/gpustack/gguf-parser-go/util/signalx\"\n\t\"github.com/jedib0t/go-pretty/v6/table\"\n\t\"github.com/jedib0t/go-pretty/v6/text\"\n\t\"github.com/urfave/cli/v2\"\n\n\t. \"github.com/gpustack/gguf-parser-go\" // nolint: stylecheck\n)\n\nvar Version = \"v0.0.0\"\n\nfunc init() {\n\tcli.VersionFlag = &cli.BoolFlag{\n\t\tName:               \"version\",\n\t\tAliases:            []string{\"v\"},\n\t\tUsage:              \"Print the version.\",\n\t\tDisableDefaultText: true,\n\t}\n\tcli.HelpFlag = &cli.BoolFlag{\n\t\tName:               \"help\",\n\t\tAliases:            []string{\"h\"},\n\t\tUsage:              \"Print the usage.\",\n\t\tDisableDefaultText: true,\n\t}\n}\n\nfunc main() {\n\tname := filepath.Base(os.Args[0])\n\tapp := &cli.App{\n\t\tName:            name,\n\t\tUsage:           \"Review/Check GGUF files and estimate the memory usage and provide optimization suggestions.\",\n\t\tUsageText:       name + \" [GLOBAL OPTIONS]\",\n\t\tVersion:         Version,\n\t\tReader:          os.Stdin,\n\t\tWriter:          os.Stdout,\n\t\tErrWriter:       os.Stderr,\n\t\tHideHelpCommand: true,\n\t\tOnUsageError: func(c *cli.Context, _ error, _ bool) error {\n\t\t\treturn cli.ShowAppHelp(c)\n\t\t},\n\t\tFlags: []cli.Flag{\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &debug,\n\t\t\t\tValue:       debug,\n\t\t\t\tName:        \"debug\",\n\t\t\t\tUsage:       \"Enable debugging, verbosity.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &path,\n\t\t\t\tValue:       path,\n\t\t\t\tCategory:    \"Model/Local\",\n\t\t\t\tName:        \"path\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"model\",\n\t\t\t\t\t\"m\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Path where the GGUF file to load for the main model, e.g. \\\"~/.cache\" +\n\t\t\t\t\t\"/lm-studio/models/QuantFactory/Qwen2-7B-Instruct-GGUF\" +\n\t\t\t\t\t\"/Qwen2-7B-Instruct.Q5_K_M.gguf\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &draftPath,\n\t\t\t\tValue:       draftPath,\n\t\t\t\tCategory:    \"Model/Local\",\n\t\t\t\tName:        \"draft-path\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"model-draft\",\n\t\t\t\t\t\"md\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Path where the GGUF file to load for the draft model, optional, e.g. \\\"~/.cache\" +\n\t\t\t\t\t\"/lm-studio/models/QuantFactory/Qwen2-1.5B-Instruct-GGUF\" +\n\t\t\t\t\t\"/Qwen2-1.5B-Instruct.Q5_K_M.gguf\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &mmprojPath,\n\t\t\t\tValue:       mmprojPath,\n\t\t\t\tCategory:    \"Model/Local\",\n\t\t\t\tName:        \"mmproj-path\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"mmproj\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Path where the GGUF file to load for the multimodal projector, optional.\",\n\t\t\t},\n\t\t\t&cli.StringSliceFlag{\n\t\t\t\tDestination: &loraPaths,\n\t\t\t\tCategory:    \"Model/Local\",\n\t\t\t\tName:        \"lora-path\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"lora\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Path where the GGUF file to load for the LoRA adapter, optional.\",\n\t\t\t},\n\t\t\t&cli.StringSliceFlag{\n\t\t\t\tDestination: &controlVectorPaths,\n\t\t\t\tCategory:    \"Model/Local\",\n\t\t\t\tName:        \"control-vector-path\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"control-vector\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Path where the GGUF file to load for the Control Vector adapter, optional.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &upscalePath,\n\t\t\t\tValue:       upscalePath,\n\t\t\t\tCategory:    \"Model/Local\",\n\t\t\t\tName:        \"upscale-path\",\n\t\t\t\tAliases: []string{\n\t\t\t\t\t\"upscale-model\",       // StableDiffusionCpp compatibility\n\t\t\t\t\t\"image-upscale-model\", // LLaMABox compatibility\n\t\t\t\t},\n\t\t\t\tUsage: \"Path where the GGUF file to load for the Upscale model, optional.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &controlNetPath,\n\t\t\t\tValue:       controlNetPath,\n\t\t\t\tCategory:    \"Model/Local\",\n\t\t\t\tName:        \"control-net-path\",\n\t\t\t\tAliases: []string{\n\t\t\t\t\t\"control-net\",             // StableDiffusionCpp compatibility\n\t\t\t\t\t\"image-control-net-model\", // LLaMABox compatibility\n\t\t\t\t},\n\t\t\t\tUsage: \"Path where the GGUF file to load for the Control Net model, optional.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &url,\n\t\t\t\tValue:       url,\n\t\t\t\tCategory:    \"Model/Remote\",\n\t\t\t\tName:        \"url\",\n\t\t\t\tAliases: []string{\n\t\t\t\t\t\"model-url\",\n\t\t\t\t\t\"mu\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Url where the GGUF file to load for the main model, e.g. \" +\n\t\t\t\t\t\"\\\"https://huggingface.co/QuantFactory/Qwen2-7B-Instruct-GGUF\" +\n\t\t\t\t\t\"/resolve/main/Qwen2-7B-Instruct.Q5_K_M.gguf\\\". \" +\n\t\t\t\t\t\"Note that gguf-parser does not need to download the entire GGUF file.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &draftUrl,\n\t\t\t\tValue:       draftUrl,\n\t\t\t\tCategory:    \"Model/Remote\",\n\t\t\t\tName:        \"draft-url\",\n\t\t\t\tUsage: \"Url where the GGUF file to load for the draft model, optional, e.g. \" +\n\t\t\t\t\t\"\\\"https://huggingface.co/QuantFactory/Qwen2-1.5B-Instruct-GGUF\" +\n\t\t\t\t\t\"/resolve/main/Qwen2-1.5B-Instruct.Q5_K_M.gguf\\\". \" +\n\t\t\t\t\t\"Note that gguf-parser does not need to download the entire GGUF file.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &mmprojUrl,\n\t\t\t\tValue:       mmprojUrl,\n\t\t\t\tCategory:    \"Model/Remote\",\n\t\t\t\tName:        \"mmproj-url\",\n\t\t\t\tUsage:       \"Url where the GGUF file to load for the multimodal projector, optional.\",\n\t\t\t},\n\t\t\t&cli.StringSliceFlag{\n\t\t\t\tDestination: &loraUrls,\n\t\t\t\tCategory:    \"Model/Remote\",\n\t\t\t\tName:        \"lora-url\",\n\t\t\t\tUsage:       \"Url where the GGUF file to load for the LoRA adapter, optional.\",\n\t\t\t},\n\t\t\t&cli.StringSliceFlag{\n\t\t\t\tDestination: &controlVectorUrls,\n\t\t\t\tCategory:    \"Model/Remote\",\n\t\t\t\tName:        \"control-vector-url\",\n\t\t\t\tUsage:       \"Url where the GGUF file to load for the Control Vector adapter, optional.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &upscaleUrl,\n\t\t\t\tValue:       upscaleUrl,\n\t\t\t\tCategory:    \"Model/Remote\",\n\t\t\t\tName:        \"upscale-url\",\n\t\t\t\tUsage:       \"Url where the GGUF file to load for the Upscale model, optional.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &controlNetUrl,\n\t\t\t\tValue:       controlNetUrl,\n\t\t\t\tCategory:    \"Model/Remote\",\n\t\t\t\tName:        \"control-net-url\",\n\t\t\t\tUsage:       \"Url where the GGUF file to load for the Control Net model, optional.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &token,\n\t\t\t\tValue:       token,\n\t\t\t\tCategory:    \"Model/Remote\",\n\t\t\t\tName:        \"token\",\n\t\t\t\tEnvVars: []string{\n\t\t\t\t\t\"TOKEN\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Bearer auth token to load GGUF file, optional, \" +\n\t\t\t\t\t\"works with \\\"--url/--draft-url\\\".\",\n\t\t\t},\n\t\t\t&cli.StringSliceFlag{\n\t\t\t\tDestination: &headers,\n\t\t\t\tCategory:    \"Model/Remote\",\n\t\t\t\tName:        \"header\",\n\t\t\t\tUsage: \"Custom HTTP header in \\\"Key: Value\\\" format, \" +\n\t\t\t\t\t\"works with \\\"--url/--draft-url\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &hfRepo,\n\t\t\t\tValue:       hfRepo,\n\t\t\t\tCategory:    \"Model/Remote/HuggingFace\",\n\t\t\t\tName:        \"hf-repo\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"hfr\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Repository of HuggingFace which the GGUF file store for the main model, e.g. \" +\n\t\t\t\t\t\"\\\"QuantFactory/Qwen2-7B-Instruct-GGUF\\\", works with \\\"--hf-file\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &hfFile,\n\t\t\t\tValue:       hfFile,\n\t\t\t\tCategory:    \"Model/Remote/HuggingFace\",\n\t\t\t\tName:        \"hf-file\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"hff\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Model file below the \\\"--hf-repo\\\", e.g. \" +\n\t\t\t\t\t\"\\\"Qwen2-7B-Instruct.Q5_K_M.gguf\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &hfDraftRepo,\n\t\t\t\tValue:       hfDraftRepo,\n\t\t\t\tCategory:    \"Model/Remote/HuggingFace\",\n\t\t\t\tName:        \"hf-draft-repo\",\n\t\t\t\tUsage: \"Repository of HuggingFace which the GGUF file store for the draft model, optional, e.g. \" +\n\t\t\t\t\t\"\\\"QuantFactory/Qwen2-1.5B-Instruct-GGUF\\\", works with \\\"--hf-draft-file\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &hfDraftFile,\n\t\t\t\tValue:       hfDraftFile,\n\t\t\t\tCategory:    \"Model/Remote/HuggingFace\",\n\t\t\t\tName:        \"hf-draft-file\",\n\t\t\t\tUsage: \"Model file below the \\\"--hf-draft-repo\\\", optional, e.g. \" +\n\t\t\t\t\t\"\\\"Qwen2-1.5B-Instruct.Q5_K_M.gguf\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &hfMMProjFile,\n\t\t\t\tValue:       hfMMProjFile,\n\t\t\t\tCategory:    \"Model/Remote/HuggingFace\",\n\t\t\t\tName:        \"hf-mmproj-file\",\n\t\t\t\tUsage:       \"Multimodal projector file below the \\\"--hf-repo\\\".\",\n\t\t\t},\n\t\t\t&cli.StringSliceFlag{\n\t\t\t\tDestination: &hfLoRAFiles,\n\t\t\t\tCategory:    \"Model/Remote/HuggingFace\",\n\t\t\t\tName:        \"hf-lora-file\",\n\t\t\t\tUsage:       \"LoRA adapter file below the \\\"--hf-repo\\\".\",\n\t\t\t},\n\t\t\t&cli.StringSliceFlag{\n\t\t\t\tDestination: &hfControlVectorFiles,\n\t\t\t\tCategory:    \"Model/Remote/HuggingFace\",\n\t\t\t\tName:        \"hf-control-vector-file\",\n\t\t\t\tUsage:       \"Control Vector adapter file below the \\\"--hf-repo\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &hfUpscaleRepo,\n\t\t\t\tValue:       hfUpscaleRepo,\n\t\t\t\tCategory:    \"Model/Remote/HuggingFace\",\n\t\t\t\tName:        \"hf-upscale-repo\",\n\t\t\t\tUsage: \"Repository of HuggingFace which the GGUF file store for the Upscale model, optional, \" +\n\t\t\t\t\t\"works with \\\"--hf-upscale-file\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &hfUpscaleFile,\n\t\t\t\tValue:       hfUpscaleFile,\n\t\t\t\tCategory:    \"Model/Remote/HuggingFace\",\n\t\t\t\tName:        \"hf-upscale-file\",\n\t\t\t\tUsage:       \"Model file below the \\\"--hf-upscale-repo\\\", optional.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &hfControlNetRepo,\n\t\t\t\tValue:       hfControlNetRepo,\n\t\t\t\tCategory:    \"Model/Remote/HuggingFace\",\n\t\t\t\tName:        \"hf-control-net-repo\",\n\t\t\t\tUsage: \"Repository of HuggingFace which the GGUF file store for the Control Net model, optional, \" +\n\t\t\t\t\t\"works with \\\"--hf-control-net-file\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &hfControlNetFile,\n\t\t\t\tValue:       hfControlNetFile,\n\t\t\t\tCategory:    \"Model/Remote/HuggingFace\",\n\t\t\t\tName:        \"hf-control-net-file\",\n\t\t\t\tUsage:       \"Model file below the \\\"--hf-control-net-repo\\\", optional.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &hfToken,\n\t\t\t\tValue:       hfToken,\n\t\t\t\tCategory:    \"Model/Remote/HuggingFace\",\n\t\t\t\tName:        \"hf-token\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"hft\",\n\t\t\t\t},\n\t\t\t\tEnvVars: []string{\n\t\t\t\t\t\"HF_TOKEN\",\n\t\t\t\t},\n\t\t\t\tUsage: \"User access token of HuggingFace, optional, \" +\n\t\t\t\t\t\"works with \\\"--hf-repo/--hf-file pair\\\" or \\\"--hf-draft-repo/--hf-draft-file\\\" pair. \" +\n\t\t\t\t\t\"See https://huggingface.co/settings/tokens.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &msRepo,\n\t\t\t\tValue:       msRepo,\n\t\t\t\tCategory:    \"Model/Remote/ModelScope\",\n\t\t\t\tName:        \"ms-repo\",\n\t\t\t\tUsage: \"Repository of ModelScope which the GGUF file store for the main model, e.g. \" +\n\t\t\t\t\t\"\\\"qwen/Qwen1.5-7B-Chat-GGUF\\\", works with \\\"--ms-file\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &msFile,\n\t\t\t\tValue:       msFile,\n\t\t\t\tCategory:    \"Model/Remote/ModelScope\",\n\t\t\t\tName:        \"ms-file\",\n\t\t\t\tUsage: \"Model file below the \\\"--ms-repo\\\", e.g. \" +\n\t\t\t\t\t\"\\\"qwen1_5-7b-chat-q5_k_m.gguf\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &msDraftRepo,\n\t\t\t\tValue:       msDraftRepo,\n\t\t\t\tCategory:    \"Model/Remote/ModelScope\",\n\t\t\t\tName:        \"ms-draft-repo\",\n\t\t\t\tUsage: \"Repository of ModelScope which the GGUF file store for the draft model, optional, e.g. \" +\n\t\t\t\t\t\"\\\"qwen/Qwen1.5-1.8B-Chat-GGUF\\\", works with \\\"--ms-draft-file\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &msDraftFile,\n\t\t\t\tValue:       msDraftFile,\n\t\t\t\tCategory:    \"Model/Remote/ModelScope\",\n\t\t\t\tName:        \"ms-draft-file\",\n\t\t\t\tUsage: \"Model file below the \\\"--ms-draft-repo\\\", optional, e.g. \" +\n\t\t\t\t\t\"\\\"qwen1_5-1_8b-chat-q5_k_m.gguf\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &msMMProjFile,\n\t\t\t\tValue:       msMMProjFile,\n\t\t\t\tCategory:    \"Model/Remote/ModelScope\",\n\t\t\t\tName:        \"ms-mmproj-file\",\n\t\t\t\tUsage:       \"Multimodal projector file below the \\\"--ms-repo\\\".\",\n\t\t\t},\n\t\t\t&cli.StringSliceFlag{\n\t\t\t\tDestination: &msLoRAFiles,\n\t\t\t\tCategory:    \"Model/Remote/ModelScope\",\n\t\t\t\tName:        \"ms-lora-file\",\n\t\t\t\tUsage:       \"LoRA adapter file below the \\\"--ms-repo\\\".\",\n\t\t\t},\n\t\t\t&cli.StringSliceFlag{\n\t\t\t\tDestination: &msControlVectorFiles,\n\t\t\t\tCategory:    \"Model/Remote/ModelScope\",\n\t\t\t\tName:        \"ms-control-vector-file\",\n\t\t\t\tUsage:       \"Control Vector adapter file below the \\\"--ms-repo\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &msUpscaleRepo,\n\t\t\t\tValue:       msUpscaleRepo,\n\t\t\t\tCategory:    \"Model/Remote/ModelScope\",\n\t\t\t\tName:        \"ms-upscale-repo\",\n\t\t\t\tUsage: \"Repository of ModelScope which the GGUF file store for the Upscale model, optional, \" +\n\t\t\t\t\t\"works with \\\"--ms-upscale-file\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &msUpscaleFile,\n\t\t\t\tValue:       msUpscaleFile,\n\t\t\t\tCategory:    \"Model/Remote/ModelScope\",\n\t\t\t\tName:        \"ms-upscale-file\",\n\t\t\t\tUsage:       \"Model file below the \\\"--ms-upscale-repo\\\", optional.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &msControlNetRepo,\n\t\t\t\tValue:       msControlNetRepo,\n\t\t\t\tCategory:    \"Model/Remote/ModelScope\",\n\t\t\t\tName:        \"ms-control-net-repo\",\n\t\t\t\tUsage: \"Repository of ModelScope which the GGUF file store for the Control Net model, optional, \" +\n\t\t\t\t\t\"works with \\\"--ms-control-net-file\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &msControlNetFile,\n\t\t\t\tValue:       msControlNetFile,\n\t\t\t\tCategory:    \"Model/Remote/ModelScope\",\n\t\t\t\tName:        \"ms-control-net-file\",\n\t\t\t\tUsage:       \"Model file below the \\\"--ms-control-net-repo\\\", optional.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &msToken,\n\t\t\t\tValue:       msToken,\n\t\t\t\tCategory:    \"Model/Remote/ModelScope\",\n\t\t\t\tName:        \"ms-token\",\n\t\t\t\tEnvVars: []string{\n\t\t\t\t\t\"HF_TOKEN\", // Compatible with HuggingFace\n\t\t\t\t\t\"MS_TOKEN\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Git access token of ModelScope, optional, \" +\n\t\t\t\t\t\"works with \\\"--ms-repo/--ms-file\\\" pair or \\\"--ms-draft-repo/--ms-draft-file\\\" pair. \" +\n\t\t\t\t\t\"See https://modelscope.cn/my/myaccesstoken.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &olBaseURL,\n\t\t\t\tValue:       olBaseURL,\n\t\t\t\tCategory:    \"Model/Remote/Ollama\",\n\t\t\t\tName:        \"ol-base-url\",\n\t\t\t\tUsage: \"Model base URL of Ollama, e.g. \" +\n\t\t\t\t\t\"https://registry.ollama.ai.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &olModel,\n\t\t\t\tValue:       olModel,\n\t\t\t\tCategory:    \"Model/Remote/Ollama\",\n\t\t\t\tName:        \"ol-model\",\n\t\t\t\tUsage: \"Model name of Ollama, e.g. \" +\n\t\t\t\t\t\"\\\"gemma2\\\".\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &olUsage,\n\t\t\t\tValue:       olUsage,\n\t\t\t\tCategory:    \"Model/Remote/Ollama\",\n\t\t\t\tName:        \"ol-usage\",\n\t\t\t\tUsage: \"Specify respecting the extending layers introduced by Ollama, \" +\n\t\t\t\t\t\"works with \\\"--ol-model\\\", which affects the usage estimation.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &skipProxy,\n\t\t\t\tValue:       skipProxy,\n\t\t\t\tCategory:    \"Load\",\n\t\t\t\tName:        \"skip-proxy\",\n\t\t\t\tEnvVars: []string{\n\t\t\t\t\t\"SKIP_PROXY\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Skip proxy settings, \" +\n\t\t\t\t\t\"works with \\\"--url/--hf-*/--ms-*/--ol-*\\\", \" +\n\t\t\t\t\t\"default is respecting the environment variables \\\"HTTP_PROXY/HTTPS_PROXY/NO_PROXY\\\".\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &skipTLSVerify,\n\t\t\t\tValue:       skipTLSVerify,\n\t\t\t\tCategory:    \"Load\",\n\t\t\t\tName:        \"skip-tls-verify\",\n\t\t\t\tEnvVars: []string{\n\t\t\t\t\t\"SKIP_TLS_VERIFY\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Skip TLS verification, \" +\n\t\t\t\t\t\"works with \\\"--url/--hf-*/--ms-*/--ol-*\\\", \" +\n\t\t\t\t\t\"default is verifying the TLS certificate on HTTPs request.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &skipDNSCache,\n\t\t\t\tValue:       skipDNSCache,\n\t\t\t\tCategory:    \"Load\",\n\t\t\t\tName:        \"skip-dns-cache\",\n\t\t\t\tEnvVars: []string{\n\t\t\t\t\t\"SKIP_DNS_CACHE\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Skip DNS cache, \" +\n\t\t\t\t\t\"works with \\\"--url/--hf-*/--ms-*/--ol-*\\\", \" +\n\t\t\t\t\t\"default is caching the DNS lookup result.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &skipRangDownloadDetect,\n\t\t\t\tValue:       skipRangDownloadDetect,\n\t\t\t\tCategory:    \"Load\",\n\t\t\t\tName:        \"skip-range-download-detect\",\n\t\t\t\tEnvVars: []string{\n\t\t\t\t\t\"SKIP_RANGE_DOWNLOAD_DETECT\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Skip range download detect, \" +\n\t\t\t\t\t\"works with \\\"--url/--hf-*/--ms-*/--ol-*\\\", \" +\n\t\t\t\t\t\"default is detecting the range download support.\",\n\t\t\t},\n\t\t\t&cli.DurationFlag{\n\t\t\t\tDestination: &cacheExpiration,\n\t\t\t\tValue:       cacheExpiration,\n\t\t\t\tCategory:    \"Load\",\n\t\t\t\tName:        \"cache-expiration\",\n\t\t\t\tUsage: \"Specify the expiration of cache, \" +\n\t\t\t\t\t\"works with \\\"--url/--hf-*/--ms-*/--ol-*\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &cachePath,\n\t\t\t\tValue:       cachePath,\n\t\t\t\tCategory:    \"Load\",\n\t\t\t\tName:        \"cache-path\",\n\t\t\t\tUsage: \"Cache the read result to the path, \" +\n\t\t\t\t\t\"works with \\\"--url/--hf-*/--ms-*/--ol-*\\\".\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &skipCache,\n\t\t\t\tValue:       skipCache,\n\t\t\t\tCategory:    \"Load\",\n\t\t\t\tName:        \"skip-cache\",\n\t\t\t\tEnvVars: []string{\n\t\t\t\t\t\"SKIP_CACHE\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Skip cache, \" +\n\t\t\t\t\t\"works with \\\"--url/--hf-*/--ms-*/--ol-*\\\", \" +\n\t\t\t\t\t\"default is caching the read result.\",\n\t\t\t},\n\t\t\t&cli.IntFlag{\n\t\t\t\tDestination: &parallelSize,\n\t\t\t\tValue:       parallelSize,\n\t\t\t\tCategory:    \"Estimate\",\n\t\t\t\tName:        \"parallel-size\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"parallel\",\n\t\t\t\t\t\"np\",\n\t\t\t\t\t\"threads-http\", // LLaMABox v0.0.140+ compatibility\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify the number of parallel sequences to decode, \" +\n\t\t\t\t\t\"which is used to estimate the usage.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &flashAttention,\n\t\t\t\tValue:       flashAttention,\n\t\t\t\tCategory:    \"Estimate\",\n\t\t\t\tName:        \"flash-attention\",\n\t\t\t\tAliases: []string{\n\t\t\t\t\t\"flash-attn\",\n\t\t\t\t\t\"fa\",\n\t\t\t\t\t\"diffusion-fa\", // StableDiffusionCpp compatibility\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify enabling Flash Attention, \" +\n\t\t\t\t\t\"which is used to estimate the usage. \" +\n\t\t\t\t\t\"Flash Attention can reduce the usage of RAM/VRAM.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{ // LLaMABox compatibility\n\t\t\t\tCategory: \"Estimate\",\n\t\t\t\tName:     \"no-flash-attention\",\n\t\t\t\tAliases: []string{\n\t\t\t\t\t\"no-flash-attn\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify disabling Flash Attention.\",\n\t\t\t\tAction: func(context *cli.Context, b bool) error {\n\t\t\t\t\tflashAttention = !b\n\t\t\t\t\treturn nil\n\t\t\t\t},\n\t\t\t},\n\t\t\t&cli.UintFlag{\n\t\t\t\tDestination: &mainGPU,\n\t\t\t\tValue:       mainGPU,\n\t\t\t\tCategory:    \"Estimate\",\n\t\t\t\tName:        \"main-gpu\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"mg\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify the GPU to use for the model (with \\\"--split-mode=none\\\") \" +\n\t\t\t\t\t\"or for intermediate results and KV (with \\\"--split-mode=row\\\"), \" +\n\t\t\t\t\t\"which is used to estimate the usage. \" +\n\t\t\t\t\t\"Since gguf-parser cannot recognize the host GPU devices or RPC servers, \" +\n\t\t\t\t\t\"\\\"--main-gpu\\\" only works when \\\"--tensor-split\\\" is set.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &rpcServers,\n\t\t\t\tValue:       rpcServers,\n\t\t\t\tCategory:    \"Estimate\",\n\t\t\t\tName:        \"rpc\",\n\t\t\t\tUsage: \"Specify the RPC servers, \" +\n\t\t\t\t\t\"which is used to estimate the usage, \" +\n\t\t\t\t\t\"it is a comma-separated list of host:port. \" +\n\t\t\t\t\t\"Woks with \\\"--tensor-split\\\".\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &tensorSplit,\n\t\t\t\tValue:       tensorSplit,\n\t\t\t\tCategory:    \"Estimate\",\n\t\t\t\tName:        \"tensor-split\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"ts\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify the fraction of the model to offload to each device, \" +\n\t\t\t\t\t\"which is used to estimate the usage, \" +\n\t\t\t\t\t\"it is a comma-separated list of integer. \" +\n\t\t\t\t\t\"Since gguf-parser cannot recognize the host GPU devices or RPC servers, \" +\n\t\t\t\t\t\"must explicitly set \\\"--tensor-split\\\" to indicate how many devices are used. \" +\n\t\t\t\t\t\"To declare the devices belong to RPC servers, set \\\"--rpc\\\" please.\",\n\t\t\t},\n\t\t\t&cli.IntFlag{\n\t\t\t\tDestination: &offloadLayers,\n\t\t\t\tValue:       offloadLayers,\n\t\t\t\tCategory:    \"Estimate\",\n\t\t\t\tName:        \"gpu-layers\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"ngl\",\n\t\t\t\t\t\"n-gpu-layers\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify how many layers of the main model to offload, \" +\n\t\t\t\t\t\"which is used to estimate the usage, \" +\n\t\t\t\t\t\"default is full offloaded.\",\n\t\t\t},\n\t\t\t&cli.StringSliceFlag{\n\t\t\t\tDestination: &overrideTensors,\n\t\t\t\tCategory:    \"Estimate\",\n\t\t\t\tName:        \"override-tensor\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"ot\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Override tensor buffer type, \" +\n\t\t\t\t\t\"for example, use --override-tensor \\\"[2-9][0-9]\\\\.ffn_.*_exps\\\\.=CPU\\\" to keep experts of layers 20-99 in the CPU\",\n\t\t\t},\n\t\t\t&cli.StringSliceFlag{\n\t\t\t\tDestination: &deviceMetrics,\n\t\t\t\tCategory:    \"Estimate\",\n\t\t\t\tName:        \"device-metric\",\n\t\t\t\tUsage: \"Specify the device metrics, \" +\n\t\t\t\t\t\"which is used to estimate the throughput, in form of \\\"FLOPS;Up Bandwidth[;Down Bandwidth]\\\". \" +\n\t\t\t\t\t\"The FLOPS unit, select from [PFLOPS, TFLOPS, GFLOPS, MFLOPS, KFLOPS]. \" +\n\t\t\t\t\t\"The Up/Down Bandwidth unit, select from [PiBps, TiBps, GiBps, MiBps, KiBps, PBps, TBps, GBps, MBps, KBps, Pbps, Tbps, Gbps, Mbps, Kbps]. \" +\n\t\t\t\t\t\"Up Bandwidth usually indicates the bandwidth to transmit the data to calculate, \" +\n\t\t\t\t\t\"and Down Bandwidth indicates the bandwidth to transmit the calculated result to next layer. \" +\n\t\t\t\t\t\"For example, \\\"--device-metric 10TFLOPS;400GBps\\\" means the device has 10 TFLOPS and 400 GBps Up/Down bandwidth, \" +\n\t\t\t\t\t\"\\\"--device-metric 10TFLOPS;400GBps;5000MBps\\\" means the device has 5000MBps Down bandwidth. \" +\n\t\t\t\t\t\"If the quantity specified by \\\"--device-metric\\\" is less than the number of estimation devices(\" +\n\t\t\t\t\t\"determined by \\\"--tensor-split\\\" and \\\"--rpc\\\" to infer the device count), \" +\n\t\t\t\t\t\"then replicate the last \\\"--device-metric\\\" to meet the required number of evaluation devices.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &platformFootprint,\n\t\t\t\tValue:       platformFootprint,\n\t\t\t\tCategory:    \"Estimate\",\n\t\t\t\tName:        \"platform-footprint\",\n\t\t\t\tUsage: \"Specify the platform footprint(RAM,VRAM) of running host in MiB, \" +\n\t\t\t\t\t\"which is used to estimate the NonUMA usage, \" +\n\t\t\t\t\t\"default is \\\"150,250\\\". \" +\n\t\t\t\t\t\"Different platform always gets different RAM and VRAM footprints, \" +\n\t\t\t\t\t\"for example, within CUDA, \\\"cudaMemGetInfo\\\" or \\\"cudaSetDevice\\\" would occupy some RAM and VRAM, \" +\n\t\t\t\t\t\"see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo.\",\n\t\t\t},\n\t\t\t&cli.IntFlag{\n\t\t\t\tDestination: &lmcCtxSize,\n\t\t\t\tValue:       lmcCtxSize,\n\t\t\t\tCategory:    \"Estimate/LLaMACpp\",\n\t\t\t\tName:        \"ctx-size\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"c\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify the size of prompt context, \" +\n\t\t\t\t\t\"which is used to estimate the usage, \" +\n\t\t\t\t\t\"default is equal to the model's maximum context size.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &lmcRoPEScalingType,\n\t\t\t\tCategory:    \"Estimate/LLaMACpp\",\n\t\t\t\tName:        \"rope-scaling\",\n\t\t\t\tUsage: \"RoPE frequency scaling method, \" +\n\t\t\t\t\t\"defaults to linear unless specified by the model, select from [none, linear, yarn].\",\n\t\t\t},\n\t\t\t&cli.Float64Flag{\n\t\t\t\tCategory: \"Estimate/LLaMACpp\",\n\t\t\t\tName:     \"rope-scale\",\n\t\t\t\tUsage: \"RoPE context scaling factor, \" +\n\t\t\t\t\t\"expands context by a factor of N.\",\n\t\t\t\tAction: func(context *cli.Context, f float64) error {\n\t\t\t\t\tif f != 0 {\n\t\t\t\t\t\tlmcRoPEFreqScale = 1 / f\n\t\t\t\t\t}\n\t\t\t\t\treturn nil\n\t\t\t\t},\n\t\t\t},\n\t\t\t&cli.Float64Flag{\n\t\t\t\tDestination: &lmcRoPEFreqBase,\n\t\t\t\tCategory:    \"Estimate/LLaMACpp\",\n\t\t\t\tName:        \"rope-freq-base\",\n\t\t\t\tUsage: \"RoPE base frequency, \" +\n\t\t\t\t\t\"used by NTK-aware scaling.\",\n\t\t\t},\n\t\t\t&cli.Float64Flag{\n\t\t\t\tDestination: &lmcRoPEFreqScale,\n\t\t\t\tCategory:    \"Estimate/LLaMACpp\",\n\t\t\t\tName:        \"rope-freq-scale\",\n\t\t\t\tUsage: \"RoPE frequency scaling factor, \" +\n\t\t\t\t\t\"expands context by a factor of 1/N.\",\n\t\t\t},\n\t\t\t&cli.IntFlag{\n\t\t\t\tDestination: &lmcRoPEScalingOrigCtxSize,\n\t\t\t\tCategory:    \"Estimate/LLaMACpp\",\n\t\t\t\tName:        \"yarn-orig-ctx\",\n\t\t\t\tUsage: \"YaRN original context size of model, \" +\n\t\t\t\t\t\"defaults to model training context size.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &lmcInMaxCtxSize,\n\t\t\t\tValue:       lmcInMaxCtxSize,\n\t\t\t\tCategory:    \"Estimate/LLaMACpp\",\n\t\t\t\tName:        \"in-max-ctx-size\",\n\t\t\t\tUsage: \"Limit the context size to the maximum context size of the model, \" +\n\t\t\t\t\t\"if the context size is larger than the maximum context size.\",\n\t\t\t},\n\t\t\t&cli.IntFlag{\n\t\t\t\tDestination: &lmcLogicalBatchSize,\n\t\t\t\tValue:       lmcLogicalBatchSize,\n\t\t\t\tCategory:    \"Estimate/LLaMACpp\",\n\t\t\t\tName:        \"batch-size\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"b\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify the logical batch size, \" +\n\t\t\t\t\t\"which is used to estimate the usage.\",\n\t\t\t},\n\t\t\t&cli.IntFlag{\n\t\t\t\tDestination: &lmcPhysicalBatchSize,\n\t\t\t\tValue:       lmcPhysicalBatchSize,\n\t\t\t\tCategory:    \"Estimate/LLaMACpp\",\n\t\t\t\tName:        \"ubatch-size\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"ub\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify the physical maximum batch size, \" +\n\t\t\t\t\t\"which is used to estimate the usage.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &lmcCacheKeyType,\n\t\t\t\tValue:       lmcCacheKeyType,\n\t\t\t\tCategory:    \"Estimate/LLaMACpp\",\n\t\t\t\tName:        \"cache-type-k\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"ctk\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify the type of Key cache, \" +\n\t\t\t\t\t\"which is used to estimate the usage, select from [f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1].\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &lmcCacheValueType,\n\t\t\t\tValue:       lmcCacheValueType,\n\t\t\t\tCategory:    \"Estimate/LLaMACpp\",\n\t\t\t\tName:        \"cache-type-v\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"ctv\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify the type of Value cache, \" +\n\t\t\t\t\t\"which is used to estimate the usage, select from [f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1].\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &lmcNoKVOffload,\n\t\t\t\tValue:       lmcNoKVOffload,\n\t\t\t\tCategory:    \"Estimate/LLaMACpp\",\n\t\t\t\tName:        \"no-kv-offload\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"nkvo\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify disabling Key-Value offloading, \" +\n\t\t\t\t\t\"which is used to estimate the usage. \" +\n\t\t\t\t\t\"Disable Key-Value offloading can reduce the usage of VRAM.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &lmcSplitMode,\n\t\t\t\tValue:       lmcSplitMode,\n\t\t\t\tCategory:    \"Estimate/LLaMACpp\",\n\t\t\t\tName:        \"split-mode\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"sm\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify how to split the model across multiple devices, \" +\n\t\t\t\t\t\"which is used to estimate the usage, select from [layer, row, none]. \" +\n\t\t\t\t\t\"Since gguf-parser always estimates the usage of VRAM, \" +\n\t\t\t\t\t\"\\\"none\\\" is meaningless here, keep for compatibility.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &lmcSWAFull,\n\t\t\t\tValue:       lmcSWAFull,\n\t\t\t\tCategory:    \"Estimate/LLaMACpp\",\n\t\t\t\tName:        \"swa-full\",\n\t\t\t\tUsage:       \"Specify using full-size SWA cache.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &lmcNoMMap,\n\t\t\t\tValue:       lmcNoMMap,\n\t\t\t\tCategory:    \"Estimate/LLaMACpp\",\n\t\t\t\tName:        \"no-mmap\",\n\t\t\t\tUsage: \"Specify disabling Memory-Mapped using, \" +\n\t\t\t\t\t\"which is used to estimate the usage. \" +\n\t\t\t\t\t\"Memory-Mapped can avoid loading the entire model weights into RAM.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{ // LLaMABox compatibility\n\t\t\t\tCategory: \"Estimate/LLaMACpp\",\n\t\t\t\tName:     \"mmap\",\n\t\t\t\tUsage: \"Specify enabling Memory-Mapped using, \" +\n\t\t\t\t\t\"which is used to estimate the usage. \" +\n\t\t\t\t\t\"Memory-Mapped can avoid loading the entire model weights into RAM.\",\n\t\t\t\tAction: func(context *cli.Context, b bool) error {\n\t\t\t\t\tlmcNoMMap = !b\n\t\t\t\t\treturn nil\n\t\t\t\t},\n\t\t\t},\n\t\t\t&cli.UintFlag{ // LLaMABox compatibility\n\t\t\t\tDestination: &lmcVisualMaxImageSize,\n\t\t\t\tValue:       lmcVisualMaxImageSize,\n\t\t\t\tCategory:    \"Estimate/LLaMACpp\",\n\t\t\t\tName:        \"visual-max-image-size\",\n\t\t\t\tUsage:       \"Specify maximum image size when completion with vision model.\",\n\t\t\t},\n\t\t\t&cli.UintFlag{ // LLaMABox compatibility\n\t\t\t\tDestination: &lmcMaxProjectedCache,\n\t\t\t\tValue:       lmcMaxProjectedCache,\n\t\t\t\tCategory:    \"Estimate/LLaMACpp\",\n\t\t\t\tName:        \"max-projected-cache\",\n\t\t\t\tAliases: []string{\n\t\t\t\t\t\"visual-max-image-cache\", // Deprecated argument name\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify how many projected embedding to be cached.\",\n\t\t\t},\n\t\t\t&cli.IntFlag{\n\t\t\t\tDestination: &lmcOffloadLayersDraft,\n\t\t\t\tValue:       lmcOffloadLayersDraft,\n\t\t\t\tCategory:    \"Estimate/LLaMACpp\",\n\t\t\t\tName:        \"gpu-layers-draft\",\n\t\t\t\tAliases: []string{ // LLaMACpp compatibility\n\t\t\t\t\t\"ngld\",\n\t\t\t\t\t\"n-gpu-layers-draft\",\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify how many layers of the draft model to offload, \" +\n\t\t\t\t\t\"which is used to estimate the usage, \" +\n\t\t\t\t\t\"default is full offloaded.\",\n\t\t\t},\n\t\t\t&cli.Uint64Flag{\n\t\t\t\tDestination: &lmcOffloadLayersStep,\n\t\t\t\tValue:       lmcOffloadLayersStep,\n\t\t\t\tCategory:    \"Estimate/LLaMACpp\",\n\t\t\t\tName:        \"gpu-layers-step\",\n\t\t\t\tUsage: \"Specify the step of layers to offload, \" +\n\t\t\t\t\t\"works with \\\"--gpu-layers\\\".\",\n\t\t\t},\n\t\t\t&cli.UintFlag{\n\t\t\t\tDestination: &sdcBatchCount,\n\t\t\t\tValue:       sdcBatchCount,\n\t\t\t\tCategory:    \"Estimate/StableDiffusionCpp\",\n\t\t\t\tName:        \"image-batch-count\",\n\t\t\t\tAliases: []string{\n\t\t\t\t\t\"batch-count\",     // StableDiffusionCpp compatibility\n\t\t\t\t\t\"image-max-batch\", // LLaMABox compatibility\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify the batch(generation) count of the image.\",\n\t\t\t},\n\t\t\t&cli.UintFlag{\n\t\t\t\tDestination: &sdcHeight,\n\t\t\t\tValue:       sdcHeight,\n\t\t\t\tCategory:    \"Estimate/StableDiffusionCpp\",\n\t\t\t\tName:        \"image-height\",\n\t\t\t\tAliases: []string{\n\t\t\t\t\t\"height\",           // StableDiffusionCpp compatibility\n\t\t\t\t\t\"image-max-height\", // LLaMABox compatibility\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify the (maximum) height of the image.\",\n\t\t\t},\n\t\t\t&cli.UintFlag{\n\t\t\t\tDestination: &sdcWidth,\n\t\t\t\tValue:       sdcWidth,\n\t\t\t\tCategory:    \"Estimate/StableDiffusionCpp\",\n\t\t\t\tName:        \"image-width\",\n\t\t\t\tAliases: []string{\n\t\t\t\t\t\"width\",           // StableDiffusionCpp compatibility\n\t\t\t\t\t\"image-max-width\", // LLaMABox compatibility\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify the (maximum) width of the image.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &sdcNoConditionerOffload,\n\t\t\t\tValue:       sdcNoConditionerOffload,\n\t\t\t\tCategory:    \"Estimate/StableDiffusionCpp\",\n\t\t\t\tName:        \"image-no-conditioner-offload\",\n\t\t\t\tAliases: []string{\n\t\t\t\t\t\"clip-on-cpu\",                         // StableDiffusionCpp compatibility\n\t\t\t\t\t\"image-no-text-encoder-model-offload\", // LLaMABox compatibility\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify to offload the text encoder model to CPU.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &sdcNoAutoencoderOffload,\n\t\t\t\tValue:       sdcNoAutoencoderOffload,\n\t\t\t\tCategory:    \"Estimate/StableDiffusionCpp\",\n\t\t\t\tName:        \"image-no-autoencoder-offload\",\n\t\t\t\tAliases: []string{\n\t\t\t\t\t\"vae-on-cpu\",                 // StableDiffusionCpp compatibility\n\t\t\t\t\t\"image-no-vae-model-offload\", // LLaMABox compatibility\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify to offload the vae model to CPU.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &sdcNoControlNetOffload,\n\t\t\t\tValue:       sdcNoControlNetOffload,\n\t\t\t\tCategory:    \"Estimate/StableDiffusionCpp\",\n\t\t\t\tName:        \"image-no-control-net-offload\",\n\t\t\t\tAliases: []string{\n\t\t\t\t\t\"control-net-cpu\",                    // StableDiffusionCpp compatibility\n\t\t\t\t\t\"image-no-control-net-model-offload\", // LLaMABox compatibility\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify to offload the control net model to CPU.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &sdcAutoencoderTiling,\n\t\t\t\tValue:       sdcAutoencoderTiling,\n\t\t\t\tCategory:    \"Estimate/StableDiffusionCpp\",\n\t\t\t\tName:        \"image-autoencoder-tiling\",\n\t\t\t\tAliases: []string{\n\t\t\t\t\t\"vae-tiling\",       // StableDiffusionCpp compatibility\n\t\t\t\t\t\"image-vae-tiling\", // LLaMABox compatibility\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify to enable tiling for the vae model.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &sdcNoAutoencoderTiling,\n\t\t\t\tValue:       sdcNoAutoencoderTiling,\n\t\t\t\tCategory:    \"Estimate/StableDiffusionCpp\",\n\t\t\t\tName:        \"image-no-autoencoder-tiling\",\n\t\t\t\tAliases: []string{\n\t\t\t\t\t\"image-no-vae-tiling\", // LLaMABox compatibility\n\t\t\t\t},\n\t\t\t\tUsage: \"Specify to disable tiling for the vae model, it takes precedence over --image-autoencoder-tiling.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &sdcFreeComputeMemoryImmediately,\n\t\t\t\tValue:       sdcFreeComputeMemoryImmediately,\n\t\t\t\tCategory:    \"Estimate/StableDiffusionCpp\",\n\t\t\t\tName:        \"image-free-compute-memory-immediately\", // LLaMABox compatibility\n\t\t\t\tUsage:       \"Specify to free the compute memory immediately after the generation, which burst using VRAM.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &raw,\n\t\t\t\tValue:       raw,\n\t\t\t\tCategory:    \"Output\",\n\t\t\t\tName:        \"raw\",\n\t\t\t\tUsage:       \"Output the GGUF file information as JSON only, skip anything.\",\n\t\t\t},\n\t\t\t&cli.StringFlag{\n\t\t\t\tDestination: &rawOutput,\n\t\t\t\tValue:       rawOutput,\n\t\t\t\tCategory:    \"Output\",\n\t\t\t\tName:        \"raw-output\",\n\t\t\t\tUsage:       \"Works with \\\"--raw\\\", to save the result to the file\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &skipMetadata,\n\t\t\t\tValue:       skipMetadata,\n\t\t\t\tCategory:    \"Output\",\n\t\t\t\tName:        \"skip-metadata\",\n\t\t\t\tUsage:       \"Skip to display metadata.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &skipArchitecture,\n\t\t\t\tValue:       skipArchitecture,\n\t\t\t\tCategory:    \"Output\",\n\t\t\t\tName:        \"skip-architecture\",\n\t\t\t\tUsage:       \"Skip to display architecture.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &skipTokenizer,\n\t\t\t\tValue:       skipTokenizer,\n\t\t\t\tCategory:    \"Output\",\n\t\t\t\tName:        \"skip-tokenizer\",\n\t\t\t\tUsage: \"Skip to display tokenizer. \" +\n\t\t\t\t\t\"By default, gguf-parser always displays the tokenizer of the file which types with \\\"model\\\".\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &skipEstimate,\n\t\t\t\tValue:       skipEstimate,\n\t\t\t\tCategory:    \"Output\",\n\t\t\t\tName:        \"skip-estimate\",\n\t\t\t\tUsage: \"Skip to estimate. \" +\n\t\t\t\t\t\"By default, gguf-parser always estimates the file which types with \\\"model\\\".\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tCategory: \"Output\",\n\t\t\t\tName:     \"estimate\",\n\t\t\t\tUsage:    \"Skip all the information except the estimate result.\",\n\t\t\t\tAction: func(_ *cli.Context, estimateOnly bool) error {\n\t\t\t\t\tif estimateOnly {\n\t\t\t\t\t\tskipMetadata = true\n\t\t\t\t\t\tskipArchitecture = true\n\t\t\t\t\t\tskipTokenizer = true\n\t\t\t\t\t}\n\t\t\t\t\treturn nil\n\t\t\t\t},\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &inShort,\n\t\t\t\tValue:       inShort,\n\t\t\t\tCategory:    \"Output\",\n\t\t\t\tName:        \"in-short\",\n\t\t\t\tUsage:       \"Display the estimated result in table in short form.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &inMib,\n\t\t\t\tValue:       inMib,\n\t\t\t\tCategory:    \"Output\",\n\t\t\t\tName:        \"in-mib\",\n\t\t\t\tUsage:       \"Display the estimated result in table with MiB.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &inJson,\n\t\t\t\tValue:       inJson,\n\t\t\t\tCategory:    \"Output\",\n\t\t\t\tName:        \"json\",\n\t\t\t\tUsage:       \"Output as JSON.\",\n\t\t\t},\n\t\t\t&cli.BoolFlag{\n\t\t\t\tDestination: &inPrettyJson,\n\t\t\t\tValue:       inPrettyJson,\n\t\t\t\tCategory:    \"Output\",\n\t\t\t\tName:        \"json-pretty\",\n\t\t\t\tUsage:       \"Works with \\\"--json\\\", to output pretty format JSON.\",\n\t\t\t},\n\t\t},\n\t\tAction: mainAction,\n\t}\n\n\tif err := app.RunContext(signalx.Handler(), os.Args); err != nil {\n\t\t_, _ = fmt.Fprintf(os.Stderr, \"%v\\n\", err)\n\t\tos.Exit(1)\n\t}\n}\n\nvar (\n\t// model options\n\tpath                 string\n\tdraftPath            string          // for estimate\n\tmmprojPath           string          // for estimate\n\tloraPaths            cli.StringSlice // for estimate\n\tcontrolVectorPaths   cli.StringSlice // for estimate\n\tupscalePath          string          // for estimate\n\tcontrolNetPath       string          // for estimate\n\turl                  string\n\tdraftUrl             string          // for estimate\n\tmmprojUrl            string          // for estimate\n\tloraUrls             cli.StringSlice // for estimate\n\tcontrolVectorUrls    cli.StringSlice // for estimate\n\tupscaleUrl           string          // for estimate\n\tcontrolNetUrl        string          // for estimate\n\ttoken                string\n\theaders              cli.StringSlice\n\thfRepo               string\n\thfFile               string\n\thfDraftRepo          string          // for estimate\n\thfDraftFile          string          // for estimate\n\thfMMProjFile         string          // for estimate\n\thfLoRAFiles          cli.StringSlice // for estimate\n\thfControlVectorFiles cli.StringSlice // for estimate\n\thfUpscaleRepo        string          // for estimate\n\thfUpscaleFile        string          // for estimate\n\thfControlNetRepo     string          // for estimate\n\thfControlNetFile     string          // for estimate\n\thfToken              string\n\tmsRepo               string\n\tmsFile               string\n\tmsDraftRepo          string          // for estimate\n\tmsDraftFile          string          // for estimate\n\tmsMMProjFile         string          // for estimate\n\tmsLoRAFiles          cli.StringSlice // for estimate\n\tmsControlVectorFiles cli.StringSlice // for estimate\n\tmsUpscaleRepo        string          // for estimate\n\tmsUpscaleFile        string          // for estimate\n\tmsControlNetRepo     string          // for estimate\n\tmsControlNetFile     string          // for estimate\n\tmsToken              string\n\tolBaseURL            = \"https://registry.ollama.ai\"\n\tolModel              string\n\tolUsage              bool\n\t// load options\n\tdebug                  bool\n\tskipProxy              bool\n\tskipTLSVerify          bool\n\tskipDNSCache           bool\n\tskipRangDownloadDetect bool\n\tcacheExpiration        = 24 * time.Hour\n\tcachePath              = DefaultCachePath()\n\tskipCache              bool\n\t// estimate options\n\tparallelSize      = 1\n\tflashAttention    bool\n\tmainGPU           uint\n\trpcServers        string\n\ttensorSplit       string\n\toffloadLayers     = -1\n\toverrideTensors   cli.StringSlice\n\tdeviceMetrics     cli.StringSlice\n\tplatformFootprint = \"150,250\"\n\t// estimate options for llama.cpp\n\tlmcCtxSize                = 0\n\tlmcRoPEFreqBase           float64\n\tlmcRoPEFreqScale          float64\n\tlmcRoPEScalingType        string\n\tlmcRoPEScalingOrigCtxSize int\n\tlmcInMaxCtxSize           bool\n\tlmcLogicalBatchSize       = 2048\n\tlmcPhysicalBatchSize      = 512\n\tlmcCacheKeyType           = \"f16\"\n\tlmcCacheValueType         = \"f16\"\n\tlmcNoKVOffload            bool\n\tlmcSplitMode              = \"layer\"\n\tlmcSWAFull                = false\n\tlmcNoMMap                 bool\n\tlmcVisualMaxImageSize     uint\n\tlmcMaxProjectedCache      uint\n\tlmcOffloadLayersDraft     = -1\n\tlmcOffloadLayersStep      uint64\n\t// estimate options for stable-diffusion.cpp\n\tsdcBatchCount                   uint = 1\n\tsdcHeight                       uint = 1024\n\tsdcWidth                        uint = 1024\n\tsdcNoConditionerOffload         bool\n\tsdcNoAutoencoderOffload         bool\n\tsdcNoControlNetOffload          bool\n\tsdcAutoencoderTiling            bool\n\tsdcNoAutoencoderTiling          bool\n\tsdcFreeComputeMemoryImmediately bool\n\t// output options\n\traw              bool\n\trawOutput        string\n\tinShort          bool\n\tskipMetadata     bool\n\tskipArchitecture bool\n\tskipTokenizer    bool\n\tskipEstimate     bool\n\tinMib            bool\n\tinJson           bool\n\tinPrettyJson     = true\n)\n\nfunc mainAction(c *cli.Context) error {\n\tctx := c.Context\n\n\t// Prepare options.\n\n\tropts := []GGUFReadOption{\n\t\tSkipLargeMetadata(),\n\t\tUseMMap(),\n\t\tUseCache(),\n\t}\n\tif hs := headers.Value(); len(hs) > 0 {\n\t\thm := make(map[string]string, len(hs))\n\t\tfor _, h := range hs {\n\t\t\tparts := strings.SplitN(h, \":\", 2)\n\t\t\tif len(parts) == 2 {\n\t\t\t\thm[strings.TrimSpace(parts[0])] = strings.TrimSpace(parts[1])\n\t\t\t}\n\t\t}\n\t\tif len(hm) > 0 {\n\t\t\tropts = append(ropts, UseHeaders(hm))\n\t\t}\n\t}\n\tif token != \"\" {\n\t\tropts = append(ropts, UseBearerAuth(token))\n\t}\n\tif debug {\n\t\tropts = append(ropts, UseDebug())\n\t}\n\tif skipProxy {\n\t\tropts = append(ropts, SkipProxy())\n\t}\n\tif skipTLSVerify {\n\t\tropts = append(ropts, SkipTLSVerification())\n\t}\n\tif skipDNSCache {\n\t\tropts = append(ropts, SkipDNSCache())\n\t}\n\tif skipRangDownloadDetect {\n\t\tropts = append(ropts, SkipRangeDownloadDetection())\n\t}\n\tif cacheExpiration >= 0 {\n\t\tropts = append(ropts, UseCacheExpiration(cacheExpiration))\n\t}\n\tif cachePath != \"\" {\n\t\tropts = append(ropts, UseCachePath(cachePath))\n\t}\n\tif skipCache {\n\t\tropts = append(ropts, SkipCache())\n\t}\n\n\teopts := []GGUFRunEstimateOption{\n\t\tWithLLaMACppCacheValueType(GGMLTypeF16),\n\t\tWithLLaMACppCacheKeyType(GGMLTypeF16),\n\t}\n\tif parallelSize > 0 {\n\t\teopts = append(eopts, WithParallelSize(int32(parallelSize)))\n\t}\n\tif flashAttention {\n\t\teopts = append(eopts, WithFlashAttention())\n\t}\n\tif tensorSplit != \"\" {\n\t\ttss := strings.Split(tensorSplit, \",\")\n\t\tif len(tss) > 128 {\n\t\t\treturn errors.New(\"--tensor-split exceeds the number of devices\")\n\t\t}\n\t\tvar vs float64\n\t\tvv := make([]float64, len(tss))\n\t\tvf := make([]float64, len(tss))\n\t\tfor i, s := range tss {\n\t\t\ts = strings.TrimSpace(s)\n\t\t\tv, err := strconv.ParseFloat(s, 64)\n\t\t\tif err != nil {\n\t\t\t\treturn errors.New(\"--tensor-split has invalid integer\")\n\t\t\t}\n\t\t\tvs += v\n\t\t\tvv[i] = vs\n\t\t}\n\t\tfor i, v := range vv {\n\t\t\tvf[i] = v / vs\n\t\t}\n\t\teopts = append(eopts, WithTensorSplitFraction(vf))\n\t\tif mainGPU < uint(len(vv)) {\n\t\t\teopts = append(eopts, WithMainGPUIndex(int(mainGPU)))\n\t\t} else {\n\t\t\treturn errors.New(\"--main-gpu must be less than item size of --tensor-split\")\n\t\t}\n\t\tif rpcServers != \"\" {\n\t\t\trss := strings.Split(rpcServers, \",\")\n\t\t\tif len(rss) > len(tss) {\n\t\t\t\treturn errors.New(\"--rpc has more items than --tensor-split\")\n\t\t\t}\n\t\t\trpc := make([]string, len(rss))\n\t\t\tfor i, s := range rss {\n\t\t\t\ts = strings.TrimSpace(s)\n\t\t\t\tif _, _, err := net.SplitHostPort(s); err != nil {\n\t\t\t\t\treturn errors.New(\"--rpc has invalid host:port\")\n\t\t\t\t}\n\t\t\t\trpc[i] = s\n\t\t\t}\n\t\t\teopts = append(eopts, WithRPCServers(rpc))\n\t\t}\n\t}\n\tif otss := overrideTensors.Value(); len(otss) > 0 {\n\t\tvar ots []GGUFRunOverriddenTensor\n\t\tfor i := range otss {\n\t\t\tpots := strings.Split(otss[i], \",\")\n\t\t\tfor j := range pots {\n\t\t\t\tss := strings.SplitN(strings.TrimSpace(pots[j]), \"=\", 2)\n\t\t\t\tif len(ss) != 2 {\n\t\t\t\t\treturn errors.New(\"--override-tensor has invalid format\")\n\t\t\t\t}\n\t\t\t\tpr, err := regexp.Compile(strings.TrimSpace(ss[0]))\n\t\t\t\tif err != nil {\n\t\t\t\t\treturn fmt.Errorf(\"--override-tensor has invalid pattern: %w\", err)\n\t\t\t\t}\n\t\t\t\tbt := strings.TrimSpace(ss[1])\n\t\t\t\tif bt == \"\" {\n\t\t\t\t\treturn errors.New(\"--override-tensor has empty buffer type\")\n\t\t\t\t}\n\t\t\t\tots = append(ots, GGUFRunOverriddenTensor{\n\t\t\t\t\tPatternRegex: pr,\n\t\t\t\t\tBufferType:   bt,\n\t\t\t\t})\n\t\t\t}\n\t\t}\n\t\teopts = append(eopts, WithOverriddenTensors(ots))\n\t}\n\tif dmss := deviceMetrics.Value(); len(dmss) > 0 {\n\t\tdms := make([]GGUFRunDeviceMetric, len(dmss))\n\t\tfor i := range dmss {\n\t\t\tss := strings.Split(dmss[i], \";\")\n\t\t\tif len(ss) < 2 {\n\t\t\t\treturn errors.New(\"--device-metric has invalid format\")\n\t\t\t}\n\t\t\tvar err error\n\t\t\tdms[i].FLOPS, err = ParseFLOPSScalar(strings.TrimSpace(ss[0]))\n\t\t\tif err != nil {\n\t\t\t\treturn fmt.Errorf(\"--device-metric has invalid FLOPS: %w\", err)\n\t\t\t}\n\t\t\tdms[i].UpBandwidth, err = ParseBytesPerSecondScalar(strings.TrimSpace(ss[1]))\n\t\t\tif err != nil {\n\t\t\t\treturn fmt.Errorf(\"--device-metric has invalid Up Bandwidth: %w\", err)\n\t\t\t}\n\t\t\tif len(ss) > 2 {\n\t\t\t\tdms[i].DownBandwidth, err = ParseBytesPerSecondScalar(strings.TrimSpace(ss[2]))\n\t\t\t\tif err != nil {\n\t\t\t\t\treturn fmt.Errorf(\"--device-metric has invalid Down Bandwidth: %w\", err)\n\t\t\t\t}\n\t\t\t} else {\n\t\t\t\tdms[i].DownBandwidth = dms[i].UpBandwidth\n\t\t\t}\n\t\t}\n\t\teopts = append(eopts, WithDeviceMetrics(dms))\n\t}\n\tif lmcCtxSize > 0 {\n\t\teopts = append(eopts, WithLLaMACppContextSize(int32(lmcCtxSize)))\n\t}\n\tif lmcRoPEFreqBase > 0 || lmcRoPEFreqScale > 0 || lmcRoPEScalingType != \"\" || lmcRoPEScalingOrigCtxSize > 0 {\n\t\teopts = append(eopts, WithLLaMACppRoPE(lmcRoPEFreqBase, lmcRoPEFreqScale, lmcRoPEScalingType, int32(lmcRoPEScalingOrigCtxSize)))\n\t}\n\tif lmcInMaxCtxSize {\n\t\teopts = append(eopts, WithinLLaMACppMaxContextSize())\n\t}\n\tif lmcLogicalBatchSize > 0 {\n\t\teopts = append(eopts, WithLLaMACppLogicalBatchSize(int32(max(32, lmcLogicalBatchSize))))\n\t}\n\tif lmcPhysicalBatchSize > 0 {\n\t\tif lmcPhysicalBatchSize > lmcLogicalBatchSize {\n\t\t\treturn errors.New(\"--ubatch-size must be less than or equal to --batch-size\")\n\t\t}\n\t\teopts = append(eopts, WithLLaMACppPhysicalBatchSize(int32(lmcPhysicalBatchSize)))\n\t}\n\tif lmcCacheKeyType != \"\" {\n\t\teopts = append(eopts, WithLLaMACppCacheKeyType(toGGMLType(lmcCacheKeyType)))\n\t}\n\tif lmcCacheValueType != \"\" {\n\t\teopts = append(eopts, WithLLaMACppCacheValueType(toGGMLType(lmcCacheValueType)))\n\t}\n\tif lmcNoKVOffload {\n\t\teopts = append(eopts, WithoutLLaMACppOffloadKVCache())\n\t}\n\tswitch lmcSplitMode {\n\tcase \"row\":\n\t\teopts = append(eopts, WithLLaMACppSplitMode(LLaMACppSplitModeRow))\n\tcase \"none\":\n\t\teopts = append(eopts, WithLLaMACppSplitMode(LLaMACppSplitModeNone))\n\tdefault:\n\t\teopts = append(eopts, WithLLaMACppSplitMode(LLaMACppSplitModeLayer))\n\t}\n\tif lmcSWAFull {\n\t\teopts = append(eopts, WithLLaMACppFullSizeSWACache())\n\t}\n\tif lmcVisualMaxImageSize > 0 {\n\t\teopts = append(eopts, WithLLaMACppVisualMaxImageSize(uint32(lmcVisualMaxImageSize)))\n\t}\n\tif lmcMaxProjectedCache > 0 {\n\t\teopts = append(eopts, WithLLaMACppMaxProjectedCache(uint32(lmcMaxProjectedCache)))\n\t}\n\tif sdcBatchCount > 1 {\n\t\teopts = append(eopts, WithStableDiffusionCppBatchCount(int32(sdcBatchCount)))\n\t}\n\tif sdcHeight > 0 {\n\t\teopts = append(eopts, WithStableDiffusionCppHeight(uint32(sdcHeight)))\n\t}\n\tif sdcWidth > 0 {\n\t\teopts = append(eopts, WithStableDiffusionCppWidth(uint32(sdcWidth)))\n\t}\n\tif sdcNoConditionerOffload {\n\t\teopts = append(eopts, WithoutStableDiffusionCppOffloadConditioner())\n\t}\n\tif sdcNoAutoencoderOffload {\n\t\teopts = append(eopts, WithoutStableDiffusionCppOffloadAutoencoder())\n\t}\n\tif sdcAutoencoderTiling && !sdcNoAutoencoderTiling {\n\t\teopts = append(eopts, WithStableDiffusionCppAutoencoderTiling())\n\t}\n\tif sdcFreeComputeMemoryImmediately {\n\t\teopts = append(eopts, WithStableDiffusionCppFreeComputeMemoryImmediately())\n\t}\n\tif offloadLayers >= 0 {\n\t\teopts = append(eopts, WithLLaMACppOffloadLayers(uint64(offloadLayers)), WithStableDiffusionCppOffloadLayers(uint64(offloadLayers)))\n\t}\n\n\t// Parse GGUF file.\n\n\tvar (\n\t\t// Common.\n\t\tgf         *GGUFFile\n\t\tadapterGfs []*GGUFFile\n\t\t// LLaMACpp specific.\n\t\tlmcProjectGf *GGUFFile\n\t\tlmcDrafterGf *GGUFFile\n\t\t// StableDiffusionCpp specific.\n\t\tsdcControlNetGf *GGUFFile\n\t\tsdcUpscaleGf    *GGUFFile\n\t)\n\t{\n\t\tvar err error\n\n\t\tropts := ropts[:len(ropts):len(ropts)]\n\n\t\t// Main model.\n\t\tswitch {\n\t\tdefault:\n\t\t\treturn errors.New(\"no model specified\")\n\t\tcase path != \"\":\n\t\t\tgf, err = ParseGGUFFile(path, ropts...)\n\t\tcase url != \"\":\n\t\t\tgf, err = ParseGGUFFileRemote(ctx, url, ropts...)\n\t\tcase hfRepo != \"\" && hfFile != \"\":\n\t\t\tif hfToken != \"\" {\n\t\t\t\tropts = append(ropts, UseBearerAuth(hfToken))\n\t\t\t}\n\t\t\tgf, err = ParseGGUFFileFromHuggingFace(ctx, hfRepo, hfFile, ropts...)\n\t\tcase msRepo != \"\" && msFile != \"\":\n\t\t\tif msToken != \"\" {\n\t\t\t\tropts = append(ropts, UseBearerAuth(msToken))\n\t\t\t}\n\t\t\tgf, err = ParseGGUFFileFromModelScope(ctx, msRepo, msFile, ropts...)\n\t\tcase olModel != \"\":\n\t\t\tom := ParseOllamaModel(olModel, SetOllamaModelBaseURL(olBaseURL))\n\t\t\tgf, err = ParseGGUFFileFromOllamaModel(ctx, om, ropts...)\n\t\t\tif err == nil && om != nil && olUsage {\n\t\t\t\t// Parameters override.\n\t\t\t\t{\n\t\t\t\t\tps, _ := om.Params(ctx, nil)\n\t\t\t\t\tif v, ok := ps[\"num_ctx\"]; ok {\n\t\t\t\t\t\teopts = append(eopts, WithLLaMACppContextSize(anyx.Number[int32](v)))\n\t\t\t\t\t} else if lmcCtxSize <= 0 {\n\t\t\t\t\t\teopts = append(eopts, WithLLaMACppContextSize(2048))\n\t\t\t\t\t}\n\t\t\t\t\tif v, ok := ps[\"use_mmap\"]; ok && !anyx.Bool(v) {\n\t\t\t\t\t\tlmcNoMMap = true\n\t\t\t\t\t}\n\t\t\t\t\tif v, ok := ps[\"num_gpu\"]; ok {\n\t\t\t\t\t\toffloadLayers = anyx.Number[int](v)\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\t// Multimodal projector overlap.\n\t\t\t\t{\n\t\t\t\t\tmls := om.SearchLayers(regexp.MustCompile(`^application/vnd\\.ollama\\.image\\.projector$`))\n\t\t\t\t\tif len(mls) > 0 {\n\t\t\t\t\t\tlmcProjectGf, err = ParseGGUFFileRemote(ctx, mls[len(mls)-1].BlobURL().String(), ropts...)\n\t\t\t\t\t\tif err != nil {\n\t\t\t\t\t\t\treturn fmt.Errorf(\"failed to parse GGUF file: %w\", err)\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\t// Adapter overlap.\n\t\t\t\t{\n\t\t\t\t\tals := om.SearchLayers(regexp.MustCompile(`^application/vnd\\.ollama\\.image\\.adapter$`))\n\t\t\t\t\tif len(als) > 0 {\n\t\t\t\t\t\tvar adpgf *GGUFFile\n\t\t\t\t\t\tfor i := range als {\n\t\t\t\t\t\t\tadpgf, err = ParseGGUFFileRemote(ctx, als[i].BlobURL().String(), ropts...)\n\t\t\t\t\t\t\tif err != nil {\n\t\t\t\t\t\t\t\treturn fmt.Errorf(\"failed to parse GGUF file: %w\", err)\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\tadapterGfs = append(adapterGfs, adpgf)\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\tif err != nil {\n\t\t\treturn fmt.Errorf(\"failed to parse GGUF file: %w\", err)\n\t\t}\n\n\t\t// Adapter.\n\t\t{\n\t\t\t// LoRA.\n\t\t\tfor _, loraPath := range loraPaths.Value() {\n\t\t\t\tadpgf, err := ParseGGUFFile(loraPath, ropts...)\n\t\t\t\tif err != nil {\n\t\t\t\t\treturn fmt.Errorf(\"failed to parse LoRA adapter GGUF file: %w\", err)\n\t\t\t\t}\n\t\t\t\tadapterGfs = append(adapterGfs, adpgf)\n\t\t\t}\n\t\t\tfor _, loraUrl := range loraUrls.Value() {\n\t\t\t\tadpgf, err := ParseGGUFFileRemote(ctx, loraUrl, ropts...)\n\t\t\t\tif err != nil {\n\t\t\t\t\treturn fmt.Errorf(\"failed to parse LoRA adapter GGUF file: %w\", err)\n\t\t\t\t}\n\t\t\t\tadapterGfs = append(adapterGfs, adpgf)\n\t\t\t}\n\t\t\tif hfRepo != \"\" {\n\t\t\t\tfor _, hfLoRAFile := range hfLoRAFiles.Value() {\n\t\t\t\t\tadpgf, err := ParseGGUFFileFromHuggingFace(ctx, hfRepo, hfLoRAFile, ropts...)\n\t\t\t\t\tif err != nil {\n\t\t\t\t\t\treturn fmt.Errorf(\"failed to parse LoRA adapter GGUF file: %w\", err)\n\t\t\t\t\t}\n\t\t\t\t\tadapterGfs = append(adapterGfs, adpgf)\n\t\t\t\t}\n\t\t\t}\n\t\t\tif msRepo != \"\" {\n\t\t\t\tfor _, msLoRAFile := range msLoRAFiles.Value() {\n\t\t\t\t\tadpgf, err := ParseGGUFFileFromModelScope(ctx, msRepo, msLoRAFile, ropts...)\n\t\t\t\t\tif err != nil {\n\t\t\t\t\t\treturn fmt.Errorf(\"failed to parse LoRA adapter GGUF file: %w\", err)\n\t\t\t\t\t}\n\t\t\t\t\tadapterGfs = append(adapterGfs, adpgf)\n\t\t\t\t}\n\t\t\t}\n\n\t\t\t// Control Vector.\n\t\t\tfor _, cvPath := range controlVectorPaths.Value() {\n\t\t\t\tadpgf, err := ParseGGUFFile(cvPath, ropts...)\n\t\t\t\tif err != nil {\n\t\t\t\t\treturn fmt.Errorf(\"failed to parse Control Vector adapter GGUF file: %w\", err)\n\t\t\t\t}\n\t\t\t\tadapterGfs = append(adapterGfs, adpgf)\n\t\t\t}\n\t\t\tfor _, cvUrl := range controlVectorUrls.Value() {\n\t\t\t\tadpgf, err := ParseGGUFFileRemote(ctx, cvUrl, ropts...)\n\t\t\t\tif err != nil {\n\t\t\t\t\treturn fmt.Errorf(\"failed to parse Control Vector adapter GGUF file: %w\", err)\n\t\t\t\t}\n\t\t\t\tadapterGfs = append(adapterGfs, adpgf)\n\t\t\t}\n\t\t\tif hfRepo != \"\" {\n\t\t\t\tfor _, hfCvFile := range hfControlVectorFiles.Value() {\n\t\t\t\t\tadpgf, err := ParseGGUFFileFromHuggingFace(ctx, hfRepo, hfCvFile, ropts...)\n\t\t\t\t\tif err != nil {\n\t\t\t\t\t\treturn fmt.Errorf(\"failed to parse Control Vector adapter GGUF file: %w\", err)\n\t\t\t\t\t}\n\t\t\t\t\tadapterGfs = append(adapterGfs, adpgf)\n\t\t\t\t}\n\t\t\t}\n\t\t\tif msRepo != \"\" {\n\t\t\t\tfor _, msCvFile := range msControlVectorFiles.Value() {\n\t\t\t\t\tadpgf, err := ParseGGUFFileFromModelScope(ctx, msRepo, msCvFile, ropts...)\n\t\t\t\t\tif err != nil {\n\t\t\t\t\t\treturn fmt.Errorf(\"failed to parse Control Vector adapter GGUF file: %w\", err)\n\t\t\t\t\t}\n\t\t\t\t\tadapterGfs = append(adapterGfs, adpgf)\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\t// Drafter for LLaMACpp.\n\t\tswitch {\n\t\tcase draftPath != \"\":\n\t\t\tlmcDrafterGf, err = ParseGGUFFile(draftPath, ropts...)\n\t\tcase draftUrl != \"\":\n\t\t\tlmcDrafterGf, err = ParseGGUFFileRemote(ctx, draftUrl, ropts...)\n\t\tcase hfDraftRepo != \"\" && hfDraftFile != \"\":\n\t\t\tlmcDrafterGf, err = ParseGGUFFileFromHuggingFace(ctx, hfDraftRepo, hfDraftFile, ropts...)\n\t\tcase msDraftRepo != \"\" && msDraftFile != \"\":\n\t\t\tlmcDrafterGf, err = ParseGGUFFileFromModelScope(ctx, msDraftRepo, msDraftFile, ropts...)\n\t\t}\n\t\tif err != nil {\n\t\t\treturn fmt.Errorf(\"failed to parse draft GGUF file: %w\", err)\n\t\t}\n\n\t\t// Projector for LLaMACpp.\n\t\tswitch {\n\t\tcase mmprojPath != \"\":\n\t\t\tlmcProjectGf, err = ParseGGUFFile(mmprojPath, ropts...)\n\t\tcase mmprojUrl != \"\":\n\t\t\tlmcProjectGf, err = ParseGGUFFileRemote(ctx, mmprojUrl, ropts...)\n\t\tcase hfRepo != \"\" && hfMMProjFile != \"\":\n\t\t\tlmcProjectGf, err = ParseGGUFFileFromHuggingFace(ctx, hfRepo, hfMMProjFile, ropts...)\n\t\tcase msRepo != \"\" && msMMProjFile != \"\":\n\t\t\tlmcProjectGf, err = ParseGGUFFileFromModelScope(ctx, msRepo, msMMProjFile, ropts...)\n\t\t}\n\t\tif err != nil {\n\t\t\treturn fmt.Errorf(\"failed to parse multimodal projector GGUF file: %w\", err)\n\t\t}\n\n\t\t// ControlNet for StableDiffusionCpp.\n\t\tswitch {\n\t\tcase controlNetPath != \"\":\n\t\t\tsdcControlNetGf, err = ParseGGUFFile(controlNetPath, ropts...)\n\t\tcase controlNetUrl != \"\":\n\t\t\tsdcControlNetGf, err = ParseGGUFFileRemote(ctx, controlNetUrl, ropts...)\n\t\tcase hfControlNetRepo != \"\" && hfControlNetFile != \"\":\n\t\t\tsdcControlNetGf, err = ParseGGUFFileFromHuggingFace(ctx, hfControlNetRepo, hfControlNetFile, ropts...)\n\t\tcase msControlNetRepo != \"\" && msControlNetFile != \"\":\n\t\t\tsdcControlNetGf, err = ParseGGUFFileFromModelScope(ctx, msControlNetRepo, msControlNetFile, ropts...)\n\t\t}\n\t\tif err != nil {\n\t\t\treturn fmt.Errorf(\"failed to parse control net GGUF file: %w\", err)\n\t\t}\n\n\t\t// Upscaler for StableDiffusionCpp.\n\t\tswitch {\n\t\tcase upscalePath != \"\":\n\t\t\tsdcUpscaleGf, err = ParseGGUFFile(upscalePath, ropts...)\n\t\tcase upscaleUrl != \"\":\n\t\t\tsdcUpscaleGf, err = ParseGGUFFileRemote(ctx, upscaleUrl, ropts...)\n\t\tcase hfUpscaleRepo != \"\" && hfUpscaleFile != \"\":\n\t\t\tsdcUpscaleGf, err = ParseGGUFFileFromHuggingFace(ctx, hfUpscaleRepo, hfUpscaleFile, ropts...)\n\t\tcase msUpscaleRepo != \"\" && msUpscaleFile != \"\":\n\t\t\tsdcUpscaleGf, err = ParseGGUFFileFromModelScope(ctx, msUpscaleRepo, msUpscaleFile, ropts...)\n\t\t}\n\t\tif err != nil {\n\t\t\treturn fmt.Errorf(\"failed to parse upscaler GGUF file: %w\", err)\n\t\t}\n\t}\n\n\t// Output raw.\n\n\tif raw {\n\t\tw := os.Stdout\n\t\tif rawOutput != \"\" {\n\t\t\tf, err := osx.CreateFile(rawOutput, 0o666)\n\t\t\tif err != nil {\n\t\t\t\treturn fmt.Errorf(\"failed to create file: %w\", err)\n\t\t\t}\n\t\t\tdefer osx.Close(f)\n\t\t\tw = f\n\t\t}\n\t\tif err := json.NewEncoder(w).Encode(gf); err != nil {\n\t\t\treturn fmt.Errorf(\"failed to encode JSON: %w\", err)\n\t\t}\n\t\treturn nil\n\t}\n\n\t// Otherwise, display the metadata and estimate the usage.\n\n\tvar (\n\t\tm   = gf.Metadata()\n\t\ta   = gf.Architecture()\n\t\tt   = gf.Tokenizer()\n\t\tlme LLaMACppRunEstimate\n\t\tsde StableDiffusionCppRunEstimate\n\t)\n\n\tskipArchitecture = skipArchitecture || m.Type == \"imatrix\"\n\tskipTokenizer = skipTokenizer || t.Model == \"\"\n\tskipEstimate = skipEstimate || m.Type != \"model\"\n\n\tif !skipEstimate && m.Architecture != \"diffusion\" {\n\t\tif lmcDrafterGf != nil {\n\t\t\tdlmceopts := eopts[:len(eopts):len(eopts)]\n\t\t\tif lmcOffloadLayersDraft >= 0 {\n\t\t\t\tdlmceopts = append(dlmceopts, WithLLaMACppOffloadLayers(uint64(lmcOffloadLayersDraft)))\n\t\t\t}\n\t\t\tdlmceopts = append(dlmceopts, WithLLaMACppCacheKeyType(GGMLTypeF16), WithLLaMACppCacheValueType(GGMLTypeF16))\n\t\t\tde := lmcDrafterGf.EstimateLLaMACppRun(dlmceopts...)\n\t\t\teopts = append(eopts, WithLLaMACppDrafter(&de))\n\t\t}\n\n\t\tif lmcProjectGf != nil {\n\t\t\tplmceopts := eopts[:len(eopts):len(eopts)]\n\t\t\tme := lmcProjectGf.EstimateLLaMACppRun(plmceopts...)\n\t\t\teopts = append(eopts, WithLLaMACppProjector(&me))\n\t\t}\n\n\t\tif len(adapterGfs) > 0 {\n\t\t\tadps := make([]LLaMACppRunEstimate, len(adapterGfs))\n\t\t\talmceopts := eopts[:len(eopts):len(eopts)]\n\t\t\tfor i, adpgf := range adapterGfs {\n\t\t\t\tae := adpgf.EstimateLLaMACppRun(almceopts...)\n\t\t\t\tadps[i] = ae\n\t\t\t}\n\t\t\teopts = append(eopts, WithLLaMACppAdapters(adps))\n\t\t}\n\n\t\tlme = gf.EstimateLLaMACppRun(eopts...)\n\t}\n\n\tif !skipEstimate && m.Architecture == \"diffusion\" {\n\t\tif sdcUpscaleGf != nil {\n\t\t\tsdceopts := eopts[:len(eopts):len(eopts)]\n\t\t\tue := sdcUpscaleGf.EstimateStableDiffusionCppRun(sdceopts...)\n\t\t\teopts = append(eopts, WithStableDiffusionCppUpscaler(&ue))\n\t\t}\n\n\t\tif sdcControlNetGf != nil {\n\t\t\tsdceopts := eopts[:len(eopts):len(eopts)]\n\t\t\tif sdcNoControlNetOffload {\n\t\t\t\tsdceopts = append(sdceopts, WithStableDiffusionCppOffloadLayers(0))\n\t\t\t}\n\t\t\tce := sdcControlNetGf.EstimateStableDiffusionCppRun(sdceopts...)\n\t\t\teopts = append(eopts, WithStableDiffusionCppControlNet(&ce))\n\t\t}\n\n\t\tsde = gf.EstimateStableDiffusionCppRun(eopts...)\n\t}\n\n\t// Then, output as JSON or table.\n\n\tvar (\n\t\tmmap                      = !lmcNoMMap\n\t\tplatformRAM, platformVRAM uint64\n\t)\n\t{\n\t\tif platformFootprint != \"\" {\n\t\t\tparts := strings.Split(platformFootprint, \",\")\n\t\t\tif len(parts) == 2 {\n\t\t\t\tif v, err := strconv.ParseUint(parts[0], 10, 64); err == nil {\n\t\t\t\t\tplatformRAM = v * 1024 * 1024\n\t\t\t\t}\n\t\t\t\tif v, err := strconv.ParseUint(parts[1], 10, 64); err == nil {\n\t\t\t\t\tplatformVRAM = v * 1024 * 1024\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\n\tif inJson {\n\t\to := map[string]any{}\n\n\t\tif !skipMetadata {\n\t\t\to[\"metadata\"] = m\n\t\t}\n\n\t\tif !skipArchitecture {\n\t\t\to[\"architecture\"] = a\n\t\t}\n\n\t\tif !skipTokenizer {\n\t\t\to[\"tokenizer\"] = t\n\t\t}\n\n\t\tif !skipEstimate && m.Architecture != \"diffusion\" {\n\t\t\tlmes := lme.Summarize(mmap, platformRAM, platformVRAM)\n\t\t\tswitch {\n\t\t\tcase lmcOffloadLayersStep > lme.OffloadLayers:\n\t\t\t\tlmcOffloadLayersStep = lme.OffloadLayers\n\t\t\tcase lmcOffloadLayersStep <= 0:\n\t\t\t\tlmcOffloadLayersStep = lme.OffloadLayers\n\t\t\t}\n\t\t\tif lmcOffloadLayersStep < lme.OffloadLayers {\n\t\t\t\tcnt := lme.OffloadLayers/lmcOffloadLayersStep + 1\n\t\t\t\tif lme.OffloadLayers%lmcOffloadLayersStep != 0 || lme.FullOffloaded {\n\t\t\t\t\tcnt++\n\t\t\t\t}\n\t\t\t\tesis := make([]LLaMACppRunEstimateSummaryItem, cnt)\n\t\t\t\tvar wg sync.WaitGroup\n\t\t\t\tfor i := 0; i < cap(esis); i++ {\n\t\t\t\t\twg.Add(1)\n\t\t\t\t\tgo func(i int) {\n\t\t\t\t\t\tdefer wg.Done()\n\t\t\t\t\t\tlmeopts := eopts[:len(eopts):len(eopts)]\n\t\t\t\t\t\tlmeopts = append(lmeopts, WithLLaMACppOffloadLayers(uint64(i)*lmcOffloadLayersStep))\n\t\t\t\t\t\tesis[i] = gf.EstimateLLaMACppRun(lmeopts...).SummarizeItem(mmap, platformRAM, platformVRAM)\n\t\t\t\t\t}(i)\n\t\t\t\t}\n\t\t\t\twg.Wait()\n\t\t\t\tesis[cap(esis)-1] = lmes.Items[0]\n\t\t\t\tlmes.Items = esis\n\t\t\t}\n\t\t\to[\"estimate\"] = lmes\n\t\t}\n\n\t\tif !skipEstimate && m.Architecture == \"diffusion\" {\n\t\t\tsdes := sde.Summarize(mmap, platformRAM, platformVRAM)\n\t\t\to[\"estimate\"] = sdes\n\t\t}\n\n\t\tenc := json.NewEncoder(os.Stdout)\n\t\tif inPrettyJson {\n\t\t\tenc.SetIndent(\"\", \"  \")\n\t\t}\n\t\tif err := enc.Encode(o); err != nil {\n\t\t\treturn fmt.Errorf(\"failed to encode JSON: %w\", err)\n\t\t}\n\n\t\treturn nil\n\t}\n\n\tGGUFBytesScalarStringInMiBytes = inMib\n\n\tif !skipMetadata {\n\t\ttprint(\n\t\t\t\"Metadata\",\n\t\t\t[][]any{\n\t\t\t\t{\n\t\t\t\t\t\"Type\",\n\t\t\t\t\t\"Name\",\n\t\t\t\t\t\"Arch\",\n\t\t\t\t\t\"Quantization\",\n\t\t\t\t\t\"Little Endian\",\n\t\t\t\t\t\"Size\",\n\t\t\t\t\t\"Parameters\",\n\t\t\t\t\t\"BPW\",\n\t\t\t\t},\n\t\t\t},\n\t\t\t[][]any{\n\t\t\t\t{\n\t\t\t\t\tm.Type,\n\t\t\t\t\tsprintf(tenary(len(m.Name) == 0, \"N/A\", tenary(len([]rune(m.Name)) <= 20, m.Name, string([]rune(m.Name)[:20])+\"...\"))),\n\t\t\t\t\tm.Architecture,\n\t\t\t\t\tm.FileTypeDescriptor,\n\t\t\t\t\tsprintf(m.LittleEndian),\n\t\t\t\t\tsprintf(m.Size),\n\t\t\t\t\tsprintf(m.Parameters),\n\t\t\t\t\tsprintf(m.BitsPerWeight),\n\t\t\t\t},\n\t\t\t})\n\t}\n\n\tif !skipArchitecture {\n\t\tvar (\n\t\t\thds [][]any\n\t\t\tbds [][]any\n\t\t)\n\t\tswitch a.Type {\n\t\tcase \"projector\":\n\t\t\thds = [][]any{\n\t\t\t\t{\n\t\t\t\t\t\"Projector Type\",\n\t\t\t\t\t\"Embedding Len\",\n\t\t\t\t\t\"Layers\",\n\t\t\t\t\t\"Feed Forward Len\",\n\t\t\t\t\t\"Encoder\",\n\t\t\t\t},\n\t\t\t}\n\t\t\tswitch {\n\t\t\tcase a.ClipHasVisionEncoder && a.ClipHasAudioEncoder:\n\t\t\t\thds = [][]any{\n\t\t\t\t\t{\n\t\t\t\t\t\t\"Projector Type\",\n\t\t\t\t\t\t\"Embedding Len\", \"Embedding Len\",\n\t\t\t\t\t\t\"Layers\", \"Layers\",\n\t\t\t\t\t\t\"Feed Forward Len\", \"Feed Forward Len\",\n\t\t\t\t\t\t\"Encoder\",\n\t\t\t\t\t},\n\t\t\t\t\t{\n\t\t\t\t\t\t\"Projector Type\",\n\t\t\t\t\t\t\"Vision\", \"Audio\",\n\t\t\t\t\t\t\"Vision\", \"Audio\",\n\t\t\t\t\t\t\"Vision\", \"Audio\",\n\t\t\t\t\t\t\"Encoder\",\n\t\t\t\t\t},\n\t\t\t\t}\n\t\t\t\tbds = [][]any{\n\t\t\t\t\t{\n\t\t\t\t\t\tsprintf(a.ClipProjectorType),\n\t\t\t\t\t\tsprintf(a.ClipVisionEmbeddingLength),\n\t\t\t\t\t\tsprintf(a.ClipAudioEmbeddingLength),\n\t\t\t\t\t\tsprintf(a.ClipVisionBlockCount),\n\t\t\t\t\t\tsprintf(a.ClipAudioBlockCount),\n\t\t\t\t\t\tsprintf(tenary(\n\t\t\t\t\t\t\ta.ClipVisionFeedForwardLength[0] == a.ClipVisionFeedForwardLength[1],\n\t\t\t\t\t\t\ta.ClipVisionFeedForwardLength[0],\n\t\t\t\t\t\t\tsprintf(\"[%d, %d, ...]\", a.ClipVisionFeedForwardLength[0], a.ClipVisionFeedForwardLength[1]))),\n\t\t\t\t\t\tsprintf(tenary(\n\t\t\t\t\t\t\ta.ClipAudioFeedForwardLength[0] == a.ClipAudioFeedForwardLength[1],\n\t\t\t\t\t\t\ta.ClipAudioFeedForwardLength[0],\n\t\t\t\t\t\t\tsprintf(\"[%d, %d, ...]\", a.ClipAudioFeedForwardLength[0], a.ClipAudioFeedForwardLength[1]))),\n\t\t\t\t\t\t\"Vision & Audio\",\n\t\t\t\t\t},\n\t\t\t\t}\n\t\t\tcase a.ClipHasVisionEncoder:\n\t\t\t\tbds = [][]any{\n\t\t\t\t\t{\n\t\t\t\t\t\tsprintf(a.ClipProjectorType),\n\t\t\t\t\t\tsprintf(a.ClipVisionEmbeddingLength),\n\t\t\t\t\t\tsprintf(a.ClipVisionBlockCount),\n\t\t\t\t\t\tsprintf(tenary(\n\t\t\t\t\t\t\ta.ClipVisionFeedForwardLength[0] == a.ClipVisionFeedForwardLength[1],\n\t\t\t\t\t\t\ta.ClipVisionFeedForwardLength[0],\n\t\t\t\t\t\t\tsprintf(\"[%d, %d, ...]\", a.ClipVisionFeedForwardLength[0], a.ClipVisionFeedForwardLength[1]))),\n\t\t\t\t\t\t\"Vision\",\n\t\t\t\t\t},\n\t\t\t\t}\n\t\t\tdefault:\n\t\t\t\tbds = [][]any{\n\t\t\t\t\t{\n\t\t\t\t\t\tsprintf(a.ClipProjectorType),\n\t\t\t\t\t\tsprintf(a.ClipAudioEmbeddingLength),\n\t\t\t\t\t\tsprintf(a.ClipAudioBlockCount),\n\t\t\t\t\t\tsprintf(tenary(\n\t\t\t\t\t\t\ta.ClipAudioFeedForwardLength[0] == a.ClipAudioFeedForwardLength[1],\n\t\t\t\t\t\t\ta.ClipAudioFeedForwardLength[0],\n\t\t\t\t\t\t\tsprintf(\"[%d, %d, ...]\", a.ClipAudioFeedForwardLength[0], a.ClipAudioFeedForwardLength[1]))),\n\t\t\t\t\t\t\"Audio\",\n\t\t\t\t\t},\n\t\t\t\t}\n\t\t\t}\n\t\tcase \"adapter\":\n\t\t\thds = [][]any{\n\t\t\t\t{\n\t\t\t\t\t\"Adapter Type\",\n\t\t\t\t},\n\t\t\t}\n\t\t\tbds = [][]any{\n\t\t\t\t{\n\t\t\t\t\tsprintf(a.AdapterType),\n\t\t\t\t},\n\t\t\t}\n\t\t\tif a.AdapterType == \"lora\" {\n\t\t\t\thds[0] = append(hds[0], \"LoRA Alpha\")\n\t\t\t\tbds[0] = append(bds[0], sprintf(a.AdapterLoRAAlpha))\n\t\t\t} else {\n\t\t\t\thds[0] = append(hds[0], \"ControlVector Layers\")\n\t\t\t\tbds[0] = append(bds[0], sprintf(a.AdapterControlVectorLayerCount))\n\t\t\t}\n\t\tdefault:\n\t\t\tif a.Architecture == \"diffusion\" {\n\t\t\t\thds = [][]any{\n\t\t\t\t\t{\n\t\t\t\t\t\t\"Diffusion Arch\",\n\t\t\t\t\t\t\"Conditioners\",\n\t\t\t\t\t\t\"Autoencoder\",\n\t\t\t\t\t},\n\t\t\t\t}\n\t\t\t\tbds = [][]any{\n\t\t\t\t\t{\n\t\t\t\t\t\tsprintf(tenary(a.DiffusionArchitecture != \"\", a.DiffusionArchitecture, \"N/A\")),\n\t\t\t\t\t\tsprintf(tenary(a.DiffusionHasConditioners(), a.DiffusionConditioners, \"N/A\")),\n\t\t\t\t\t\tsprintf(tenary(a.DiffusionHasAutoencoder(), a.DiffusionAutoencoder, \"N/A\")),\n\t\t\t\t\t},\n\t\t\t\t}\n\t\t\t} else {\n\t\t\t\thds = [][]any{\n\t\t\t\t\t{\n\t\t\t\t\t\t\"Max Context Len\",\n\t\t\t\t\t\t\"Embedding Len\",\n\t\t\t\t\t\t\"Attention Causal\",\n\t\t\t\t\t\t\"Attention Head Cnt\",\n\t\t\t\t\t\t\"Layers\",\n\t\t\t\t\t\ttenary(a.ExpertFeedForwardLength != 0, \"Expert Feed Forward Len\", \"Feed Forward Len\"),\n\t\t\t\t\t\t\"Expert Cnt\",\n\t\t\t\t\t\t\"Vocabulary Len\",\n\t\t\t\t\t},\n\t\t\t\t}\n\t\t\t\tbds = [][]any{\n\t\t\t\t\t{\n\t\t\t\t\t\tsprintf(a.MaximumContextLength),\n\t\t\t\t\t\tsprintf(a.EmbeddingLength),\n\t\t\t\t\t\tsprintf(a.AttentionCausal),\n\t\t\t\t\t\tsprintf(tenary(\n\t\t\t\t\t\t\ta.AttentionHeadCountKV == 0 || a.AttentionHeadCountKV == a.AttentionHeadCount,\n\t\t\t\t\t\t\t\"N/A\",\n\t\t\t\t\t\t\ta.AttentionHeadCount)),\n\t\t\t\t\t\tsprintf(a.BlockCount),\n\t\t\t\t\t\tsprintf(tenaryFunc(\n\t\t\t\t\t\t\ta.ExpertFeedForwardLength != 0,\n\t\t\t\t\t\t\tfunc() any {\n\t\t\t\t\t\t\t\treturn a.ExpertFeedForwardLength\n\t\t\t\t\t\t\t},\n\t\t\t\t\t\t\tfunc() any {\n\t\t\t\t\t\t\t\tswitch {\n\t\t\t\t\t\t\t\tcase len(a.FeedForwardLength) == 0:\n\t\t\t\t\t\t\t\t\treturn \"N/A\"\n\t\t\t\t\t\t\t\tcase len(a.FeedForwardLength) == 1:\n\t\t\t\t\t\t\t\t\treturn a.FeedForwardLength[0]\n\t\t\t\t\t\t\t\tcase a.FeedForwardLength[0] == a.FeedForwardLength[1]:\n\t\t\t\t\t\t\t\t\treturn a.FeedForwardLength[0]\n\t\t\t\t\t\t\t\tdefault:\n\t\t\t\t\t\t\t\t\treturn sprintf(\"[%d, %d, ...]\", a.FeedForwardLength[0], a.FeedForwardLength[1])\n\t\t\t\t\t\t\t\t}\n\t\t\t\t\t\t\t},\n\t\t\t\t\t\t)),\n\t\t\t\t\t\tsprintf(a.ExpertCount),\n\t\t\t\t\t\tsprintf(a.VocabularyLength),\n\t\t\t\t\t},\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\ttprint(\n\t\t\t\"ARCHITECTURE\",\n\t\t\thds,\n\t\t\tbds)\n\t}\n\n\tif !skipTokenizer {\n\t\ttprint(\n\t\t\t\"TOKENIZER\",\n\t\t\t[][]any{\n\t\t\t\t{\n\t\t\t\t\t\"Model\",\n\t\t\t\t\t\"Tokens Size\",\n\t\t\t\t\t\"Tokens Len\",\n\t\t\t\t\t\"Added Tokens Len\",\n\t\t\t\t\t\"BOS Token\",\n\t\t\t\t\t\"EOS Token\",\n\t\t\t\t\t\"EOT Token\",\n\t\t\t\t\t\"EOM Token\",\n\t\t\t\t\t\"Unknown Token\",\n\t\t\t\t\t\"Separator Token\",\n\t\t\t\t\t\"Padding Token\",\n\t\t\t\t},\n\t\t\t},\n\t\t\t[][]any{\n\t\t\t\t{\n\t\t\t\t\tt.Model,\n\t\t\t\t\tsprintf(tenary(t.TokensSize <= 0, \"N/A\", GGUFBytesScalar(t.TokensSize))),\n\t\t\t\t\tsprintf(tenary(t.TokensLength <= 0, \"N/A\", t.TokensLength)),\n\t\t\t\t\tsprintf(tenary(t.AddedTokensLength <= 0, \"N/A\", t.AddedTokensLength)),\n\t\t\t\t\tsprintf(tenary(t.BOSTokenID < 0, \"N/A\", t.BOSTokenID)),\n\t\t\t\t\tsprintf(tenary(t.EOSTokenID < 0, \"N/A\", t.EOSTokenID)),\n\t\t\t\t\tsprintf(tenary(t.EOTTokenID < 0, \"N/A\", t.EOTTokenID)),\n\t\t\t\t\tsprintf(tenary(t.EOMTokenID < 0, \"N/A\", t.EOMTokenID)),\n\t\t\t\t\tsprintf(tenary(t.UnknownTokenID < 0, \"N/A\", t.UnknownTokenID)),\n\t\t\t\t\tsprintf(tenary(t.SeparatorTokenID < 0, \"N/A\", t.SeparatorTokenID)),\n\t\t\t\t\tsprintf(tenary(t.PaddingTokenID < 0, \"N/A\", t.PaddingTokenID)),\n\t\t\t\t},\n\t\t\t})\n\t}\n\n\tif !skipEstimate && m.Architecture != \"diffusion\" {\n\t\thds := make([][]any, 2)\n\t\tlmes := lme.Summarize(mmap, platformRAM, platformVRAM)\n\t\tif !inShort {\n\t\t\thds[0] = []any{\n\t\t\t\t\"Arch\",\n\t\t\t\t\"Context Size\",\n\t\t\t\t\"Batch Size (L / P)\",\n\t\t\t\t\"Flash Attention\",\n\t\t\t\t\"MMap Load\",\n\t\t\t\t\"Embedding Only\",\n\t\t\t\t\"Reranking\",\n\t\t\t\t\"Distributable\",\n\t\t\t\t\"Offload Layers\",\n\t\t\t\t\"Full Offloaded\",\n\t\t\t}\n\t\t\thds[1] = []any{\n\t\t\t\t\"Arch\",\n\t\t\t\t\"Context Size\",\n\t\t\t\t\"Batch Size (L / P)\",\n\t\t\t\t\"Flash Attention\",\n\t\t\t\t\"MMap Load\",\n\t\t\t\t\"Embedding Only\",\n\t\t\t\t\"Reranking\",\n\t\t\t\t\"Distributable\",\n\t\t\t\t\"Offload Layers\",\n\t\t\t\t\"Full Offloaded\",\n\t\t\t}\n\t\t}\n\t\tif lmes.Items[0].MaximumTokensPerSecond != nil {\n\t\t\thds[0] = append(hds[0], \"Max TPS\")\n\t\t\thds[1] = append(hds[1], \"Max TPS\")\n\t\t}\n\t\thds[0] = append(hds[0], \"RAM\", \"RAM\", \"RAM\")\n\t\thds[1] = append(hds[1], \"Layers (I + T + O)\", \"UMA\", \"NonUMA\")\n\t\tfor _, v := range lmes.Items[0].VRAMs {\n\t\t\tvar hd string\n\t\t\tif v.Remote {\n\t\t\t\thd = fmt.Sprintf(\"RPC %d (V)RAM\", v.Position)\n\t\t\t} else {\n\t\t\t\thd = fmt.Sprintf(\"VRAM %d\", v.Position)\n\t\t\t}\n\t\t\thds[0] = append(hds[0], hd, hd, hd)\n\t\t\thds[1] = append(hds[1], \"Layers (T + O)\", \"UMA\", \"NonUMA\")\n\t\t}\n\n\t\tswitch {\n\t\tcase lmcOffloadLayersStep > lme.OffloadLayers:\n\t\t\tlmcOffloadLayersStep = lme.OffloadLayers\n\t\tcase lmcOffloadLayersStep <= 0:\n\t\t\tlmcOffloadLayersStep = lme.OffloadLayers\n\t\t}\n\t\tif lmcOffloadLayersStep < lme.OffloadLayers {\n\t\t\tcnt := lme.OffloadLayers/lmcOffloadLayersStep + 1\n\t\t\tif lme.OffloadLayers%lmcOffloadLayersStep != 0 || lme.FullOffloaded {\n\t\t\t\tcnt++\n\t\t\t}\n\t\t\tesis := make([]LLaMACppRunEstimateSummaryItem, cnt)\n\t\t\tvar wg sync.WaitGroup\n\t\t\tfor i := 0; i < cap(esis); i++ {\n\t\t\t\twg.Add(1)\n\t\t\t\tgo func(i int) {\n\t\t\t\t\tdefer wg.Done()\n\t\t\t\t\tlmeopts := eopts[:len(eopts):len(eopts)]\n\t\t\t\t\tlmeopts = append(lmeopts, WithLLaMACppOffloadLayers(uint64(i)*lmcOffloadLayersStep))\n\t\t\t\t\tesis[i] = gf.EstimateLLaMACppRun(lmeopts...).SummarizeItem(mmap, platformRAM, platformVRAM)\n\t\t\t\t}(i)\n\t\t\t}\n\t\t\twg.Wait()\n\t\t\tesis[cap(esis)-1] = lmes.Items[0]\n\t\t\tlmes.Items = esis\n\t\t}\n\n\t\tbds := make([][]any, len(lmes.Items))\n\t\tfor i := range lmes.Items {\n\t\t\tif !inShort {\n\t\t\t\tbds[i] = []any{\n\t\t\t\t\tsprintf(tenary(lmes.Architecture != \"\", lmes.Architecture, \"N/A\")),\n\t\t\t\t\tsprintf(lmes.ContextSize),\n\t\t\t\t\tsprintf(\"%d / %d\", lmes.LogicalBatchSize, lmes.PhysicalBatchSize),\n\t\t\t\t\tsprintf(tenary(flashAttention, tenary(lmes.FlashAttention, \"Enabled\", \"Unsupported\"), \"Disabled\")),\n\t\t\t\t\tsprintf(tenary(mmap, tenary(!lmes.NoMMap, \"Enabled\", \"Unsupported\"), \"Disabled\")),\n\t\t\t\t\tsprintf(tenary(lmes.EmbeddingOnly, \"Yes\", \"No\")),\n\t\t\t\t\tsprintf(tenary(lmes.Reranking, \"Supported\", \"Unsupported\")),\n\t\t\t\t\tsprintf(tenary(lmes.Architecture != \"\" && lmes.Distributable, \"Supported\", \"Unsupported\")),\n\t\t\t\t\tsprintf(tenary(lmes.Items[i].FullOffloaded, sprintf(\"%d (%d + 1)\",\n\t\t\t\t\t\tlmes.Items[i].OffloadLayers, lmes.Items[i].OffloadLayers-1), lmes.Items[i].OffloadLayers)),\n\t\t\t\t\tsprintf(tenary(lmes.Items[i].FullOffloaded, \"Yes\", \"No\")),\n\t\t\t\t}\n\t\t\t}\n\t\t\tif lmes.Items[i].MaximumTokensPerSecond != nil {\n\t\t\t\tbds[i] = append(bds[i],\n\t\t\t\t\tsprintf(*lmes.Items[i].MaximumTokensPerSecond))\n\t\t\t}\n\t\t\tbds[i] = append(bds[i],\n\t\t\t\tsprintf(\"1 + %d + %d\", lmes.Items[i].RAM.HandleLayers, tenary(lmes.Items[i].RAM.HandleOutputLayer, 1, 0)),\n\t\t\t\tsprintf(lmes.Items[i].RAM.UMA),\n\t\t\t\tsprintf(lmes.Items[i].RAM.NonUMA))\n\t\t\tfor _, v := range lmes.Items[i].VRAMs {\n\t\t\t\tbds[i] = append(bds[i],\n\t\t\t\t\tsprintf(\"%d + %d\", v.HandleLayers, tenary(v.HandleOutputLayer, 1, 0)),\n\t\t\t\t\tsprintf(v.UMA),\n\t\t\t\t\tsprintf(v.NonUMA))\n\t\t\t}\n\t\t}\n\n\t\ttprint(\n\t\t\t\"ESTIMATE\",\n\t\t\thds,\n\t\t\tbds)\n\t}\n\n\tif !skipEstimate && m.Architecture == \"diffusion\" {\n\t\thds := make([][]any, 2)\n\t\tsdes := sde.Summarize(mmap, platformRAM, platformVRAM)\n\t\tif !inShort {\n\t\t\thds[0] = []any{\n\t\t\t\t\"Arch\",\n\t\t\t\t\"Flash Attention\",\n\t\t\t\t\"MMap Load\",\n\t\t\t\t\"Distributable\",\n\t\t\t\t\"Full Offloaded\",\n\t\t\t}\n\t\t\thds[1] = []any{\n\t\t\t\t\"Arch\",\n\t\t\t\t\"Flash Attention\",\n\t\t\t\t\"MMap Load\",\n\t\t\t\t\"Distributable\",\n\t\t\t\t\"Full Offloaded\",\n\t\t\t}\n\t\t}\n\t\thds[0] = append(hds[0], \"RAM\", \"RAM\")\n\t\thds[1] = append(hds[1], \"UMA\", \"NonUMA\")\n\t\tfor _, v := range sdes.Items[0].VRAMs {\n\t\t\tvar hd string\n\t\t\tif v.Remote {\n\t\t\t\thd = fmt.Sprintf(\"RPC %d (V)RAM\", v.Position)\n\t\t\t} else {\n\t\t\t\thd = fmt.Sprintf(\"VRAM %d\", v.Position)\n\t\t\t}\n\t\t\thds[0] = append(hds[0], hd, hd)\n\t\t\thds[1] = append(hds[1], \"UMA\", \"NonUMA\")\n\t\t}\n\n\t\tbds := make([][]any, len(sdes.Items))\n\t\tfor i := range sdes.Items {\n\t\t\tif !inShort {\n\t\t\t\tbds[i] = []any{\n\t\t\t\t\tsprintf(tenary(sdes.Architecture != \"\", sdes.Architecture, \"N/A\")),\n\t\t\t\t\tsprintf(tenary(flashAttention, tenary(sdes.FlashAttention, \"Enabled\", \"Unsupported\"), \"Disabled\")),\n\t\t\t\t\tsprintf(tenary(mmap, tenary(!sdes.NoMMap, \"Enabled\", \"Unsupported\"), \"Disabled\")),\n\t\t\t\t\tsprintf(tenary(sdes.Architecture != \"\" && sdes.Distributable, \"Supported\", \"Unsupported\")),\n\t\t\t\t\tsprintf(tenary(sdes.Items[i].FullOffloaded, \"Yes\", \"No\")),\n\t\t\t\t}\n\t\t\t}\n\t\t\tbds[i] = append(bds[i],\n\t\t\t\tsprintf(sdes.Items[i].RAM.UMA),\n\t\t\t\tsprintf(sdes.Items[i].RAM.NonUMA))\n\t\t\tfor _, v := range sdes.Items[i].VRAMs {\n\t\t\t\tbds[i] = append(bds[i],\n\t\t\t\t\tsprintf(v.UMA),\n\t\t\t\t\tsprintf(v.NonUMA))\n\t\t\t}\n\t\t}\n\n\t\ttprint(\n\t\t\t\"ESTIMATE\",\n\t\t\thds,\n\t\t\tbds)\n\t}\n\n\treturn nil\n}\n\nfunc sprintf(f any, a ...any) string {\n\tif v, ok := f.(string); ok {\n\t\tif len(a) != 0 {\n\t\t\treturn fmt.Sprintf(v, a...)\n\t\t}\n\t\treturn v\n\t}\n\treturn anyx.String(f)\n}\n\nfunc tprint(title string, headers, bodies [][]any) {\n\ttw := table.NewWriter()\n\ttw.SetOutputMirror(os.Stdout)\n\ttw.SetTitle(strings.ToUpper(title))\n\tfor i := range headers {\n\t\ttw.AppendHeader(headers[i], table.RowConfig{AutoMerge: true, AutoMergeAlign: text.AlignCenter})\n\t}\n\tfor i := range bodies {\n\t\ttw.AppendRow(bodies[i])\n\t}\n\ttw.SetColumnConfigs(func() (r []table.ColumnConfig) {\n\t\tr = make([]table.ColumnConfig, len(headers[0]))\n\t\tfor i := range r {\n\t\t\tr[i].Number = i + 1\n\t\t\tr[i].AutoMerge = true\n\t\t\tif len(headers) > 1 && (strings.HasPrefix(headers[1][i].(string), \"Layers\") || headers[1][i] == \"UMA\" || headers[1][i] == \"NonUMA\") {\n\t\t\t\tr[i].AutoMerge = false\n\t\t\t}\n\t\t\tr[i].Align = text.AlignCenter\n\t\t\tr[i].AlignHeader = text.AlignCenter\n\t\t}\n\t\treturn r\n\t}())\n\ttw.Style().Options.SeparateRows = true\n\ttw.Render()\n\tfmt.Println()\n}\n\nfunc tenary(c bool, t, f any) any {\n\tif c {\n\t\treturn t\n\t}\n\treturn f\n}\n\nfunc tenaryFunc(c bool, t, f func() any) any {\n\tif c {\n\t\treturn t()\n\t}\n\treturn f()\n}\n\nfunc toGGMLType(s string) GGMLType {\n\tt := GGMLTypeF16\n\tswitch s {\n\tcase \"f32\":\n\t\tt = GGMLTypeF32\n\tcase \"f16\":\n\t\tt = GGMLTypeF16\n\tcase \"q8_0\":\n\t\tt = GGMLTypeQ8_0\n\tcase \"q4_0\":\n\t\tt = GGMLTypeQ4_0\n\tcase \"q4_1\":\n\t\tt = GGMLTypeQ4_1\n\tcase \"iq4_nl\":\n\t\tt = GGMLTypeIQ4_NL\n\tcase \"q5_0\":\n\t\tt = GGMLTypeQ5_0\n\tcase \"q5_1\":\n\t\tt = GGMLTypeQ5_1\n\t}\n\treturn t\n}\n"
  },
  {
    "path": "file.go",
    "content": "package gguf_parser\n\nimport (\n\t\"bytes\"\n\t\"encoding/binary\"\n\t\"errors\"\n\t\"fmt\"\n\t\"io\"\n\t\"regexp\"\n\t\"strings\"\n\n\t\"golang.org/x/exp/constraints\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/anyx\"\n\t\"github.com/gpustack/gguf-parser-go/util/bytex\"\n\t\"github.com/gpustack/gguf-parser-go/util/funcx\"\n\t\"github.com/gpustack/gguf-parser-go/util/osx\"\n\t\"github.com/gpustack/gguf-parser-go/util/stringx\"\n)\n\n// GGUFFile represents a GGUF file,\n// see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#file-structure.\n//\n// Compared with the complete GGUF file,\n// this structure lacks the tensor data part.\ntype GGUFFile struct {\n\t/* Basic */\n\n\t// Header is the header of the GGUF file.\n\tHeader GGUFHeader `json:\"header\"`\n\t// TensorInfos are the tensor infos of the GGUF file,\n\t// the size of TensorInfos is equal to `Header.TensorCount`.\n\tTensorInfos GGUFTensorInfos `json:\"tensorInfos\"`\n\t// Padding is the padding size of the GGUF file,\n\t// which is used to split Header and TensorInfos from tensor data.\n\tPadding int64 `json:\"padding\"`\n\t// SplitPaddings holds the padding size slice of the GGUF file splits,\n\t// each item represents splitting Header and TensorInfos from tensor data.\n\t//\n\t// The length of SplitPaddings is the number of split files.\n\tSplitPaddings []int64 `json:\"splitPaddings,omitempty\"`\n\t// TensorDataStartOffset is the offset in bytes of the tensor data in this file.\n\t//\n\t// The offset is the start of the file.\n\tTensorDataStartOffset int64 `json:\"tensorDataStartOffset\"`\n\t// SplitTensorDataStartOffsets holds the offset slice in bytes of the tensor data of the GGUF file splits,\n\t// each item represents the offset of the tensor data in the split file.\n\t//\n\t// The length of SplitTensorDataStartOffsets is the number of split files.\n\tSplitTensorDataStartOffsets []int64 `json:\"splitTensorDataStartOffsets,omitempty\"`\n\n\t/* Appendix */\n\n\t// Size is the size of the GGUF file,\n\t// if the file is split, the size is the sum of all split files.\n\tSize GGUFBytesScalar `json:\"size\"`\n\t// SplitSizes holds the size slice of the GGUF file splits,\n\t// each item represents the size of the split file.\n\t//\n\t// The length of SplitSizes is the number of split files.\n\tSplitSizes []GGUFBytesScalar `json:\"splitSizes,omitempty\"`\n\t// ModelSize is the size of the model when loading.\n\tModelSize GGUFBytesScalar `json:\"modelSize\"`\n\t// SplitModelSizes holds the size slice of the model,\n\t// each item represents a size when loading of the split file.\n\t//\n\t// The length of SplitModelSizes is the number of split files.\n\tSplitModelSizes []GGUFBytesScalar `json:\"splitModelSizes,omitempty\"`\n\t// ModelParameters is the number of the model parameters.\n\tModelParameters GGUFParametersScalar `json:\"modelParameters\"`\n\t// ModelBitsPerWeight is the bits per weight of the model,\n\t// which describes how many bits are used to store a weight,\n\t// higher is better.\n\tModelBitsPerWeight GGUFBitsPerWeightScalar `json:\"modelBitsPerWeight\"`\n}\n\n// GGUFMagic is a magic number of GGUF file,\n// see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#historical-state-of-affairs.\ntype GGUFMagic uint32\n\n// GGUFMagic constants.\nconst (\n\tGGUFMagicGGML   GGUFMagic = 0x67676d6c\n\tGGUFMagicGGMF   GGUFMagic = 0x67676d66\n\tGGUFMagicGGJT   GGUFMagic = 0x67676a74\n\tGGUFMagicGGUFLe GGUFMagic = 0x46554747 // GGUF\n\tGGUFMagicGGUFBe GGUFMagic = 0x47475546 // GGUF\n)\n\n// GGUFVersion is a version of GGUF file format,\n// see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#version-history.\ntype GGUFVersion uint32\n\n// GGUFVersion constants.\nconst (\n\tGGUFVersionV1 GGUFVersion = iota + 1\n\tGGUFVersionV2\n\tGGUFVersionV3\n)\n\n// GGUFHeader represents the header of a GGUF file.\ntype GGUFHeader struct {\n\t// Magic is a magic number that announces that this is a GGUF file.\n\tMagic GGUFMagic `json:\"magic\"`\n\t// Version is a version of the GGUF file format.\n\tVersion GGUFVersion `json:\"version\"`\n\t// TensorCount is the number of tensors in the file.\n\tTensorCount uint64 `json:\"tensorCount\"`\n\t// MetadataKVCount is the number of key-value pairs in the metadata.\n\tMetadataKVCount uint64 `json:\"metadataKVCount\"`\n\t// MetadataKV are the key-value pairs in the metadata,\n\tMetadataKV GGUFMetadataKVs `json:\"metadataKV\"`\n}\n\n// GGUFMetadataValueType is a type of GGUF metadata value,\n// see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#file-structure.\ntype GGUFMetadataValueType uint32\n\n// GGUFMetadataValueType constants.\nconst (\n\tGGUFMetadataValueTypeUint8 GGUFMetadataValueType = iota\n\tGGUFMetadataValueTypeInt8\n\tGGUFMetadataValueTypeUint16\n\tGGUFMetadataValueTypeInt16\n\tGGUFMetadataValueTypeUint32\n\tGGUFMetadataValueTypeInt32\n\tGGUFMetadataValueTypeFloat32\n\tGGUFMetadataValueTypeBool\n\tGGUFMetadataValueTypeString\n\tGGUFMetadataValueTypeArray\n\tGGUFMetadataValueTypeUint64\n\tGGUFMetadataValueTypeInt64\n\tGGUFMetadataValueTypeFloat64\n\t_GGUFMetadataValueTypeCount // Unknown\n)\n\n// Types for GGUFMetadataKV.\ntype (\n\t// GGUFMetadataKV is a key-value pair in the metadata of a GGUF file.\n\tGGUFMetadataKV struct {\n\t\t// Key is the key of the metadata key-value pair,\n\t\t// which is no larger than 64 bytes long.\n\t\tKey string `json:\"key\"`\n\t\t// ValueType is the type of the metadata value.\n\t\tValueType GGUFMetadataValueType `json:\"valueType\"`\n\t\t// Value is the value of the metadata key-value pair.\n\t\tValue any `json:\"value\"`\n\t}\n\n\t// GGUFMetadataKVArrayValue is a value of a GGUFMetadataKV with type GGUFMetadataValueTypeArray.\n\tGGUFMetadataKVArrayValue struct {\n\t\t/* Basic */\n\n\t\t// Type is the type of the array item.\n\t\tType GGUFMetadataValueType `json:\"type\"`\n\t\t// Len is the length of the array.\n\t\tLen uint64 `json:\"len\"`\n\t\t// Array holds all array items.\n\t\tArray []any `json:\"array,omitempty\"`\n\n\t\t/* Appendix */\n\n\t\t// StartOffset is the offset in bytes of the GGUFMetadataKVArrayValue in the GGUFFile file.\n\t\t//\n\t\t// The offset is the start of the file.\n\t\tStartOffset int64 `json:\"startOffset\"`\n\n\t\t// Size is the size of the array in bytes.\n\t\tSize int64 `json:\"size\"`\n\t}\n\n\t// GGUFMetadataKVs is a list of GGUFMetadataKV.\n\tGGUFMetadataKVs []GGUFMetadataKV\n)\n\n// Types for GGUFTensorInfo.\ntype (\n\t// GGUFTensorInfo represents a tensor info in a GGUF file.\n\tGGUFTensorInfo struct {\n\t\t/* Basic */\n\n\t\t// Name is the name of the tensor,\n\t\t// which is no larger than 64 bytes long.\n\t\tName string `json:\"name\"`\n\t\t// NDimensions is the number of dimensions of the tensor.\n\t\tNDimensions uint32 `json:\"nDimensions\"`\n\t\t// Dimensions is the dimensions of the tensor,\n\t\t// the length is NDimensions.\n\t\tDimensions []uint64 `json:\"dimensions\"`\n\t\t// Type is the type of the tensor.\n\t\tType GGMLType `json:\"type\"`\n\t\t// Offset is the offset in bytes of the tensor's data in this file.\n\t\t//\n\t\t// The offset is relative to tensor data, not to the start of the file.\n\t\tOffset uint64 `json:\"offset\"`\n\n\t\t/* Appendix */\n\n\t\t// StartOffset is the offset in bytes of the GGUFTensorInfo in the GGUFFile file.\n\t\t//\n\t\t// The offset is the start of the file.\n\t\tStartOffset int64 `json:\"startOffset\"`\n\t}\n\n\t// GGUFTensorInfos is a list of GGUFTensorInfo.\n\tGGUFTensorInfos []GGUFTensorInfo\n)\n\nvar ErrGGUFFileInvalidFormat = errors.New(\"invalid GGUF format\")\n\n// ParseGGUFFile parses a GGUF file from the local given path,\n// and returns the GGUFFile, or an error if any.\nfunc ParseGGUFFile(path string, opts ...GGUFReadOption) (*GGUFFile, error) {\n\tvar o _GGUFReadOptions\n\tfor _, opt := range opts {\n\t\topt(&o)\n\t}\n\n\tvar paths []string\n\t{\n\t\trs := CompleteShardGGUFFilename(path)\n\t\tif rs != nil {\n\t\t\tpaths = rs\n\t\t} else {\n\t\t\tpaths = []string{path}\n\t\t}\n\t}\n\n\tfs := make([]_GGUFFileReadSeeker, 0, len(paths))\n\tdefer func() {\n\t\tfor i := range fs {\n\t\t\tosx.Close(fs[i])\n\t\t}\n\t}()\n\n\tfor i := range paths {\n\t\tif o.MMap {\n\t\t\tmf, err := osx.OpenMmapFile(paths[i])\n\t\t\tif err != nil {\n\t\t\t\treturn nil, fmt.Errorf(\"open mmap file: %w\", err)\n\t\t\t}\n\n\t\t\tfs = append(fs, _GGUFFileReadSeeker{\n\t\t\t\tCloser:     mf,\n\t\t\t\tReadSeeker: io.NewSectionReader(mf, 0, mf.Len()),\n\t\t\t\tSize:       mf.Len(),\n\t\t\t})\n\n\t\t\tcontinue\n\t\t}\n\n\t\tff, err := osx.Open(paths[i])\n\t\tif err != nil {\n\t\t\treturn nil, fmt.Errorf(\"open file: %w\", err)\n\t\t}\n\n\t\tfs = append(fs, _GGUFFileReadSeeker{\n\t\t\tCloser:     ff,\n\t\t\tReadSeeker: ff,\n\t\t\tSize:       funcx.MustNoError(ff.Stat()).Size(),\n\t\t})\n\t}\n\n\treturn parseGGUFFile(fs, o)\n}\n\ntype _GGUFFileReadSeeker struct {\n\tio.Closer\n\tio.ReadSeeker\n\tSize int64\n}\n\nfunc _validateCountWithRemaining(f _GGUFFileReadSeeker, count uint64, version GGUFVersion, what string) error {\n\tif count == 0 {\n\t\treturn nil\n\t}\n\tvar minItemSize int64\n\n\tswitch strings.ToLower(what) {\n\tcase \"metadatakvcount\":\n\t\tif version <= GGUFVersionV1 {\n\t\t\tminItemSize = 12 // key length (uint32) + value type (uint32) + min value (string length uint32)\n\t\t} else {\n\t\t\tminItemSize = 20 // key length (uint64) + value type (uint32) + min value (string length uint64)\n\t\t}\n\tcase \"tensor\":\n\t\tif version <= GGUFVersionV1 {\n\t\t\tminItemSize = 20 // name length (uint32) + n_dims (uint32) + type (uint32) + offset (uint64)\n\t\t} else {\n\t\t\tminItemSize = 24 // name length (uint64) + n_dims (uint32) + type (uint32) + offset (uint64)\n\t\t}\n\t}\n\n\tif minItemSize <= 0 {\n\t\treturn fmt.Errorf(\"invalid min item size for %s: %d\", what, minItemSize)\n\t}\n\tpos, err := f.Seek(0, io.SeekCurrent)\n\tif err != nil {\n\t\treturn fmt.Errorf(\"seek %s count position: %w\", what, err)\n\t}\n\tremaining := f.Size - pos\n\tif remaining < 0 {\n\t\treturn fmt.Errorf(\"invalid file size: %d\", f.Size)\n\t}\n\tmaxCount := uint64(remaining / minItemSize)\n\tif maxCount < count {\n\t\treturn fmt.Errorf(\"%s count too large for remaining bytes: %d\", what, count)\n\t}\n\n\treturn nil\n}\n\nfunc parseGGUFFile(fs []_GGUFFileReadSeeker, o _GGUFReadOptions) (_ *GGUFFile, err error) {\n\tvar gf GGUFFile\n\n\tfor _, f := range fs {\n\t\tvar bo binary.ByteOrder = binary.LittleEndian\n\n\t\t// magic\n\t\tvar magic GGUFMagic\n\t\tif err = binary.Read(f, bo, &magic); err != nil {\n\t\t\treturn nil, fmt.Errorf(\"read magic: %w\", err)\n\t\t}\n\t\tswitch magic {\n\t\tdefault:\n\t\t\treturn nil, ErrGGUFFileInvalidFormat\n\t\tcase GGUFMagicGGML, GGUFMagicGGMF, GGUFMagicGGJT:\n\t\t\treturn nil, fmt.Errorf(\"unsupported format: %s\", magic)\n\t\tcase GGUFMagicGGUFLe:\n\t\tcase GGUFMagicGGUFBe:\n\t\t\tbo = binary.BigEndian\n\t\t}\n\t\tgf.Header.Magic = magic\n\n\t\t// version\n\t\tvar version GGUFVersion\n\t\tif err = binary.Read(f, bo, &version); err != nil {\n\t\t\treturn nil, fmt.Errorf(\"read version: %w\", err)\n\t\t}\n\t\tif version > GGUFVersionV3 {\n\t\t\treturn nil, fmt.Errorf(\"unsupported GGUF version: %d (supported: %d-%d)\",\n\t\t\t\tversion, GGUFVersionV1, GGUFVersionV3)\n\t\t}\n\t\tgf.Header.Version = version\n\n\t\trd := _GGUFReader{v: version, o: o, f: f, bo: bo}\n\n\t\t// tensor count\n\t\tvar tensorCount uint64\n\t\tif version <= GGUFVersionV1 {\n\t\t\ttensorCount, err = rd.ReadUint64FromUint32()\n\t\t} else {\n\t\t\ttensorCount, err = rd.ReadUint64()\n\t\t}\n\t\tif err != nil {\n\t\t\treturn nil, fmt.Errorf(\"read tensor count: %w\", err)\n\t\t}\n\t\tif err := _validateCountWithRemaining(f, tensorCount, version, \"tensor\"); err != nil {\n\t\t\treturn nil, err\n\t\t}\n\t\tgf.Header.TensorCount += tensorCount\n\n\t\t// metadata kv count\n\t\tvar metadataKVCount uint64\n\t\tif version <= GGUFVersionV1 {\n\t\t\tmetadataKVCount, err = rd.ReadUint64FromUint32()\n\t\t} else {\n\t\t\tmetadataKVCount, err = rd.ReadUint64()\n\t\t}\n\t\tif err != nil {\n\t\t\treturn nil, fmt.Errorf(\"read metadata kv count: %w\", err)\n\t\t}\n\t\tif err := _validateCountWithRemaining(f, metadataKVCount, version, \"metadatakvcount\"); err != nil {\n\t\t\treturn nil, err\n\t\t}\n\t\tgf.Header.MetadataKVCount += metadataKVCount\n\n\t\t// metadata kv\n\t\t{\n\t\t\trd := _GGUFMetadataReader{_GGUFReader: rd}\n\t\t\tkvs := make(GGUFMetadataKVs, metadataKVCount)\n\t\t\tfor i := uint64(0); i < metadataKVCount; i++ {\n\t\t\t\tkvs[i], err = rd.Read()\n\t\t\t\tif err != nil {\n\t\t\t\t\treturn nil, fmt.Errorf(\"read metadata kv %d: %w\", i, err)\n\t\t\t\t}\n\t\t\t}\n\t\t\tfor i := range kvs {\n\t\t\t\tif kvs[i].Key == \"split.no\" {\n\t\t\t\t\tgf.Header.MetadataKVCount--\n\t\t\t\t\tcontinue\n\t\t\t\t}\n\t\t\t\tgf.Header.MetadataKV = append(gf.Header.MetadataKV, kvs[i])\n\t\t\t}\n\t\t}\n\n\t\t// tensor infos\n\t\tif gf.TensorInfos == nil {\n\t\t\ttc, ok := gf.Header.MetadataKV.Get(\"split.tensors.count\")\n\t\t\tif ok {\n\t\t\t\tgf.TensorInfos = make(GGUFTensorInfos, 0, anyx.Number[int](tc.Value))\n\t\t\t} else {\n\t\t\t\t// avoid preallocating with tensorCount (could be huge); start empty and append\n\t\t\t\tgf.TensorInfos = make(GGUFTensorInfos, 0)\n\t\t\t}\n\t\t}\n\t\t{\n\t\t\trd := _GGUFTensorInfoReader{_GGUFReader: rd}\n\t\t\ttis := make(GGUFTensorInfos, 0)\n\t\t\tfor i := uint64(0); i < tensorCount; i++ {\n\t\t\t\tti, err := rd.Read()\n\t\t\t\tif err != nil {\n\t\t\t\t\treturn nil, fmt.Errorf(\"read tensor info %d: %w\", i, err)\n\t\t\t\t}\n\t\t\t\ttis = append(tis, ti)\n\t\t\t}\n\t\t\tgf.TensorInfos = append(gf.TensorInfos, tis...)\n\t\t}\n\n\t\tpds, err := f.Seek(0, io.SeekCurrent)\n\t\tif err != nil {\n\t\t\treturn nil, fmt.Errorf(\"seek padding start: %w\", err)\n\t\t}\n\n\t\t// padding\n\t\tvar padding int64\n\t\t{\n\t\t\t// The global alignment to use, as described above.\n\t\t\t// This can vary to allow for different alignment schemes, but it must be a multiple of 8.\n\t\t\t// Some writers may not write the alignment.\n\t\t\t// If the alignment is not specified, assume it is 32.\n\t\t\tvar ag uint32 = 32\n\t\t\tif v, ok := gf.Header.MetadataKV.Get(\"general.alignment\"); ok {\n\t\t\t\tag = v.ValueUint32()\n\t\t\t}\n\t\t\tpadding = int64(ag) - (pds % int64(ag))\n\t\t}\n\t\tif len(fs) == 1 {\n\t\t\tgf.Padding = padding\n\t\t}\n\t\tgf.SplitPaddings = append(gf.SplitPaddings, padding)\n\n\t\t// tensor data offset\n\t\ttensorDataStartOffset := pds + padding\n\t\tif len(fs) == 1 {\n\t\t\tgf.TensorDataStartOffset = tensorDataStartOffset\n\t\t}\n\t\tgf.SplitTensorDataStartOffsets = append(gf.SplitTensorDataStartOffsets, tensorDataStartOffset)\n\n\t\t// size\n\t\tsize := GGUFBytesScalar(f.Size)\n\t\tgf.Size += size\n\t\tgf.SplitSizes = append(gf.SplitSizes, size)\n\n\t\t// model size\n\t\tmodelSize := GGUFBytesScalar(f.Size - tensorDataStartOffset)\n\t\tgf.ModelSize += modelSize\n\t\tgf.SplitModelSizes = append(gf.SplitModelSizes, modelSize)\n\t}\n\n\t// model parameters\n\tgf.ModelParameters = GGUFParametersScalar(gf.TensorInfos.Elements())\n\n\t// bpw\n\tif gf.ModelParameters != 0 {\n\t\tgf.ModelBitsPerWeight = GGUFBitsPerWeightScalar(float64(gf.ModelSize) * 8 / float64(gf.ModelParameters))\n\t}\n\n\treturn &gf, nil\n}\n\n// Types for GGUF hierarchical tensors.\ntype (\n\t// GGUFTensorInfoFilter is a filter to filter out if the given tensor name matches.\n\t// Return true if the name matches, and false otherwise.\n\tGGUFTensorInfoFilter func(name string) bool\n\n\t// IGGUFTensorInfos is an interface for GGUF tensor infos,\n\t// which includes basic operations.\n\tIGGUFTensorInfos interface {\n\t\t// Get returns the GGUFTensorInfo with the given name,\n\t\t// and true if found, and false otherwise.\n\t\tGet(name string) (info GGUFTensorInfo, found bool)\n\t\t// GetFileType returns the GGUFFileType.\n\t\tGetFileType() GGUFFileType\n\t\t// Match returns true if the name matches the given regex, and false otherwise.\n\t\tMatch(nameRegex *regexp.Regexp) bool\n\t\t// Search returns a list of GGUFTensorInfo with the names that match the given regex.\n\t\tSearch(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo)\n\t\t// Index returns a map value to the GGUFTensorInfo with the given names,\n\t\t// and the number of names found.\n\t\tIndex(names []string) (infos map[string]GGUFTensorInfo, found int)\n\t\t// Elements returns the number of elements(parameters).\n\t\tElements(filter ...GGUFTensorInfoFilter) uint64\n\t\t// Bytes returns the number of bytes.\n\t\tBytes(filter ...GGUFTensorInfoFilter) uint64\n\t\t// Count returns the number of tensors.\n\t\tCount() uint64\n\t}\n\n\t// GGUFLayerTensorInfos represents hierarchical tensor infos of a GGUF file,\n\t// it can save GGUFNamedTensorInfos, GGUFTensorInfos, and GGUFTensorInfo.\n\tGGUFLayerTensorInfos []IGGUFTensorInfos\n\n\t// GGUFNamedTensorInfos is the namespace for relevant tensors,\n\t// which must has a name.\n\tGGUFNamedTensorInfos struct {\n\t\t// Name is the name of the namespace.\n\t\tName string `json:\"name\"`\n\t\t// GGUFLayerTensorInfos can save GGUFNamedTensorInfos, GGUFTensorInfos, or GGUFTensorInfo.\n\t\t//\n\t\t// If the item is type of GGUFTensorInfo, it must be the leaf node.\n\t\t//\n\t\t// Any branch nodes are type of GGUFNamedTensorInfos or GGUFTensorInfos,\n\t\t// which can be nested.\n\t\t//\n\t\t// Branch nodes store in type pointer.\n\t\tGGUFLayerTensorInfos `json:\"items,omitempty\"`\n\t}\n)\n\n// Layers converts the GGUFTensorInfos to GGUFLayerTensorInfos.\nfunc (gf *GGUFFile) Layers(ignores ...string) GGUFLayerTensorInfos {\n\treturn gf.TensorInfos.Layers(ignores...)\n}\n\nfunc (kv GGUFMetadataKV) ValueUint8() uint8 {\n\tif kv.ValueType != GGUFMetadataValueTypeUint8 {\n\t\tpanic(fmt.Errorf(\"key %q try to get type Uint8 but type %v\", kv.Key, kv.ValueType))\n\t}\n\treturn anyx.Number[uint8](kv.Value)\n}\n\nfunc (kv GGUFMetadataKV) ValueInt8() int8 {\n\tif kv.ValueType != GGUFMetadataValueTypeInt8 {\n\t\tpanic(fmt.Errorf(\"key %q try to get type Int8 but type %v\", kv.Key, kv.ValueType))\n\t}\n\treturn anyx.Number[int8](kv.Value)\n}\n\nfunc (kv GGUFMetadataKV) ValueUint16() uint16 {\n\tif kv.ValueType != GGUFMetadataValueTypeUint16 {\n\t\tpanic(fmt.Errorf(\"key %q try to get type Uint16 but type %v\", kv.Key, kv.ValueType))\n\t}\n\treturn anyx.Number[uint16](kv.Value)\n}\n\nfunc (kv GGUFMetadataKV) ValueInt16() int16 {\n\tif kv.ValueType != GGUFMetadataValueTypeInt16 {\n\t\tpanic(fmt.Errorf(\"key %q try to get type Int16 but type %v\", kv.Key, kv.ValueType))\n\t}\n\treturn anyx.Number[int16](kv.Value)\n}\n\nfunc (kv GGUFMetadataKV) ValueUint32() uint32 {\n\tif kv.ValueType != GGUFMetadataValueTypeUint32 {\n\t\tpanic(fmt.Errorf(\"key %q try to get type Uint32 but type %v\", kv.Key, kv.ValueType))\n\t}\n\treturn anyx.Number[uint32](kv.Value)\n}\n\nfunc (kv GGUFMetadataKV) ValueInt32() int32 {\n\tif kv.ValueType != GGUFMetadataValueTypeInt32 {\n\t\tpanic(fmt.Errorf(\"key %q try to get type Int32 but type %v\", kv.Key, kv.ValueType))\n\t}\n\treturn anyx.Number[int32](kv.Value)\n}\n\nfunc (kv GGUFMetadataKV) ValueFloat32() float32 {\n\tif kv.ValueType != GGUFMetadataValueTypeFloat32 {\n\t\tpanic(fmt.Errorf(\"key %q try to get type Float32 but type %v\", kv.Key, kv.ValueType))\n\t}\n\treturn anyx.Number[float32](kv.Value)\n}\n\nfunc (kv GGUFMetadataKV) ValueBool() bool {\n\tif kv.ValueType != GGUFMetadataValueTypeBool {\n\t\tpanic(fmt.Errorf(\"key %q try to get type Bool but type %v\", kv.Key, kv.ValueType))\n\t}\n\treturn anyx.Bool(kv.Value)\n}\n\nfunc (kv GGUFMetadataKV) ValueString() string {\n\tif kv.ValueType != GGUFMetadataValueTypeString {\n\t\tpanic(fmt.Errorf(\"key %q try to get type String but type %v\", kv.Key, kv.ValueType))\n\t}\n\treturn anyx.String(kv.Value)\n}\n\nfunc (kv GGUFMetadataKV) ValueArray() GGUFMetadataKVArrayValue {\n\tif kv.ValueType != GGUFMetadataValueTypeArray {\n\t\tpanic(fmt.Errorf(\"key %q try to get type Array but type %v\", kv.Key, kv.ValueType))\n\t}\n\tswitch t := kv.Value.(type) {\n\tcase GGUFMetadataKVArrayValue:\n\t\treturn t\n\tcase map[string]any:\n\t\treturn GGUFMetadataKVArrayValue{\n\t\t\tType: anyx.Number[GGUFMetadataValueType](t[\"type\"]),\n\t\t\tLen:  anyx.Number[uint64](t[\"len\"]),\n\t\t\tArray: func() []any {\n\t\t\t\tif vv, ok := t[\"array\"].([]any); ok {\n\t\t\t\t\treturn vv\n\t\t\t\t}\n\t\t\t\treturn nil\n\t\t\t}(),\n\t\t\tStartOffset: anyx.Number[int64](t[\"startOffset\"]),\n\t\t\tSize:        anyx.Number[int64](t[\"size\"]),\n\t\t}\n\tdefault:\n\t\tpanic(fmt.Errorf(\"key %q try to get type Array but type %T\", kv.Key, kv.Value))\n\t}\n}\n\nfunc (kv GGUFMetadataKV) ValueUint64() uint64 {\n\tif kv.ValueType != GGUFMetadataValueTypeUint64 {\n\t\tpanic(fmt.Errorf(\"key %q try to get type Uint64 but type %v\", kv.Key, kv.ValueType))\n\t}\n\treturn anyx.Number[uint64](kv.Value)\n}\n\nfunc (kv GGUFMetadataKV) ValueInt64() int64 {\n\tif kv.ValueType != GGUFMetadataValueTypeInt64 {\n\t\tpanic(fmt.Errorf(\"key %q try to get type Int64 but type %v\", kv.Key, kv.ValueType))\n\t}\n\treturn anyx.Number[int64](kv.Value)\n}\n\nfunc (kv GGUFMetadataKV) ValueFloat64() float64 {\n\tif kv.ValueType != GGUFMetadataValueTypeFloat64 {\n\t\tpanic(fmt.Errorf(\"key %q try to get type Float64 but type %v\", kv.Key, kv.ValueType))\n\t}\n\treturn anyx.Number[float64](kv.Value)\n}\n\n// ValueNumeric returns the numeric values of the GGUFMetadataKV,\n// and panics if the value type is not numeric.\n//\n// ValueNumeric is a generic function, and the type T must be constraints.Integer or constraints.Float.\n//\n// Compare to the GGUFMetadataKV's Value* functions,\n// ValueNumeric will cast the original value to the target type.\nfunc ValueNumeric[T constraints.Integer | constraints.Float](kv GGUFMetadataKV) T {\n\tswitch kv.ValueType {\n\tcase GGUFMetadataValueTypeUint8:\n\tcase GGUFMetadataValueTypeInt8:\n\tcase GGUFMetadataValueTypeUint16:\n\tcase GGUFMetadataValueTypeInt16:\n\tcase GGUFMetadataValueTypeUint32:\n\tcase GGUFMetadataValueTypeInt32:\n\tcase GGUFMetadataValueTypeFloat32:\n\tcase GGUFMetadataValueTypeUint64:\n\tcase GGUFMetadataValueTypeInt64:\n\tcase GGUFMetadataValueTypeFloat64:\n\tdefault:\n\t\tpanic(fmt.Errorf(\"key %q try to get type Numeric but got type %v\", kv.Key, kv.ValueType))\n\t}\n\treturn anyx.Number[T](kv.Value)\n}\n\nfunc (av GGUFMetadataKVArrayValue) ValuesUint8() []uint8 {\n\tif av.Type != GGUFMetadataValueTypeUint8 {\n\t\tpanic(fmt.Errorf(\"try to get type Uint8 but got type %v\", av.Type))\n\t}\n\tv := make([]uint8, av.Len)\n\tfor i := uint64(0); i < av.Len; i++ {\n\t\tv[i] = anyx.Number[uint8](av.Array[i])\n\t}\n\treturn v\n}\n\nfunc (av GGUFMetadataKVArrayValue) ValuesInt8() []int8 {\n\tif av.Type != GGUFMetadataValueTypeInt8 {\n\t\tpanic(fmt.Errorf(\"try to get type Int8 but got type %v\", av.Type))\n\t}\n\tv := make([]int8, av.Len)\n\tfor i := uint64(0); i < av.Len; i++ {\n\t\tv[i] = anyx.Number[int8](av.Array[i])\n\t}\n\treturn v\n}\n\nfunc (av GGUFMetadataKVArrayValue) ValuesUint16() []uint16 {\n\tif av.Type != GGUFMetadataValueTypeUint16 {\n\t\tpanic(fmt.Errorf(\"try to get type Uint16 but got type %v\", av.Type))\n\t}\n\tv := make([]uint16, av.Len)\n\tfor i := uint64(0); i < av.Len; i++ {\n\t\tv[i] = anyx.Number[uint16](av.Array[i])\n\t}\n\treturn v\n}\n\nfunc (av GGUFMetadataKVArrayValue) ValuesInt16() []int16 {\n\tif av.Type != GGUFMetadataValueTypeInt16 {\n\t\tpanic(fmt.Errorf(\"try to get type Int16 but got type %v\", av.Type))\n\t}\n\tv := make([]int16, av.Len)\n\tfor i := uint64(0); i < av.Len; i++ {\n\t\tv[i] = anyx.Number[int16](av.Array[i])\n\t}\n\treturn v\n}\n\nfunc (av GGUFMetadataKVArrayValue) ValuesUint32() []uint32 {\n\tif av.Type != GGUFMetadataValueTypeUint32 {\n\t\tpanic(fmt.Errorf(\"try to get type Uint8 but got type %v\", av.Type))\n\t}\n\tv := make([]uint32, av.Len)\n\tfor i := uint64(0); i < av.Len; i++ {\n\t\tv[i] = anyx.Number[uint32](av.Array[i])\n\t}\n\treturn v\n}\n\nfunc (av GGUFMetadataKVArrayValue) ValuesInt32() []int32 {\n\tif av.Type != GGUFMetadataValueTypeInt32 {\n\t\tpanic(fmt.Errorf(\"try to get type Int32 but got type %v\", av.Type))\n\t}\n\tv := make([]int32, av.Len)\n\tfor i := uint64(0); i < av.Len; i++ {\n\t\tv[i] = anyx.Number[int32](av.Array[i])\n\t}\n\treturn v\n}\n\nfunc (av GGUFMetadataKVArrayValue) ValuesFloat32() []float32 {\n\tif av.Type != GGUFMetadataValueTypeFloat32 {\n\t\tpanic(fmt.Errorf(\"try to get type Float32 but got type %v\", av.Type))\n\t}\n\tv := make([]float32, av.Len)\n\tfor i := uint64(0); i < av.Len; i++ {\n\t\tv[i] = anyx.Number[float32](av.Array[i])\n\t}\n\treturn v\n}\n\nfunc (av GGUFMetadataKVArrayValue) ValuesBool() []bool {\n\tif av.Type != GGUFMetadataValueTypeBool {\n\t\tpanic(fmt.Errorf(\"try to get type Bool but got type %v\", av.Type))\n\t}\n\tv := make([]bool, av.Len)\n\tfor i := uint64(0); i < av.Len; i++ {\n\t\tv[i] = anyx.Bool(av.Array[i])\n\t}\n\treturn v\n}\n\nfunc (av GGUFMetadataKVArrayValue) ValuesString() []string {\n\tif av.Type != GGUFMetadataValueTypeString {\n\t\tpanic(fmt.Errorf(\"try to get type String but got type %v\", av.Type))\n\t}\n\tv := make([]string, av.Len)\n\tfor i := uint64(0); i < av.Len; i++ {\n\t\tv[i] = anyx.String(av.Array[i])\n\t}\n\treturn v\n}\n\nfunc (av GGUFMetadataKVArrayValue) ValuesArray() []GGUFMetadataKVArrayValue {\n\tif av.Type != GGUFMetadataValueTypeArray {\n\t\tpanic(fmt.Errorf(\"try to get type Array but got type %v\", av.Type))\n\t}\n\tv := make([]GGUFMetadataKVArrayValue, av.Len)\n\tfor i := uint64(0); i < av.Len; i++ {\n\t\tswitch t := av.Array[i].(type) {\n\t\tcase GGUFMetadataKVArrayValue:\n\t\t\tv[i] = t\n\t\tcase map[string]any:\n\t\t\tv[i] = GGUFMetadataKVArrayValue{\n\t\t\t\tType: anyx.Number[GGUFMetadataValueType](t[\"type\"]),\n\t\t\t\tLen:  anyx.Number[uint64](t[\"len\"]),\n\t\t\t\tArray: func() []any {\n\t\t\t\t\tif vv, ok := t[\"array\"].([]any); ok {\n\t\t\t\t\t\treturn vv\n\t\t\t\t\t}\n\t\t\t\t\treturn nil\n\t\t\t\t}(),\n\t\t\t\tStartOffset: anyx.Number[int64](t[\"startOffset\"]),\n\t\t\t\tSize:        anyx.Number[int64](t[\"size\"]),\n\t\t\t}\n\t\tdefault:\n\t\t\tpanic(fmt.Errorf(\"try to get type Array but got type %T\", av.Array[i]))\n\t\t}\n\t}\n\treturn v\n}\n\nfunc (av GGUFMetadataKVArrayValue) ValuesUint64() []uint64 {\n\tif av.Type != GGUFMetadataValueTypeUint64 {\n\t\tpanic(fmt.Errorf(\"try to get type Uint16 but got type %v\", av.Type))\n\t}\n\tv := make([]uint64, av.Len)\n\tfor i := uint64(0); i < av.Len; i++ {\n\t\tv[i] = anyx.Number[uint64](av.Array[i])\n\t}\n\treturn v\n}\n\nfunc (av GGUFMetadataKVArrayValue) ValuesInt64() []int64 {\n\tif av.Type != GGUFMetadataValueTypeInt64 {\n\t\tpanic(fmt.Errorf(\"try to get type Int64 but got type %v\", av.Type))\n\t}\n\tv := make([]int64, av.Len)\n\tfor i := uint64(0); i < av.Len; i++ {\n\t\tv[i] = anyx.Number[int64](av.Array[i])\n\t}\n\treturn v\n}\n\nfunc (av GGUFMetadataKVArrayValue) ValuesFloat64() []float64 {\n\tif av.Type != GGUFMetadataValueTypeFloat64 {\n\t\tpanic(fmt.Errorf(\"try to get type Float64 but got type %v\", av.Type))\n\t}\n\tv := make([]float64, av.Len)\n\tfor i := uint64(0); i < av.Len; i++ {\n\t\tv[i] = anyx.Number[float64](av.Array[i])\n\t}\n\treturn v\n}\n\n// ValuesNumeric returns the numeric values of the GGUFMetadataKVArrayValue,\n// and panics if the value type is not numeric.\n//\n// ValuesNumeric is a generic function, and the type T must be constraints.Integer or constraints.Float.\n//\n// Compare to the GGUFMetadataKVArrayValue's Value* functions,\n// ValuesNumeric will cast the original value to the target type.\nfunc ValuesNumeric[T constraints.Integer | constraints.Float](av GGUFMetadataKVArrayValue) []T {\n\tv := make([]T, av.Len)\n\tfor i := uint64(0); i < av.Len; i++ {\n\t\tswitch av.Type {\n\t\tcase GGUFMetadataValueTypeUint8:\n\t\tcase GGUFMetadataValueTypeInt8:\n\t\tcase GGUFMetadataValueTypeUint16:\n\t\tcase GGUFMetadataValueTypeInt16:\n\t\tcase GGUFMetadataValueTypeUint32:\n\t\tcase GGUFMetadataValueTypeInt32:\n\t\tcase GGUFMetadataValueTypeFloat32:\n\t\tcase GGUFMetadataValueTypeUint64:\n\t\tcase GGUFMetadataValueTypeInt64:\n\t\tcase GGUFMetadataValueTypeFloat64:\n\t\tdefault:\n\t\t\tpanic(fmt.Errorf(\"try to get type Numeric but got type %v\", av.Type))\n\t\t}\n\t\tif av.Array != nil {\n\t\t\tv[i] = anyx.Number[T](av.Array[i])\n\t\t}\n\t}\n\treturn v\n}\n\n// Get returns the GGUFMetadataKV with the given key,\n// and true if found, and false otherwise.\nfunc (kvs GGUFMetadataKVs) Get(key string) (value GGUFMetadataKV, found bool) {\n\tfor i := range kvs {\n\t\tif kvs[i].Key == key {\n\t\t\treturn kvs[i], true\n\t\t}\n\t}\n\treturn GGUFMetadataKV{}, false\n}\n\n// Search returns a list of GGUFMetadataKV with the keys that match the given regex.\nfunc (kvs GGUFMetadataKVs) Search(keyRegex *regexp.Regexp) (values []GGUFMetadataKV) {\n\tfor i := range kvs {\n\t\tif keyRegex.MatchString(kvs[i].Key) {\n\t\t\tvalues = append(values, kvs[i])\n\t\t}\n\t}\n\treturn values\n}\n\n// Index returns a map value to the GGUFMetadataKVs with the given keys,\n// and the number of keys found.\nfunc (kvs GGUFMetadataKVs) Index(keys []string) (values map[string]GGUFMetadataKV, found int) {\n\tks := make(map[string]struct{}, len(keys))\n\tfor i := range keys {\n\t\tks[keys[i]] = struct{}{}\n\t}\n\tvalues = make(map[string]GGUFMetadataKV)\n\tfor i := range kvs {\n\t\tif _, ok := ks[kvs[i].Key]; ok {\n\t\t\tvalues[kvs[i].Key] = kvs[i]\n\t\t\tfound++\n\t\t}\n\t\tif found == len(ks) {\n\t\t\tbreak\n\t\t}\n\t}\n\treturn values, found\n}\n\n// Get returns the GGUFTensorInfo with the given name,\n// and true if found, and false otherwise.\nfunc (ti GGUFTensorInfo) Get(name string) (info GGUFTensorInfo, found bool) {\n\tif ti.Name == name {\n\t\treturn ti, true\n\t}\n\treturn GGUFTensorInfo{}, false\n}\n\n// GetFileType returns the GGUFFileType.\nfunc (ti GGUFTensorInfo) GetFileType() GGUFFileType {\n\treturn GetFileType(map[GGMLType]int{ti.Type: 1})\n}\n\n// Match returns true if the name of the GGUFTensorInfo matches the given regex.\nfunc (ti GGUFTensorInfo) Match(nameRegex *regexp.Regexp) bool {\n\treturn nameRegex.MatchString(ti.Name)\n}\n\n// Search returns a list of GGUFTensorInfo with the names that match the given regex.\nfunc (ti GGUFTensorInfo) Search(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo) {\n\tif nameRegex.MatchString(ti.Name) {\n\t\treturn []GGUFTensorInfo{ti}\n\t}\n\treturn nil\n}\n\n// Index returns a map value to the GGUFTensorInfo with the given names,\n// and the number of names found.\nfunc (ti GGUFTensorInfo) Index(names []string) (infos map[string]GGUFTensorInfo, found int) {\n\tif len(names) == 0 {\n\t\treturn nil, 0\n\t}\n\tif names[0] == ti.Name {\n\t\treturn map[string]GGUFTensorInfo{ti.Name: ti}, 1\n\t}\n\treturn nil, 0\n}\n\n// Elements returns the number of elements of the GGUFTensorInfo,\n// which is inspired by\n// https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2597-L2601.\nfunc (ti GGUFTensorInfo) Elements(filter ...GGUFTensorInfoFilter) uint64 {\n\tif ti.NDimensions == 0 {\n\t\treturn 0\n\t}\n\n\tfor i := range filter {\n\t\tif filter[i] != nil && !filter[i](ti.Name) {\n\t\t\treturn 0\n\t\t}\n\t}\n\n\tret := uint64(1)\n\tfor i := uint32(0); i < ti.NDimensions; i++ {\n\t\tret *= ti.Dimensions[i]\n\t}\n\treturn ret\n}\n\n// Bytes returns the number of bytes of the GGUFTensorInfo,\n// which is inspired by\n// https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2609-L2626.\nfunc (ti GGUFTensorInfo) Bytes(filter ...GGUFTensorInfoFilter) uint64 {\n\tif ti.NDimensions == 0 {\n\t\treturn 0\n\t}\n\n\ttt, ok := ti.Type.Trait()\n\tif !ok {\n\t\tpanic(fmt.Errorf(\"invalid type: %v\", ti.Type))\n\t}\n\n\tfor i := range filter {\n\t\tif filter[i] != nil && !filter[i](ti.Name) {\n\t\t\treturn 0\n\t\t}\n\t}\n\n\t// https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L3210-L3214\n\tnb := make([]uint64, 0, ti.NDimensions)\n\t{\n\t\tnb = append(nb, tt.TypeSize)\n\t\tnb = append(nb, nb[0]*(ti.Dimensions[0]/tt.BlockSize))\n\t\tfor i := uint32(2); i < ti.NDimensions; i++ {\n\t\t\tnb = append(nb, nb[i-1]*ti.Dimensions[i-1])\n\t\t}\n\t}\n\n\tvar ret uint64\n\tif tt.BlockSize == 1 {\n\t\tret = tt.TypeSize\n\t\tfor i := uint32(0); i < ti.NDimensions; i++ {\n\t\t\tret += (ti.Dimensions[i] - 1) * nb[i]\n\t\t}\n\t\treturn ret\n\t}\n\n\tret = ti.Dimensions[0] * nb[0] / tt.BlockSize\n\tfor i := uint32(1); i < ti.NDimensions; i++ {\n\t\tret += (ti.Dimensions[i] - 1) * nb[i]\n\t}\n\treturn ret\n}\n\n// Count returns the number of GGUF tensors of the GGUFTensorInfo,\n// which is always 1.\nfunc (ti GGUFTensorInfo) Count() uint64 {\n\treturn 1\n}\n\n// Get returns the GGUFTensorInfo with the given name,\n// and true if found, and false otherwise.\nfunc (tis GGUFTensorInfos) Get(name string) (info GGUFTensorInfo, found bool) {\n\tfor i := range tis {\n\t\tif tis[i].Name == name {\n\t\t\treturn tis[i], true\n\t\t}\n\t}\n\treturn GGUFTensorInfo{}, false\n}\n\n// GetFileType returns the GGUFFileType represented the mostly GGMLType of the GGUFTensorInfos.\nfunc (tis GGUFTensorInfos) GetFileType() GGUFFileType {\n\tif len(tis) == 0 {\n\t\treturn _GGUFFileTypeCount\n\t}\n\n\tcm := make(map[GGMLType]int)\n\tfor i := range tis {\n\t\tcm[tis[i].Type]++\n\t}\n\n\treturn GetFileType(cm)\n}\n\n// Match returns true if a tensor of GGUFTensorInfos matches the given regex.\nfunc (tis GGUFTensorInfos) Match(nameRegex *regexp.Regexp) bool {\n\tfor i := range tis {\n\t\tif nameRegex.MatchString(tis[i].Name) {\n\t\t\treturn true\n\t\t}\n\t}\n\treturn false\n}\n\n// Search returns a list of GGUFTensorInfo with the names that match the given regex.\nfunc (tis GGUFTensorInfos) Search(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo) {\n\tfor i := range tis {\n\t\tif nameRegex.MatchString(tis[i].Name) {\n\t\t\tinfos = append(infos, tis[i])\n\t\t}\n\t}\n\treturn infos\n}\n\n// Index returns a map value to the GGUFTensorInfos with the given names,\n// and the number of names found.\nfunc (tis GGUFTensorInfos) Index(names []string) (infos map[string]GGUFTensorInfo, found int) {\n\tns := make(map[string]struct{}, len(names))\n\tfor i := range names {\n\t\tns[names[i]] = struct{}{}\n\t}\n\tinfos = make(map[string]GGUFTensorInfo)\n\tfor i := range tis {\n\t\tif _, ok := ns[tis[i].Name]; ok {\n\t\t\tinfos[tis[i].Name] = tis[i]\n\t\t\tfound++\n\t\t}\n\t\tif found == len(ns) {\n\t\t\tbreak\n\t\t}\n\t}\n\treturn infos, found\n}\n\n// Elements returns the number of elements of the GGUFTensorInfos.\nfunc (tis GGUFTensorInfos) Elements() uint64 {\n\tvar ret uint64\n\tfor i := range tis {\n\t\tret += tis[i].Elements()\n\t}\n\treturn ret\n}\n\n// Bytes returns the number of bytes of the GGUFTensorInfos.\nfunc (tis GGUFTensorInfos) Bytes() uint64 {\n\tvar ret uint64\n\tfor i := range tis {\n\t\tret += tis[i].Bytes()\n\t}\n\treturn ret\n}\n\n// Count returns the number of GGUF tensors of the GGUFTensorInfos.\nfunc (tis GGUFTensorInfos) Count() uint64 {\n\treturn uint64(len(tis))\n}\n\n// Layers converts the GGUFTensorInfos to GGUFLayerTensorInfos.\nfunc (tis GGUFTensorInfos) Layers(ignores ...string) GGUFLayerTensorInfos {\n\tif len(tis) == 0 {\n\t\treturn nil\n\t}\n\n\tls := tis.layers()\n\tif len(ignores) != 0 {\n\t\t_, ls, _ = ls.Cut(ignores)\n\t\treturn ls\n\t}\n\treturn ls\n}\n\nvar numberRegex = regexp.MustCompile(`^\\d+$`)\n\nfunc (tis GGUFTensorInfos) layers() GGUFLayerTensorInfos {\n\tvar ret GGUFLayerTensorInfos\n\n\tpm := make(map[string]any)\n\tfor i := range tis {\n\t\tps := strings.Split(tis[i].Name, \".\")\n\t\tif len(ps) < 2 {\n\t\t\tret = append(ret, tis[i])\n\t\t\tcontinue\n\t\t}\n\t\tswitch {\n\t\tdefault:\n\t\t\tret = append(ret, tis[i])\n\t\tcase ps[0] == \"blk\" || ps[0] == \"block\":\n\t\t\t// LLaMACpp.\n\t\t\tp := strings.Join([]string{ps[0], ps[1]}, \".\")\n\t\t\tif _, ok := pm[p]; !ok {\n\t\t\t\tl := &GGUFNamedTensorInfos{Name: p}\n\t\t\t\tpm[p] = l\n\t\t\t\tret = append(ret, l)\n\t\t\t}\n\t\t\tl := pm[p].(*GGUFNamedTensorInfos)\n\t\t\tl.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i])\n\t\tcase (ps[0] == \"v\" || ps[0] == \"t\" || ps[0] == \"a\") && ps[1] == \"blk\":\n\t\t\t// LLaMACpp CLIP.\n\t\t\tp := ps[0]\n\t\t\tif _, ok := pm[p]; !ok {\n\t\t\t\tl := &GGUFNamedTensorInfos{Name: p}\n\t\t\t\tpm[p] = l\n\t\t\t\tret = append(ret, l)\n\t\t\t}\n\t\t\tl := pm[p].(*GGUFNamedTensorInfos)\n\t\t\tif len(ps) < 3 {\n\t\t\t\tl.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i])\n\t\t\t\tcontinue\n\t\t\t}\n\t\t\tp = strings.Join([]string{ps[0], ps[1], ps[2]}, \".\")\n\t\t\tif _, ok := pm[p]; !ok {\n\t\t\t\txl := &GGUFNamedTensorInfos{Name: p}\n\t\t\t\tpm[p] = xl\n\t\t\t\tl.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, xl)\n\t\t\t}\n\t\t\txl := pm[p].(*GGUFNamedTensorInfos)\n\t\t\txl.GGUFLayerTensorInfos = append(xl.GGUFLayerTensorInfos, tis[i])\n\t\tcase ((ps[0] == \"dec\" || ps[0] == \"enc\") && ps[1] == \"blk\") ||\n\t\t\t((ps[0] == \"decoder\" || ps[0] == \"encoder\") && ps[1] == \"block\"):\n\t\t\t// BERT.\n\t\t\tp := ps[0]\n\t\t\tif _, ok := pm[p]; !ok {\n\t\t\t\tl := &GGUFNamedTensorInfos{Name: p}\n\t\t\t\tpm[p] = l\n\t\t\t\tret = append(ret, l)\n\t\t\t}\n\t\t\tl := pm[p].(*GGUFNamedTensorInfos)\n\t\t\tif len(ps) < 3 {\n\t\t\t\tl.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i])\n\t\t\t\tcontinue\n\t\t\t}\n\t\t\tp = strings.Join([]string{ps[0], ps[1], ps[2]}, \".\")\n\t\t\tif _, ok := pm[p]; !ok {\n\t\t\t\txl := &GGUFNamedTensorInfos{Name: p}\n\t\t\t\tpm[p] = xl\n\t\t\t\tl.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, xl)\n\t\t\t}\n\t\t\txl := pm[p].(*GGUFNamedTensorInfos)\n\t\t\txl.GGUFLayerTensorInfos = append(xl.GGUFLayerTensorInfos, tis[i])\n\t\tcase ps[0] == \"first_stage_model\":\n\t\t\t// StableDiffusionCpp Autoencoder.\n\t\t\tp := strings.Join([]string{ps[0], ps[1]}, \".\")\n\t\t\tif _, ok := pm[p]; !ok {\n\t\t\t\tl := &GGUFNamedTensorInfos{Name: p}\n\t\t\t\tpm[p] = l\n\t\t\t\tret = append(ret, l)\n\t\t\t}\n\t\t\tl := pm[p].(*GGUFNamedTensorInfos)\n\t\t\tif len(ps) < 3 {\n\t\t\t\tl.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i])\n\t\t\t\tcontinue\n\t\t\t}\n\t\t\tp = strings.Join([]string{ps[0], ps[1], ps[2]}, \".\")\n\t\t\tif _, ok := pm[p]; !ok {\n\t\t\t\txl := &GGUFNamedTensorInfos{Name: p}\n\t\t\t\tpm[p] = xl\n\t\t\t\tl.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, xl)\n\t\t\t}\n\t\t\txl := pm[p].(*GGUFNamedTensorInfos)\n\t\t\txl.GGUFLayerTensorInfos = append(xl.GGUFLayerTensorInfos, tis[i])\n\t\tcase ps[0] == \"cond_stage_model\":\n\t\t\t// StableDiffusionCpp Conditioner.\n\t\t\tif len(ps) < 3 {\n\t\t\t\tret = append(ret, tis[i])\n\t\t\t\tcontinue\n\t\t\t}\n\t\t\tp := strings.Join([]string{ps[0], ps[1], ps[2]}, \".\")\n\t\t\tif !numberRegex.MatchString(ps[1]) {\n\t\t\t\tp = strings.Join([]string{ps[0], ps[1]}, \".\")\n\t\t\t}\n\t\t\tif _, ok := pm[p]; !ok {\n\t\t\t\tl := &GGUFNamedTensorInfos{Name: p}\n\t\t\t\tpm[p] = l\n\t\t\t\tret = append(ret, l)\n\t\t\t}\n\t\t\tl := pm[p].(*GGUFNamedTensorInfos)\n\t\t\tif len(ps) < 4 {\n\t\t\t\tl.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i])\n\t\t\t\tcontinue\n\t\t\t}\n\t\t\tp = strings.Join([]string{ps[0], ps[1], ps[2], ps[3]}, \".\")\n\t\t\tif !numberRegex.MatchString(ps[1]) {\n\t\t\t\tp = strings.Join([]string{ps[0], ps[1], ps[2]}, \".\")\n\t\t\t}\n\t\t\tif _, ok := pm[p]; !ok {\n\t\t\t\txl := &GGUFNamedTensorInfos{Name: p}\n\t\t\t\tpm[p] = xl\n\t\t\t\tl.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, xl)\n\t\t\t}\n\t\t\txl := pm[p].(*GGUFNamedTensorInfos)\n\t\t\txl.GGUFLayerTensorInfos = append(xl.GGUFLayerTensorInfos, tis[i])\n\t\tcase ps[0] == \"model\" && ps[1] == \"diffusion_model\": // nolint: goconst\n\t\t\t// StableDiffusionCpp.\n\t\t\tp := \"model.diffusion_model\"\n\t\t\tif _, ok := pm[p]; !ok {\n\t\t\t\tl := &GGUFNamedTensorInfos{Name: p}\n\t\t\t\tpm[p] = l\n\t\t\t\tret = append(ret, l)\n\t\t\t}\n\t\t\tl := pm[p].(*GGUFNamedTensorInfos)\n\t\t\tif len(ps) < 3 {\n\t\t\t\tl.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i])\n\t\t\t\tcontinue\n\t\t\t}\n\t\t\tp = strings.Join([]string{\"model.diffusion_model\", ps[2]}, \".\")\n\t\t\tif _, ok := pm[p]; !ok {\n\t\t\t\txl := &GGUFNamedTensorInfos{Name: p}\n\t\t\t\tpm[p] = xl\n\t\t\t\tl.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, xl)\n\t\t\t}\n\t\t\txl := pm[p].(*GGUFNamedTensorInfos)\n\t\t\txl.GGUFLayerTensorInfos = append(xl.GGUFLayerTensorInfos, tis[i])\n\t\t}\n\t}\n\treturn ret\n}\n\n// Get returns the IGGUFTensorInfos with the given name,\n// and true if found, and false otherwise.\nfunc (ltis GGUFLayerTensorInfos) Get(name string) (info GGUFTensorInfo, found bool) {\n\tfor i := range ltis {\n\t\tswitch v := ltis[i].(type) {\n\t\tcase GGUFTensorInfo:\n\t\t\tif v.Name == name {\n\t\t\t\treturn v, true\n\t\t\t}\n\t\tcase *GGUFNamedTensorInfos:\n\t\t\tinfo, found = v.GGUFLayerTensorInfos.Get(name)\n\t\t\tif found {\n\t\t\t\treturn info, true\n\t\t\t}\n\t\t}\n\t}\n\treturn GGUFTensorInfo{}, false\n}\n\n// GetFileType returns the GGUFFileType represented the mostly GGMLType of the GGUFLayerTensorInfos.\nfunc (ltis GGUFLayerTensorInfos) GetFileType() GGUFFileType {\n\tif len(ltis) == 0 {\n\t\treturn _GGUFFileTypeCount\n\t}\n\n\tcm := make(map[GGMLType]int)\n\tfor i := range ltis {\n\t\tswitch v := ltis[i].(type) {\n\t\tcase GGUFTensorInfo:\n\t\t\tcm[v.Type]++\n\t\tcase *GGUFNamedTensorInfos:\n\t\t\tcm[v.GetFileType().GGMLType()]++\n\t\t}\n\t}\n\n\treturn GetFileType(cm)\n}\n\n// Match returns true if a tensor of GGUFLayerTensorInfos matches the given regex.\nfunc (ltis GGUFLayerTensorInfos) Match(nameRegex *regexp.Regexp) bool {\n\tfor i := range ltis {\n\t\tswitch v := ltis[i].(type) {\n\t\tcase GGUFTensorInfo:\n\t\t\tif nameRegex.MatchString(v.Name) {\n\t\t\t\treturn true\n\t\t\t}\n\t\tcase *GGUFNamedTensorInfos:\n\t\t\tif v.Match(nameRegex) {\n\t\t\t\treturn true\n\t\t\t}\n\t\t}\n\t}\n\treturn false\n}\n\n// Search returns a list of GGUFTensorInfo with the names that match the given regex.\nfunc (ltis GGUFLayerTensorInfos) Search(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo) {\n\tfor i := range ltis {\n\t\tswitch v := ltis[i].(type) {\n\t\tcase GGUFTensorInfo:\n\t\t\tif nameRegex.MatchString(v.Name) {\n\t\t\t\tinfos = append(infos, v)\n\t\t\t}\n\t\tcase *GGUFNamedTensorInfos:\n\t\t\tinfos = append(infos, v.Search(nameRegex)...)\n\t\t}\n\t}\n\treturn infos\n}\n\n// Index returns a map value to the GGUFTensorInfos with the given names,\n// and the number of names found.\nfunc (ltis GGUFLayerTensorInfos) Index(names []string) (infos map[string]GGUFTensorInfo, found int) {\n\tns := make(map[string]struct{}, len(names))\n\tfor i := range names {\n\t\tns[names[i]] = struct{}{}\n\t}\n\tinfos = make(map[string]GGUFTensorInfo)\n\tfor i := range ltis {\n\t\tswitch v := ltis[i].(type) {\n\t\tcase GGUFTensorInfo:\n\t\t\tif _, ok := ns[v.Name]; ok {\n\t\t\t\tinfos[v.Name] = v\n\t\t\t\tfound++\n\t\t\t}\n\t\tcase *GGUFNamedTensorInfos:\n\t\t\tinf, _ := v.Index(names)\n\t\t\tfor k := range inf {\n\t\t\t\tinfos[k] = inf[k]\n\t\t\t\tfound++\n\t\t\t}\n\t\t}\n\t\tif found == len(ns) {\n\t\t\tbreak\n\t\t}\n\t}\n\treturn infos, found\n}\n\n// Elements returns the number of elements of the GGUFLayerTensorInfos.\nfunc (ltis GGUFLayerTensorInfos) Elements(filter ...GGUFTensorInfoFilter) uint64 {\n\tvar ret uint64\n\tfor i := range ltis {\n\t\tret += ltis[i].Elements(filter...)\n\t}\n\treturn ret\n}\n\n// Bytes returns the number of bytes of the GGUFLayerTensorInfos.\nfunc (ltis GGUFLayerTensorInfos) Bytes(filter ...GGUFTensorInfoFilter) uint64 {\n\tvar ret uint64\n\tfor i := range ltis {\n\t\tret += ltis[i].Bytes(filter...)\n\t}\n\treturn ret\n}\n\n// Count returns the number of GGUF tensors of the GGUFLayerTensorInfos.\nfunc (ltis GGUFLayerTensorInfos) Count() uint64 {\n\tvar ret uint64\n\tfor i := range ltis {\n\t\tret += ltis[i].Count()\n\t}\n\treturn ret\n}\n\n// Cut splits the GGUFLayerTensorInfos into two parts,\n// and returns the GGUFLayerTensorInfos with the names that match the given names at first,\n// and the GGUFLayerTensorInfos without the names at second,\n// and true if the GGUFLayerTensorInfos with the names are found, and false otherwise.\n//\n// The given names support glob pattern, for example, \"a*\" matches \"a\", \"ab\", \"abc\", and so on.\nfunc (ltis GGUFLayerTensorInfos) Cut(names []string) (before, after GGUFLayerTensorInfos, found bool) {\n\tprefixes := make(map[string]struct{})\n\tmatches := make(map[string]struct{})\n\tfor i := range names {\n\t\tif strings.HasSuffix(names[i], \"*\") {\n\t\t\tprefixes[strings.TrimSuffix(names[i], \"*\")] = struct{}{}\n\t\t} else {\n\t\t\tmatches[names[i]] = struct{}{}\n\t\t}\n\t}\n\tbefore = make(GGUFLayerTensorInfos, 0, len(names))\n\tafter = make(GGUFLayerTensorInfos, 0, len(ltis))\n\n\tfor i := range ltis {\n\t\tswitch v := ltis[i].(type) {\n\t\tcase GGUFTensorInfo:\n\t\t\tif len(matches) != 0 {\n\t\t\t\tif _, ok := matches[v.Name]; ok {\n\t\t\t\t\tbefore = append(before, v)\n\t\t\t\t\tcontinue\n\t\t\t\t}\n\t\t\t}\n\t\t\tif len(prefixes) != 0 {\n\t\t\t\tvar check bool\n\t\t\t\tfor prefix := range prefixes {\n\t\t\t\t\tif strings.HasPrefix(v.Name, prefix) {\n\t\t\t\t\t\tbefore = append(before, v)\n\t\t\t\t\t\tcheck = true\n\t\t\t\t\t\tbreak\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\tif check {\n\t\t\t\t\tcontinue\n\t\t\t\t}\n\t\t\t}\n\t\t\tafter = append(after, v)\n\t\tcase *GGUFNamedTensorInfos:\n\t\t\tif len(matches) != 0 {\n\t\t\t\tif _, ok := matches[v.Name]; ok {\n\t\t\t\t\tbefore = append(before, v)\n\t\t\t\t\tcontinue\n\t\t\t\t}\n\t\t\t}\n\t\t\tif len(prefixes) != 0 {\n\t\t\t\tvar check bool\n\t\t\t\tfor prefix := range prefixes {\n\t\t\t\t\tif strings.HasPrefix(v.Name, prefix) {\n\t\t\t\t\t\tbefore = append(before, v)\n\t\t\t\t\t\tcheck = true\n\t\t\t\t\t\tbreak\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\tif check {\n\t\t\t\t\tcontinue\n\t\t\t\t}\n\t\t\t}\n\t\t\tafter = append(after, v)\n\t\t}\n\t}\n\treturn before, after, len(before) > 0\n}\n\ntype _GGUFReader struct {\n\tv  GGUFVersion\n\to  _GGUFReadOptions\n\tf  io.ReadSeeker\n\tbo binary.ByteOrder\n}\n\nfunc (rd _GGUFReader) ReadUint8() (v uint8, err error) {\n\terr = binary.Read(rd.f, rd.bo, &v)\n\tif err != nil {\n\t\treturn 0, fmt.Errorf(\"read uint8: %w\", err)\n\t}\n\treturn v, nil\n}\n\nfunc (rd _GGUFReader) ReadInt8() (v int8, err error) {\n\terr = binary.Read(rd.f, rd.bo, &v)\n\tif err != nil {\n\t\treturn 0, fmt.Errorf(\"read int8: %w\", err)\n\t}\n\treturn v, nil\n}\n\nfunc (rd _GGUFReader) ReadUint16() (v uint16, err error) {\n\terr = binary.Read(rd.f, rd.bo, &v)\n\tif err != nil {\n\t\treturn 0, fmt.Errorf(\"read uint16: %w\", err)\n\t}\n\treturn v, nil\n}\n\nfunc (rd _GGUFReader) ReadInt16() (v int16, err error) {\n\terr = binary.Read(rd.f, rd.bo, &v)\n\tif err != nil {\n\t\treturn 0, fmt.Errorf(\"read int16: %w\", err)\n\t}\n\treturn v, nil\n}\n\nfunc (rd _GGUFReader) ReadUint32() (v uint32, err error) {\n\terr = binary.Read(rd.f, rd.bo, &v)\n\tif err != nil {\n\t\treturn 0, fmt.Errorf(\"read uint32: %w\", err)\n\t}\n\treturn v, nil\n}\n\nfunc (rd _GGUFReader) ReadUint64FromUint32() (uint64, error) {\n\tv, err := rd.ReadUint32()\n\treturn uint64(v), err\n}\n\nfunc (rd _GGUFReader) ReadInt32() (v int32, err error) {\n\terr = binary.Read(rd.f, rd.bo, &v)\n\tif err != nil {\n\t\treturn 0, fmt.Errorf(\"read int32: %w\", err)\n\t}\n\treturn v, nil\n}\n\nfunc (rd _GGUFReader) ReadFloat32() (v float32, err error) {\n\terr = binary.Read(rd.f, rd.bo, &v)\n\tif err != nil {\n\t\treturn 0, fmt.Errorf(\"read float32: %w\", err)\n\t}\n\treturn v, nil\n}\n\nfunc (rd _GGUFReader) ReadBool() (v bool, err error) {\n\tb, err := rd.ReadUint8()\n\tif err != nil {\n\t\treturn false, fmt.Errorf(\"read bool: %w\", err)\n\t}\n\treturn b != 0, nil\n}\n\nfunc (rd _GGUFReader) ReadString() (v string, err error) {\n\tvar l uint64\n\tif rd.v <= GGUFVersionV1 {\n\t\tl, err = rd.ReadUint64FromUint32()\n\t} else {\n\t\tl, err = rd.ReadUint64()\n\t}\n\tif err != nil {\n\t\treturn \"\", fmt.Errorf(\"read string length: %w\", err)\n\t}\n\n\tb := bytex.GetBytes(l)\n\tdefer bytex.Put(b)\n\tif _, err = rd.f.Read(b); err != nil {\n\t\treturn \"\", fmt.Errorf(\"read string: %w\", err)\n\t}\n\n\treturn string(bytes.TrimSpace(b)), nil\n}\n\nfunc (rd _GGUFReader) SkipReadingString() (err error) {\n\tvar l uint64\n\tif rd.v <= GGUFVersionV1 {\n\t\tl, err = rd.ReadUint64FromUint32()\n\t} else {\n\t\tl, err = rd.ReadUint64()\n\t}\n\tif err != nil {\n\t\treturn fmt.Errorf(\"read string length: %w\", err)\n\t}\n\t_, err = rd.f.Seek(int64(l), io.SeekCurrent)\n\tif err != nil {\n\t\treturn fmt.Errorf(\"seek string: %w\", err)\n\t}\n\treturn nil\n}\n\nfunc (rd _GGUFReader) ReadArray(key string) (v GGUFMetadataKVArrayValue, err error) {\n\tv.StartOffset, err = rd.f.Seek(0, io.SeekCurrent)\n\tif err != nil {\n\t\treturn v, fmt.Errorf(\"read array start: %w\", err)\n\t}\n\n\tif err = binary.Read(rd.f, rd.bo, &v.Type); err != nil {\n\t\treturn v, fmt.Errorf(\"read array item type: %w\", err)\n\t}\n\n\tif rd.v <= GGUFVersionV1 {\n\t\tv.Len, err = rd.ReadUint64FromUint32()\n\t} else {\n\t\tv.Len, err = rd.ReadUint64()\n\t}\n\tif err != nil {\n\t\treturn v, fmt.Errorf(\"read array length: %w\", err)\n\t}\n\n\titemStart, err := rd.f.Seek(0, io.SeekCurrent)\n\tif err != nil {\n\t\treturn v, fmt.Errorf(\"seek array item start: %w\", err)\n\t}\n\n\tif !rd.o.SkipLargeMetadata || stringx.HasSuffixes(key, \".feed_forward_length\", \".attention.head_count\") {\n\t\tv.Array = make([]any, v.Len)\n\t\tfor i := uint64(0); i < v.Len; i++ {\n\t\t\tv.Array[i], err = rd.ReadValue(key, v.Type)\n\t\t\tif err != nil {\n\t\t\t\treturn v, fmt.Errorf(\"read array item %d: %w\", i, err)\n\t\t\t}\n\t\t}\n\n\t\titemEnd, err := rd.f.Seek(0, io.SeekCurrent)\n\t\tif err != nil {\n\t\t\treturn v, fmt.Errorf(\"seek array item end: %w\", err)\n\t\t}\n\t\tv.Size = itemEnd - itemStart\n\n\t\treturn v, nil\n\t}\n\n\tswitch v.Type {\n\tcase GGUFMetadataValueTypeUint8, GGUFMetadataValueTypeInt8, GGUFMetadataValueTypeBool:\n\t\t_, err = rd.f.Seek(int64(v.Len), io.SeekCurrent)\n\tcase GGUFMetadataValueTypeUint16, GGUFMetadataValueTypeInt16:\n\t\t_, err = rd.f.Seek(int64(v.Len)*2, io.SeekCurrent)\n\tcase GGUFMetadataValueTypeUint32, GGUFMetadataValueTypeInt32, GGUFMetadataValueTypeFloat32:\n\t\t_, err = rd.f.Seek(int64(v.Len)*4, io.SeekCurrent)\n\tcase GGUFMetadataValueTypeUint64, GGUFMetadataValueTypeInt64, GGUFMetadataValueTypeFloat64:\n\t\t_, err = rd.f.Seek(int64(v.Len)*8, io.SeekCurrent)\n\tcase GGUFMetadataValueTypeString:\n\t\tfor i := uint64(0); i < v.Len; i++ {\n\t\t\tif err = rd.SkipReadingString(); err != nil {\n\t\t\t\treturn v, fmt.Errorf(\"seek array[string] %d: %w\", i, err)\n\t\t\t}\n\t\t}\n\tdefault:\n\t\t// Should not happen.\n\t\tpanic(fmt.Errorf(\"invalid type: %v\", v.Type))\n\t}\n\tif err != nil {\n\t\treturn v, fmt.Errorf(\"seek array end: %w\", err)\n\t}\n\n\titemEnd, err := rd.f.Seek(0, io.SeekCurrent)\n\tif err != nil {\n\t\treturn v, fmt.Errorf(\"seek array item end: %w\", err)\n\t}\n\tv.Size = itemEnd - itemStart\n\n\treturn v, nil\n}\n\nfunc (rd _GGUFReader) ReadUint64() (v uint64, err error) {\n\terr = binary.Read(rd.f, rd.bo, &v)\n\tif err != nil {\n\t\treturn 0, fmt.Errorf(\"read uint64: %w\", err)\n\t}\n\treturn v, nil\n}\n\nfunc (rd _GGUFReader) ReadInt64() (v int64, err error) {\n\terr = binary.Read(rd.f, rd.bo, &v)\n\tif err != nil {\n\t\treturn 0, fmt.Errorf(\"read int64: %w\", err)\n\t}\n\treturn v, nil\n}\n\nfunc (rd _GGUFReader) ReadFloat64() (v float64, err error) {\n\terr = binary.Read(rd.f, rd.bo, &v)\n\tif err != nil {\n\t\treturn 0, fmt.Errorf(\"read float64: %w\", err)\n\t}\n\treturn v, nil\n}\n\nfunc (rd _GGUFReader) ReadValue(vk string, vt GGUFMetadataValueType) (v any, err error) {\n\tif vt >= _GGUFMetadataValueTypeCount {\n\t\treturn nil, fmt.Errorf(\"invalid type: %v\", vt)\n\t}\n\n\tswitch vt {\n\tcase GGUFMetadataValueTypeUint8:\n\t\tv, err = rd.ReadUint8()\n\tcase GGUFMetadataValueTypeInt8:\n\t\tv, err = rd.ReadInt8()\n\tcase GGUFMetadataValueTypeUint16:\n\t\tv, err = rd.ReadUint16()\n\tcase GGUFMetadataValueTypeInt16:\n\t\tv, err = rd.ReadInt16()\n\tcase GGUFMetadataValueTypeUint32:\n\t\tv, err = rd.ReadUint32()\n\tcase GGUFMetadataValueTypeInt32:\n\t\tv, err = rd.ReadInt32()\n\tcase GGUFMetadataValueTypeFloat32:\n\t\tv, err = rd.ReadFloat32()\n\tcase GGUFMetadataValueTypeBool:\n\t\tv, err = rd.ReadBool()\n\tcase GGUFMetadataValueTypeString:\n\t\tv, err = rd.ReadString()\n\tcase GGUFMetadataValueTypeArray:\n\t\tv, err = rd.ReadArray(vk)\n\tcase GGUFMetadataValueTypeUint64:\n\t\tv, err = rd.ReadUint64()\n\tcase GGUFMetadataValueTypeInt64:\n\t\tv, err = rd.ReadInt64()\n\tcase GGUFMetadataValueTypeFloat64:\n\t\tv, err = rd.ReadFloat64()\n\tdefault:\n\t\t// Should not happen.\n\t\tpanic(fmt.Errorf(\"invalid type: %v\", vt))\n\t}\n\tif err != nil {\n\t\treturn nil, err\n\t}\n\treturn v, nil\n}\n\ntype _GGUFMetadataReader struct {\n\t_GGUFReader\n}\n\nfunc (rd _GGUFMetadataReader) Read() (kv GGUFMetadataKV, err error) {\n\tkv.Key, err = rd.ReadString()\n\tif err != nil {\n\t\treturn kv, fmt.Errorf(\"read key: %w\", err)\n\t}\n\n\t{\n\t\tvt, err := rd.ReadUint32()\n\t\tif err != nil {\n\t\t\treturn kv, fmt.Errorf(\"read value type: %w\", err)\n\t\t}\n\t\tkv.ValueType = GGUFMetadataValueType(vt)\n\t\tif kv.ValueType >= _GGUFMetadataValueTypeCount {\n\t\t\treturn kv, fmt.Errorf(\"invalid value type: %v\", kv.ValueType)\n\t\t}\n\t}\n\n\tkv.Value, err = rd.ReadValue(kv.Key, kv.ValueType)\n\tif err != nil {\n\t\treturn kv, fmt.Errorf(\"read %s value: %w\", kv.Key, err)\n\t}\n\n\treturn kv, nil\n}\n\ntype _GGUFTensorInfoReader struct {\n\t_GGUFReader\n}\n\nfunc (rd _GGUFTensorInfoReader) Read() (ti GGUFTensorInfo, err error) {\n\tti.StartOffset, err = rd.f.Seek(0, io.SeekCurrent)\n\tif err != nil {\n\t\treturn ti, fmt.Errorf(\"seek tensor info start: %w\", err)\n\t}\n\n\tti.Name, err = rd.ReadString()\n\tif err != nil {\n\t\treturn ti, fmt.Errorf(\"read name: %w\", err)\n\t}\n\n\tti.NDimensions, err = rd.ReadUint32()\n\tif err != nil {\n\t\treturn ti, fmt.Errorf(\"read n dimensions: %w\", err)\n\t}\n\n\tti.Dimensions = make([]uint64, ti.NDimensions)\n\tfor i := uint32(0); i < ti.NDimensions; i++ {\n\t\tif rd.v <= GGUFVersionV1 {\n\t\t\tti.Dimensions[i], err = rd.ReadUint64FromUint32()\n\t\t} else {\n\t\t\tti.Dimensions[i], err = rd.ReadUint64()\n\t\t}\n\t\tif err != nil {\n\t\t\treturn ti, fmt.Errorf(\"read dimension %d: %w\", i, err)\n\t\t}\n\t}\n\n\t{\n\t\tv, err := rd.ReadUint32()\n\t\tif err != nil {\n\t\t\treturn ti, fmt.Errorf(\"read type: %w\", err)\n\t\t}\n\t\tti.Type = GGMLType(v)\n\t\tif ti.Type >= _GGMLTypeCount {\n\t\t\treturn ti, fmt.Errorf(\"%v: This quantized type is currently unsupported\", ti.Type)\n\t\t}\n\t}\n\n\tti.Offset, err = rd.ReadUint64()\n\tif err != nil {\n\t\treturn ti, fmt.Errorf(\"read offset: %w\", err)\n\t}\n\n\treturn ti, nil\n}\n"
  },
  {
    "path": "file_architecture.go",
    "content": "package gguf_parser\n\nimport (\n\t\"regexp\"\n\t\"slices\"\n\t\"strings\"\n)\n\n// Types for the architecture metadata of a GGUF file.\ntype (\n\t// GGUFArchitecture represents the architecture metadata of a GGUF file.\n\tGGUFArchitecture struct {\n\t\t/* Basic */\n\n\t\t// Type describes the type of the file,\n\t\t// default is \"model\".\n\t\tType string `json:\"type\"`\n\t\t// Architecture describes what architecture this model implements.\n\t\t//\n\t\t// All lowercase ASCII.\n\t\tArchitecture string `json:\"architecture\"`\n\t\t// MaximumContextLength(n_ctx_train) is the maximum context length of the model.\n\t\t//\n\t\t// For most architectures, this is the hard limit on the length of the input.\n\t\t// Architectures, like RWKV,\n\t\t// that are not reliant on transformer-style attention may be able to handle larger inputs,\n\t\t// but this is not guaranteed.\n\t\tMaximumContextLength uint64 `json:\"maximumContextLength,omitempty\"`\n\t\t// EmbeddingLength(n_embd) is the length of the embedding layer.\n\t\tEmbeddingLength uint64 `json:\"embeddingLength,omitempty\"`\n\t\t// BlockCount(n_layer) is the number of blocks of attention and feed-forward layers,\n\t\t// i.e. the bulk of the LLM.\n\t\t// This does not include the input or embedding layers.\n\t\tBlockCount uint64 `json:\"blockCount,omitempty\"`\n\t\t// FeedForwardLength(n_ff) stores the length of each feed-forward layer.\n\t\tFeedForwardLength []uint64 `json:\"feedForwardLength,omitempty\"`\n\t\t// ExpertFeedForwardLength(expert_feed_forward_length) is the length of the feed-forward layer in the expert model.\n\t\tExpertFeedForwardLength uint64 `json:\"expertFeedForwardLength,omitempty\"`\n\t\t// ExpertSharedFeedForwardLength(expert_shared_feed_forward_length) is the length of the shared feed-forward layer in the expert model.\n\t\tExpertSharedFeedForwardLength uint64 `json:\"expertSharedFeedForwardLength,omitempty\"`\n\t\t// ExpertCount(n_expert) is the number of experts in MoE models.\n\t\tExpertCount uint32 `json:\"expertCount,omitempty\"`\n\t\t// ExpertUsedCount(n_expert_used) is the number of experts used during each token evaluation in MoE models.\n\t\tExpertUsedCount uint32 `json:\"expertUsedCount,omitempty\"`\n\t\t// ExpertSharedCount(n_expert_shared) is the number of shared experts in MoE models.\n\t\tExpertSharedCount uint32 `json:\"expertSharedCount,omitempty\"`\n\t\t// AttentionHeadCount(n_head) is the number of attention heads.\n\t\tAttentionHeadCount uint64 `json:\"attentionHeadCount,omitempty\"`\n\t\t// AttentionHeadCountKV(n_head_kv) is the number of attention heads per group used in Grouped-Query-Attention.\n\t\t//\n\t\t// If not provided or equal to AttentionHeadCount,\n\t\t// the model does not use Grouped-Query-Attention.\n\t\tAttentionHeadCountKV uint64 `json:\"attentionHeadCountKV,omitempty\"`\n\t\t// AttentionSlidingWindowPattern is the pattern used in the sliding window attention.\n\t\t//\n\t\t// 0 means all layers are Sliding Window Attention.\n\t\t// 1 means all layers are none Sliding Window Attention.\n\t\t// N means every Nth layer is none Sliding Window Attention.\n\t\tAttentionSlidingWindowPattern uint32 `json:\"attentionSlidingWindowPattern,omitempty\"`\n\t\t// AttentionSlidingWindow is the size of the sliding window used in the attention layer.\n\t\tAttentionSlidingWindow uint64 `json:\"attentionSlidingWindow,omitempty\"`\n\t\t// AttentionMaxALiBIBias is the maximum bias to use for ALiBI.\n\t\tAttentionMaxALiBIBias float32 `json:\"attentionMaxALiBIBias,omitempty\"`\n\t\t// AttentionClampKQV describes a value `C`,\n\t\t// which is used to clamp the values of the `Q`, `K` and `V` tensors between `[-C, C]`.\n\t\tAttentionClampKQV float32 `json:\"attentionClampKQV,omitempty\"`\n\t\t// AttentionLayerNormEpsilon is the epsilon value used in the LayerNorm(Layer Normalization).\n\t\tAttentionLayerNormEpsilon float32 `json:\"attentionLayerNormEpsilon,omitempty\"`\n\t\t// AttentionLayerNormRMSEpsilon is the epsilon value used in the RMSNorm(root Mean Square Layer Normalization),\n\t\t// which is a simplification of the original LayerNorm.\n\t\tAttentionLayerNormRMSEpsilon float32 `json:\"attentionLayerNormRMSEpsilon,omitempty\"`\n\t\t// AttentionQueryLORARank is the LORA rank of the query matrix.\n\t\t//\n\t\t// Zero means no LORA.\n\t\tAttentionQueryLORARank uint32 `json:\"attentionQueryLORARank,omitempty\"`\n\t\t// AttentionKeyValueLORARank is the LORA rank of the key/value matrix.\n\t\t//\n\t\t// Zero means no LORA.\n\t\tAttentionKeyValueLORARank uint32 `json:\"attentionKeyValueLORARank,omitempty\"`\n\t\t// AttentionKeyLength(n_embd_head_k) is the size of a key head.\n\t\t//\n\t\t// Defaults to `EmbeddingLength / AttentionHeadCount`.\n\t\tAttentionKeyLength uint32 `json:\"attentionKeyLength,omitempty\"`\n\t\t// AttentionKeyLengthMLA(n_embd_head_k_mla) is the size of a key head in MLA(Multi-Layer Attention).\n\t\t//\n\t\t// Zero means no MLA.\n\t\tAttentionKeyLengthMLA uint32 `json:\"attentionKeyLengthMLA,omitempty\"`\n\t\t// AttentionValueLength(n_embd_head_v) is the size of a value head.\n\t\t//\n\t\t// Defaults to `EmbeddingLength / AttentionHeadCount`.\n\t\tAttentionValueLength uint32 `json:\"attentionValueLength,omitempty\"`\n\t\t// AttentionValueLengthMLA(n_embd_head_v_mla) is the size of a value head in MLA(Multi-Layer Attention).\n\t\t//\n\t\t// Zero means no MLA.\n\t\tAttentionValueLengthMLA uint32 `json:\"attentionValueLengthMLA,omitempty\"`\n\t\t// AttentionCausal is true if the attention is causal.\n\t\tAttentionCausal bool `json:\"attentionCausal,omitempty\"`\n\t\t// AttentionRecurrent is true if the attention is recurrent.\n\t\t//\n\t\t// Used in Mamba, RWKV, and similar architectures.\n\t\tAttentionRecurrent bool `json:\"attentionRecurrent,omitempty\"`\n\t\t// AttentionHybrid is true if the attention is hybrid (causal (self-attention) + recurrent).\n\t\t//\n\t\t// Used in Jamba, Falcon-H1, and similar architectures.\n\t\tAttentionHybrid bool `json:\"attentionHybrid,omitempty\"`\n\t\t// RoPEDimensionCount is the number of dimensions in the RoPE(Rotary Positional Encoding).\n\t\tRoPEDimensionCount uint64 `json:\"ropeDimensionCount,omitempty\"`\n\t\t// RoPEFrequencyBase is the base frequency of the RoPE.\n\t\tRoPEFrequencyBase float32 `json:\"ropeFrequencyBase,omitempty\"`\n\t\t// RoPEFrequencyScale is the scale frequency of the RoPE.\n\t\tRoPEFrequencyScale float32 `json:\"ropeFrequencyScale,omitempty\"`\n\t\t// RoPEFrequencyScale is the frequency scale of the RoPE.\n\t\tRoPEScalingType string `json:\"ropeScalingType,omitempty\"`\n\t\t// RoPEScalingFactor is the scaling factor of the RoPE.\n\t\tRoPEScalingFactor float32 `json:\"ropeScalingFactor,omitempty\"`\n\t\t// RoPEScalingOriginalContextLength is the original context length of the RoPE scaling.\n\t\tRoPEScalingOriginalContextLength uint64 `json:\"ropeScalingOriginalContextLength,omitempty\"`\n\t\t// RoPEScalingFinetuned is true if the RoPE scaling is fine-tuned.\n\t\tRoPEScalingFinetuned bool `json:\"ropeScalingFinetuned,omitempty\"`\n\t\t// PoolingType is the type of pooling used in the model.\n\t\tPoolingType uint32 `json:\"poolingType,omitempty\"`\n\t\t// SSMConvolutionKernel is the size of the convolution kernel used in the Selective State Space Model (SSM) and similar architectures.\n\t\tSSMConvolutionKernel uint32 `json:\"ssmConvolutionKernel,omitempty\"`\n\t\t// SSMInnerSize is the embedding size of the state in SSM and similar architectures.\n\t\tSSMInnerSize uint32 `json:\"ssmInnerSize,omitempty\"`\n\t\t// SSMStateSize is the size of the recurrent state in SSM and similar architectures.\n\t\tSSMStateSize uint32 `json:\"ssmStateSize,omitempty\"`\n\t\t// SSMTimeStepRank is the rank of the time steps in SSM and similar architectures.\n\t\tSSMTimeStepRank uint32 `json:\"ssmTimeStepRank,omitempty\"`\n\t\t// SSMGroupCount is the number of groups in the SSM and similar architectures.\n\t\tSSMGroupCount uint32 `json:\"ssmGroupCount,omitempty\"`\n\t\t// WKVHeadSize is the size of the head in RWKV and similar architectures.\n\t\tRWKVHeadSize uint32 `json:\"rwkvHeadSize,omitempty\"`\n\t\t// RWKVRescaleEveryNLayers is the number of layers after which the rescaling is applied in RWKV and similar architectures.\n\t\tRWKVRescaleEveryNLayers uint32 `json:\"rwkvRescaleEveryNLayers,omitempty\"`\n\t\t// RWKVTimeMixExtraDimension indicates whether the RWKV architecture has an extra dimension for time mixing.\n\t\tRWKVTimeMixExtraDimension uint32 `json:\"rwkvTimeMixExtraDimension,omitempty\"`\n\t\t// RWKVTimeDecayExtraDimension indicates whether the RWKV architecture has an extra dimension for time decay.\n\t\tRWKVTimeDecayExtraDimension uint32 `json:\"rwkvTimeDecayExtraDimension,omitempty\"`\n\t\t// TokenShiftCount is the number of token shifts used in RWKV and similar architectures.\n\t\tRWKVTokenShiftCount uint32 `json:\"rwkvTokenShiftCount,omitempty\"`\n\t\t// VocabularyLength is the size of the vocabulary.\n\t\t//\n\t\t// VocabularyLength is the same as the tokenizer's token size.\n\t\tVocabularyLength uint64 `json:\"vocabularyLength,omitempty\"`\n\n\t\t/* Appendix */\n\n\t\t// ClipProjectorType is the type of the projector used in the clip model.\n\t\t//\n\t\t// Only used when Architecture is \"clip\".\n\t\tClipProjectorType string `json:\"clipProjectorType,omitempty\"`\n\t\t// ClipHasLLaVAProjector indicates whether the clip model has LLaVA projector or not.\n\t\t//\n\t\t// Only used when Architecture is \"clip\".\n\t\t//\n\t\t// Deprecated: use ClipProjectorType instead.\n\t\tClipHasLLaVAProjector bool `json:\"clipHasLLaVAProjector,omitempty\"`\n\t\t// ClipHasMiniCPMVProjector indicates whether the clip model has MiniCPMV projector or not.\n\t\t//\n\t\t// Only used when Architecture is \"clip\".\n\t\t//\n\t\t// Deprecated: use ClipProjectorType instead.\n\t\tClipHasMiniCPMVProjector bool `json:\"clipHasMiniCPMVProject,omitempty\"`\n\t\t// ClipMiniCPMVVersion is the version of the MiniCPMV projector.\n\t\t//\n\t\t// Only used when Architecture is \"clip\".\n\t\tClipMiniCPMVVersion int32 `json:\"clipMiniCPMVVersion,omitempty\"`\n\t\t// ClipMiniCPMVQueryNum is the number of queries used in the MiniCPMV projector.\n\t\t//\n\t\t// Only used when Architecture is \"clip\".\n\t\tClipMiniCPMVQueryNum int32 `json:\"clipMiniCPMVQueryNum,omitempty\"`\n\t\t// ClipHasGLMProjector indicates whether the clip model has GLM projector or not.\n\t\t//\n\t\t// Only used when Architecture is \"clip\".\n\t\t//\n\t\t// Deprecated: use ClipProjectorType instead.\n\t\tClipHasGLMProjector bool `json:\"clipHasGLMProjector,omitempty\"`\n\t\t// ClipHasQwen2VLMerger indicates whether the clip model has Qwen2VL merger or not.\n\t\t//\n\t\t// Only used when Architecture is \"clip\".\n\t\t//\n\t\t// Deprecated: use ClipProjectorType instead.\n\t\tClipHasQwen2VLMerger bool `json:\"clipHasQwen2VLMerger,omitempty\"`\n\t\t// ClipHasVisionEncoder indicates whether the clip model has vision encoder or not.\n\t\t//\n\t\t// Only used when Architecture is \"clip\".\n\t\tClipHasVisionEncoder bool `json:\"clipHasVisionEncoder,omitempty\"`\n\t\t// ClipVisionEmbeddingLength indicates the embedding length of vision encoder.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasVisionEncoder is true.\n\t\tClipVisionEmbeddingLength uint64 `json:\"clipVisionEmbeddingLength,omitempty\"`\n\t\t// ClipVisionBlockCount indicates the number of blocks in the vision encoder.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasVisionEncoder is true.\n\t\tClipVisionBlockCount uint64 `json:\"clipVisionBlockCount,omitempty\"`\n\t\t// ClipVisionFeedForwardLength indicates the feed-forward length of the vision encoder.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasVisionEncoder is true.\n\t\tClipVisionFeedForwardLength []uint64 `json:\"clipVisionFeedForwardLength,omitempty\"`\n\t\t// ClipVisionAttentionHeadCount indicates the number of attention heads in the vision encoder.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasVisionEncoder is true.\n\t\tClipVisionAttentionHeadCount uint64 `json:\"clipVisionAttentionHeadCount,omitempty\"`\n\t\t// ClipVisionAttentionLayerNormRMSEpsilon indicates the epsilon value used in the RMSNorm of the vision encoder.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasVisionEncoder is true.\n\t\tClipVisionAttentionLayerNormRMSEpsilon float32 `json:\"clipVisionAttentionLayerNormRMSEpsilon,omitempty\"`\n\t\t// ClipVisionProjectionDim indicates the projection dimension of vision encoder.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasVisionEncoder is true.\n\t\tClipVisionProjectionDim uint32 `json:\"clipVisionProjectionDim,omitempty\"`\n\t\t// ClipVisionProjectorScaleFactor is the scale factor of the projector.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasVisionEncoder is true.\n\t\tClipVisionProjectorScaleFactor uint32 `json:\"clipVisionProjectorScaleFactor,omitempty\"`\n\t\t// ClipVisionImageSize indicates the image size of vision encoder.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasVisionEncoder is true.\n\t\tClipVisionImageSize uint32 `json:\"clipVisionImageSize,omitempty\"`\n\t\t// ClipVisionPatchSize indicates the patch size of vision encoder.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasVisionEncoder is true.\n\t\tClipVisionPatchSize uint32 `json:\"clipVisionPatchSize,omitempty\"`\n\t\t// ClipVisionMMPatchMergeType indicates the merge type of the vision encoder.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasVisionEncoder is true.\n\t\tClipVisionMMPatchMergeType string `json:\"clipVisionMMPatchMergeType,omitempty\"`\n\t\t// ClipVisionSpatialMergeSize is the spatial merge size of the vision encoder.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasVisionEncoder is true.\n\t\tClipVisionSpatialMergeSize uint32 `json:\"clipVisionSpatialMergeSize,omitempty\"`\n\t\t// ClipVisionWindowAttentionPattern is the Window Attention pattern used in the vision encoder.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasVisionEncoder is true.\n\t\tClipVisionWindowAttentionPattern uint32 `json:\"clipVisionWindowAttentionPattern,omitempty\"`\n\t\t// ClipHasAudioEncoder indicates whether the clip model has audio encoder or not.\n\t\t//\n\t\t// Only used when Architecture is \"clip\".\n\t\tClipHasAudioEncoder bool `json:\"clipHasAudioEncoder,omitempty\"`\n\t\t// ClipAudioEmbeddingLength indicates the embedding length of audio encoder.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasAudioEncoder is true.\n\t\tClipAudioEmbeddingLength uint64 `json:\"clipAudioEmbeddingLength,omitempty\"`\n\t\t// ClipAudioBlockCount indicates the number of blocks in the audio encoder.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasAudioEncoder is true.\n\t\tClipAudioBlockCount uint64 `json:\"clipAudioBlockCount,omitempty\"`\n\t\t// ClipAudioFeedForwardLength indicates the feed-forward length of the audio encoder.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasAudioEncoder is true.\n\t\tClipAudioFeedForwardLength []uint64 `json:\"clipAudioFeedForwardLength,omitempty\"`\n\t\t// ClipAudioAttentionHeadCount indicates the number of attention heads in the audio encoder.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasAudioEncoder is true.\n\t\tClipAudioAttentionHeadCount uint64 `json:\"clipAudioAttentionHeadCount,omitempty\"`\n\t\t// ClipAudioAttentionLayerNormRMSEpsilon indicates the epsilon value used in the RMSNorm of the audio encoder.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasAudioEncoder is true.\n\t\tClipAudioAttentionLayerNormRMSEpsilon float32 `json:\"clipAudioAttentionLayerNormRMSEpsilon,omitempty\"`\n\t\t// ClipAudioProjectionDim indicates the projection dimension of audio encoder.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasAudioEncoder is true.\n\t\tClipAudioProjectionDim uint32 `json:\"clipAudioProjectionDim,omitempty\"`\n\t\t// ClipAudioProjectorStackFactor is the scale factor of the projector.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasAudioEncoder is true.\n\t\tClipAudioProjectorStackFactor uint32 `json:\"clipAudioProjectorStackFactor,omitempty\"`\n\t\t// ClipAudioNumMelBins is the number of mel bins used in the audio encoder.\n\t\t//\n\t\t// Only used when Architecture is \"clip\" and ClipHasAudioEncoder is true.\n\t\tClipAudioNumMelBins uint32 `json:\"clipAudioNumMelBins,omitempty\"`\n\n\t\t// AdapterType is the type of the adapter.\n\t\t//\n\t\t// Only used when Architecture is \"adapter\".\n\t\tAdapterType string `json:\"adapterType,omitempty\"`\n\t\t// AdapterLoRAAlpha is the alpha value of the LoRA adapter.\n\t\t//\n\t\t// Only used when AdapterType is \"lora\".\n\t\tAdapterLoRAAlpha float32 `json:\"adapterLoRAAlpha,omitempty\"`\n\t\t// AdapterControlVectorLayerCount is the number of layers in the control vector.\n\t\t//\n\t\t// Only used when Architecture is \"control_vector\".\n\t\tAdapterControlVectorLayerCount uint32 `json:\"adapterControlVectorLayerCount,omitempty\"`\n\n\t\t// DiffusionArchitecture is the actual architecture of the diffusion model.\n\t\t//\n\t\t// Only used when Architecture is \"diffusion\".\n\t\tDiffusionArchitecture string `json:\"diffusionArchitecture,omitempty\"`\n\t\t// DiffusionTransformer indicates whether the diffusion model is a diffusion transformer or not.\n\t\t//\n\t\tDiffusionTransformer bool `json:\"diffusionTransformer,omitempty\"`\n\t\t// DiffusionConditioners is the list of diffusion conditioners.\n\t\t//\n\t\t// Only used when Architecture is \"diffusion\".\n\t\tDiffusionConditioners GGUFArchitectureDiffusionConditioners `json:\"diffusionConditioners,omitempty\"`\n\t\t// DiffusionAutoencoder represents the autoencoder of the diffusion model.\n\t\t//\n\t\t// Only used when Architecture is \"diffusion\".\n\t\tDiffusionAutoencoder *GGUFArchitectureDiffusionAutoencoder `json:\"diffusionAutoencoder,omitempty\"`\n\t}\n\n\t// GGUFArchitectureDiffusionConditioners is the list of GGUFArchitectureDiffusionConditioner.\n\tGGUFArchitectureDiffusionConditioners []GGUFArchitectureDiffusionConditioner\n\n\t// GGUFArchitectureDiffusionConditioner represents the conditioner metadata of the diffusion architecture.\n\tGGUFArchitectureDiffusionConditioner struct {\n\t\t// Architecture is the architecture of the diffusion conditioner.\n\t\tArchitecture string `json:\"architecture\"`\n\n\t\t// FileType describes the type of the majority of the tensors in the GGUF file.\n\t\tFileType GGUFFileType `json:\"fileType\"`\n\t}\n\n\t// GGUFArchitectureDiffusionAutoencoder represents the autoencoder metadata of the diffusion architecture.\n\tGGUFArchitectureDiffusionAutoencoder struct {\n\t\t// Architecture is the architecture of the diffusion autoencoder.\n\t\t//\n\t\t// Currently, only \"VAE\" is supported.\n\t\tArchitecture string `json:\"architecture\"`\n\n\t\t// FileType describes the type of the majority of the tensors in the GGUF file.\n\t\tFileType GGUFFileType `json:\"fileType\"`\n\t}\n)\n\n// DiffusionHasConditioners returns true if the diffusion model has conditioners.\nfunc (ga GGUFArchitecture) DiffusionHasConditioners() bool {\n\treturn len(ga.DiffusionConditioners) > 0\n}\n\n// DiffusionHasAutoencoder returns true if the diffusion model has an autoencoder.\nfunc (ga GGUFArchitecture) DiffusionHasAutoencoder() bool {\n\treturn ga.DiffusionAutoencoder != nil && ga.DiffusionAutoencoder.Architecture != \"\"\n}\n\nfunc (gacs GGUFArchitectureDiffusionConditioners) String() string {\n\tvar sb strings.Builder\n\tfor i, gac := range gacs {\n\t\tif i > 0 {\n\t\t\tsb.WriteString(\", \")\n\t\t}\n\t\tsb.WriteString(gac.String())\n\t}\n\treturn sb.String()\n}\n\nfunc (gac GGUFArchitectureDiffusionConditioner) String() string {\n\treturn gac.Architecture + \" (\" + gac.FileType.String() + \")\"\n}\n\nfunc (gaa GGUFArchitectureDiffusionAutoencoder) String() string {\n\treturn gaa.Architecture + \" (\" + gaa.FileType.String() + \")\"\n}\n\n// Architecture returns the architecture metadata of the GGUF file.\nfunc (gf *GGUFFile) Architecture() (ga GGUFArchitecture) {\n\tfor _, re := range _GGUFPotentialDiffusionArchitectureTensorsRegexes {\n\t\tif gf.TensorInfos.Match(re) {\n\t\t\treturn gf.diffuserArchitecture()\n\t\t}\n\t}\n\tvar (\n\t\tgeneralTypeKey         = \"general.type\"\n\t\tgeneralArchitectureKey = \"general.architecture\"\n\n\t\tcontrolVectorModelHintKey = \"controlvector.model_hint\"\n\t)\n\tm, _ := gf.Header.MetadataKV.Index([]string{\n\t\tgeneralTypeKey,\n\t\tgeneralArchitectureKey,\n\t\tcontrolVectorModelHintKey,\n\t})\n\n\ttyp, arch := \"model\", \"llama\" // nolint: goconst\n\t{\n\t\tif v, ok := m[generalTypeKey]; ok {\n\t\t\ttyp = v.ValueString()\n\t\t}\n\t\tif v, ok := m[generalArchitectureKey]; ok {\n\t\t\tarch = v.ValueString()\n\t\t}\n\t}\n\n\tswitch {\n\tcase arch == \"clip\":\n\t\treturn gf.clipArchitecture()\n\tcase arch == \"controlvector\":\n\t\tarch = \"llama\"\n\t\tif v, ok := m[controlVectorModelHintKey]; ok {\n\t\t\tarch = v.ValueString()\n\t\t}\n\t\treturn gf.adapterArchitecture(arch)\n\tcase typ == \"adapter\":\n\t\treturn gf.adapterArchitecture(arch)\n\tcase typ == \"imatrix\":\n\t\treturn gf.imatrixArchitecture(arch)\n\t}\n\treturn gf.transformerArchitecture(arch)\n}\n\nfunc (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) {\n\tconst (\n\t\t// Diffusion\n\n\t\tsdKey                = \"model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_v.weight\" // SD 1.x/2.x\n\t\tsdKey2               = \"output_blocks.11.1.transformer_blocks.0.attn2.to_v.weight\"\n\t\tsdXlKey              = \"model.diffusion_model.output_blocks.5.1.transformer_blocks.1.attn1.to_v.weight\" // SD XL\n\t\tsdXlKey2             = \"output_blocks.5.1.transformer_blocks.1.attn1.to_v.weight\"\n\t\tsdXlRefinerKey       = \"model.diffusion_model.output_blocks.8.1.transformer_blocks.1.attn1.to_v.weight\" // SD XL Refiner\n\t\tsdXlRefinerKey2      = \"output_blocks.8.1.transformer_blocks.1.attn1.to_v.weight\"\n\t\tsd3Key               = \"model.diffusion_model.joint_blocks.23.x_block.attn.proj.weight\" // SD 3.x\n\t\tsd3Key2              = \"joint_blocks.23.x_block.attn.proj.weight\"\n\t\tsdInPaintFeatureKey  = \"model.diffusion_model.input_blocks.0.0.weight\" // SD in-paint feature\n\t\tsdInPaintFeatureKey2 = \"input_blocks.0.0.weight\"\n\n\t\tfluxKey             = \"model.diffusion_model.double_blocks.0.txt_attn.proj.weight\" // FLUX.1\n\t\tfluxKey2            = \"double_blocks.0.txt_attn.proj.weight\"\n\t\tfluxFillFeatureKey  = \"model.diffusion_model.img_in.weight\" // FLUX.1 Fill feature\n\t\tfluxFillFeatureKey2 = \"img_in.weight\"\n\n\t\t// Conditioner\n\n\t\topenAiClipVitL14Key  = \"cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight\" // OpenAI CLIP ViT-L/14\n\t\topenAiClipVitL14Key2 = \"text_model.encoder.layers.11.self_attn.k_proj.weight\"\n\t\topenClipVitH14Key    = \"cond_stage_model.transformer.text_model.encoder.layers.22.self_attn.k_proj.weight\" // OpenCLIP ViT-H/14\n\t\topenClipVitH14Key2   = \"text_model.encoder.layers.22.self_attn.k_proj.weight\"\n\t\topenClipVitG14Key    = \"cond_stage_model.1.transformer.text_model.encoder.layers.31.self_attn.k_proj.weight\" // OpenCLIP ViT-G/14\n\t\topenClipVitG14Key2   = \"text_model.encoder.layers.31.self_attn.k_proj.weight\"\n\t\tt5xxlKey             = \"cond_stage_model.1.transformer.encoder.block.23.layer.0.SelfAttention.k.weight\" // Google T5-xxl\n\t\tt5xxlKey2            = \"cond_stage_model.2.transformer.encoder.block.23.layer.0.SelfAttention.k.weight\"\n\t\tt5xxlKey3            = \"encoder.block.23.layer.0.SelfAttention.k.weight\"\n\t)\n\n\ttis, _ := gf.TensorInfos.Index([]string{\n\t\tsdKey,\n\t\tsdKey2,\n\t\tsdXlKey,\n\t\tsdXlKey2,\n\t\tsdXlRefinerKey,\n\t\tsdXlRefinerKey2,\n\t\tsd3Key,\n\t\tsd3Key2,\n\t\tsdInPaintFeatureKey,\n\t\tsdInPaintFeatureKey2,\n\n\t\tfluxKey,\n\t\tfluxKey2,\n\t\tfluxFillFeatureKey,\n\t\tfluxFillFeatureKey2,\n\n\t\topenAiClipVitL14Key,\n\t\topenAiClipVitL14Key2,\n\t\topenClipVitH14Key,\n\t\topenClipVitH14Key2,\n\t\topenClipVitG14Key,\n\t\topenClipVitG14Key2,\n\t\tt5xxlKey,\n\t\tt5xxlKey2,\n\t\tt5xxlKey3,\n\t})\n\n\tga.Type = \"model\"\n\tga.Architecture = \"diffusion\"\n\n\tif ti, ok := tis[sdKey]; ok {\n\t\tga.DiffusionArchitecture = \"Stable Diffusion 1.x\"\n\t\tif ti.Dimensions[0] == 1024 {\n\t\t\tga.DiffusionArchitecture = \"Stable Diffusion 2.x\"\n\t\t}\n\t\tif ti, ok := tis[sdInPaintFeatureKey]; ok && ti.Dimensions[2] == 9 {\n\t\t\tga.DiffusionArchitecture += \" InPaint\"\n\t\t}\n\t} else if _, ok := tis[sdKey2]; ok {\n\t\tga.DiffusionArchitecture = \"Stable Diffusion 1.x\"\n\t\tif ti.Dimensions[0] == 1024 {\n\t\t\tga.DiffusionArchitecture = \"Stable Diffusion 2.x\"\n\t\t}\n\t\tif ti, ok := tis[sdInPaintFeatureKey2]; ok && ti.Dimensions[2] == 9 {\n\t\t\tga.DiffusionArchitecture += \" InPaint\"\n\t\t}\n\t} else if _, ok := tis[sdXlKey]; ok {\n\t\tga.DiffusionArchitecture = \"Stable Diffusion XL\"\n\t\tif _, ok = tis[sdXlRefinerKey]; ok {\n\t\t\tga.DiffusionArchitecture = \"Stable Diffusion XL Refiner\"\n\t\t}\n\t\tif ti, ok := tis[sdInPaintFeatureKey]; ok && ti.Dimensions[2] == 9 {\n\t\t\tga.DiffusionArchitecture += \" InPaint\"\n\t\t}\n\t} else if _, ok := tis[sdXlKey2]; ok {\n\t\tga.DiffusionArchitecture = \"Stable Diffusion XL\"\n\t\tif _, ok = tis[sdXlRefinerKey2]; ok {\n\t\t\tga.DiffusionArchitecture = \"Stable Diffusion XL Refiner\"\n\t\t}\n\t\tif ti, ok := tis[sdInPaintFeatureKey2]; ok && ti.Dimensions[2] == 9 {\n\t\t\tga.DiffusionArchitecture += \" InPaint\"\n\t\t}\n\t} else if _, ok := tis[sd3Key]; ok {\n\t\tga.DiffusionArchitecture = \"Stable Diffusion 3.x\"\n\t\tga.DiffusionTransformer = true\n\t} else if _, ok := tis[sd3Key2]; ok {\n\t\tga.DiffusionArchitecture = \"Stable Diffusion 3.x\"\n\t\tga.DiffusionTransformer = true\n\t}\n\tif _, ok := tis[fluxKey]; ok {\n\t\tga.DiffusionArchitecture = \"FLUX.1\"\n\t\tga.DiffusionTransformer = true\n\t\tif ti, ok := tis[fluxFillFeatureKey]; ok && ti.Dimensions[0] == 384 {\n\t\t\tga.DiffusionArchitecture += \" Fill\"\n\t\t}\n\t} else if _, ok := tis[fluxKey2]; ok {\n\t\tga.DiffusionArchitecture = \"FLUX.1\"\n\t\tga.DiffusionTransformer = true\n\t\tif ti, ok := tis[fluxFillFeatureKey2]; ok && ti.Dimensions[0] == 384 {\n\t\t\tga.DiffusionArchitecture += \" Fill\"\n\t\t}\n\t}\n\n\tif ti, ok := tis[openAiClipVitL14Key]; ok {\n\t\tcond := GGUFArchitectureDiffusionConditioner{\n\t\t\tArchitecture: \"OpenAI CLIP ViT-L/14\",\n\t\t\tFileType:     ti.GetFileType(),\n\t\t}\n\t\tif ti, ok = tis[openClipVitH14Key]; ok {\n\t\t\tcond = GGUFArchitectureDiffusionConditioner{\n\t\t\t\tArchitecture: \"OpenCLIP ViT-H/14\",\n\t\t\t\tFileType:     ti.GetFileType(),\n\t\t\t}\n\t\t}\n\t\tga.DiffusionConditioners = append(ga.DiffusionConditioners, cond)\n\t} else if ti, ok := tis[openAiClipVitL14Key2]; ok {\n\t\tcond := GGUFArchitectureDiffusionConditioner{\n\t\t\tArchitecture: \"OpenAI CLIP ViT-L/14\",\n\t\t\tFileType:     ti.GetFileType(),\n\t\t}\n\t\tif ti, ok = tis[openClipVitH14Key2]; ok {\n\t\t\tcond = GGUFArchitectureDiffusionConditioner{\n\t\t\t\tArchitecture: \"OpenCLIP ViT-H/14\",\n\t\t\t\tFileType:     ti.GetFileType(),\n\t\t\t}\n\t\t}\n\t\tga.DiffusionConditioners = append(ga.DiffusionConditioners, cond)\n\t}\n\tif ti, ok := tis[openClipVitG14Key]; ok {\n\t\tga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{\n\t\t\tArchitecture: \"OpenCLIP ViT-G/14\",\n\t\t\tFileType:     ti.GetFileType(),\n\t\t})\n\t} else if ti, ok = tis[openClipVitG14Key2]; ok {\n\t\tga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{\n\t\t\tArchitecture: \"OpenCLIP ViT-G/14\",\n\t\t\tFileType:     ti.GetFileType(),\n\t\t})\n\t}\n\tif ti, ok := tis[t5xxlKey]; ok {\n\t\tga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{\n\t\t\tArchitecture: \"Google T5-xxl\",\n\t\t\tFileType:     ti.GetFileType(),\n\t\t})\n\t} else if ti, ok = tis[t5xxlKey2]; ok {\n\t\tga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{\n\t\t\tArchitecture: \"Google T5-xxl\",\n\t\t\tFileType:     ti.GetFileType(),\n\t\t})\n\t} else if ti, ok = tis[t5xxlKey3]; ok {\n\t\tga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{\n\t\t\tArchitecture: \"Google T5-xxl\",\n\t\t\tFileType:     ti.GetFileType(),\n\t\t})\n\t}\n\n\tfor _, re := range []*regexp.Regexp{\n\t\tregexp.MustCompile(`^first_stage_model\\..*`),\n\t\tregexp.MustCompile(`^decoder\\.conv_in\\..*`),\n\t} {\n\t\tif tis := gf.TensorInfos.Search(re); len(tis) != 0 {\n\t\t\tga.DiffusionAutoencoder = &GGUFArchitectureDiffusionAutoencoder{\n\t\t\t\tArchitecture: ga.DiffusionArchitecture + \" VAE\",\n\t\t\t\tFileType:     GGUFTensorInfos(tis).GetFileType(),\n\t\t\t}\n\t\t\tbreak\n\t\t}\n\t}\n\n\treturn ga\n}\n\nfunc (gf *GGUFFile) clipArchitecture() (ga GGUFArchitecture) {\n\tconst (\n\t\tprojectorTypeKey     = \"clip.projector_type\"\n\t\thasLLaVAProjectorKey = \"clip.has_llava_projector\"\n\t\thasMiniCPMVProjector = \"clip.has_minicpmv_projector\"\n\t\tminiCPMVVersionKey   = \"clip.minicpmv_version\"\n\t\tminiCPMVQueryNumKey  = \"clip.minicpmv_query_num\"\n\t\thasGLMProjectorKey   = \"clip.has_glm_projector\"\n\t\thasQwen2VLMergerKey  = \"clip.has_qwen2vl_merger\"\n\n\t\thasVisionEncoderKey                   = \"clip.has_vision_encoder\"\n\t\tvisionEmbeddingLengthKey              = \"clip.vision.embedding_length\"\n\t\tvisionBlockCountKey                   = \"clip.vision.block_count\"\n\t\tvisionFeedForwardLengthKey            = \"clip.vision.feed_forward_length\"\n\t\tvisionAttentionHeadCountKey           = \"clip.vision.attention.head_count\"\n\t\tvisionAttentionLayerNormRMSEpsilonKey = \"clip.vision.attention.layer_norm_epsilon\"\n\t\tvisionProjectionDimKey                = \"clip.vision.projection_dim\"\n\t\tvisionProjectorScaleFactorKey         = \"clip.vision.projector.scale_factor\"\n\t\tvisionImageSizeKey                    = \"clip.vision.image_size\"\n\t\tvisionPatchSizeKey                    = \"clip.vision.patch_size\"\n\t\tvisionMMPatchMergeTypeKey             = \"clip.vision.mm_patch_merge_type\"\n\t\tvisioSpatialMergeSizeKey              = \"clip.vision.spatial_merge_size\"\n\t\tvisionWindowAttentionPatternKey       = \"clip.vision.n_wa_pattern\"\n\n\t\thasAudioEncoderKey                   = \"clip.has_audio_encoder\"\n\t\taudioEmbeddingLengthKey              = \"clip.audio.embedding_length\"\n\t\taudioBlockCountKey                   = \"clip.audio.block_count\"\n\t\taudioFeedForwardLengthKey            = \"clip.audio.feed_forward_length\"\n\t\taudioAttentionHeadCountKey           = \"clip.audio.attention.head_count\"\n\t\taudioAttentionLayerNormRMSEpsilonKey = \"clip.audio.attention.layer_norm_epsilon\"\n\t\taudioProjectionDimKey                = \"clip.audio.projection_dim\"\n\t\taudioProjectorStackFactorKey         = \"clip.audio.projector.stack_factor\"\n\t\taudioNumMelBinsKey                   = \"clip.audio.num_mel_bins\"\n\t)\n\n\tga.Type = \"projector\"\n\tga.Architecture = \"clip\"\n\n\tm, _ := gf.Header.MetadataKV.Index([]string{\n\t\tprojectorTypeKey,\n\t\thasLLaVAProjectorKey,\n\t\thasMiniCPMVProjector,\n\t\tminiCPMVVersionKey,\n\t\tminiCPMVQueryNumKey,\n\t\thasGLMProjectorKey,\n\t\thasQwen2VLMergerKey,\n\t\t// Vision\n\t\thasVisionEncoderKey,\n\t\tvisionEmbeddingLengthKey,\n\t\tvisionBlockCountKey,\n\t\tvisionFeedForwardLengthKey,\n\t\tvisionAttentionHeadCountKey,\n\t\tvisionAttentionLayerNormRMSEpsilonKey,\n\t\tvisionProjectionDimKey,\n\t\tvisionProjectorScaleFactorKey,\n\t\tvisionImageSizeKey,\n\t\tvisionPatchSizeKey,\n\t\tvisionMMPatchMergeTypeKey,\n\t\tvisioSpatialMergeSizeKey,\n\t\tvisionWindowAttentionPatternKey,\n\t\t// Audio\n\t\thasAudioEncoderKey,\n\t\taudioEmbeddingLengthKey,\n\t\taudioBlockCountKey,\n\t\taudioFeedForwardLengthKey,\n\t\taudioAttentionHeadCountKey,\n\t\taudioAttentionLayerNormRMSEpsilonKey,\n\t\taudioProjectionDimKey,\n\t\taudioProjectorStackFactorKey,\n\t\taudioNumMelBinsKey,\n\t})\n\n\tif v, ok := m[projectorTypeKey]; ok {\n\t\tga.ClipProjectorType = v.ValueString()\n\t} else {\n\t\tga.ClipProjectorType = \"mlp\"\n\t}\n\tif v, ok := m[hasLLaVAProjectorKey]; ok {\n\t\tga.ClipHasLLaVAProjector = v.ValueBool()\n\t}\n\tif v, ok := m[hasMiniCPMVProjector]; ok {\n\t\tga.ClipHasMiniCPMVProjector = v.ValueBool()\n\t}\n\tif v, ok := m[miniCPMVVersionKey]; ok {\n\t\tga.ClipMiniCPMVVersion = ValueNumeric[int32](v)\n\t}\n\tif v, ok := m[miniCPMVQueryNumKey]; ok {\n\t\tga.ClipMiniCPMVQueryNum = ValueNumeric[int32](v)\n\t}\n\tif v, ok := m[hasGLMProjectorKey]; ok {\n\t\tga.ClipHasGLMProjector = v.ValueBool()\n\t}\n\tif v, ok := m[hasQwen2VLMergerKey]; ok {\n\t\tga.ClipHasQwen2VLMerger = v.ValueBool()\n\t}\n\t// Vision\n\tif v, ok := m[hasVisionEncoderKey]; ok {\n\t\tga.ClipHasVisionEncoder = v.ValueBool()\n\t}\n\tif v, ok := m[visionEmbeddingLengthKey]; ok {\n\t\tga.ClipVisionEmbeddingLength = ValueNumeric[uint64](v)\n\t}\n\tif v, ok := m[visionBlockCountKey]; ok {\n\t\tga.ClipVisionBlockCount = ValueNumeric[uint64](v)\n\t}\n\tif v, ok := m[visionFeedForwardLengthKey]; ok {\n\t\tif v.ValueType == GGUFMetadataValueTypeArray {\n\t\t\tga.ClipVisionFeedForwardLength = ValuesNumeric[uint64](v.ValueArray())\n\t\t} else {\n\t\t\tvx := ValueNumeric[uint64](v)\n\t\t\tga.ClipVisionFeedForwardLength = make([]uint64, ga.ClipVisionBlockCount)\n\t\t\tfor i := range ga.ClipVisionFeedForwardLength {\n\t\t\t\tga.ClipVisionFeedForwardLength[i] = vx\n\t\t\t}\n\t\t}\n\t}\n\tif v, ok := m[visionAttentionHeadCountKey]; ok {\n\t\tga.ClipVisionAttentionHeadCount = ValueNumeric[uint64](v)\n\t}\n\tif v, ok := m[visionAttentionLayerNormRMSEpsilonKey]; ok {\n\t\tga.ClipVisionAttentionLayerNormRMSEpsilon = ValueNumeric[float32](v)\n\t}\n\tif v, ok := m[visionImageSizeKey]; ok {\n\t\tga.ClipVisionImageSize = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[visionProjectionDimKey]; ok {\n\t\tga.ClipVisionProjectionDim = ValueNumeric[uint32](v)\n\t}\n\tga.ClipVisionProjectorScaleFactor = 1\n\tif ga.ClipProjectorType == \"gemma3\" {\n\t\tga.ClipVisionProjectorScaleFactor = 4\n\t}\n\tif v, ok := m[visionProjectorScaleFactorKey]; ok {\n\t\tga.ClipVisionProjectorScaleFactor = ValueNumeric[uint32](v)\n\t}\n\tga.ClipVisionPatchSize = 1\n\tif v, ok := m[visionPatchSizeKey]; ok {\n\t\tga.ClipVisionPatchSize = ValueNumeric[uint32](v)\n\t}\n\tga.ClipVisionMMPatchMergeType = \"flat\"\n\tif v, ok := m[visionMMPatchMergeTypeKey]; ok {\n\t\tga.ClipVisionMMPatchMergeType = v.ValueString()\n\t}\n\tif v, ok := m[visioSpatialMergeSizeKey]; ok {\n\t\tga.ClipVisionSpatialMergeSize = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[visionWindowAttentionPatternKey]; ok {\n\t\tga.ClipVisionWindowAttentionPattern = ValueNumeric[uint32](v)\n\t}\n\t// Audio\n\tif v, ok := m[hasAudioEncoderKey]; ok {\n\t\tga.ClipHasAudioEncoder = v.ValueBool()\n\t}\n\tif v, ok := m[audioEmbeddingLengthKey]; ok {\n\t\tga.ClipAudioEmbeddingLength = ValueNumeric[uint64](v)\n\t}\n\tif v, ok := m[audioBlockCountKey]; ok {\n\t\tga.ClipAudioBlockCount = ValueNumeric[uint64](v)\n\t}\n\tif v, ok := m[audioFeedForwardLengthKey]; ok {\n\t\tif v.ValueType == GGUFMetadataValueTypeArray {\n\t\t\tga.ClipAudioFeedForwardLength = ValuesNumeric[uint64](v.ValueArray())\n\t\t} else {\n\t\t\tvx := ValueNumeric[uint64](v)\n\t\t\tga.ClipAudioFeedForwardLength = make([]uint64, ga.ClipAudioBlockCount)\n\t\t\tfor i := range ga.ClipAudioFeedForwardLength {\n\t\t\t\tga.ClipAudioFeedForwardLength[i] = vx\n\t\t\t}\n\t\t}\n\t}\n\tif v, ok := m[audioAttentionHeadCountKey]; ok {\n\t\tga.ClipAudioAttentionHeadCount = ValueNumeric[uint64](v)\n\t}\n\tif v, ok := m[audioAttentionLayerNormRMSEpsilonKey]; ok {\n\t\tga.ClipAudioAttentionLayerNormRMSEpsilon = ValueNumeric[float32](v)\n\t}\n\tif v, ok := m[audioProjectionDimKey]; ok {\n\t\tga.ClipAudioProjectionDim = ValueNumeric[uint32](v)\n\t}\n\tga.ClipAudioProjectorStackFactor = 1\n\tif v, ok := m[audioProjectorStackFactorKey]; ok {\n\t\tga.ClipAudioProjectorStackFactor = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[audioNumMelBinsKey]; ok {\n\t\tga.ClipAudioNumMelBins = ValueNumeric[uint32](v)\n\t}\n\n\tga.AttentionHeadCountKV = ga.AttentionHeadCount\n\n\treturn ga\n}\n\nfunc (gf *GGUFFile) adapterArchitecture(arch string) (ga GGUFArchitecture) {\n\tvar (\n\t\ttypeKey = \"adapter.type\"\n\n\t\tloraAlphaKey = \"adapter.lora.alpha\"\n\n\t\tcontrolVectorLayerCountKey  = \"adapter.control_vector.layer_count\"\n\t\tcontrolVectorLayerCountKey2 = \"control_vector.layer_count\"\n\t)\n\n\tga.Type = \"adapter\"\n\tga.Architecture = arch\n\n\tm, _ := gf.Header.MetadataKV.Index([]string{\n\t\ttypeKey,\n\t\tloraAlphaKey,\n\t\tcontrolVectorLayerCountKey,\n\t\tcontrolVectorLayerCountKey2,\n\t})\n\n\tif v, ok := m[typeKey]; ok {\n\t\tga.AdapterType = v.ValueString()\n\t}\n\tif v, ok := m[loraAlphaKey]; ok {\n\t\tga.AdapterLoRAAlpha = ValueNumeric[float32](v)\n\t}\n\tif v, ok := m[controlVectorLayerCountKey]; ok {\n\t\tga.AdapterControlVectorLayerCount = ValueNumeric[uint32](v)\n\t} else if v, ok := m[controlVectorLayerCountKey2]; ok {\n\t\tga.AdapterControlVectorLayerCount = ValueNumeric[uint32](v)\n\t}\n\n\treturn ga\n}\n\nfunc (gf *GGUFFile) imatrixArchitecture(_ string) (ga GGUFArchitecture) {\n\tga.Type = \"imatrix\"\n\tga.Architecture = \"imatrix\"\n\n\treturn ga\n}\n\nfunc (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) {\n\tvar (\n\t\tcontextLengthKey     = arch + \".context_length\"\n\t\tembeddingLengthKey   = arch + \".embedding_length\"\n\t\tblockCountKey        = arch + \".block_count\"\n\t\tfeedForwardLengthKey = arch + \".feed_forward_length\"\n\n\t\texpertFeedForwardLengthKey       = arch + \".expert_feed_forward_length\"\n\t\texpertSharedFeedForwardLengthKey = arch + \".expert_shared_feed_forward_length\"\n\t\texpertCountKey                   = arch + \".expert_count\"\n\t\texpertUsedCountKey               = arch + \".expert_used_count\"\n\t\texpertSharedCountKey             = arch + \".expert_shared_count\"\n\n\t\tattentionHeadCountKey           = arch + \".attention.head_count\"\n\t\tattentionHeadCountKVKey         = arch + \".attention.head_count_kv\"\n\t\tattentionSlidingWindowKey       = arch + \".attention.sliding_window\"\n\t\tattentionMaxALiBIBiasKey        = arch + \".attention.max_alibi_bias\"\n\t\tattentionMaxALiBIBiasKey2       = arch + \".attention.alibi_bias_max\"\n\t\tattentionClampKQVKey            = arch + \".attention.clamp_kqv\"\n\t\tattentionClampKQVKey2           = arch + \".attention.clip_kqv\"\n\t\tattentionLayerNormEpsilonKey    = arch + \".attention.layer_norm_epsilon\"\n\t\tattentionLayerNormRMSEpsilonKey = arch + \".attention.layer_norm_rms_epsilon\"\n\t\tattentionQueryLORARankKey       = arch + \".attention.q_lora_rank\"\n\t\tattentionKeyValueLORARankKey    = arch + \".attention.kv_lora_rank\"\n\t\tattentionKeyLengthKey           = arch + \".attention.key_length\"\n\t\tattentionKeyLengthMLAKey        = arch + \".attention.key_length_mla\"\n\t\tattentionValueLengthKey         = arch + \".attention.value_length\"\n\t\tattentionValueLengthMLAKey      = arch + \".attention.value_length_mla\"\n\t\tattentionCausalKey              = arch + \".attention.causal\"\n\n\t\tropeDimensionCountKey         = arch + \".rope.dimension_count\"\n\t\tropeFrequencyBaseKey          = arch + \".rope.freq_base\"\n\t\tropeFrequencyScaleKey         = arch + \".rope.freq_scale\"\n\t\tropeScaleLinearKey            = arch + \".rope.scale_linear\"\n\t\tropeScalingTypeKey            = arch + \".rope.scaling.type\"\n\t\tropeScalingFactorKey          = arch + \".rope.scaling.factor\"\n\t\tropeScalingOriginalContextKey = arch + \".rope.scaling.original_context_length\" // uint32 maybe\n\t\tropeScalingFinetunedKey       = arch + \".rope.scaling.finetuned\"\n\n\t\tpoolingTypeKey = arch + \".pooling_type\"\n\n\t\tssmConvolutionKernelKey = arch + \".ssm.conv_kernel\"\n\t\tssmInnerSizeKey         = arch + \".ssm.inner_size\"\n\t\tssmStateSizeKey         = arch + \".ssm.state_size\"\n\t\tssmTimeStepRankKey      = arch + \".ssm.time_step_rank\"\n\t\tssmGroupCountKey        = arch + \".ssm.group_count\"\n\n\t\trwkvHeadSizeKey                = arch + \".wkv.head_size\"\n\t\trwkvRescaleEveryNLayersKey     = arch + \".rescale_every_n_layers\"\n\t\trwkvTimeMixExtraDimensionKey   = arch + \".time_mix_extra_dim\"\n\t\trwkvTimeDecayExtraDimensionKey = arch + \".time_decay_extra_dim\"\n\t\trwkvTokenShiftCountKey         = arch + \".token_shift_count\"\n\n\t\tvocabularyLengthKey    = arch + \".vocab_size\"\n\t\ttokenizerGGMLTokensKey = \"tokenizer.ggml.tokens\"\n\t)\n\n\tga.Type = \"model\"\n\tga.Architecture = arch\n\n\tm, _ := gf.Header.MetadataKV.Index([]string{\n\t\tcontextLengthKey,\n\t\tembeddingLengthKey,\n\t\tblockCountKey,\n\t\tfeedForwardLengthKey,\n\t\texpertFeedForwardLengthKey,\n\t\texpertSharedFeedForwardLengthKey,\n\t\texpertCountKey,\n\t\texpertUsedCountKey,\n\t\texpertSharedCountKey,\n\t\tattentionHeadCountKey,\n\t\tattentionHeadCountKVKey,\n\t\tattentionSlidingWindowKey,\n\t\tattentionMaxALiBIBiasKey,\n\t\tattentionMaxALiBIBiasKey2,\n\t\tattentionClampKQVKey,\n\t\tattentionClampKQVKey2,\n\t\tattentionLayerNormEpsilonKey,\n\t\tattentionLayerNormRMSEpsilonKey,\n\t\tattentionQueryLORARankKey,\n\t\tattentionKeyValueLORARankKey,\n\t\tattentionKeyLengthKey,\n\t\tattentionKeyLengthMLAKey,\n\t\tattentionValueLengthKey,\n\t\tattentionValueLengthMLAKey,\n\t\tattentionCausalKey,\n\t\tropeDimensionCountKey,\n\t\tropeFrequencyBaseKey,\n\t\tropeFrequencyScaleKey,\n\t\tropeScaleLinearKey,\n\t\tropeScalingTypeKey,\n\t\tropeScalingFactorKey,\n\t\tropeScalingOriginalContextKey,\n\t\tropeScalingFinetunedKey,\n\t\tpoolingTypeKey,\n\t\tssmConvolutionKernelKey,\n\t\tssmInnerSizeKey,\n\t\tssmStateSizeKey,\n\t\tssmTimeStepRankKey,\n\t\tssmGroupCountKey,\n\t\trwkvHeadSizeKey,\n\t\trwkvRescaleEveryNLayersKey,\n\t\trwkvTimeMixExtraDimensionKey,\n\t\trwkvTimeDecayExtraDimensionKey,\n\t\trwkvTokenShiftCountKey,\n\t\tvocabularyLengthKey,\n\t\ttokenizerGGMLTokensKey,\n\t})\n\n\tif v, ok := m[contextLengthKey]; ok {\n\t\tga.MaximumContextLength = ValueNumeric[uint64](v)\n\t}\n\tif v, ok := m[embeddingLengthKey]; ok {\n\t\tga.EmbeddingLength = ValueNumeric[uint64](v)\n\t}\n\tif v, ok := m[blockCountKey]; ok {\n\t\tga.BlockCount = ValueNumeric[uint64](v)\n\t}\n\tif v, ok := m[feedForwardLengthKey]; ok {\n\t\tif v.ValueType == GGUFMetadataValueTypeArray {\n\t\t\tga.FeedForwardLength = ValuesNumeric[uint64](v.ValueArray())\n\t\t} else {\n\t\t\tvx := ValueNumeric[uint64](v)\n\t\t\tga.FeedForwardLength = make([]uint64, ga.BlockCount)\n\t\t\tfor i := range ga.FeedForwardLength {\n\t\t\t\tga.FeedForwardLength[i] = vx\n\t\t\t}\n\t\t}\n\t}\n\n\tif v, ok := m[expertCountKey]; ok {\n\t\tga.ExpertCount = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[expertUsedCountKey]; ok {\n\t\tga.ExpertUsedCount = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[expertSharedCountKey]; ok {\n\t\tga.ExpertSharedCount = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[expertFeedForwardLengthKey]; ok {\n\t\tga.ExpertFeedForwardLength = ValueNumeric[uint64](v)\n\t}\n\tif v, ok := m[expertSharedFeedForwardLengthKey]; ok {\n\t\tga.ExpertSharedFeedForwardLength = ValueNumeric[uint64](v)\n\t}\n\n\tif v, ok := m[attentionHeadCountKey]; ok {\n\t\tif v.ValueType == GGUFMetadataValueTypeArray {\n\t\t\tga.AttentionHeadCount = ValuesNumeric[uint64](v.ValueArray())[0]\n\t\t} else {\n\t\t\tga.AttentionHeadCount = ValueNumeric[uint64](v)\n\t\t}\n\t}\n\tif v, ok := m[attentionHeadCountKVKey]; ok {\n\t\tif v.ValueType == GGUFMetadataValueTypeArray {\n\t\t\tga.AttentionHeadCountKV = ValuesNumeric[uint64](v.ValueArray())[0]\n\t\t} else {\n\t\t\tga.AttentionHeadCountKV = ValueNumeric[uint64](v)\n\t\t}\n\t} else {\n\t\tga.AttentionHeadCountKV = ga.AttentionHeadCount\n\t}\n\tga.AttentionSlidingWindowPattern = 1\n\tif v, ok := m[attentionSlidingWindowKey]; ok {\n\t\tif v.ValueType == GGUFMetadataValueTypeArray {\n\t\t\tga.AttentionSlidingWindow = ValuesNumeric[uint64](v.ValueArray())[0]\n\t\t} else {\n\t\t\tga.AttentionSlidingWindow = ValueNumeric[uint64](v)\n\t\t}\n\t}\n\tswitch arch {\n\tcase \"llama4\":\n\t\tif ga.AttentionSlidingWindow == 0 {\n\t\t\tga.AttentionSlidingWindow = 8192\n\t\t}\n\t\tga.AttentionSlidingWindowPattern = 4\n\tcase \"phi3\":\n\t\t// See https://github.com/ggml-org/llama.cpp/pull/13676\n\t\tga.AttentionSlidingWindow = 0\n\tcase \"gemma2\":\n\t\tif ga.AttentionSlidingWindow == 0 {\n\t\t\tga.AttentionSlidingWindow = 4096\n\t\t}\n\t\tga.AttentionSlidingWindowPattern = 2\n\tcase \"gemma3\":\n\t\tga.AttentionSlidingWindowPattern = 6\n\tcase \"cohere2\":\n\t\tga.AttentionSlidingWindowPattern = 4\n\t}\n\tif v, ok := m[attentionMaxALiBIBiasKey]; ok {\n\t\tga.AttentionMaxALiBIBias = ValueNumeric[float32](v)\n\t} else if v, ok := m[attentionMaxALiBIBiasKey2]; ok {\n\t\tga.AttentionMaxALiBIBias = ValueNumeric[float32](v)\n\t}\n\tif v, ok := m[attentionClampKQVKey]; ok {\n\t\tga.AttentionClampKQV = ValueNumeric[float32](v)\n\t} else if v, ok := m[attentionClampKQVKey2]; ok {\n\t\tga.AttentionClampKQV = ValueNumeric[float32](v)\n\t}\n\tif v, ok := m[attentionLayerNormEpsilonKey]; ok {\n\t\tga.AttentionLayerNormEpsilon = ValueNumeric[float32](v)\n\t}\n\tif v, ok := m[attentionLayerNormRMSEpsilonKey]; ok {\n\t\tga.AttentionLayerNormRMSEpsilon = ValueNumeric[float32](v)\n\t}\n\tif v, ok := m[attentionQueryLORARankKey]; ok {\n\t\tga.AttentionQueryLORARank = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[attentionKeyValueLORARankKey]; ok {\n\t\tga.AttentionKeyValueLORARank = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[attentionKeyLengthKey]; ok {\n\t\tga.AttentionKeyLength = ValueNumeric[uint32](v)\n\t} else if ga.AttentionHeadCount != 0 {\n\t\tga.AttentionKeyLength = uint32(ga.EmbeddingLength / ga.AttentionHeadCount)\n\t}\n\tif v, ok := m[attentionKeyLengthMLAKey]; ok {\n\t\tga.AttentionKeyLengthMLA = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[attentionValueLengthKey]; ok {\n\t\tga.AttentionValueLength = ValueNumeric[uint32](v)\n\t} else if ga.AttentionHeadCount != 0 {\n\t\tga.AttentionValueLength = uint32(ga.EmbeddingLength / ga.AttentionHeadCount)\n\t}\n\tif v, ok := m[attentionValueLengthMLAKey]; ok {\n\t\tga.AttentionValueLengthMLA = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[attentionCausalKey]; ok {\n\t\tga.AttentionCausal = v.ValueBool()\n\t} else {\n\t\tga.AttentionCausal = true\n\t}\n\t// See https://github.com/ggml-org/llama.cpp/blob/6491d6e4f1caf0ad2221865b4249ae6938a6308c/src/llama-arch.cpp#L1913-L1924.\n\tga.AttentionRecurrent = slices.Contains([]string{ // TODO(thxCode): calculate this from the metadata.\n\t\t\"mamba\",\n\t\t\"mamba2\",\n\t\t\"rwkv6\",\n\t\t\"rwkv6qwen2\",\n\t\t\"rwkv7\",\n\t\t\"arwkv7\",\n\t}, ga.Architecture)\n\t// See https://github.com/ggml-org/llama.cpp/blob/a57d1bcb3c0165ac87b1f0dbb429839b0da69689/src/llama-arch.cpp#L2029-L2038.\n\tga.AttentionHybrid = slices.Contains([]string{ // TODO(thxCode): calculate this from the metadata.\n\t\t\"jamba\",\n\t\t\"falcon-h1\",\n\t\t\"granitehybrid\",\n\t}, ga.Architecture)\n\tga.AttentionRecurrent = ga.AttentionHybrid || ga.AttentionRecurrent\n\n\tif v, ok := m[ropeDimensionCountKey]; ok {\n\t\tga.RoPEDimensionCount = ValueNumeric[uint64](v)\n\t}\n\tga.RoPEFrequencyBase = 10000.0\n\tif v, ok := m[ropeFrequencyBaseKey]; ok {\n\t\tga.RoPEFrequencyBase = ValueNumeric[float32](v)\n\t}\n\tga.RoPEFrequencyScale = 1.0\n\tif v, ok := m[ropeFrequencyScaleKey]; ok {\n\t\tga.RoPEFrequencyScale = ValueNumeric[float32](v)\n\t}\n\tif v, ok := m[ropeScalingTypeKey]; ok {\n\t\tga.RoPEScalingType = v.ValueString()\n\t}\n\tif v, ok := m[ropeScaleLinearKey]; ok {\n\t\tga.RoPEScalingType = \"linear\"\n\t\tga.RoPEScalingFactor = ValueNumeric[float32](v)\n\t\tif ga.RoPEScalingFactor != 0 {\n\t\t\tga.RoPEFrequencyScale = 1.0 / ga.RoPEScalingFactor\n\t\t}\n\t}\n\tif v, ok := m[ropeScalingFactorKey]; ok {\n\t\tga.RoPEScalingFactor = ValueNumeric[float32](v)\n\t\tif ga.RoPEScalingFactor != 0 {\n\t\t\tga.RoPEFrequencyScale = 1.0 / ga.RoPEScalingFactor\n\t\t}\n\t}\n\tif v, ok := m[ropeScalingOriginalContextKey]; ok {\n\t\tga.RoPEScalingOriginalContextLength = ValueNumeric[uint64](v)\n\t}\n\tif v, ok := m[ropeScalingFinetunedKey]; ok {\n\t\tga.RoPEScalingFinetuned = v.ValueBool()\n\t}\n\n\tif v, ok := m[poolingTypeKey]; ok {\n\t\tga.PoolingType = v.ValueUint32()\n\t\tif ga.AttentionCausal && ga.PoolingType > 2 {\n\t\t\tga.AttentionCausal = false\n\t\t}\n\t}\n\n\tif v, ok := m[ssmConvolutionKernelKey]; ok {\n\t\tga.SSMConvolutionKernel = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[ssmInnerSizeKey]; ok {\n\t\tga.SSMInnerSize = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[ssmStateSizeKey]; ok {\n\t\tga.SSMStateSize = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[ssmTimeStepRankKey]; ok {\n\t\tga.SSMTimeStepRank = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[ssmGroupCountKey]; ok {\n\t\tga.SSMGroupCount = ValueNumeric[uint32](v)\n\t}\n\n\tif v, ok := m[rwkvHeadSizeKey]; ok {\n\t\tga.RWKVHeadSize = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[rwkvRescaleEveryNLayersKey]; ok {\n\t\tga.RWKVRescaleEveryNLayers = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[rwkvTimeMixExtraDimensionKey]; ok {\n\t\tga.RWKVTimeMixExtraDimension = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[rwkvTimeDecayExtraDimensionKey]; ok {\n\t\tga.RWKVTimeDecayExtraDimension = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[rwkvTokenShiftCountKey]; ok {\n\t\tga.RWKVTokenShiftCount = ValueNumeric[uint32](v)\n\t} else if ga.AttentionRecurrent {\n\t\tga.RWKVTokenShiftCount = 2\n\t}\n\n\tif v, ok := m[vocabularyLengthKey]; ok {\n\t\tga.VocabularyLength = ValueNumeric[uint64](v)\n\t} else if v, ok := m[tokenizerGGMLTokensKey]; ok {\n\t\tga.VocabularyLength = v.ValueArray().Len\n\t}\n\n\treturn ga\n}\n"
  },
  {
    "path": "file_architecture_test.go",
    "content": "package gguf_parser\n\nimport (\n\t\"context\"\n\t\"os\"\n\t\"testing\"\n\n\t\"github.com/davecgh/go-spew/spew\"\n)\n\nfunc TestGGUFFile_Architecture(t *testing.T) {\n\tctx := context.Background()\n\n\tf, err := ParseGGUFFileFromHuggingFace(\n\t\tctx,\n\t\t\"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF\",\n\t\t\"Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf\",\n\t\tSkipLargeMetadata())\n\tif err != nil {\n\t\tt.Fatal(err)\n\t\treturn\n\t}\n\n\tt.Log(\"\\n\", spew.Sdump(f.Architecture()), \"\\n\")\n}\n\nfunc BenchmarkGGUFFile_Architecture(b *testing.B) {\n\tmp, ok := os.LookupEnv(\"TEST_MODEL_PATH\")\n\tif !ok {\n\t\tb.Skip(\"TEST_MODEL_PATH is not set\")\n\t\treturn\n\t}\n\n\tf, err := ParseGGUFFile(mp, SkipLargeMetadata(), UseMMap())\n\tif err != nil {\n\t\tb.Fatal(err)\n\t\treturn\n\t}\n\n\tb.ReportAllocs()\n\n\tb.ResetTimer()\n\tfor i := 0; i < b.N; i++ {\n\t\t_ = f.Architecture()\n\t}\n}\n"
  },
  {
    "path": "file_estimate__llamacpp.go",
    "content": "package gguf_parser\n\nimport (\n\t\"math\"\n\t\"regexp\"\n\t\"slices\"\n\t\"strings\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/anyx\"\n\t\"github.com/gpustack/gguf-parser-go/util/ptr\"\n\t\"github.com/gpustack/gguf-parser-go/util/slicex\"\n)\n\n// Types for LLaMACpp estimation.\ntype (\n\t// LLaMACppRunEstimate represents the estimated result of loading the GGUF file in llama.cpp.\n\tLLaMACppRunEstimate struct {\n\t\t// Type describes what type this GGUF file is.\n\t\tType string `json:\"type\"`\n\t\t// Architecture describes what architecture this GGUF file implements.\n\t\t//\n\t\t// All lowercase ASCII.\n\t\tArchitecture string `json:\"architecture\"`\n\t\t// ClipProjectorType is the type of the projector used in the clip model.\n\t\t//\n\t\t// Only used when Architecture is \"clip\".\n\t\tClipProjectorType string `json:\"clipProjectorType,omitempty\"`\n\t\t// AdapterType is the type of the adapter.\n\t\t//\n\t\t// Only used when Architecture is \"adapter\".\n\t\tAdapterType string `json:\"adapterType,omitempty\"`\n\t\t// FlashAttention is the flag to indicate whether enable the flash attention,\n\t\t// true for enable.\n\t\tFlashAttention bool `json:\"flashAttention\"`\n\t\t// ContextSize is the size of the context.\n\t\tContextSize uint64 `json:\"contextSize\"`\n\t\t// OffloadLayers is the number of offloaded layers.\n\t\tOffloadLayers uint64 `json:\"offloadLayers\"`\n\t\t// FullOffloaded is the flag to indicate whether the layers are fully offloaded,\n\t\t// false for partial offloaded or zero offloaded.\n\t\tFullOffloaded bool `json:\"fullOffloaded\"`\n\t\t// NoMMap is the flag to indicate whether support the mmap,\n\t\t// true for support.\n\t\tNoMMap bool `json:\"noMMap\"`\n\t\t// EmbeddingOnly is the flag to indicate whether the model is used for embedding only,\n\t\t// true for embedding only.\n\t\tEmbeddingOnly bool `json:\"embeddingOnly\"`\n\t\t// Reranking is the flag to indicate whether the model is used for reranking,\n\t\t// true for reranking.\n\t\t//\n\t\t// Only available when EmbeddingOnly is true.\n\t\tReranking bool `json:\"reranking\"`\n\t\t// Distributable is the flag to indicate whether the model is distributable,\n\t\t// true for distributable.\n\t\tDistributable bool `json:\"distributable\"`\n\t\t// LogicalBatchSize is the logical batch size.\n\t\tLogicalBatchSize int32 `json:\"logicalBatchSize\"`\n\t\t// PhysicalBatchSize is the physical batch size.\n\t\tPhysicalBatchSize int32 `json:\"physicalBatchSize\"`\n\t\t// Devices represents the usage for running the GGUF file,\n\t\t// the first device is the CPU, and the rest are GPUs.\n\t\tDevices []LLaMACppRunDeviceUsage `json:\"devices\"`\n\t\t// Drafter is the estimated result of drafter.\n\t\tDrafter *LLaMACppRunEstimate `json:\"drafter,omitempty\"`\n\t\t// Projector is the estimated result of multimodal projector.\n\t\tProjector *LLaMACppRunEstimate `json:\"projector,omitempty\"`\n\t\t// Adapters is the estimated result of adapters.\n\t\tAdapters []LLaMACppRunEstimate `json:\"adapters,omitempty\"`\n\t\t// MaximumTokensPerSecond represents the maximum tokens per second for running the GGUF file.\n\t\tMaximumTokensPerSecond *GGUFTokensPerSecondScalar `json:\"maximumTokensPerSecond,omitempty\"`\n\t}\n\n\t// LLaMACppRunDeviceUsage represents the usage for running the GGUF file in llama.cpp.\n\tLLaMACppRunDeviceUsage struct {\n\t\t// HandleLayers is the number of layers that the device can handle.\n\t\tHandleLayers uint64 `json:\"handleLayers\"`\n\t\t// HandleSWALayers is the number of layers that the device can handle in sliding window attention (SWA),\n\t\t// the non SWA layers is `HandleLayers - HandleSWALayers`.\n\t\tHandleSWALayers uint64 `json:\"handleSWALayers\"`\n\t\t// HandleLastLayer is the index of the last layer the device can handle,\n\t\t// -1 means the device does not handle the last layer.\n\t\tHandleLastLayer int `json:\"handleLastLayer\"`\n\t\t// HandleOutputLayer is the flag to indicate whether the device can handle the output layer,\n\t\t// true for handle.\n\t\tHandleOutputLayer bool `json:\"handleOutputLayer\"`\n\t\t// Remote is the flag to indicate whether the device is remote,\n\t\t// true for remote.\n\t\tRemote bool `json:\"remote\"`\n\t\t// Position is the relative position of the device,\n\t\t// starts from 0.\n\t\t//\n\t\t// If Remote is true, Position is the position of the remote devices,\n\t\t// Otherwise, Position is the position of the device in the local devices.\n\t\tPosition int `json:\"position\"`\n\t\t// Endpoint is the endpoint of the remote device, empty for local devices.\n\t\tEndpoint string `json:\"endpoint,omitempty\"`\n\t\t// Footprint is the memory footprint for bootstrapping.\n\t\tFootprint GGUFBytesScalar `json:\"footprint\"`\n\t\t// Parameter is the running parameters that the device processes.\n\t\tParameter LLaMACppParameterUsage `json:\"parameter\"`\n\t\t// Weight is the memory usage of weights that the device loads.\n\t\tWeight LLaMACppWeightMemoryUsage `json:\"weight\"`\n\t\t// KVCache is the memory usage of kv that the device caches.\n\t\tKVCache LLaMACppKVCacheMemoryUsage `json:\"kvCache\"`\n\t\t// Computation is the memory usage of computation that the device processes.\n\t\tComputation LLaMACppComputationMemoryUsage `json:\"computation\"`\n\t}\n\n\t// LLaMACppParameterUsage represents the parameter usage for running the GGUF file in llama.cpp.\n\tLLaMACppParameterUsage struct {\n\t\t// KVCache is the parameter usage for caching previous KV.\n\t\tKVCache GGUFParametersScalar `json:\"kvCache\"`\n\t\t// Input is the parameter usage for input tensors.\n\t\tInput GGUFParametersScalar `json:\"input\"`\n\t\t// Compute is the parameter usage for compute tensors.\n\t\tCompute GGUFParametersScalar `json:\"compute\"`\n\t\t// ComputeOverridden is the parameter usage for overridden compute tensors.\n\t\tComputeOverridden GGUFParametersScalar `json:\"computeOverridden\"`\n\t\t// Output is the parameter usage for output tensors.\n\t\tOutput GGUFParametersScalar `json:\"output\"`\n\t}\n\n\t// LLaMACppWeightMemoryUsage represents the memory usage of loading weights in llama.cpp.\n\tLLaMACppWeightMemoryUsage struct {\n\t\t// Input is the memory usage for loading input tensors.\n\t\tInput GGUFBytesScalar `json:\"input\"`\n\t\t// Compute is the memory usage for loading compute tensors.\n\t\tCompute GGUFBytesScalar `json:\"compute\"`\n\t\t// ComputeOverridden is the memory usage for loading overridden compute tensors.\n\t\tComputeOverridden GGUFBytesScalar `json:\"computeOverridden\"`\n\t\t// Output is the memory usage for loading output tensors.\n\t\tOutput GGUFBytesScalar `json:\"output\"`\n\t}\n\n\t// LLaMACppKVCacheMemoryUsage represents the memory usage of caching previous KV in llama.cpp.\n\tLLaMACppKVCacheMemoryUsage struct {\n\t\t// Key is the memory usage for caching previous keys.\n\t\tKey GGUFBytesScalar `json:\"key\"`\n\t\t// Value is the memory usage for caching previous values.\n\t\tValue GGUFBytesScalar `json:\"value\"`\n\t}\n\n\t// LLaMACppComputationMemoryUsage represents the memory usage of computation in llama.cpp.\n\tLLaMACppComputationMemoryUsage struct {\n\t\t// Footprint is the memory footprint for computation.\n\t\tFootprint GGUFBytesScalar `json:\"footprint\"`\n\t\t// Input is the memory usage for input.\n\t\tInput GGUFBytesScalar `json:\"input\"`\n\t\t// Compute is the memory usage for computation.\n\t\tCompute GGUFBytesScalar `json:\"graph\"`\n\t\t// Output is the memory usage for output.\n\t\tOutput GGUFBytesScalar `json:\"output\"`\n\t}\n)\n\n// EstimateLLaMACppRun estimates the usages of the GGUF file in llama.cpp.\nfunc (gf *GGUFFile) EstimateLLaMACppRun(opts ...GGUFRunEstimateOption) (e LLaMACppRunEstimate) {\n\t// Options\n\tvar o _GGUFRunEstimateOptions\n\tfor _, opt := range opts {\n\t\topt(&o)\n\t}\n\tswitch {\n\tcase o.TensorSplitFraction == nil:\n\t\to.TensorSplitFraction = []float64{1}\n\t\to.MainGPUIndex = 0\n\tcase o.MainGPUIndex < 0 || o.MainGPUIndex >= len(o.TensorSplitFraction):\n\t\tpanic(\"main device index must be range of 0 to the length of tensor split fraction\")\n\t}\n\tif len(o.DeviceMetrics) > 0 {\n\t\tfor i, j := 0, len(o.DeviceMetrics)-1; i < len(o.TensorSplitFraction)-j; i++ {\n\t\t\to.DeviceMetrics = append(o.DeviceMetrics, o.DeviceMetrics[j])\n\t\t}\n\t\to.DeviceMetrics = o.DeviceMetrics[:len(o.TensorSplitFraction)+1]\n\t}\n\tif o.LMCCacheKeyType == nil {\n\t\to.LMCCacheKeyType = ptr.To(GGMLTypeF16)\n\t}\n\tif o.LMCCacheValueType == nil {\n\t\to.LMCCacheValueType = ptr.To(GGMLTypeF16)\n\t}\n\tif o.LMCOffloadKVCache == nil {\n\t\to.LMCOffloadKVCache = ptr.To(true)\n\t}\n\tif o.LMCLogicalBatchSize == nil {\n\t\to.LMCLogicalBatchSize = ptr.To(int32(2048))\n\t} else {\n\t\t// See https://github.com/ggerganov/llama.cpp/blob/0bf16de07b0692e7df26b9a633e232bbd66e0360/src/llama.cpp#L16519-L16525.\n\t\to.LMCLogicalBatchSize = ptr.To(max(32, *o.LMCLogicalBatchSize))\n\t}\n\tif o.LMCPhysicalBatchSize == nil {\n\t\to.LMCPhysicalBatchSize = ptr.To(int32(512))\n\t}\n\tif *o.LMCPhysicalBatchSize > *o.LMCLogicalBatchSize {\n\t\tpanic(\"physical batch size must be less than or equal to logical batch size\")\n\t}\n\tif o.LMCSplitMode >= _LLAMACppSplitModeMax {\n\t\tpanic(\"split mode must be less than max\")\n\t}\n\n\t// Devices.\n\te.Devices = make([]LLaMACppRunDeviceUsage, len(o.TensorSplitFraction)+1)\n\tfor i := range e.Devices {\n\t\te.Devices[i].HandleLastLayer = -1\n\t}\n\tfor j := range e.Devices[1:] {\n\t\te.Devices[j+1].Remote = j < len(o.RPCServers)\n\t\tif e.Devices[j+1].Remote {\n\t\t\te.Devices[j+1].Position = j\n\t\t\te.Devices[j+1].Endpoint = o.RPCServers[j]\n\t\t} else {\n\t\t\te.Devices[j+1].Position = j - len(o.RPCServers)\n\t\t}\n\t}\n\n\t// Metadata.\n\ta := gf.Architecture()\n\te.Type = a.Type\n\te.Architecture = a.Architecture\n\te.ClipProjectorType = a.ClipProjectorType\n\te.AdapterType = a.AdapterType\n\n\tswitch a.Type {\n\tcase \"model\":\n\t\tt := gf.Tokenizer()\n\t\tgf.estimateLLaMACppRunInModel(&o, &a, &t, &e)\n\tcase \"projector\":\n\t\t// For projector model,\n\t\t// see https://github.com/ggerganov/llama.cpp/blob/148ec970b62c3c5ae0a8bfdaad2fc237aaae350d/examples/llava/clip.cpp#L994-L1008.\n\t\tif ptr.Deref(o.LMCOffloadLayers, math.MaxUint64) != 0 {\n\t\t\t// Full offload.\n\t\t\to.LMCOffloadLayers = ptr.To[uint64](math.MaxUint64)\n\t\t} else {\n\t\t\t// Zero offload.\n\t\t\to.LMCOffloadLayers = ptr.To[uint64](0)\n\t\t}\n\t\tgf.estimateLLaMACppRunInProjector(&o, &a, &e)\n\tcase \"adapter\":\n\t\tgf.estimateLLaMACppRunInAdapter(&o, &a, &e)\n\tcase \"imatrix\":\n\t\tgf.estimateLLaMACppRunInIMatrix(&o, &a, &e)\n\t}\n\n\treturn e\n}\n\n// estimateLLaMACppRunInModel estimates the usages of the GGUF file for model,\n// including the usages of footprint, weight, KV cache, and computation.\nfunc (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GGUFArchitecture, t *GGUFTokenizer, e *LLaMACppRunEstimate) {\n\tls := gf.Layers()\n\tioLs, tfLs, _ := ls.Cut([]string{\n\t\t\"position_*\",\n\t\t\"token_*\",\n\t\t\"cls.*\",\n\t\t\"output.*\",\n\t\t\"output_*\",\n\t\t\"rope_factors_*\",\n\t})\n\tipLs, opLs, _ := ioLs.Cut([]string{\n\t\t\"position_*\",\n\t\t\"token_*\",\n\t})\n\n\tif a.BlockCount == 0 {\n\t\ta.BlockCount = uint64(len(tfLs))\n\t}\n\n\t// Using sliding window attention.\n\tusingSWA := a.AttentionSlidingWindowPattern != 1 && !o.LMCFullSizeSWACache\n\n\t// Full offload: nLoadLayers == 0 && isOffloadOutputLayer\n\t// Zero offload: nOffloadLayers == 0\n\t// Partial offload: !Full offload && !Zero offload\n\tvar (\n\t\tnOffloadLayers       uint64\n\t\tnActualOffloadLayers uint64\n\t\tnLoadLayers          = a.BlockCount\n\t\tidxOutputDevice      int\n\n\t\tfullOffload, zeroOffload          bool\n\t\tnSWALoadLayers, nSWAOffloadLayers uint64\n\t)\n\t{\n\t\tvar isOffloadOutputLayer bool\n\n\t\tswitch v := o.LMCOffloadLayers; {\n\t\tcase v == nil:\n\t\t\to.LMCOffloadLayers = ptr.To(a.BlockCount)\n\t\t\tnOffloadLayers = a.BlockCount\n\t\t\tisOffloadOutputLayer = true\n\t\tcase *v != 0:\n\t\t\tnOffloadLayers = *v\n\t\t\tif nOffloadLayers > a.BlockCount {\n\t\t\t\tisOffloadOutputLayer = true\n\t\t\t\tnOffloadLayers = a.BlockCount\n\t\t\t}\n\t\t}\n\t\tnActualOffloadLayers = nOffloadLayers\n\t\tif isOffloadOutputLayer {\n\t\t\tnActualOffloadLayers += 1\n\t\t}\n\t\tnLoadLayers -= nOffloadLayers\n\n\t\tfullOffload = nLoadLayers == 0 && isOffloadOutputLayer\n\t\tzeroOffload = nOffloadLayers == 0\n\n\t\te.FullOffloaded = fullOffload\n\t\te.OffloadLayers = nOffloadLayers\n\n\t\tfor i, j, offloadStart := uint64(0), 0, a.BlockCount-nOffloadLayers; i < a.BlockCount; i++ {\n\t\t\tswitch {\n\t\t\tcase i < nLoadLayers:\n\t\t\t\te.Devices[0].HandleLayers += 1\n\t\t\t\te.Devices[0].HandleLastLayer = int(i)\n\t\t\t\tif usingSWA && (a.AttentionSlidingWindowPattern == 0 || i%uint64(a.AttentionSlidingWindowPattern) != 0) {\n\t\t\t\t\te.Devices[0].HandleSWALayers += 1\n\t\t\t\t\tnSWALoadLayers += 1\n\t\t\t\t}\n\t\t\tcase i >= offloadStart:\n\t\t\t\tx := float64(i-offloadStart) / float64(nActualOffloadLayers)\n\t\t\t\tj = slicex.UpperBound(o.TensorSplitFraction, x)\n\t\t\t\te.Devices[j+1].HandleLayers += 1\n\t\t\t\te.Devices[j+1].HandleLastLayer = int(i)\n\t\t\t\tif usingSWA && (a.AttentionSlidingWindowPattern == 0 || i%uint64(a.AttentionSlidingWindowPattern) != 0) {\n\t\t\t\t\te.Devices[j+1].HandleSWALayers += 1\n\t\t\t\t\tnSWAOffloadLayers += 1\n\t\t\t\t}\n\t\t\t\tif fullOffload && i == a.BlockCount-1 {\n\t\t\t\t\tidxOutputDevice = j + 1\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\te.Devices[idxOutputDevice].HandleOutputLayer = true\n\t}\n\n\t// Flash attention.\n\t{\n\t\t// Grok is not compatible with flash attention,\n\t\t// see https://github.com/ggerganov/llama.cpp/blob/19d3c8293b1f61acbe2dab1d49a17950fd788a4a/src/llama.cpp#L9566-L9569.\n\t\tif a.Architecture == \"grok\" {\n\t\t\to.FlashAttention = false\n\t\t}\n\t\t// Fallback to FP16 if the value type is quantized when disabling flash attention,\n\t\t// see https://github.com/ggerganov/llama.cpp/blob/19d3c8293b1f61acbe2dab1d49a17950fd788a4a/src/llama.cpp#L9576-L9579.\n\t\tif o.LMCCacheValueType.IsQuantized() && !o.FlashAttention {\n\t\t\to.LMCCacheValueType = ptr.To(GGMLTypeF16)\n\t\t}\n\n\t\te.FlashAttention = o.FlashAttention\n\t}\n\n\t// Embedding.\n\tif !a.AttentionCausal {\n\t\tropeFrequencyBase := ptr.Deref(o.LMCRoPEFrequencyBase, a.RoPEFrequencyBase)\n\t\tropeFrequencyScale := ptr.Deref(o.LMCRoPEFrequencyScale, a.RoPEFrequencyScale)\n\t\tropeScalingType := ptr.Deref(o.LMCRoPEScalingType, a.RoPEScalingType)\n\t\tropeScalingOriginalContextSize := ptr.Deref(o.LMCRoPEScalingOriginalContextSize, int32(a.RoPEScalingOriginalContextLength))\n\t\tisRoPECustomized := ropeFrequencyBase != a.RoPEFrequencyBase ||\n\t\t\tropeFrequencyScale != a.RoPEFrequencyScale ||\n\t\t\tropeScalingType != a.RoPEScalingType ||\n\t\t\t(ropeScalingType == \"yarn\" && ropeScalingOriginalContextSize != int32(a.RoPEScalingOriginalContextLength))\n\n\t\te.EmbeddingOnly = true\n\t\to.LMCContextSize = ptr.To(ptr.Deref(o.LMCContextSize, int32(a.MaximumContextLength)))\n\t\t// Set context size/physical batch size/logical batch size to the training context size.\n\t\tif !isRoPECustomized {\n\t\t\to.LMCContextSize = ptr.To(min(int32(a.MaximumContextLength), *o.LMCContextSize))\n\t\t}\n\t\to.LMCLogicalBatchSize = o.LMCContextSize\n\t\to.LMCPhysicalBatchSize = o.LMCLogicalBatchSize\n\t\t// Reranking.\n\t\tif _, found := gf.TensorInfos.Index([]string{\"cls.bias\", \"cls.weight\"}); found > 0 {\n\t\t\te.Reranking = true\n\t\t}\n\t\tif !e.Reranking && a.PoolingType == 4 { // 0: None, 1: Mean, 2: Cls, 3: Last, 4: Rank\n\t\t\te.Reranking = true\n\t\t}\n\t}\n\n\t// Distributable,\n\t// fix by https://github.com/ggerganov/llama.cpp/pull/11047.\n\te.Distributable = true\n\n\t// Batch size.\n\te.LogicalBatchSize = *o.LMCLogicalBatchSize\n\te.PhysicalBatchSize = *o.LMCPhysicalBatchSize\n\n\t// Padding alignment.\n\tpaddingAlign := uint64(32)\n\tif o.FlashAttention {\n\t\tpaddingAlign = 256\n\t}\n\n\t// Init hyperparameters,\n\t// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L6957-L7000.\n\tvar (\n\t\tnContext uint64\n\t\tnTokens  uint64\n\t\tnBatch   uint64\n\t\tnOutputs uint64\n\t\tnSeq     uint64\n\t\tnKV      uint64\n\t)\n\t{\n\t\tnContext = a.MaximumContextLength\n\t\tif o.LMCContextSize != nil {\n\t\t\tnContext = uint64(*o.LMCContextSize)\n\t\t}\n\t\tif o.LMCInMaxContextSize {\n\t\t\tnContext = min(nContext, a.MaximumContextLength)\n\t\t}\n\t\t// Padding context size,\n\t\t// see https://github.com/ggerganov/llama.cpp/blob/278d0e18469aacf505be18ce790a63c7cc31be26/src/llama.cpp#L19001-L19002.\n\t\tnContext = GGMLPadding(nContext, paddingAlign)\n\n\t\t// Correct token size,\n\t\t// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L12221-L12224.\n\t\tnTokens = min(nContext, uint64(*o.LMCPhysicalBatchSize))\n\t\tnBatch = nTokens\n\t\tnOutputs = nTokens\n\t\tnSeq = uint64(ptr.Deref(o.ParallelSize, 1))\n\t\tnKV = nContext\n\n\t\te.ContextSize = nContext\n\t}\n\n\t// Footprint.\n\t{\n\t\t// Bootstrap.\n\t\te.Devices[0].Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */\n\n\t\t// Tokens,\n\t\t// https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L6380-L6384.\n\t\tfp := t.TokensLength * (4 /* token type */ + 4 /* token score*/)\n\t\tif t.Model == \"gpt2\" {\n\t\t\tfp += t.MergesLength * (48 /* key type */ + 56 /* value type */)\n\t\t}\n\t\tfp += t.TokensLength * (32 /* id to token vector */ + (24 + 32) /* token to id map*/)\n\t\te.Devices[0].Footprint += GGUFBytesScalar(fp)\n\n\t\t// Output buffer,\n\t\t// see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L11940-L12003.\n\t\tob := a.EmbeddingLength * nOutputs * 4 /* float32 size */\n\t\tif a.AttentionCausal {\n\t\t\tob += a.VocabularyLength * nOutputs * 4 /* float32 size */\n\t\t}\n\t\tif fullOffload {\n\t\t\te.Devices[idxOutputDevice].Footprint += GGUFBytesScalar(ob)\n\t\t} else {\n\t\t\te.Devices[0].Footprint += GGUFBytesScalar(ob)\n\t\t}\n\t}\n\n\t// Weight & Parameter.\n\t{\n\t\tfilter := func(idx int) GGUFTensorInfoFilter {\n\t\t\tif len(o.OverriddenTensors) == 0 {\n\t\t\t\treturn nil\n\t\t\t}\n\t\t\treturn func(name string) bool {\n\t\t\t\tfor _, ot := range o.OverriddenTensors {\n\t\t\t\t\tbt, bi := ot.ParseBufferType()\n\t\t\t\t\tswitch {\n\t\t\t\t\tcase bt == GGUFRunOverriddenTensorBufferTypeUnknown:\n\t\t\t\t\t\tcontinue\n\t\t\t\t\tcase bt == GGUFRunOverriddenTensorBufferTypeCPU && idx == 0:\n\t\t\t\t\t\tcontinue\n\t\t\t\t\tcase bt == GGUFRunOverriddenTensorBufferTypeGPU &&\n\t\t\t\t\t\t(e.Devices[idx].Remote || anyx.Number[int](bi)+1 != idx):\n\t\t\t\t\t\tcontinue\n\t\t\t\t\tcase bt == GGUFRunOverriddenTensorBufferTypeRPC &&\n\t\t\t\t\t\t(!e.Devices[idx].Remote || e.Devices[idx].Endpoint != bi):\n\t\t\t\t\t\tcontinue\n\t\t\t\t\t}\n\t\t\t\t\tif ot.PatternRegex.MatchString(name) {\n\t\t\t\t\t\treturn false\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\treturn true\n\t\t\t}\n\t\t}\n\n\t\t// If overridden tensors are provided,\n\t\t// we need to search the tensors of the overridden pattern,\n\t\t// and place them in the correct device.\n\t\tif len(o.OverriddenTensors) != 0 {\n\t\t\tfor _, ot := range o.OverriddenTensors {\n\t\t\t\tbt, bi := ot.ParseBufferType()\n\t\t\t\tif bt == GGUFRunOverriddenTensorBufferTypeUnknown {\n\t\t\t\t\tcontinue\n\t\t\t\t}\n\t\t\t\tvar sls GGUFTensorInfos = ls.Search(ot.PatternRegex)\n\t\t\t\tif len(sls) == 0 {\n\t\t\t\t\tcontinue\n\t\t\t\t}\n\t\t\t\tswitch bt {\n\t\t\t\tcase GGUFRunOverriddenTensorBufferTypeCPU:\n\t\t\t\t\te.Devices[0].Weight.ComputeOverridden += GGUFBytesScalar(sls.Bytes())\n\t\t\t\t\te.Devices[0].Parameter.ComputeOverridden += GGUFParametersScalar(sls.Elements())\n\t\t\t\tcase GGUFRunOverriddenTensorBufferTypeGPU:\n\t\t\t\t\tidx := anyx.Number[int](bi) + 1\n\t\t\t\t\te.Devices[idx].Weight.ComputeOverridden += GGUFBytesScalar(sls.Bytes())\n\t\t\t\t\te.Devices[idx].Parameter.ComputeOverridden += GGUFParametersScalar(sls.Elements())\n\t\t\t\tdefault:\n\t\t\t\t\tfor i, d := range e.Devices[1:] {\n\t\t\t\t\t\tif d.Endpoint == bi {\n\t\t\t\t\t\t\te.Devices[i+1].Weight.ComputeOverridden += GGUFBytesScalar(sls.Bytes())\n\t\t\t\t\t\t\te.Devices[i+1].Parameter.ComputeOverridden += GGUFParametersScalar(sls.Elements())\n\t\t\t\t\t\t\tbreak\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\t// Compute.\n\t\tfor i, j, offloadStart := 0, 0, len(tfLs)-int(nOffloadLayers); i < len(tfLs); i++ {\n\t\t\tidx := 0\n\t\t\tif i >= offloadStart {\n\t\t\t\tx := float64(i-offloadStart) / float64(nActualOffloadLayers)\n\t\t\t\tj = slicex.UpperBound(o.TensorSplitFraction, x)\n\t\t\t\tidx = j + 1\n\t\t\t}\n\t\t\tf := filter(idx)\n\t\t\te.Devices[idx].Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes(f))\n\t\t\te.Devices[idx].Parameter.Compute += GGUFParametersScalar(tfLs[i].Elements(f))\n\t\t}\n\n\t\t// IO,\n\t\t// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L4930-L5002.\n\t\te.Devices[0].Weight.Input = GGUFBytesScalar(ipLs.Bytes())\n\t\te.Devices[0].Parameter.Input = GGUFParametersScalar(ipLs.Elements())\n\t\tvar (\n\t\t\twg GGUFBytesScalar\n\t\t\tps GGUFParametersScalar\n\t\t)\n\t\tif _, ok := opLs.Get(\"output.weight\"); ok {\n\t\t\twg = GGUFBytesScalar(opLs.Bytes())\n\t\t\tps = GGUFParametersScalar(opLs.Elements())\n\t\t} else {\n\t\t\twg = GGUFBytesScalar(opLs.Bytes()) + e.Devices[0].Weight.Input /* duplicate the input layer */\n\t\t\tps = GGUFParametersScalar(opLs.Elements() + ipLs.Elements())\n\t\t}\n\t\te.Devices[0].Weight.Output = wg\n\t\tif fullOffload {\n\t\t\te.Devices[idxOutputDevice].Weight.Output = wg\n\t\t\te.Devices[idxOutputDevice].Parameter.Output = ps\n\t\t} else {\n\t\t\te.Devices[0].Parameter.Output = ps\n\t\t}\n\t}\n\n\t// KV cache.\n\tif a.AttentionCausal {\n\t\tswitch {\n\t\t// Recurrent,\n\t\t// see https://github.com/ggml-org/llama.cpp/blob/704bb7a71c01dc07c1478b85f6322bf5dfde1eaf/src/llama-hparams.cpp#L68-L88.\n\t\tcase a.AttentionRecurrent:\n\t\t\tvar r, s uint64\n\t\t\tif a.RWKVHeadSize > 0 {\n\t\t\t\tr = uint64(a.RWKVTokenShiftCount) * a.EmbeddingLength\n\t\t\t\ts = uint64(a.RWKVHeadSize) * a.EmbeddingLength\n\t\t\t} else {\n\t\t\t\tr = uint64((a.SSMConvolutionKernel - 1) * (a.SSMInnerSize + 2*a.SSMGroupCount*a.SSMStateSize))\n\t\t\t\ts = uint64(a.SSMStateSize * a.SSMInnerSize)\n\t\t\t}\n\n\t\t\trps, sps := r*nSeq, s*nSeq\n\t\t\trrs, srs := GGMLTypeF32.RowSizeOf([]uint64{rps}), GGMLTypeF32.RowSizeOf([]uint64{sps})\n\n\t\t\te.Devices[0].KVCache.Key += GGUFBytesScalar(rrs * nLoadLayers)\n\t\t\te.Devices[0].KVCache.Value += GGUFBytesScalar(srs * nLoadLayers)\n\t\t\te.Devices[0].Parameter.KVCache += GGUFParametersScalar((rrs + srs) * nLoadLayers)\n\t\t\tif !*o.LMCOffloadKVCache {\n\t\t\t\te.Devices[0].KVCache.Key += GGUFBytesScalar(rrs * nOffloadLayers)\n\t\t\t\te.Devices[0].KVCache.Value += GGUFBytesScalar(srs * nOffloadLayers)\n\t\t\t\te.Devices[0].Parameter.KVCache += GGUFParametersScalar((rrs + srs) * nOffloadLayers)\n\t\t\t} else if !zeroOffload {\n\t\t\t\tfor i, d := range e.Devices[1:] {\n\t\t\t\t\te.Devices[i+1].KVCache.Key += GGUFBytesScalar(rrs * d.HandleLayers)\n\t\t\t\t\te.Devices[i+1].KVCache.Value += GGUFBytesScalar(srs * d.HandleLayers)\n\t\t\t\t\te.Devices[i+1].Parameter.KVCache += GGUFParametersScalar((rrs + srs) * d.HandleLayers)\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tif !a.AttentionHybrid {\n\t\t\t\tbreak\n\t\t\t}\n\n\t\t\tfallthrough\n\t\t// Causal,\n\t\t// see https://github.com/ggml-org/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L2479-L2501.\n\t\tdefault:\n\t\t\takl, avl := uint64(a.AttentionKeyLength), uint64(a.AttentionValueLength)\n\t\t\tif a.AttentionKeyLengthMLA > 0 && a.AttentionValueLengthMLA > 0 {\n\t\t\t\takl, avl = uint64(a.AttentionKeyLengthMLA), uint64(a.AttentionValueLengthMLA)\n\t\t\t}\n\t\t\tkGQA := akl * a.AttentionHeadCountKV\n\t\t\tvGQA := avl * a.AttentionHeadCountKV\n\t\t\tkps, vps := kGQA*nKV, vGQA*nKV\n\t\t\tkrs, vrs := o.LMCCacheKeyType.RowSizeOf([]uint64{kps}), o.LMCCacheValueType.RowSizeOf([]uint64{vps})\n\n\t\t\tif !usingSWA {\n\t\t\t\te.Devices[0].KVCache.Key += GGUFBytesScalar(krs * nLoadLayers)\n\t\t\t\te.Devices[0].KVCache.Value += GGUFBytesScalar(vrs * nLoadLayers)\n\t\t\t\te.Devices[0].Parameter.KVCache += GGUFParametersScalar((kps + vps) * nLoadLayers)\n\t\t\t\tif !*o.LMCOffloadKVCache {\n\t\t\t\t\te.Devices[0].KVCache.Key += GGUFBytesScalar(krs * nOffloadLayers)\n\t\t\t\t\te.Devices[0].KVCache.Value += GGUFBytesScalar(vrs * nOffloadLayers)\n\t\t\t\t\te.Devices[0].Parameter.KVCache += GGUFParametersScalar((kps + vps) * nOffloadLayers)\n\t\t\t\t} else if !zeroOffload {\n\t\t\t\t\tfor i, d := range e.Devices[1:] {\n\t\t\t\t\t\te.Devices[i+1].KVCache.Key += GGUFBytesScalar(krs * d.HandleLayers)\n\t\t\t\t\t\te.Devices[i+1].KVCache.Value += GGUFBytesScalar(vrs * d.HandleLayers)\n\t\t\t\t\t\te.Devices[i+1].Parameter.KVCache += GGUFParametersScalar((kps + vps) * d.HandleLayers)\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t} else {\n\t\t\t\t// Sliding window attention size,\n\t\t\t\t// see https://github.com/ggml-org/llama.cpp/blob/3079e9ac8e04ef6eddeb0c164d72edb6b6fd2df5/src/llama-kv-cache.cpp#L1640-L1642.\n\t\t\t\tswas := min(nKV, GGMLPadding(a.AttentionSlidingWindow*nSeq+uint64(*o.LMCLogicalBatchSize), paddingAlign))\n\t\t\t\tswaKps, swaVps := kGQA*swas, vGQA*swas\n\t\t\t\tswaKrs, swaVrs := o.LMCCacheKeyType.RowSizeOf([]uint64{swaKps}), o.LMCCacheValueType.RowSizeOf([]uint64{swaVps})\n\n\t\t\t\tnNonSWALoadLayers, nNonSWAOffloadLayers := nLoadLayers-nSWALoadLayers, nOffloadLayers-nSWAOffloadLayers\n\n\t\t\t\te.Devices[0].KVCache.Key += GGUFBytesScalar(swaKrs*nSWALoadLayers + krs*nNonSWALoadLayers)\n\t\t\t\te.Devices[0].KVCache.Value += GGUFBytesScalar(swaVrs*nSWALoadLayers + vrs*nNonSWALoadLayers)\n\t\t\t\te.Devices[0].Parameter.KVCache += GGUFParametersScalar((swaKps+swaVps)*nSWALoadLayers + (kps+vps)*nNonSWALoadLayers)\n\t\t\t\tif !*o.LMCOffloadKVCache {\n\t\t\t\t\te.Devices[0].KVCache.Key += GGUFBytesScalar(swaKrs*nSWAOffloadLayers + krs*nNonSWAOffloadLayers)\n\t\t\t\t\te.Devices[0].KVCache.Value += GGUFBytesScalar(swaVrs*nSWAOffloadLayers + vrs*nNonSWAOffloadLayers)\n\t\t\t\t\te.Devices[0].Parameter.KVCache += GGUFParametersScalar((swaKps+swaVps)*nSWAOffloadLayers + (kps+vps)*nNonSWAOffloadLayers)\n\t\t\t\t} else if !zeroOffload {\n\t\t\t\t\tfor i, d := range e.Devices[1:] {\n\t\t\t\t\t\te.Devices[i+1].KVCache.Key += GGUFBytesScalar(swaKrs*d.HandleSWALayers + krs*(d.HandleLayers-d.HandleSWALayers))\n\t\t\t\t\t\te.Devices[i+1].KVCache.Value += GGUFBytesScalar(swaVrs*d.HandleSWALayers + vrs*(d.HandleLayers-d.HandleSWALayers))\n\t\t\t\t\t\te.Devices[i+1].Parameter.KVCache += GGUFParametersScalar((swaKps+swaVps)*d.HandleSWALayers + (kps+vps)*(d.HandleLayers-d.HandleSWALayers))\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t}\n\n\t// Computation.\n\t{\n\t\t// See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/src/llama-context.cpp#L1241-L1243.\n\t\tmaxNodes := max(1024, uint64(8*len(gf.TensorInfos)))\n\n\t\t// Bootstrap, compute metadata.\n\t\tcm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false)\n\t\te.Devices[0].Computation.Footprint = GGUFBytesScalar(cm)\n\n\t\t// Scheduler overhead,\n\t\t// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149.\n\t\te.Devices[0].Computation.Footprint += GGUFBytesScalar(4 * 1024 * 1024)\n\n\t\t// GGML context,\n\t\t// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036.\n\t\tgc := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.BlockCount*3)\n\t\te.Devices[0].Computation.Footprint += GGUFBytesScalar(gc)\n\n\t\t// Tensor usage,\n\t\t// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149.\n\t\t//\n\t\t// First, get the usage of input layer,\n\t\t// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L2279-L2290.\n\t\tvar (\n\t\t\tinpTokens = GGMLTypeI32.RowSizeOf([]uint64{nBatch})                    // I32 [n_batch]\n\t\t\tinpEmbd   = GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, nBatch}) // F32 [n_embd, n_batch]\n\t\t\tinpPos    = GGMLTypeI32.RowSizeOf([]uint64{nBatch})                    // I32 [n_batch]\n\t\t\tinpOutIds = GGMLTypeI32.RowSizeOf([]uint64{nOutputs})                  // I32 [n_outputs],\n\t\t\tinpKQMask = GGMLTypeF32.RowSizeOf([]uint64{nKV, nBatch})               // F32 [n_kv, n_batch]\n\t\t\tinpSMask  = GGMLTypeF32.RowSizeOf([]uint64{1, nSeq})                   // F32 [1, n_seq]\n\t\t\tinpSSeq   = GGMLTypeI32.RowSizeOf([]uint64{nSeq, nBatch})              // I32 [n_seq, n_batch]\n\t\t)\n\t\tif a.AttentionRecurrent {\n\t\t\te.Devices[0].Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + 2*inpSMask + inpSSeq + inpOutIds)\n\t\t} else {\n\t\t\te.Devices[0].Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + inpPos + inpKQMask + inpOutIds)\n\t\t}\n\t\t{\n\t\t\tvar v GGUFBytesScalar\n\t\t\tif a.AttentionRecurrent {\n\t\t\t\tv = GGUFBytesScalar(inpEmbd + inpSMask + inpSSeq)\n\t\t\t} else {\n\t\t\t\tv = GGUFBytesScalar(inpEmbd + inpPos + inpKQMask)\n\t\t\t}\n\t\t\tif len(o.RPCServers) == 0 && len(o.TensorSplitFraction) > 1 {\n\t\t\t\tif a.ExpertCount > 0 {\n\t\t\t\t\tv *= 2\n\t\t\t\t} else {\n\t\t\t\t\tv *= 4\n\t\t\t\t}\n\t\t\t}\n\t\t\tfor i := range e.Devices[1:] {\n\t\t\t\te.Devices[i+1].Computation.Input += v\n\t\t\t}\n\t\t}\n\t\t// Since the steps between transformer layers are serial,\n\t\t// the allocated memory can be reused for the next layer.\n\t\t// So, we only consider the usage of the largest layer,\n\t\t// which is the last layer by default.\n\t\tif a.AttentionRecurrent && !a.AttentionHybrid {\n\t\t\tif a.RWKVHeadSize > 0 {\n\t\t\t\tattnInc := uint64(0)\n\t\t\t\tfor _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\\.\\d+\\.(attn_norm|attn_norm_2)\\.weight`)) {\n\t\t\t\t\trs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch})\n\t\t\t\t\tattnInc += rs\n\t\t\t\t}\n\t\t\t\tffnInc := uint64(0)\n\t\t\t\tfor _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\\.\\d+\\.time_mix_(lerp_x|receptance|decay_w2|key|value|gate|w2|output)\\.weight`)) { // nolint: lll\n\t\t\t\t\tswitch {\n\t\t\t\t\tcase strings.HasSuffix(l.Name, \".time_mix_w2.weight\"):\n\t\t\t\t\t\trs := GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, 1, nTokens, l.Dimensions[l.NDimensions-1]})\n\t\t\t\t\t\tffnInc += rs\n\t\t\t\t\tcase strings.HasSuffix(l.Name, \".time_mix_output.weight\"):\n\t\t\t\t\t\trs := GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, nBatch + uint64(a.RWKVHeadSize)*nSeq})\n\t\t\t\t\t\tffnInc += rs\n\t\t\t\t\tdefault:\n\t\t\t\t\t\trs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch})\n\t\t\t\t\t\tffnInc += rs\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t\tcp := GGUFBytesScalar(attnInc + ffnInc)\n\t\t\t\tfor i := range e.Devices[1:] {\n\t\t\t\t\te.Devices[i+1].Computation.Compute = cp\n\t\t\t\t}\n\t\t\t} else {\n\t\t\t\tr := uint64((a.SSMConvolutionKernel - 1) * (a.SSMInnerSize + 2*a.SSMGroupCount*a.SSMStateSize))\n\t\t\t\tconvInc := GGMLTypeF32.RowSizeOf([]uint64{r, nSeq}) // F32 [n_embd_key_gqa, nSeq] reshape\n\t\t\t\tfor _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\\.\\d+\\.(attn_norm|ssm_in|ssm_conv1d)\\.weight`)) {\n\t\t\t\t\tif !strings.HasSuffix(l.Name, \".ssm_conv1d.weight\") {\n\t\t\t\t\t\trs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})\n\t\t\t\t\t\tconvInc += rs\n\t\t\t\t\t\tcontinue\n\t\t\t\t\t}\n\t\t\t\t\t// https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L10379.\n\t\t\t\t\trs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.SSMInnerSize)*nTokens + uint64(a.SSMConvolutionKernel)*uint64(a.SSMInnerSize)*nSeq})\n\t\t\t\t\tconvInc += rs\n\t\t\t\t}\n\t\t\t\tssmInc := uint64(0)\n\t\t\t\tfor _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\\.\\d+\\.ssm_(dt\\.weight|a)`)) {\n\t\t\t\t\tif !strings.HasSuffix(l.Name, \".ssm_a\") {\n\t\t\t\t\t\trs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})\n\t\t\t\t\t\tssmInc += rs\n\t\t\t\t\t\tcontinue\n\t\t\t\t\t}\n\t\t\t\t\t// https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L10413.\n\t\t\t\t\trs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.SSMInnerSize)*nTokens + uint64(a.SSMStateSize)*uint64(a.SSMInnerSize)*nSeq})\n\t\t\t\t\tssmInc += rs\n\t\t\t\t}\n\t\t\t\tcp := GGUFBytesScalar(convInc + ssmInc)\n\t\t\t\tfor i := range e.Devices[1:] {\n\t\t\t\t\te.Devices[i+1].Computation.Compute = cp\n\t\t\t\t}\n\t\t\t}\n\t\t} else {\n\t\t\tloadAttnInc, offloadAttnInc := uint64(0), uint64(0)\n\t\t\t{\n\t\t\t\trs := o.LMCCacheKeyType.RowSizeOf([]uint64{uint64(a.AttentionKeyLength), nKV, a.AttentionHeadCountKV})\n\t\t\t\tloadAttnInc = rs // k-?\n\t\t\t\trs = o.LMCCacheValueType.RowSizeOf([]uint64{uint64(a.AttentionValueLength), nKV, a.AttentionHeadCountKV})\n\t\t\t\tloadAttnInc += rs // v-?\n\t\t\t}\n\t\t\tif o.FlashAttention {\n\t\t\t\t// https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L7387.\n\t\t\t\toffloadAttnInc = GGMLTypeF16.RowSizeOf([]uint64{nKV, nTokens})\n\t\t\t\tfor _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\\.\\d+\\.attn_(norm|q|qkv|q_b)\\.weight`)) {\n\t\t\t\t\tif strings.HasSuffix(l.Name, \".attn_norm.weight\") {\n\t\t\t\t\t\trs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})\n\t\t\t\t\t\toffloadAttnInc += rs\n\t\t\t\t\t\tcontinue\n\t\t\t\t\t}\n\t\t\t\t\trs := l.Bytes()\n\t\t\t\t\toffloadAttnInc += rs\n\t\t\t\t}\n\t\t\t\t// https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L6986-L6992.\n\t\t\t\trs := o.LMCCacheKeyType.RowSizeOf([]uint64{uint64(a.AttentionKeyLength), nKV, a.AttentionHeadCountKV})\n\t\t\t\toffloadAttnInc += rs\n\t\t\t\t// https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L7000-L7007.\n\t\t\t\trs = o.LMCCacheValueType.RowSizeOf([]uint64{uint64(a.AttentionValueLength), nKV, a.AttentionHeadCountKV})\n\t\t\t\toffloadAttnInc += rs\n\t\t\t} else {\n\t\t\t\toffloadAttnInc = uint64(0)\n\t\t\t\tfor _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\\.\\d+\\.attn_(norm|q|qkv|q_b)\\.weight`)) {\n\t\t\t\t\tvar rs uint64\n\t\t\t\t\tswitch {\n\t\t\t\t\tdefault: // norm.\n\t\t\t\t\t\trs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})\n\t\t\t\t\t\toffloadAttnInc += rs\n\t\t\t\t\tcase strings.HasSuffix(l.Name, \".attn_q.weight\"):\n\t\t\t\t\t\trs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[0], nTokens})\n\t\t\t\t\t\toffloadAttnInc += rs * 2 // Qcur.\n\t\t\t\t\t\trs = GGMLTypeF32.RowSizeOf([]uint64{nKV, nTokens, a.AttentionHeadCount})\n\t\t\t\t\t\toffloadAttnInc += rs // kq.\n\t\t\t\t\t\tif !zeroOffload && !fullOffload {\n\t\t\t\t\t\t\toffloadAttnInc += loadAttnInc\n\t\t\t\t\t\t}\n\t\t\t\t\tcase strings.HasSuffix(l.Name, \".attn_qkv.weight\"):\n\t\t\t\t\t\trs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[0], nTokens})\n\t\t\t\t\t\toffloadAttnInc += rs * 2 // Qcur.\n\t\t\t\t\t\trs = GGMLTypeF32.RowSizeOf([]uint64{nKV, nTokens, a.AttentionHeadCount})\n\t\t\t\t\t\toffloadAttnInc += rs // kq.\n\t\t\t\t\t\trs = GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, a.EmbeddingLength * 3})\n\t\t\t\t\t\toffloadAttnInc += rs // wqkv.\n\t\t\t\t\t\tif !zeroOffload && !fullOffload {\n\t\t\t\t\t\t\toffloadAttnInc += loadAttnInc\n\t\t\t\t\t\t}\n\t\t\t\t\tcase strings.HasSuffix(l.Name, \".attn_q_b.weight\"):\n\t\t\t\t\t\trs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})\n\t\t\t\t\t\toffloadAttnInc += rs * 2 // q-?\n\t\t\t\t\t\trs = GGMLTypeF32.RowSizeOf([]uint64{nKV, nTokens, a.AttentionHeadCount})\n\t\t\t\t\t\toffloadAttnInc += rs // kq.\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t\tffnInc := uint64(0)\n\t\t\tfor _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\\.\\d+\\.(attn_norm|ffn_norm|ffn_gate|ffn_up)\\.weight`)) {\n\t\t\t\trs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})\n\t\t\t\tffnInc += rs\n\t\t\t}\n\t\t\tif a.ExpertCount > 0 || a.ExpertUsedCount > 0 {\n\t\t\t\trs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.ExpertCount), a.EmbeddingLength})\n\t\t\t\tffnInc += rs // ffn_gate_input\n\t\t\t\trs = GGMLTypeF32.RowSizeOf([]uint64{uint64(a.ExpertCount), nTokens})\n\t\t\t\tffnInc += rs // ffn_moe_logits\n\t\t\t\trs = GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, uint64(a.ExpertUsedCount), nTokens})\n\t\t\t\tffnInc += rs // ffn_moe_down\n\t\t\t}\n\t\t\tif !zeroOffload {\n\t\t\t\te.Devices[0].Computation.Compute = GGUFBytesScalar(loadAttnInc + ffnInc)\n\t\t\t} else {\n\t\t\t\te.Devices[0].Computation.Compute = GGUFBytesScalar(loadAttnInc)\n\t\t\t}\n\t\t\t{\n\t\t\t\tcp := GGUFBytesScalar(max(offloadAttnInc, ffnInc))\n\t\t\t\tfor i := range e.Devices[1:] {\n\t\t\t\t\te.Devices[i+1].Computation.Compute = cp\n\t\t\t\t}\n\t\t\t\tif nLoadLayers > 1 {\n\t\t\t\t\tfor i := range e.Devices[1:] {\n\t\t\t\t\t\tif e.Devices[i+1].Remote {\n\t\t\t\t\t\t\tcontinue\n\t\t\t\t\t\t}\n\t\t\t\t\t\te.Devices[i+1].Computation.Compute += GGUFBytesScalar(loadAttnInc)\n\t\t\t\t\t\tbreak\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\t\t// Finally, get the usage of output layer.\n\t\tif a.AttentionCausal {\n\t\t\tvar outInc uint64\n\t\t\tif a.AttentionRecurrent {\n\t\t\t\toutInc += inpSMask + inpSSeq\n\t\t\t}\n\t\t\tif l, ok := opLs.Get(\"output_norm.weight\"); ok {\n\t\t\t\trs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})\n\t\t\t\toutInc += rs\n\t\t\t}\n\t\t\tif l, ok := opLs.Get(\"output.weight\"); ok {\n\t\t\t\trs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})\n\t\t\t\toutInc += rs\n\t\t\t} else if l, ok := ipLs.Get(\"token_embd.weight\"); ok {\n\t\t\t\trs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})\n\t\t\t\toutInc += rs\n\t\t\t}\n\t\t\te.Devices[idxOutputDevice].Computation.Output += GGUFBytesScalar(outInc)\n\t\t}\n\t}\n\n\t// Drafter.\n\te.Drafter = o.LMCDrafter\n\n\t// Projector.\n\te.Projector = o.LMCProjector\n\n\t// Adapters.\n\te.Adapters = o.LMCAdapters\n\n\t// Maximum tokens per second.\n\tif ds, dmss := e.Devices, o.DeviceMetrics; len(dmss) != 0 {\n\t\tltss := make([]float64, len(dmss))\n\t\tbs := anyx.Number[float64](*o.LMCLogicalBatchSize) / float64(nBatch)\n\t\tfor i, dm := range dmss {\n\t\t\tfl, upbw, dwbw := float64(max(dm.FLOPS, 1)), float64(max(dm.UpBandwidth, 1)), float64(max(dm.DownBandwidth, 1))\n\t\t\tcmpops := float64(ds[i].Parameter.Compute+ds[i].Parameter.ComputeOverridden)*2 /* FMA */ *bs + float64(ds[i].Parameter.Input) + float64(ds[i].Parameter.Output) // nolint: lll\n\t\t\tcmps := float64(ds[i].Weight.Sum())\n\t\t\tcmplat := max(cmpops/fl, cmps/upbw)\n\t\t\tkvcops := float64(ds[i].Parameter.KVCache) * 2 /* FMA */ * bs\n\t\t\tkvcs := float64(ds[i].KVCache.Sum()) * bs\n\t\t\tkvclat := max(kvcops/fl, kvcs/upbw)\n\t\t\tffs := float64(GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, nBatch}))\n\t\t\tffslat := ffs / dwbw\n\t\t\tlays := float64(ds[i].HandleLayers)\n\t\t\tif ds[i].HandleOutputLayer {\n\t\t\t\tlays += 1\n\t\t\t}\n\t\t\tltss[i] = (cmplat + kvclat + ffslat) * lays / float64(a.BlockCount+2)\n\t\t}\n\t\tlt := float64(0)\n\t\tltmax := slices.Max(ltss)\n\t\tfor i := range ltss {\n\t\t\tlt += ltss[i] / ltmax * ltss[i]\n\t\t}\n\t\te.MaximumTokensPerSecond = ptr.To(GGUFTokensPerSecondScalar(1 / lt))\n\t}\n}\n\n// estimateLLaMACppRunInProjector estimates the usages of the GGUF file for projector.\nfunc (gf *GGUFFile) estimateLLaMACppRunInProjector(o *_GGUFRunEstimateOptions, a *GGUFArchitecture, e *LLaMACppRunEstimate) {\n\tls := gf.Layers()\n\tioLs, tfLs, _ := ls.Cut([]string{\n\t\t\"mm.*\",\n\t\t// Vision specific IO layers.\n\t\t\"v.patch_embd.*\",\n\t\t\"v.class_embd\",\n\t\t\"v.position_embd.*\",\n\t\t\"v.pre_ln.*\",\n\t\t\"v.post_ln.*\",\n\t\t\"model.*\",\n\t\t\"resampler.*\",\n\t\t// Audio specific IO layers.\n\t\t\"a.position_embd.*\",\n\t\t\"a.conv1d.*\",\n\t\t\"a.post_ln.*\",\n\t})\n\tipLs, opLs, _ := ioLs.Cut([]string{\n\t\t// Vision specific Input layers.\n\t\t\"v.patch_embd.*\",\n\t\t\"v.class_embd\",\n\t\t\"v.position_embd.*\",\n\t\t\"v.pre_ln.*\",\n\t\t\"model.*\",\n\t\t// Audio specific Input layers.\n\t\t\"a.position_embd.*\",\n\t\t\"a.conv1d.*\",\n\t})\n\n\t// Block count.\n\tif a.ClipHasVisionEncoder && a.ClipVisionBlockCount == 0 {\n\t\tif len(tfLs) == 1 {\n\t\t\tif ntfLs, ok := tfLs[0].(*GGUFNamedTensorInfos); ok && slices.Contains([]string{\"v\"}, ntfLs.Name) {\n\t\t\t\ta.ClipVisionBlockCount = uint64(len(ntfLs.GGUFLayerTensorInfos))\n\t\t\t}\n\t\t}\n\t\tif a.ClipVisionBlockCount == 0 {\n\t\t\ta.ClipVisionBlockCount = uint64(len(tfLs))\n\t\t}\n\t}\n\tif a.ClipHasAudioEncoder && a.ClipAudioBlockCount == 0 {\n\t\tif len(tfLs) == 1 {\n\t\t\tif ntfLs, ok := tfLs[0].(*GGUFNamedTensorInfos); ok && slices.Contains([]string{\"a\"}, ntfLs.Name) {\n\t\t\t\ta.ClipAudioBlockCount = uint64(len(ntfLs.GGUFLayerTensorInfos))\n\t\t\t}\n\t\t}\n\t\tif a.ClipAudioBlockCount == 0 {\n\t\t\ta.ClipAudioBlockCount = uint64(len(tfLs))\n\t\t}\n\t}\n\n\t// Offload layers.\n\tif *o.LMCOffloadLayers == math.MaxUint64 {\n\t\te.FullOffloaded = true\n\t\te.OffloadLayers = a.ClipVisionBlockCount + a.ClipAudioBlockCount\n\t\to.LMCOffloadLayers = ptr.To(e.OffloadLayers)\n\t} else {\n\t\te.FullOffloaded = false\n\t\te.OffloadLayers = 0\n\t}\n\n\t// Footprint.\n\t{\n\t\t// Bootstrap.\n\t\te.Devices[0].Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */\n\t}\n\n\tidx := 0 // Default to the main host's RAM.\n\tif e.FullOffloaded {\n\t\tfor i := 1; i < len(e.Devices); i++ {\n\t\t\tif !e.Devices[i].Remote {\n\t\t\t\tidx = i\n\t\t\t\tbreak\n\t\t\t}\n\t\t}\n\t}\n\n\t// Weight & Parameter.\n\t{\n\t\t// Compute.\n\t\te.Devices[idx].HandleLayers = *o.LMCOffloadLayers\n\t\te.Devices[idx].HandleLastLayer = int(e.Devices[idx].HandleLayers - 1)\n\t\te.Devices[idx].Weight.Compute = GGUFBytesScalar(tfLs.Bytes())\n\t\te.Devices[idx].Parameter.Compute = GGUFParametersScalar(tfLs.Elements())\n\n\t\t// IO.\n\t\te.Devices[idx].Weight.Input = GGUFBytesScalar(ipLs.Bytes())\n\t\te.Devices[idx].Parameter.Input = GGUFParametersScalar(ipLs.Elements())\n\t\te.Devices[idx].Weight.Output = GGUFBytesScalar(opLs.Bytes())\n\t\te.Devices[idx].Parameter.Output = GGUFParametersScalar(opLs.Elements())\n\t}\n\n\tif a.ClipHasVisionEncoder {\n\t\t// Init hyperparameters,\n\t\t// see https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/clip.cpp#L599-L636.\n\t\tvar (\n\t\t\theightMaxSize uint64 // y\n\t\t\twidthMaxSize  uint64 // x\n\t\t\t// See https://github.com/ggml-org/llama.cpp/blob/6385b843a8dc8e15b8362196039720c58dd79fa2/tools/mtmd/clip.cpp#L3462.\n\t\t\tnPatches       uint64\n\t\t\tpatchesMaxSize uint64\n\t\t\t// See https://github.com/ggml-org/llama.cpp/blob/6385b843a8dc8e15b8362196039720c58dd79fa2/tools/mtmd/clip.cpp#L4016.\n\t\t\tprojectionDim uint64 // NB(thxCode): do not sure if there is the correct name.\n\t\t)\n\t\t// See https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/llava.cpp#L397-L411,\n\t\t//     https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/clip.cpp#L2323-L2345,\n\t\t//     https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/clip.cpp#L2767-L2794.\n\t\theightMaxSize = uint64(a.ClipVisionImageSize)\n\t\twidthMaxSize = heightMaxSize\n\t\tif a.ClipHasQwen2VLMerger ||\n\t\t\ta.ClipProjectorType == \"qwen2vl_merger\" ||\n\t\t\ta.ClipProjectorType == \"qwen2.5vl_merger\" ||\n\t\t\ta.ClipProjectorType == \"qwen2.5o\" ||\n\t\t\ta.ClipProjectorType == \"pixtral\" {\n\t\t\t// See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/tools/mtmd/clip.cpp#L2217.\n\t\t\theightMaxSize = uint64(ptr.Deref(o.LMCVisualMaxImageSize, 1024))\n\t\t\twidthMaxSize = heightMaxSize\n\t\t}\n\t\tnPatchSize := uint64(a.ClipVisionPatchSize)\n\t\tnPatchesHeight := heightMaxSize / nPatchSize\n\t\tnPatchesWidth := widthMaxSize / nPatchSize\n\t\tnPatches = nPatchesHeight * nPatchesWidth\n\t\tpatchesMaxSize = 1\n\t\tswitch {\n\t\tcase a.ClipHasLLaVAProjector ||\n\t\t\ta.ClipProjectorType == \"mlp\" ||\n\t\t\ta.ClipProjectorType == \"mlp_norm\" ||\n\t\t\ta.ClipProjectorType == \"ldp\" ||\n\t\t\ta.ClipProjectorType == \"ldpv2\":\n\t\t\t// LLaVA 1.6 uses up to 6 patches\n\t\t\tif a.ClipVisionMMPatchMergeType != \"flat\" {\n\t\t\t\tpatchesMaxSize = 6\n\t\t\t}\n\t\tcase a.ClipHasMiniCPMVProjector ||\n\t\t\ta.ClipProjectorType == \"resampler\":\n\t\t\t// MiniCPM-V uses up to 10 patches\n\t\t\tpatchesMaxSize = 10\n\t\tcase a.ClipProjectorType == \"adapter\":\n\t\t\t// Granite vision uses up to 10 patches + base patch\n\t\t\tpatchesMaxSize = 11\n\t\t}\n\n\t\tif o.LMCMaxProjectedCache != nil {\n\t\t\tpatchesMaxSize += uint64(*o.LMCMaxProjectedCache)\n\t\t}\n\n\t\tswitch a.ClipProjectorType {\n\t\tcase \"ldp\":\n\t\t\tnPatches /= 4\n\t\t\tif ti, ok := gf.TensorInfos.Get(\"mm.model.mb_block.1.block.2.1.bias\"); ok {\n\t\t\t\tprojectionDim = ti.Dimensions[0]\n\t\t\t}\n\t\tcase \"ldpv2\":\n\t\t\tnPatches /= 4\n\t\t\tif ti, ok := gf.TensorInfos.Get(\"mm.model.peg.0.bias\"); ok {\n\t\t\t\tprojectionDim = ti.Dimensions[0]\n\t\t\t}\n\t\tcase \"mlp\":\n\t\t\tif ti, ok := gf.TensorInfos.Get(\"mm.2.bias\"); ok {\n\t\t\t\tprojectionDim = ti.Dimensions[0]\n\t\t\t}\n\t\tcase \"mlp_norm\":\n\t\t\tif ti, ok := gf.TensorInfos.Get(\"mm.3.bias\"); ok {\n\t\t\t\tprojectionDim = ti.Dimensions[0]\n\t\t\t}\n\t\tcase \"resampler\":\n\t\t\tif ti, ok := gf.TensorInfos.Get(\"resampler.query\"); ok {\n\t\t\t\tnPatches = ti.Dimensions[1]\n\t\t\t\tprojectionDim = ti.Dimensions[0]\n\t\t\t}\n\t\tcase \"adapter\":\n\t\t\tnPatches /= 4\n\t\t\tnPatches += 2\n\t\t\tif ti, ok := gf.TensorInfos.Get(\"adapter.linear.dense_4h_to_h.weight\"); ok {\n\t\t\t\tprojectionDim = ti.Dimensions[1]\n\t\t\t}\n\t\tcase \"qwen2vl_merger\", \"qwen2.5vl_merger\", \"qwen2.5o\":\n\t\t\tnSizePatch := uint64(a.ClipVisionPatchSize * 2)\n\t\t\theightPatchSize := heightMaxSize / nSizePatch\n\t\t\tif heightMaxSize%nSizePatch > 0 {\n\t\t\t\theightPatchSize++\n\t\t\t}\n\t\t\twidthPatchSize := widthMaxSize / nSizePatch\n\t\t\tif widthMaxSize%nSizePatch > 0 {\n\t\t\t\twidthPatchSize++\n\t\t\t}\n\t\t\tnPatches = heightPatchSize * widthPatchSize\n\t\t\tif ti, ok := gf.TensorInfos.Get(\"mm.2.bias\"); ok {\n\t\t\t\tprojectionDim = ti.Dimensions[0]\n\t\t\t}\n\t\tcase \"gemma3\":\n\t\t\tnPerSide := uint64(a.ClipVisionImageSize) / uint64(a.ClipVisionPatchSize)\n\t\t\tnPerSide2DPool := nPerSide / uint64(a.ClipVisionProjectorScaleFactor)\n\t\t\tnPatches = nPerSide2DPool * nPerSide2DPool\n\t\t\tif ti, ok := gf.TensorInfos.Get(\"mm.input_projection.weight\"); ok {\n\t\t\t\tprojectionDim = ti.Dimensions[0]\n\t\t\t}\n\t\tcase \"idefics3\", \"llama4\":\n\t\t\tnPatches /= uint64(a.ClipVisionProjectorScaleFactor * a.ClipVisionProjectorScaleFactor)\n\t\t\tif ti, ok := gf.TensorInfos.Get(\"mm.model.fc.weight\"); ok {\n\t\t\t\tprojectionDim = ti.Dimensions[1]\n\t\t\t}\n\t\tcase \"pixtral\":\n\t\t\theightPatchSize := heightMaxSize / uint64(a.ClipVisionPatchSize)\n\t\t\tif a.ClipVisionSpatialMergeSize > 0 {\n\t\t\t\theightPatchSize /= uint64(a.ClipVisionSpatialMergeSize)\n\t\t\t}\n\t\t\twidthPatchSize := widthMaxSize / uint64(a.ClipVisionPatchSize)\n\t\t\tif a.ClipVisionSpatialMergeSize > 0 {\n\t\t\t\twidthPatchSize /= uint64(a.ClipVisionSpatialMergeSize)\n\t\t\t}\n\t\t\tnPatches = heightPatchSize*widthPatchSize + heightPatchSize - 1 /* [IMG_BREAK] per row */\n\t\t\tif ti, ok := gf.TensorInfos.Get(\"mm.2.bias\"); ok {\n\t\t\t\tprojectionDim = ti.Dimensions[0]\n\t\t\t}\n\t\tcase \"internvl\":\n\t\t\tnPatches /= uint64(a.ClipVisionProjectorScaleFactor * a.ClipVisionProjectorScaleFactor)\n\t\t\tif ti, ok := gf.TensorInfos.Get(\"mm.model.mlp.3.weight\"); ok {\n\t\t\t\tprojectionDim = ti.Dimensions[1]\n\t\t\t}\n\t\t}\n\n\t\t// Footprint\n\t\t{\n\t\t\t// Image Embed,\n\t\t\t// see https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/llava.cpp#L401-L407.\n\t\t\te.Devices[0].Footprint += GGUFBytesScalar(patchesMaxSize * nPatches * projectionDim * 4 /* float32 size */)\n\t\t}\n\n\t\t// Computation.\n\t\t{\n\t\t\t// See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/tools/mtmd/clip.cpp#L374.\n\t\t\tvar maxNodes uint64 = 8192\n\n\t\t\t// Bootstrap, compute metadata.\n\t\t\tcm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false)\n\t\t\te.Devices[0].Computation.Footprint += GGUFBytesScalar(cm)\n\n\t\t\t// Scheduler overhead,\n\t\t\t// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149.\n\t\t\te.Devices[0].Computation.Footprint += GGUFBytesScalar(4 * 1024 * 1024)\n\n\t\t\t// GGML context,\n\t\t\t// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036.\n\t\t\tgc := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.ClipVisionBlockCount*3)\n\t\t\te.Devices[0].Computation.Footprint += GGUFBytesScalar(gc)\n\n\t\t\t// Tensor usage.\n\t\t\tvar (\n\t\t\t\thasClassEmbd bool\n\t\t\t\tnPositions   uint64\n\t\t\t\tnBatch       uint64\n\t\t\t\tnEmbd        uint64\n\t\t\t\tnHead        uint64\n\t\t\t)\n\t\t\t{\n\t\t\t\t_, hasClassEmbd = ipLs.Get(\"v.class_embd\")\n\t\t\t\tnPositions = nPatches\n\t\t\t\tif hasClassEmbd {\n\t\t\t\t\tnPositions += 1\n\t\t\t\t}\n\t\t\t\tif a.ClipHasQwen2VLMerger ||\n\t\t\t\t\ta.ClipProjectorType == \"qwen2vl_merger\" ||\n\t\t\t\t\ta.ClipProjectorType == \"qwen2.5vl_merger\" ||\n\t\t\t\t\ta.ClipProjectorType == \"qwen2.5o\" {\n\t\t\t\t\tnPositions *= 4\n\t\t\t\t}\n\t\t\t\tnBatch = 1\n\t\t\t\tnEmbd = a.ClipVisionEmbeddingLength\n\t\t\t\tnHead = a.ClipVisionAttentionHeadCount\n\t\t\t}\n\t\t\t// First, get the usage of input layer.\n\t\t\t{\n\t\t\t\tvar (\n\t\t\t\t\tinpRaw     = GGMLTypeF32.RowSizeOf([]uint64{widthMaxSize, heightMaxSize, 3, nBatch}) // F32 [img_width, img_height, 3, n_batch]\n\t\t\t\t\tinpRawCnt  = GGMLTypeF32.RowSizeOf([]uint64{nPatches, nEmbd, nBatch})                // I32 [n_patches, n_embd, n_batch]\n\t\t\t\t\tinpEmbd    = GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions, nBatch})              // F32 [n_embd, n_positions, n_batch]\n\t\t\t\t\tinpPosEmbd = GGMLTypeF32.RowSizeOf([]uint64{projectionDim, nPatches, nBatch})        // F32 [mmproj, n_patches, n_batch]\n\t\t\t\t\tinpPos     = GGMLTypeI32.RowSizeOf([]uint64{nPositions})                             // I32 [n_positions]\n\t\t\t\t\tinpPatches = GGMLTypeI32.RowSizeOf([]uint64{nPatches})                               // I32 [n_patches]\n\t\t\t\t)\n\t\t\t\te.Devices[idx].Computation.Input += GGUFBytesScalar(inpRaw + inpRawCnt + inpPos + inpPatches)\n\t\t\t\tif a.ClipHasMiniCPMVProjector ||\n\t\t\t\t\ta.ClipProjectorType == \"resampler\" {\n\t\t\t\t\te.Devices[idx].Computation.Input += GGUFBytesScalar(inpPosEmbd)\n\t\t\t\t}\n\t\t\t\tif hasClassEmbd {\n\t\t\t\t\te.Devices[idx].Computation.Input += GGUFBytesScalar(inpEmbd)\n\t\t\t\t}\n\t\t\t\tif a.ClipVisionWindowAttentionPattern > 0 { // Qwen2.5 VL\n\t\t\t\t\tinpWindowIndex := GGMLTypeI32.RowSizeOf([]uint64{nPatches})              // I32 [n_patches]\n\t\t\t\t\tinpWindowMask := GGMLTypeI32.RowSizeOf([]uint64{nPositions, nPositions}) // I32 [n_positions, n_positions]\n\t\t\t\t\te.Devices[idx].Computation.Input += GGUFBytesScalar(inpWindowIndex + inpWindowMask)\n\t\t\t\t}\n\t\t\t}\n\t\t\t// Since the steps between transformer layers are serial,\n\t\t\t// the allocated memory can be reused for the next layer.\n\t\t\t// So, we only consider the usage of a certain layer.\n\t\t\t{\n\t\t\t\tcompNorm := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) * 2\n\t\t\t\tcompVcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions})\n\t\t\t\tcompKcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions})\n\t\t\t\tcompKQcur := GGMLTypeF32.RowSizeOf([]uint64{nPositions, nPositions, nHead})\n\t\t\t\te.Devices[idx].Computation.Compute += GGUFBytesScalar(compNorm + compVcur + compKcur + compKQcur)\n\t\t\t}\n\t\t}\n\t}\n\n\tif a.ClipHasAudioEncoder {\n\t\t// See https://github.com/ggml-org/llama.cpp/blob/6385b843a8dc8e15b8362196039720c58dd79fa2/tools/mtmd/mtmd-audio.cpp#L311.\n\t\tvar projectionDim uint64 // NB(thxCode): do not sure if there is the correct name.\n\t\t{\n\t\t\tif ti, ok := gf.TensorInfos.Get(\"a.position_embd.weight\"); ok {\n\t\t\t\tprojectionDim = ti.Dimensions[1]\n\t\t\t}\n\t\t}\n\n\t\t// Computation.\n\t\t{\n\t\t\t// See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/tools/mtmd/clip.cpp#L374.\n\t\t\tvar maxNodes uint64 = 8192\n\n\t\t\t// Bootstrap, compute metadata.\n\t\t\tcm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false)\n\t\t\te.Devices[0].Computation.Footprint += GGUFBytesScalar(cm)\n\n\t\t\t// Scheduler overhead,\n\t\t\t// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149.\n\t\t\te.Devices[0].Computation.Footprint += GGUFBytesScalar(4 * 1024 * 1024)\n\n\t\t\t// GGML context,\n\t\t\t// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036.\n\t\t\tgc := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.ClipAudioBlockCount*3)\n\t\t\te.Devices[0].Computation.Footprint += GGUFBytesScalar(gc)\n\n\t\t\t// Tensor usage.\n\t\t\tvar (\n\t\t\t\tnPositions uint64\n\t\t\t\tnBatch     uint64\n\t\t\t\tnEmbd      uint64\n\t\t\t\tnHead      uint64\n\t\t\t)\n\t\t\t{\n\t\t\t\tnPositions = projectionDim\n\t\t\t\tnBatch = 1\n\t\t\t\tnEmbd = a.ClipAudioEmbeddingLength\n\t\t\t\tnHead = a.ClipAudioAttentionHeadCount\n\t\t\t}\n\t\t\t// First, get the usage of input layer.\n\t\t\t{\n\t\t\t\tinpEmbd := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions, nBatch}) // F32 [n_embed, n_positions, n_batch]\n\t\t\t\te.Devices[idx].Computation.Input += GGUFBytesScalar(inpEmbd)\n\t\t\t}\n\t\t\t// Since the steps between transformer layers are serial,\n\t\t\t// the allocated memory can be reused for the next layer.\n\t\t\t// So, we only consider the usage of a certain layer.\n\t\t\t{\n\t\t\t\tcompNorm := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions})\n\t\t\t\tcompVcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions})\n\t\t\t\tcompKcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions})\n\t\t\t\tcompKQcur := GGMLTypeF32.RowSizeOf([]uint64{nPositions, nPositions, nHead})\n\t\t\t\te.Devices[idx].Computation.Compute += GGUFBytesScalar(compNorm + compVcur + compKcur + compKQcur)\n\t\t\t}\n\t\t}\n\t}\n}\n\n// estimateLLaMACppRunInAdapter estimates the usages of the GGUF file for adapter.\nfunc (gf *GGUFFile) estimateLLaMACppRunInAdapter(o *_GGUFRunEstimateOptions, a *GGUFArchitecture, e *LLaMACppRunEstimate) {\n\tls := gf.Layers()\n\tioLs, tfLs, _ := ls.Cut([]string{\n\t\t\"position_*\",\n\t\t\"token_*\",\n\t\t\"cls.*\",\n\t\t\"output.*\",\n\t\t\"output_*\",\n\t})\n\tipLs, opLs, _ := ioLs.Cut([]string{\n\t\t\"position_*\",\n\t\t\"token_*\",\n\t})\n\n\tif a.BlockCount == 0 {\n\t\ta.BlockCount = uint64(len(tfLs))\n\t}\n\n\t// Full offload: nLoadLayers == 0 && isOffloadOutputLayer\n\t// Zero offload: nOffloadLayers == 0\n\t// Partial offload: !Full offload && !Zero offload\n\tvar (\n\t\tnOffloadLayers       uint64\n\t\tnActualOffloadLayers uint64\n\t\tnLoadLayers          = a.BlockCount\n\t\tidxOutputDevice      int\n\n\t\tfullOffload bool\n\t)\n\t{\n\t\tvar isOffloadOutputLayer bool\n\n\t\tswitch v := o.LMCOffloadLayers; {\n\t\tcase v == nil:\n\t\t\to.LMCOffloadLayers = ptr.To(a.BlockCount)\n\t\t\tnOffloadLayers = a.BlockCount\n\t\t\tisOffloadOutputLayer = true\n\t\tcase *v != 0:\n\t\t\tnOffloadLayers = *v\n\t\t\tif nOffloadLayers > a.BlockCount {\n\t\t\t\tisOffloadOutputLayer = true\n\t\t\t\tnOffloadLayers = a.BlockCount\n\t\t\t}\n\t\t}\n\t\tnActualOffloadLayers = nOffloadLayers\n\t\tif isOffloadOutputLayer {\n\t\t\tnActualOffloadLayers += 1\n\t\t}\n\t\tnLoadLayers -= nOffloadLayers\n\n\t\tfullOffload = nLoadLayers == 0 && isOffloadOutputLayer\n\n\t\te.FullOffloaded = fullOffload\n\t\te.OffloadLayers = nOffloadLayers\n\n\t\tfor i, j, offloadStart := 0, 0, len(tfLs)-int(nOffloadLayers); i < len(tfLs); i++ {\n\t\t\tswitch {\n\t\t\tcase i < int(nLoadLayers):\n\t\t\t\te.Devices[0].HandleLayers += 1\n\t\t\t\te.Devices[0].HandleLastLayer = i\n\t\t\tcase i >= offloadStart:\n\t\t\t\tx := float64(i-offloadStart) / float64(nActualOffloadLayers)\n\t\t\t\tj = slicex.UpperBound(o.TensorSplitFraction, x)\n\t\t\t\te.Devices[j+1].HandleLayers += 1\n\t\t\t\te.Devices[j+1].HandleLastLayer = i\n\t\t\t\tif fullOffload && i == len(tfLs)-1 {\n\t\t\t\t\tidxOutputDevice = j + 1\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\te.Devices[idxOutputDevice].HandleOutputLayer = true\n\t}\n\n\t// Distributable.\n\te.Distributable = false\n\n\t// Footprint.\n\t{\n\t\t// Bootstrap.\n\t\te.Devices[0].Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */\n\t}\n\n\t// Weight & Parameter.\n\t{\n\t\t// Compute.\n\t\tfor i, j, offloadStart := 0, 0, len(tfLs)-int(nOffloadLayers); i < len(tfLs); i++ {\n\t\t\tidx := 0\n\t\t\tif i >= offloadStart {\n\t\t\t\tx := float64(i-offloadStart) / float64(nActualOffloadLayers)\n\t\t\t\tj = slicex.UpperBound(o.TensorSplitFraction, x)\n\t\t\t\tidx = j + 1\n\t\t\t}\n\t\t\te.Devices[idx].Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes())\n\t\t\te.Devices[idx].Parameter.Compute += GGUFParametersScalar(tfLs[i].Elements())\n\t\t}\n\n\t\t// IO,\n\t\t// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L4930-L5002.\n\t\te.Devices[0].Weight.Input = GGUFBytesScalar(ipLs.Bytes())\n\t\te.Devices[0].Parameter.Input = GGUFParametersScalar(ipLs.Elements())\n\t\tvar (\n\t\t\twg GGUFBytesScalar\n\t\t\tps GGUFParametersScalar\n\t\t)\n\t\tif _, ok := opLs.Get(\"output.weight\"); ok {\n\t\t\twg = GGUFBytesScalar(opLs.Bytes())\n\t\t\tps = GGUFParametersScalar(opLs.Elements())\n\t\t} else {\n\t\t\twg = GGUFBytesScalar(opLs.Bytes()) + e.Devices[0].Weight.Input /* duplicate the input layer */\n\t\t\tps = GGUFParametersScalar(opLs.Elements() + ipLs.Elements())\n\t\t}\n\t\te.Devices[0].Weight.Output = wg\n\t\tif fullOffload {\n\t\t\te.Devices[idxOutputDevice].Weight.Output = wg\n\t\t\te.Devices[idxOutputDevice].Parameter.Output = ps\n\t\t} else {\n\t\t\te.Devices[0].Parameter.Output = ps\n\t\t}\n\t}\n}\n\n// estimateLLaMACppRunInIMatrix estimates the usages of the GGUF file for imatrix.\nfunc (gf *GGUFFile) estimateLLaMACppRunInIMatrix(_ *_GGUFRunEstimateOptions, a *GGUFArchitecture, e *LLaMACppRunEstimate) {\n\tls := gf.Layers()\n\n\tif a.BlockCount == 0 {\n\t\ta.BlockCount = uint64(len(ls))\n\t}\n\n\t// Distributable.\n\te.Distributable = false\n\n\t// Footprint.\n\t{\n\t\t// Bootstrap.\n\t\te.Devices[0].Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */\n\t}\n\n\t// Weight & Parameter.\n\t{\n\t\tvar (\n\t\t\twg GGUFBytesScalar\n\t\t\tps GGUFParametersScalar\n\t\t)\n\t\twg = GGUFBytesScalar(ls.Bytes())\n\t\tps = GGUFParametersScalar(ls.Elements())\n\t\te.Devices[0].Weight.Compute = wg\n\t\te.Devices[0].Parameter.Compute = ps\n\t}\n}\n\n// Types for LLaMACpp estimated summary.\ntype (\n\t// LLaMACppRunEstimateSummary represents the summary of the usage for loading the GGUF file in llama.cpp.\n\tLLaMACppRunEstimateSummary struct {\n\t\t/* Basic */\n\n\t\t// Items\n\t\tItems []LLaMACppRunEstimateSummaryItem `json:\"items\"`\n\n\t\t/* Appendix */\n\n\t\t// Type describes what type this GGUF file is.\n\t\tType string `json:\"type\"`\n\t\t// Architecture describes what architecture this GGUF file implements.\n\t\t//\n\t\t// All lowercase ASCII.\n\t\tArchitecture string `json:\"architecture\"`\n\t\t// ClipProjectorType is the type of the projector used in the clip model.\n\t\t//\n\t\t// Only used when Architecture is \"clip\".\n\t\tClipProjectorType string `json:\"clipProjectorType,omitempty\"`\n\t\t// AdapterType is the type of the adapter.\n\t\t//\n\t\t// Only used when Architecture is \"adapter\".\n\t\tAdapterType string `json:\"adapterType,omitempty\"`\n\t\t// ContextSize is the size of the context.\n\t\tContextSize uint64 `json:\"contextSize\"`\n\t\t// FlashAttention is the flag to indicate whether enable the flash attention,\n\t\t// true for enable.\n\t\tFlashAttention bool `json:\"flashAttention\"`\n\t\t// NoMMap is the flag to indicate whether the file must be loaded without mmap,\n\t\t// true for total loaded.\n\t\tNoMMap bool `json:\"noMMap\"`\n\t\t// EmbeddingOnly is the flag to indicate whether the model is used for embedding only,\n\t\t// true for embedding only.\n\t\tEmbeddingOnly bool `json:\"embeddingOnly\"`\n\t\t// Reranking is the flag to indicate whether the model is used for reranking,\n\t\t// true for reranking.\n\t\t//\n\t\t// Only available when EmbeddingOnly is true.\n\t\tReranking bool `json:\"reranking\"`\n\t\t// Distributable is the flag to indicate whether the model is distributable,\n\t\t// true for distributable.\n\t\tDistributable bool `json:\"distributable\"`\n\t\t// LogicalBatchSize is the logical batch size.\n\t\tLogicalBatchSize int32 `json:\"logicalBatchSize\"`\n\t\t// PhysicalBatchSize is the physical batch size.\n\t\tPhysicalBatchSize int32 `json:\"physicalBatchSize\"`\n\t}\n\n\t// LLaMACppRunEstimateSummaryItem represents one summary item for loading the GGUF file in llama.cpp.\n\tLLaMACppRunEstimateSummaryItem struct {\n\t\t// OffloadLayers is the number of offloaded layers.\n\t\tOffloadLayers uint64 `json:\"offloadLayers\"`\n\t\t// FullOffloaded is the flag to indicate whether the layers are fully offloaded,\n\t\t// false for partial offloaded or zero offloaded.\n\t\tFullOffloaded bool `json:\"fullOffloaded\"`\n\t\t// MaximumTokensPerSecond is the maximum tokens per second for running the GGUF file.\n\t\tMaximumTokensPerSecond *GGUFTokensPerSecondScalar `json:\"maximumTokensPerSecond,omitempty\"`\n\t\t// RAM is the memory usage for loading the GGUF file in RAM.\n\t\tRAM LLaMACppRunEstimateMemory `json:\"ram\"`\n\t\t// VRAMs is the memory usage for loading the GGUF file in VRAM per device.\n\t\tVRAMs []LLaMACppRunEstimateMemory `json:\"vrams\"`\n\t}\n\n\t// LLaMACppRunEstimateMemory represents the memory usage for loading the GGUF file in llama.cpp.\n\tLLaMACppRunEstimateMemory struct {\n\t\t// HandleLayers is the number of layers that the device can handle.\n\t\tHandleLayers uint64 `json:\"handleLayers\"`\n\t\t// HandleLastLayer is the index of the last layer the device can handle.\n\t\tHandleLastLayer int `json:\"handleLastLayer\"`\n\t\t// HandleOutputLayer is the flag to indicate whether the device can handle the output layer,\n\t\t// true for handle.\n\t\tHandleOutputLayer bool `json:\"handleOutputLayer\"`\n\t\t// Remote is the flag to indicate whether the device is remote,\n\t\t// true for remote.\n\t\tRemote bool `json:\"remote\"`\n\t\t// Position is the relative position of the device,\n\t\t// starts from 0.\n\t\t//\n\t\t// If Remote is true, Position is the position of the remote devices,\n\t\t// Otherwise, Position is the position of the device in the local devices.\n\t\tPosition int `json:\"position\"`\n\t\t// UMA represents the usage of Unified Memory Architecture.\n\t\tUMA GGUFBytesScalar `json:\"uma\"`\n\t\t// NonUMA represents the usage of Non-Unified Memory Architecture.\n\t\tNonUMA GGUFBytesScalar `json:\"nonuma\"`\n\t}\n)\n\n// SummarizeItem returns the corresponding LLaMACppRunEstimateSummaryItem with the given options.\nfunc (e LLaMACppRunEstimate) SummarizeItem(mmap bool, nonUMARamFootprint, nonUMAVramFootprint uint64) (emi LLaMACppRunEstimateSummaryItem) {\n\temi.OffloadLayers, emi.FullOffloaded = e.OffloadLayers, e.FullOffloaded\n\tif emi.FullOffloaded {\n\t\temi.OffloadLayers++ // The output layer is offloaded.\n\t}\n\temi.MaximumTokensPerSecond = e.MaximumTokensPerSecond\n\n\t// RAM.\n\t{\n\t\tfp := e.Devices[0].Footprint\n\t\twg := e.Devices[0].Weight.Sum()\n\t\tkv := e.Devices[0].KVCache.Sum()\n\t\tcp := e.Devices[0].Computation.Sum()\n\n\t\temi.RAM.HandleLayers = e.Devices[0].HandleLayers\n\t\temi.RAM.HandleLastLayer = e.Devices[0].HandleLastLayer\n\t\temi.RAM.HandleOutputLayer = e.Devices[0].HandleOutputLayer\n\n\t\t// UMA.\n\t\temi.RAM.UMA = fp + wg + kv + cp\n\t\tif !e.NoMMap && (mmap || e.FullOffloaded) {\n\t\t\temi.RAM.UMA -= wg\n\t\t\tif !mmap {\n\t\t\t\temi.RAM.UMA += e.Devices[0].Weight.Output\n\t\t\t\temi.RAM.UMA += e.Devices[0].Weight.ComputeOverridden\n\t\t\t}\n\t\t}\n\n\t\t// NonUMA.\n\t\temi.RAM.NonUMA = GGUFBytesScalar(nonUMARamFootprint) + emi.RAM.UMA\n\t}\n\n\t// VRAMs.\n\temi.VRAMs = make([]LLaMACppRunEstimateMemory, len(e.Devices)-1)\n\t{\n\t\tfor i, d := range e.Devices[1:] {\n\t\t\tfp := d.Footprint\n\t\t\twg := d.Weight.Sum()\n\t\t\tkv := d.KVCache.Sum()\n\t\t\tcp := d.Computation.Sum()\n\n\t\t\temi.VRAMs[i].HandleLayers = d.HandleLayers\n\t\t\temi.VRAMs[i].HandleLastLayer = d.HandleLastLayer\n\t\t\temi.VRAMs[i].HandleOutputLayer = d.HandleOutputLayer\n\t\t\temi.VRAMs[i].Remote = d.Remote\n\t\t\temi.VRAMs[i].Position = d.Position\n\n\t\t\t// UMA.\n\t\t\temi.VRAMs[i].UMA = fp + wg + kv + /* cp */ 0\n\t\t\tif !e.NoMMap && mmap {\n\t\t\t\temi.VRAMs[i].UMA -= wg\n\t\t\t\tif d.Remote || d.Position > 0 && d.HandleLastLayer >= 0 || e.Type == \"projector\" {\n\t\t\t\t\temi.VRAMs[i].UMA += wg\n\t\t\t\t}\n\t\t\t}\n\n\t\t\t// NonUMA.\n\t\t\temi.VRAMs[i].NonUMA = GGUFBytesScalar(nonUMAVramFootprint) + fp + wg + kv + cp\n\t\t\tif !d.Remote && d.Position > 0 && d.HandleLastLayer < 0 {\n\t\t\t\temi.VRAMs[i].NonUMA -= wg + cp\n\t\t\t}\n\t\t}\n\t}\n\n\t// Add drafter's usage.\n\tif e.Drafter != nil {\n\t\tdemi := e.Drafter.SummarizeItem(mmap, 0, 0)\n\t\temi.RAM.UMA += demi.RAM.UMA\n\t\temi.RAM.NonUMA += demi.RAM.NonUMA\n\t\tfor i, v := range demi.VRAMs {\n\t\t\temi.VRAMs[i].UMA += v.UMA\n\t\t\temi.VRAMs[i].NonUMA += v.NonUMA\n\t\t}\n\t}\n\n\t// Add projector's usage.\n\tif e.Projector != nil {\n\t\tpemi := e.Projector.SummarizeItem(mmap, 0, 0)\n\t\temi.RAM.UMA += pemi.RAM.UMA\n\t\temi.RAM.NonUMA += pemi.RAM.NonUMA\n\t\tfor i, v := range pemi.VRAMs {\n\t\t\temi.VRAMs[i].UMA += v.UMA\n\t\t\temi.VRAMs[i].NonUMA += v.NonUMA\n\t\t}\n\t}\n\n\t// Add adapters' usage.\n\tfor i := range e.Adapters {\n\t\taemi := e.Adapters[i].SummarizeItem(false, 0, 0)\n\t\temi.RAM.UMA += aemi.RAM.UMA\n\t\temi.RAM.NonUMA += aemi.RAM.NonUMA\n\t\tfor j, v := range aemi.VRAMs {\n\t\t\temi.VRAMs[j].UMA += v.UMA\n\t\t\temi.VRAMs[j].NonUMA += v.NonUMA\n\t\t}\n\t}\n\n\treturn emi\n}\n\n// Summarize returns the corresponding LLaMACppRunEstimateSummary with the given options.\nfunc (e LLaMACppRunEstimate) Summarize(mmap bool, nonUMARamFootprint, nonUMAVramFootprint uint64) (es LLaMACppRunEstimateSummary) {\n\t// Items.\n\tes.Items = []LLaMACppRunEstimateSummaryItem{\n\t\te.SummarizeItem(mmap, nonUMARamFootprint, nonUMAVramFootprint),\n\t}\n\n\t// Just copy from the original estimate.\n\tes.Type = e.Type\n\tes.Architecture = e.Architecture\n\tes.ClipProjectorType = e.ClipProjectorType\n\tes.AdapterType = e.AdapterType\n\tes.ContextSize = e.ContextSize\n\tes.FlashAttention = e.FlashAttention\n\tes.NoMMap = e.NoMMap\n\tes.EmbeddingOnly = e.EmbeddingOnly\n\tes.Reranking = e.Reranking\n\tes.LogicalBatchSize = e.LogicalBatchSize\n\tes.PhysicalBatchSize = e.PhysicalBatchSize\n\tes.Distributable = e.Distributable\n\n\treturn es\n}\n\nfunc (u LLaMACppWeightMemoryUsage) Sum() GGUFBytesScalar {\n\treturn u.Input + u.Compute + u.ComputeOverridden + u.Output\n}\n\nfunc (u LLaMACppKVCacheMemoryUsage) Sum() GGUFBytesScalar {\n\treturn u.Key + u.Value\n}\n\nfunc (u LLaMACppComputationMemoryUsage) Sum() GGUFBytesScalar {\n\treturn u.Footprint + u.Input + max(u.Compute, u.Output)\n}\n\n// ClipAligning returns the aligned value of x to the nearest multiple of n,\n// see https://github.com/ggml-org/llama.cpp/blob/cdf94a18023c92f41808ec874ba577d914674717/tools/mtmd/clip-impl.h#L114-L115.\nfunc ClipAligning(x, n uint64) uint64 {\n\treturn ((x + n - 1) / n) * n\n}\n"
  },
  {
    "path": "file_estimate__llamacpp_test.go",
    "content": "package gguf_parser\n\nimport (\n\t\"context\"\n\t\"testing\"\n\n\t\"github.com/davecgh/go-spew/spew\"\n)\n\nfunc TestGGUFFile_EstimateLLaMACppRun(t *testing.T) {\n\tctx := context.Background()\n\n\tcases := []struct {\n\t\tname  string\n\t\tgiven *GGUFFile\n\t}{\n\t\t{\n\t\t\tname: \"mixtral 7B\",\n\t\t\tgiven: func() *GGUFFile {\n\t\t\t\tf, err := ParseGGUFFileFromHuggingFace(\n\t\t\t\t\tctx,\n\t\t\t\t\t\"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF\",\n\t\t\t\t\t\"Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf\",\n\t\t\t\t\tSkipLargeMetadata())\n\t\t\t\tif err != nil {\n\t\t\t\t\tt.Fatal(err)\n\t\t\t\t}\n\t\t\t\treturn f\n\t\t\t}(),\n\t\t},\n\t\t{\n\t\t\tname: \"mixtral 8x7B\",\n\t\t\tgiven: func() *GGUFFile {\n\t\t\t\tf, err := ParseGGUFFileFromHuggingFace(\n\t\t\t\t\tctx,\n\t\t\t\t\t\"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF\",\n\t\t\t\t\t\"Nous-Hermes-2-Mixtral-8x7B-DPO.Q5_K_M.gguf\",\n\t\t\t\t\tSkipLargeMetadata())\n\t\t\t\tif err != nil {\n\t\t\t\t\tt.Fatal(err)\n\t\t\t\t}\n\t\t\t\treturn f\n\t\t\t}(),\n\t\t},\n\t\t{\n\t\t\tname: \"wizardlm 8x22B\",\n\t\t\tgiven: func() *GGUFFile {\n\t\t\t\tf, err := ParseGGUFFileFromHuggingFace(\n\t\t\t\t\tctx,\n\t\t\t\t\t\"MaziyarPanahi/WizardLM-2-8x22B-GGUF\",\n\t\t\t\t\t\"WizardLM-2-8x22B.IQ1_M.gguf\",\n\t\t\t\t\tSkipLargeMetadata())\n\t\t\t\tif err != nil {\n\t\t\t\t\tt.Fatal(err)\n\t\t\t\t}\n\t\t\t\treturn f\n\t\t\t}(),\n\t\t},\n\t}\n\tfor _, tc := range cases {\n\t\tt.Run(tc.name, func(t *testing.T) {\n\t\t\tf := tc.given\n\t\t\tt.Log(\"\\n\", spew.Sdump(f.EstimateLLaMACppRun()), \"\\n\")\n\t\t})\n\t}\n}\n\nfunc TestGGUFFile_EstimateLLaMACppRun_ContextSize(t *testing.T) {\n\tctx := context.Background()\n\n\tf, err := ParseGGUFFileFromHuggingFace(\n\t\tctx,\n\t\t\"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF\",\n\t\t\"Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf\",\n\t\tSkipLargeMetadata())\n\tif err != nil {\n\t\tt.Fatal(err)\n\t\treturn\n\t}\n\n\tcases := []struct {\n\t\tname string\n\t\topts []GGUFRunEstimateOption\n\t}{\n\t\t{\"1024(fp16)\", []GGUFRunEstimateOption{WithLLaMACppContextSize(1024)}},\n\t\t{\"1024(fp32)\", []GGUFRunEstimateOption{WithLLaMACppContextSize(1024), WithLLaMACppCacheKeyType(GGMLTypeF32), WithLLaMACppCacheValueType(GGMLTypeF32)}},\n\t\t{\"4096(fp16)\", []GGUFRunEstimateOption{WithLLaMACppContextSize(4096)}},\n\t\t{\"4096(fp32)\", []GGUFRunEstimateOption{WithLLaMACppContextSize(4096), WithLLaMACppCacheKeyType(GGMLTypeF32), WithLLaMACppCacheValueType(GGMLTypeF32)}},\n\t}\n\tfor _, tc := range cases {\n\t\tt.Run(tc.name, func(t *testing.T) {\n\t\t\tt.Log(\"\\n\", spew.Sdump(f.EstimateLLaMACppRun(tc.opts...)), \"\\n\")\n\t\t})\n\t}\n}\n\nfunc TestGGUFFile_EstimateLLaMACppRun_OffloadLayers(t *testing.T) {\n\tctx := context.Background()\n\n\tf, err := ParseGGUFFileFromHuggingFace(\n\t\tctx,\n\t\t\"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF\",\n\t\t\"Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf\",\n\t\tSkipLargeMetadata())\n\tif err != nil {\n\t\tt.Fatal(err)\n\t\treturn\n\t}\n\n\tcases := []struct {\n\t\tname string\n\t\topts []GGUFRunEstimateOption\n\t}{\n\t\t{\"offload 0 layer\", []GGUFRunEstimateOption{WithLLaMACppOffloadLayers(0)}},\n\t\t{\"offload 1 layer\", []GGUFRunEstimateOption{WithLLaMACppOffloadLayers(1)}},\n\t\t{\"offload 10 layers\", []GGUFRunEstimateOption{WithLLaMACppOffloadLayers(10)}},\n\t\t{\"offload all layers\", []GGUFRunEstimateOption{}},\n\t\t{\"offload 33 layers\", []GGUFRunEstimateOption{WithLLaMACppOffloadLayers(33)}}, // exceeds the number of layers\n\t}\n\tfor _, tc := range cases {\n\t\tt.Run(tc.name, func(t *testing.T) {\n\t\t\tt.Log(\"\\n\", spew.Sdump(f.EstimateLLaMACppRun(tc.opts...)), \"\\n\")\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "file_estimate__stablediffusioncpp.go",
    "content": "package gguf_parser\n\nimport (\n\t\"math\"\n\t\"strings\"\n\n\t\"golang.org/x/exp/maps\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/ptr\"\n\t\"github.com/gpustack/gguf-parser-go/util/stringx\"\n)\n\n// Types for StableDiffusionCpp estimation.\ntype (\n\t// StableDiffusionCppRunEstimate represents the estimated result of loading the GGUF file in stable-diffusion.cpp.\n\tStableDiffusionCppRunEstimate struct {\n\t\t// Type describes what type this GGUF file is.\n\t\tType string `json:\"type\"`\n\t\t// Architecture describes what architecture this GGUF file implements.\n\t\t//\n\t\t// All lowercase ASCII.\n\t\tArchitecture string `json:\"architecture\"`\n\t\t// FlashAttention is the flag to indicate whether enable the flash attention,\n\t\t// true for enable.\n\t\tFlashAttention bool `json:\"flashAttention\"`\n\t\t// FullOffloaded is the flag to indicate whether the layers are fully offloaded,\n\t\t// false for partial offloaded or zero offloaded.\n\t\tFullOffloaded bool `json:\"fullOffloaded\"`\n\t\t// NoMMap is the flag to indicate whether support the mmap,\n\t\t// true for support.\n\t\tNoMMap bool `json:\"noMMap\"`\n\t\t// ImageOnly is the flag to indicate whether the model is used for generating image,\n\t\t// true for generating image only.\n\t\tImageOnly bool `json:\"imageOnly\"`\n\t\t// Distributable is the flag to indicate whether the model is distributable,\n\t\t// true for distributable.\n\t\tDistributable bool `json:\"distributable\"`\n\t\t// Devices represents the usage for running the GGUF file,\n\t\t// the first device is the CPU, and the rest are GPUs.\n\t\tDevices []StableDiffusionCppRunDeviceUsage `json:\"devices\"`\n\t\t// Autoencoder is the estimated result of the autoencoder.\n\t\tAutoencoder *StableDiffusionCppRunEstimate `json:\"autoencoder,omitempty\"`\n\t\t// Conditioners is the estimated result of the conditioners.\n\t\tConditioners []StableDiffusionCppRunEstimate `json:\"conditioners,omitempty\"`\n\t\t// Upscaler is the estimated result of the upscaler.\n\t\tUpscaler *StableDiffusionCppRunEstimate `json:\"upscaler,omitempty\"`\n\t\t// ControlNet is the estimated result of the control net.\n\t\tControlNet *StableDiffusionCppRunEstimate `json:\"controlNet,omitempty\"`\n\t}\n\n\t// StableDiffusionCppRunDeviceUsage represents the usage for running the GGUF file in llama.cpp.\n\tStableDiffusionCppRunDeviceUsage struct {\n\t\t// Remote is the flag to indicate whether the device is remote,\n\t\t// true for remote.\n\t\tRemote bool `json:\"remote\"`\n\t\t// Position is the relative position of the device,\n\t\t// starts from 0.\n\t\t//\n\t\t// If Remote is true, Position is the position of the remote devices,\n\t\t// Otherwise, Position is the position of the device in the local devices.\n\t\tPosition int `json:\"position\"`\n\t\t// Footprint is the memory footprint for bootstrapping.\n\t\tFootprint GGUFBytesScalar `json:\"footprint\"`\n\t\t// Parameter is the running parameters that the device processes.\n\t\tParameter GGUFParametersScalar `json:\"parameter\"`\n\t\t// Weight is the memory usage of weights that the device loads.\n\t\tWeight GGUFBytesScalar `json:\"weight\"`\n\t\t// Computation is the memory usage of computation that the device processes.\n\t\tComputation GGUFBytesScalar `json:\"computation\"`\n\t}\n)\n\n// EstimateStableDiffusionCppRun estimates the usages of the GGUF file in stable-diffusion.cpp.\nfunc (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption) (e StableDiffusionCppRunEstimate) {\n\t// Options\n\tvar o _GGUFRunEstimateOptions\n\tfor _, opt := range opts {\n\t\topt(&o)\n\t}\n\tswitch {\n\tcase o.TensorSplitFraction == nil:\n\t\to.TensorSplitFraction = []float64{1}\n\t\to.MainGPUIndex = 0\n\tcase o.MainGPUIndex < 0 || o.MainGPUIndex >= len(o.TensorSplitFraction):\n\t\tpanic(\"main device index must be range of 0 to the length of tensor split fraction\")\n\t}\n\tif len(o.DeviceMetrics) > 0 {\n\t\tfor i, j := 0, len(o.DeviceMetrics)-1; i < len(o.TensorSplitFraction)-j; i++ {\n\t\t\to.DeviceMetrics = append(o.DeviceMetrics, o.DeviceMetrics[j])\n\t\t}\n\t\to.DeviceMetrics = o.DeviceMetrics[:len(o.TensorSplitFraction)+1]\n\t}\n\tif o.SDCOffloadLayers == nil {\n\t\to.SDCOffloadLayers = ptr.To[uint64](math.MaxUint64)\n\t}\n\tif o.SDCBatchCount == nil {\n\t\to.SDCBatchCount = ptr.To[int32](1)\n\t}\n\tif o.SDCHeight == nil {\n\t\to.SDCHeight = ptr.To[uint32](1024)\n\t}\n\tif o.SDCWidth == nil {\n\t\to.SDCWidth = ptr.To[uint32](1024)\n\t}\n\tif o.SDCOffloadConditioner == nil {\n\t\to.SDCOffloadConditioner = ptr.To(true)\n\t}\n\tif o.SDCOffloadAutoencoder == nil {\n\t\to.SDCOffloadAutoencoder = ptr.To(true)\n\t}\n\tif o.SDCAutoencoderTiling == nil {\n\t\to.SDCAutoencoderTiling = ptr.To(false)\n\t}\n\tif o.SDCFreeComputeMemoryImmediately == nil {\n\t\to.SDCFreeComputeMemoryImmediately = ptr.To(false)\n\t}\n\n\t// Devices.\n\tinitDevices := func(e *StableDiffusionCppRunEstimate) {\n\t\tfor j := range e.Devices[1:] {\n\t\t\te.Devices[j+1].Remote = j < len(o.RPCServers)\n\t\t\tif e.Devices[j+1].Remote {\n\t\t\t\te.Devices[j+1].Position = j\n\t\t\t} else {\n\t\t\t\te.Devices[j+1].Position = j - len(o.RPCServers)\n\t\t\t}\n\t\t}\n\t}\n\te.Devices = make([]StableDiffusionCppRunDeviceUsage, len(o.TensorSplitFraction)+1)\n\tinitDevices(&e)\n\n\t// Metadata.\n\ta := gf.Architecture()\n\te.Type = a.Type\n\te.Architecture = normalizeArchitecture(a.DiffusionArchitecture)\n\n\t// Flash attention.\n\tif o.FlashAttention && !strings.HasPrefix(a.DiffusionArchitecture, \"Stable Diffusion 3\") {\n\t\t// NB(thxCode): Stable Diffusion 3 doesn't support flash attention yet,\n\t\t// see https://github.com/leejet/stable-diffusion.cpp/pull/386.\n\t\te.FlashAttention = true\n\t}\n\n\t// Distributable.\n\te.Distributable = true\n\n\t// Offload.\n\te.FullOffloaded = *o.SDCOffloadLayers > 0\n\n\t// NoMMap.\n\te.NoMMap = true // TODO: Implement this.\n\n\t// ImageOnly.\n\te.ImageOnly = true // TODO: Implement this.\n\n\t// Autoencoder.\n\tif a.DiffusionAutoencoder != nil {\n\t\tae := &StableDiffusionCppRunEstimate{\n\t\t\tType:           \"model\",\n\t\t\tArchitecture:   e.Architecture + \"_vae\",\n\t\t\tFlashAttention: e.FlashAttention,\n\t\t\tDistributable:  e.Distributable,\n\t\t\tFullOffloaded:  e.FullOffloaded && *o.SDCOffloadAutoencoder,\n\t\t\tNoMMap:         e.NoMMap,\n\t\t\tDevices:        make([]StableDiffusionCppRunDeviceUsage, len(e.Devices)),\n\t\t}\n\t\tinitDevices(ae)\n\t\te.Autoencoder = ae\n\t}\n\n\t// Conditioners.\n\tif len(a.DiffusionConditioners) != 0 {\n\t\te.Conditioners = make([]StableDiffusionCppRunEstimate, 0, len(a.DiffusionConditioners))\n\t\tfor i := range a.DiffusionConditioners {\n\t\t\tcd := StableDiffusionCppRunEstimate{\n\t\t\t\tType:           \"model\",\n\t\t\t\tArchitecture:   normalizeArchitecture(a.DiffusionConditioners[i].Architecture),\n\t\t\t\tFlashAttention: e.FlashAttention,\n\t\t\t\tDistributable:  e.Distributable,\n\t\t\t\tFullOffloaded:  e.FullOffloaded && *o.SDCOffloadConditioner,\n\t\t\t\tNoMMap:         e.NoMMap,\n\t\t\t\tDevices:        make([]StableDiffusionCppRunDeviceUsage, len(e.Devices)),\n\t\t\t}\n\t\t\tinitDevices(&cd)\n\t\t\te.Conditioners = append(e.Conditioners, cd)\n\t\t}\n\t}\n\n\t// Footprint\n\t{\n\t\t// Bootstrap.\n\t\te.Devices[0].Footprint = GGUFBytesScalar(10*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */\n\t}\n\n\tvar cdLs, aeLs, dmLs GGUFLayerTensorInfos\n\t{\n\t\tls := gf.Layers()\n\t\tcdLs, aeLs, _ = ls.Cut([]string{\n\t\t\t\"cond_stage_model.*\",\n\t\t})\n\t\taeLs, dmLs, _ = aeLs.Cut([]string{\n\t\t\t\"first_stage_model.*\",\n\t\t})\n\t}\n\n\tvar cdDevIdx, aeDevIdx, dmDevIdx int\n\t{\n\t\tif *o.SDCOffloadConditioner && *o.SDCOffloadLayers > 0 {\n\t\t\tcdDevIdx = 1\n\t\t}\n\t\tif *o.SDCOffloadAutoencoder && *o.SDCOffloadLayers > 0 {\n\t\t\taeDevIdx = 1\n\t\t\tif len(e.Devices) > 3 {\n\t\t\t\taeDevIdx = 2\n\t\t\t}\n\t\t}\n\t\tif *o.SDCOffloadLayers > 0 {\n\t\t\tdmDevIdx = 1\n\t\t\tswitch {\n\t\t\tcase len(e.Devices) > 3:\n\t\t\t\tdmDevIdx = 3\n\t\t\tcase len(e.Devices) > 2:\n\t\t\t\tdmDevIdx = 2\n\t\t\t}\n\t\t}\n\t}\n\n\t// Weight & Parameter.\n\t{\n\t\t// Conditioners.\n\t\tfor i := range cdLs {\n\t\t\te.Conditioners[i].Devices[cdDevIdx].Weight = GGUFBytesScalar(cdLs[i].Bytes())\n\t\t\te.Conditioners[i].Devices[cdDevIdx].Parameter = GGUFParametersScalar(cdLs[i].Elements())\n\t\t}\n\n\t\t// Autoencoder.\n\t\tif len(aeLs) != 0 {\n\t\t\te.Autoencoder.Devices[aeDevIdx].Weight = GGUFBytesScalar(aeLs.Bytes())\n\t\t\te.Autoencoder.Devices[aeDevIdx].Parameter = GGUFParametersScalar(aeLs.Elements())\n\t\t}\n\n\t\t// Model.\n\t\te.Devices[dmDevIdx].Weight = GGUFBytesScalar(dmLs.Bytes())\n\t\te.Devices[dmDevIdx].Parameter = GGUFParametersScalar(dmLs.Elements())\n\t}\n\n\t// Computation.\n\t{\n\t\t// See https://github.com/leejet/stable-diffusion.cpp/blob/10c6501bd05a697e014f1bee3a84e5664290c489/ggml_extend.hpp#L1058C9-L1058C23.\n\t\tvar maxNodes uint64 = 32768\n\n\t\t// Bootstrap, compute metadata.\n\t\tcm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false)\n\t\te.Devices[0].Computation = GGUFBytesScalar(cm)\n\n\t\t// Work context,\n\t\t// see https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/stable-diffusion.cpp#L1467-L1481,\n\t\t//     https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/stable-diffusion.cpp#L1572-L1586,\n\t\t//     https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/stable-diffusion.cpp#L1675-L1679.\n\t\t//\n\t\t{\n\t\t\tzChannels := uint64(4)\n\t\t\tif a.DiffusionTransformer {\n\t\t\t\tzChannels = 16\n\t\t\t}\n\t\t\t// See https://github.com/thxCode/stable-diffusion.cpp/blob/1ae97f8a8ca3615bdaf9c1fd32c13562e2471833/stable-diffusion.cpp#L2682-L2691.\n\t\t\tusage := uint64(128 * 1024 * 1024) /* 128MiB, LLaMA Box */\n\t\t\tusage += uint64(*o.SDCWidth) * uint64(*o.SDCHeight) * 3 /* output channels */ * 4 /* sizeof(float) */ * zChannels\n\t\t\te.Devices[0].Computation += GGUFBytesScalar(usage * uint64(ptr.Deref(o.ParallelSize, 1)) /* max batch */)\n\t\t}\n\n\t\t// Encode usage,\n\t\t// see https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/conditioner.hpp#L388-L391,\n\t\t//     https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/conditioner.hpp#L758-L766,\n\t\t//     https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/conditioner.hpp#L1083-L1085.\n\t\t{\n\t\t\tvar tes [][]uint64\n\t\t\tswitch {\n\t\t\tcase strings.HasPrefix(a.DiffusionArchitecture, \"FLUX\"): // FLUX.1\n\t\t\t\ttes = [][]uint64{\n\t\t\t\t\t{768, 77},\n\t\t\t\t\t{4096, 256},\n\t\t\t\t}\n\t\t\tcase strings.HasPrefix(a.DiffusionArchitecture, \"Stable Diffusion 3\"): // SD 3.x\n\t\t\t\ttes = [][]uint64{\n\t\t\t\t\t{768, 77},\n\t\t\t\t\t{1280, 77},\n\t\t\t\t\t{4096, 77},\n\t\t\t\t}\n\t\t\tcase strings.HasPrefix(a.DiffusionArchitecture, \"Stable Diffusion XL\"): // SD XL/XL Refiner\n\t\t\t\tif strings.HasSuffix(a.DiffusionArchitecture, \"Refiner\") {\n\t\t\t\t\ttes = [][]uint64{\n\t\t\t\t\t\t{1280, 77},\n\t\t\t\t\t}\n\t\t\t\t} else {\n\t\t\t\t\ttes = [][]uint64{\n\t\t\t\t\t\t{768, 77},\n\t\t\t\t\t\t{1280, 77},\n\t\t\t\t\t}\n\t\t\t\t}\n\t\t\tdefault: // SD 1.x/2.x\n\t\t\t\ttes = [][]uint64{\n\t\t\t\t\t{768, 77},\n\t\t\t\t}\n\t\t\t}\n\t\t\tfor i := range cdLs {\n\t\t\t\tusage := GGMLTypeF32.RowSizeOf(tes[i]) * 2 /* include conditioner */\n\t\t\t\te.Conditioners[i].Devices[cdDevIdx].Computation += GGUFBytesScalar(usage)\n\t\t\t}\n\n\t\t\t// TODO VAE Encode\n\t\t}\n\n\t\t// Diffusing usage.\n\t\tif !*o.SDCFreeComputeMemoryImmediately {\n\t\t\tvar usage uint64\n\t\t\tswitch {\n\t\t\tcase strings.HasPrefix(a.DiffusionArchitecture, \"FLUX\"): // FLUX.1\n\t\t\t\tusage = GuessFLUXDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention)\n\t\t\tcase strings.HasPrefix(a.DiffusionArchitecture, \"Stable Diffusion 3\"): // SD 3.x\n\t\t\t\tconst (\n\t\t\t\t\tsd3MediumKey  = \"model.diffusion_model.joint_blocks.23.x_block.attn.proj.weight\" // SD 3 Medium\n\t\t\t\t\tsd35MediumKey = \"model.diffusion_model.joint_blocks.23.x_block.attn.ln_k.weight\" // SD 3.5 Medium\n\t\t\t\t\tsd35LargeKey  = \"model.diffusion_model.joint_blocks.37.x_block.attn.ln_k.weight\" // SD 3.5 Large\n\t\t\t\t)\n\t\t\t\tm, _ := dmLs.Index([]string{sd3MediumKey, sd35MediumKey, sd35LargeKey})\n\t\t\t\tswitch {\n\t\t\t\tcase m[sd35LargeKey].Name != \"\":\n\t\t\t\t\tusage = GuessSD35LargeDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention)\n\t\t\t\tcase m[sd35MediumKey].Name != \"\":\n\t\t\t\t\tusage = GuessSD35MediumDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention)\n\t\t\t\tdefault:\n\t\t\t\t\tusage = GuessSD3MediumDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention)\n\t\t\t\t}\n\t\t\tcase strings.HasPrefix(a.DiffusionArchitecture, \"Stable Diffusion XL\"): // SD XL/XL Refiner\n\t\t\t\tconst (\n\t\t\t\t\tsdXlKey        = \"model.diffusion_model.output_blocks.5.1.transformer_blocks.1.attn1.to_v.weight\" // SD XL\n\t\t\t\t\tsdXlRefinerKey = \"model.diffusion_model.output_blocks.8.1.transformer_blocks.1.attn1.to_v.weight\" // SD XL Refiner\n\t\t\t\t)\n\t\t\t\tm, _ := dmLs.Index([]string{sdXlKey, sdXlRefinerKey})\n\t\t\t\tif m[sdXlRefinerKey].Name != \"\" {\n\t\t\t\t\tusage = GuessSDXLRefinerDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention)\n\t\t\t\t} else {\n\t\t\t\t\tusage = GuessSDXLDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention)\n\t\t\t\t}\n\t\t\tcase strings.HasPrefix(a.DiffusionArchitecture, \"Stable Diffusion 2\"): // SD 2.x\n\t\t\t\tusage = GuessSD2DiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention)\n\t\t\tdefault: // SD 1.x\n\t\t\t\tusage = GuessSD1DiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention)\n\t\t\t}\n\t\t\te.Devices[dmDevIdx].Computation += GGUFBytesScalar(usage)\n\t\t}\n\n\t\t// Decode usage.\n\t\tif len(aeLs) != 0 && !*o.SDCFreeComputeMemoryImmediately {\n\t\t\t// Bootstrap.\n\t\t\te.Autoencoder.Devices[aeDevIdx].Footprint += GGUFBytesScalar(100 * 1024 * 1024) /*100 MiB.*/\n\n\t\t\tvar convDim uint64\n\t\t\t{\n\t\t\t\tm, _ := aeLs.Index([]string{\n\t\t\t\t\t\"first_stage_model.decoder.conv_in.weight\",\n\t\t\t\t\t\"decoder.conv_in.weight\",\n\t\t\t\t})\n\t\t\t\ttis := maps.Values(m)\n\t\t\t\tif len(tis) != 0 && tis[0].NDimensions > 3 {\n\t\t\t\t\tconvDim = max(tis[0].Dimensions[0], tis[0].Dimensions[3])\n\t\t\t\t}\n\t\t\t}\n\n\t\t\tvar usage uint64\n\t\t\tif !*o.SDCAutoencoderTiling {\n\t\t\t\tusage = uint64(*o.SDCWidth) * uint64(*o.SDCHeight) * (3 /* output channels */ *4 /* sizeof(float) */ + 1) * convDim\n\t\t\t} else {\n\t\t\t\tusage = 512 * 512 * (3 /* output channels */ *4 /* sizeof(float) */ + 1) * convDim\n\t\t\t}\n\t\t\te.Autoencoder.Devices[aeDevIdx].Computation += GGUFBytesScalar(usage)\n\t\t}\n\t}\n\n\treturn e\n}\n\n// Types for StableDiffusionCpp estimated summary.\ntype (\n\t// StableDiffusionCppRunEstimateSummary represents the estimated summary of loading the GGUF file in stable-diffusion.cpp.\n\tStableDiffusionCppRunEstimateSummary struct {\n\t\t/* Basic */\n\n\t\t// Items\n\t\tItems []StableDiffusionCppRunEstimateSummaryItem `json:\"items\"`\n\n\t\t/* Appendix */\n\n\t\t// Type describes what type this GGUF file is.\n\t\tType string `json:\"type\"`\n\t\t// Architecture describes what architecture this GGUF file implements.\n\t\t//\n\t\t// All lowercase ASCII.\n\t\tArchitecture string `json:\"architecture\"`\n\t\t// FlashAttention is the flag to indicate whether enable the flash attention,\n\t\t// true for enable.\n\t\tFlashAttention bool `json:\"flashAttention\"`\n\t\t// NoMMap is the flag to indicate whether the file must be loaded without mmap,\n\t\t// true for total loaded.\n\t\tNoMMap bool `json:\"noMMap\"`\n\t\t// ImageOnly is the flag to indicate whether the model is used for generating image,\n\t\t// true for embedding only.\n\t\tImageOnly bool `json:\"imageOnly\"`\n\t\t// Distributable is the flag to indicate whether the model is distributable,\n\t\t// true for distributable.\n\t\tDistributable bool `json:\"distributable\"`\n\t}\n\n\t// StableDiffusionCppRunEstimateSummaryItem represents the estimated summary item of loading the GGUF file in stable-diffusion.cpp.\n\tStableDiffusionCppRunEstimateSummaryItem struct {\n\t\t// FullOffloaded is the flag to indicate whether the layers are fully offloaded,\n\t\t// false for partial offloaded or zero offloaded.\n\t\tFullOffloaded bool `json:\"fullOffloaded\"`\n\t\t// RAM is the memory usage for loading the GGUF file in RAM.\n\t\tRAM StableDiffusionCppRunEstimateMemory `json:\"ram\"`\n\t\t// VRAMs is the memory usage for loading the GGUF file in VRAM per device.\n\t\tVRAMs []StableDiffusionCppRunEstimateMemory `json:\"vrams\"`\n\t}\n\n\t// StableDiffusionCppRunEstimateMemory represents the memory usage for loading the GGUF file in llama.cpp.\n\tStableDiffusionCppRunEstimateMemory struct {\n\t\t// Remote is the flag to indicate whether the device is remote,\n\t\t// true for remote.\n\t\tRemote bool `json:\"remote\"`\n\t\t// Position is the relative position of the device,\n\t\t// starts from 0.\n\t\t//\n\t\t// If Remote is true, Position is the position of the remote devices,\n\t\t// Otherwise, Position is the position of the device in the local devices.\n\t\tPosition int `json:\"position\"`\n\t\t// UMA represents the usage of Unified Memory Architecture.\n\t\tUMA GGUFBytesScalar `json:\"uma\"`\n\t\t// NonUMA represents the usage of Non-Unified Memory Architecture.\n\t\tNonUMA GGUFBytesScalar `json:\"nonuma\"`\n\t}\n)\n\n// SummarizeItem returns the corresponding LLaMACppRunEstimateSummaryItem with the given options.\nfunc (e StableDiffusionCppRunEstimate) SummarizeItem(\n\tmmap bool,\n\tnonUMARamFootprint, nonUMAVramFootprint uint64,\n) (emi StableDiffusionCppRunEstimateSummaryItem) {\n\temi.FullOffloaded = e.FullOffloaded\n\n\t// RAM.\n\t{\n\t\tfp := e.Devices[0].Footprint\n\t\twg := e.Devices[0].Weight\n\t\tcp := e.Devices[0].Computation\n\n\t\t// UMA.\n\t\temi.RAM.UMA = fp + wg + cp\n\n\t\t// NonUMA.\n\t\temi.RAM.NonUMA = GGUFBytesScalar(nonUMARamFootprint) + emi.RAM.UMA\n\t}\n\n\t// VRAMs.\n\temi.VRAMs = make([]StableDiffusionCppRunEstimateMemory, len(e.Devices)-1)\n\t{\n\t\tfor i, d := range e.Devices[1:] {\n\t\t\tfp := d.Footprint\n\t\t\twg := d.Weight\n\t\t\tcp := d.Computation\n\n\t\t\temi.VRAMs[i].Remote = d.Remote\n\t\t\temi.VRAMs[i].Position = d.Position\n\n\t\t\t// UMA.\n\t\t\temi.VRAMs[i].UMA = fp + wg + /* cp */ 0\n\t\t\tif d.Remote {\n\t\t\t\temi.VRAMs[i].UMA += cp\n\t\t\t}\n\n\t\t\t// NonUMA.\n\t\t\temi.VRAMs[i].NonUMA = GGUFBytesScalar(nonUMAVramFootprint) + fp + wg + cp\n\t\t}\n\t}\n\n\t// Add antoencoder's usage.\n\tif e.Autoencoder != nil {\n\t\taemi := e.Autoencoder.SummarizeItem(mmap, 0, 0)\n\t\temi.RAM.UMA += aemi.RAM.UMA\n\t\temi.RAM.NonUMA += aemi.RAM.NonUMA\n\t\tfor i, v := range aemi.VRAMs {\n\t\t\temi.VRAMs[i].UMA += v.UMA\n\t\t\temi.VRAMs[i].NonUMA += v.NonUMA\n\t\t}\n\t}\n\n\t// Add conditioners' usage.\n\tfor i := range e.Conditioners {\n\t\tcemi := e.Conditioners[i].SummarizeItem(mmap, 0, 0)\n\t\temi.RAM.UMA += cemi.RAM.UMA\n\t\temi.RAM.NonUMA += cemi.RAM.NonUMA\n\t\tfor i, v := range cemi.VRAMs {\n\t\t\temi.VRAMs[i].UMA += v.UMA\n\t\t\temi.VRAMs[i].NonUMA += v.NonUMA\n\t\t}\n\t}\n\n\t// Add upscaler's usage.\n\tif e.Upscaler != nil {\n\t\tuemi := e.Upscaler.SummarizeItem(mmap, 0, 0)\n\t\temi.RAM.UMA += uemi.RAM.UMA\n\t\temi.RAM.NonUMA += uemi.RAM.NonUMA\n\t\t// NB(thxCode): all VRAMs should offload to the first device at present.\n\t\tvar vramUMA, vramNonUMA GGUFBytesScalar\n\t\tfor _, v := range uemi.VRAMs {\n\t\t\tvramUMA += v.UMA\n\t\t\tvramNonUMA += v.NonUMA\n\t\t}\n\t\tif e.Upscaler.FullOffloaded {\n\t\t\temi.VRAMs[0].UMA += vramUMA\n\t\t\temi.VRAMs[0].NonUMA += vramNonUMA\n\t\t} else {\n\t\t\temi.RAM.UMA += vramUMA\n\t\t\temi.RAM.NonUMA += vramNonUMA\n\t\t}\n\t}\n\n\t// Add control net's usage.\n\tif e.ControlNet != nil {\n\t\tcnemi := e.ControlNet.SummarizeItem(mmap, 0, 0)\n\t\temi.RAM.UMA += cnemi.RAM.UMA\n\t\temi.RAM.NonUMA += cnemi.RAM.NonUMA\n\t\t// NB(thxCode): all VRAMs should offload to the first device at present.\n\t\tvar vramUMA, vramNonUMA GGUFBytesScalar\n\t\tfor _, v := range cnemi.VRAMs {\n\t\t\tvramUMA += v.UMA\n\t\t\tvramNonUMA += v.NonUMA\n\t\t}\n\t\tif e.ControlNet.FullOffloaded {\n\t\t\temi.VRAMs[0].UMA += vramUMA\n\t\t\temi.VRAMs[0].NonUMA += vramNonUMA\n\t\t} else {\n\t\t\temi.RAM.UMA += vramUMA\n\t\t\temi.RAM.NonUMA += vramNonUMA\n\t\t}\n\t}\n\n\treturn emi\n}\n\n// Summarize returns the corresponding StableDiffusionCppRunEstimate with the given options.\nfunc (e StableDiffusionCppRunEstimate) Summarize(\n\tmmap bool,\n\tnonUMARamFootprint, nonUMAVramFootprint uint64,\n) (es StableDiffusionCppRunEstimateSummary) {\n\t// Items.\n\tes.Items = []StableDiffusionCppRunEstimateSummaryItem{\n\t\te.SummarizeItem(mmap, nonUMARamFootprint, nonUMAVramFootprint),\n\t}\n\n\t// Just copy from the original estimate.\n\tes.Type = e.Type\n\tes.Architecture = e.Architecture\n\tes.FlashAttention = e.FlashAttention\n\tes.NoMMap = e.NoMMap\n\tes.ImageOnly = e.ImageOnly\n\tes.Distributable = e.Distributable\n\n\treturn es\n}\n\nfunc normalizeArchitecture(arch string) string {\n\treturn stringx.ReplaceAllFunc(arch, func(r rune) rune {\n\t\tswitch r {\n\t\tcase ' ', '.', '-', '/', ':':\n\t\t\treturn '_' // Replace with underscore.\n\t\t}\n\t\tif r >= 'A' && r <= 'Z' {\n\t\t\tr += 'a' - 'A' // Lowercase.\n\t\t}\n\t\treturn r\n\t})\n}\n"
  },
  {
    "path": "file_estimate__stablediffusioncpp_test.go",
    "content": "package gguf_parser\n\nimport (\n\t\"context\"\n\t\"testing\"\n\n\t\"github.com/davecgh/go-spew/spew\"\n)\n\nfunc TestGGUFFile_EstimateStableDiffusionRun(t *testing.T) {\n\tctx := context.Background()\n\n\tcases := []struct {\n\t\tname  string\n\t\tgiven *GGUFFile\n\t}{\n\t\t{\n\t\t\tname: \"sd 1.5\",\n\t\t\tgiven: func() *GGUFFile {\n\t\t\t\tf, err := ParseGGUFFileFromHuggingFace(\n\t\t\t\t\tctx,\n\t\t\t\t\t\"gpustack/stable-diffusion-v1-5-GGUF\",\n\t\t\t\t\t\"stable-diffusion-v1-5-FP16.gguf\",\n\t\t\t\t\tSkipLargeMetadata())\n\t\t\t\tif err != nil {\n\t\t\t\t\tt.Fatal(err)\n\t\t\t\t}\n\t\t\t\treturn f\n\t\t\t}(),\n\t\t},\n\t\t{\n\t\t\tname: \"sd 2.1\",\n\t\t\tgiven: func() *GGUFFile {\n\t\t\t\tf, err := ParseGGUFFileFromHuggingFace(\n\t\t\t\t\tctx,\n\t\t\t\t\t\"gpustack/stable-diffusion-v2-1-GGUF\",\n\t\t\t\t\t\"stable-diffusion-v2-1-Q8_0.gguf\",\n\t\t\t\t\tSkipLargeMetadata())\n\t\t\t\tif err != nil {\n\t\t\t\t\tt.Fatal(err)\n\t\t\t\t}\n\t\t\t\treturn f\n\t\t\t}(),\n\t\t},\n\t\t{\n\t\t\tname: \"sd xl\",\n\t\t\tgiven: func() *GGUFFile {\n\t\t\t\tf, err := ParseGGUFFileFromHuggingFace(\n\t\t\t\t\tctx,\n\t\t\t\t\t\"gpustack/stable-diffusion-xl-base-1.0-GGUF\",\n\t\t\t\t\t\"stable-diffusion-xl-base-1.0-FP16.gguf\",\n\t\t\t\t\tSkipLargeMetadata())\n\t\t\t\tif err != nil {\n\t\t\t\t\tt.Fatal(err)\n\t\t\t\t}\n\t\t\t\treturn f\n\t\t\t}(),\n\t\t},\n\t\t{\n\t\t\tname: \"sd 3.5 large\",\n\t\t\tgiven: func() *GGUFFile {\n\t\t\t\tf, err := ParseGGUFFileFromHuggingFace(\n\t\t\t\t\tctx,\n\t\t\t\t\t\"gpustack/stable-diffusion-v3-5-large-GGUF\",\n\t\t\t\t\t\"stable-diffusion-v3-5-large-Q4_0.gguf\",\n\t\t\t\t\tSkipLargeMetadata())\n\t\t\t\tif err != nil {\n\t\t\t\t\tt.Fatal(err)\n\t\t\t\t}\n\t\t\t\treturn f\n\t\t\t}(),\n\t\t},\n\t\t{\n\t\t\tname: \"flux .1 dev\",\n\t\t\tgiven: func() *GGUFFile {\n\t\t\t\tf, err := ParseGGUFFileFromHuggingFace(\n\t\t\t\t\tctx,\n\t\t\t\t\t\"gpustack/FLUX.1-dev-GGUF\",\n\t\t\t\t\t\"FLUX.1-dev-Q4_0.gguf\",\n\t\t\t\t\tSkipLargeMetadata())\n\t\t\t\tif err != nil {\n\t\t\t\t\tt.Fatal(err)\n\t\t\t\t}\n\t\t\t\treturn f\n\t\t\t}(),\n\t\t},\n\t}\n\tfor _, tc := range cases {\n\t\tt.Run(tc.name, func(t *testing.T) {\n\t\t\tf := tc.given\n\t\t\tt.Log(\"\\n\", spew.Sdump(f.EstimateStableDiffusionCppRun()), \"\\n\")\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "file_estimate_option.go",
    "content": "package gguf_parser\n\nimport (\n\t\"regexp\"\n\t\"slices\"\n\t\"strconv\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/ptr\"\n)\n\ntype (\n\t_GGUFRunEstimateOptions struct {\n\t\t// Common\n\t\tParallelSize        *int32\n\t\tFlashAttention      bool\n\t\tMainGPUIndex        int\n\t\tRPCServers          []string\n\t\tTensorSplitFraction []float64\n\t\tOverriddenTensors   []*GGUFRunOverriddenTensor\n\t\tDeviceMetrics       []GGUFRunDeviceMetric\n\n\t\t// LLaMACpp (LMC) specific\n\t\tLMCContextSize                    *int32\n\t\tLMCRoPEFrequencyBase              *float32\n\t\tLMCRoPEFrequencyScale             *float32\n\t\tLMCRoPEScalingType                *string\n\t\tLMCRoPEScalingOriginalContextSize *int32\n\t\tLMCInMaxContextSize               bool\n\t\tLMCLogicalBatchSize               *int32\n\t\tLMCPhysicalBatchSize              *int32\n\t\tLMCVisualMaxImageSize             *uint32\n\t\tLMCMaxProjectedCache              *uint32\n\t\tLMCCacheKeyType                   *GGMLType\n\t\tLMCCacheValueType                 *GGMLType\n\t\tLMCOffloadKVCache                 *bool\n\t\tLMCOffloadLayers                  *uint64\n\t\tLMCSplitMode                      LLaMACppSplitMode\n\t\tLMCFullSizeSWACache               bool\n\t\tLMCProjector                      *LLaMACppRunEstimate\n\t\tLMCDrafter                        *LLaMACppRunEstimate\n\t\tLMCAdapters                       []LLaMACppRunEstimate\n\n\t\t// StableDiffusionCpp (SDC) specific\n\t\tSDCOffloadLayers                *uint64\n\t\tSDCBatchCount                   *int32\n\t\tSDCHeight                       *uint32\n\t\tSDCWidth                        *uint32\n\t\tSDCOffloadConditioner           *bool\n\t\tSDCOffloadAutoencoder           *bool\n\t\tSDCAutoencoderTiling            *bool\n\t\tSDCFreeComputeMemoryImmediately *bool\n\t\tSDCUpscaler                     *StableDiffusionCppRunEstimate\n\t\tSDCControlNet                   *StableDiffusionCppRunEstimate\n\t}\n\n\t// GGUFRunOverriddenTensor holds the overridden tensor information for the estimate.\n\t//\n\t// When BufferType is CPU,\n\t// it indicates that the tensor should be loaded into the CPU memory,\n\t// even if it belongs to a GPU offload layer.\n\tGGUFRunOverriddenTensor struct {\n\t\t// PatternRegex is the regex pattern to match the tensor name.\n\t\tPatternRegex *regexp.Regexp\n\t\t// BufferType is the buffer type to override,\n\t\t// it can be \"CPU\", \"CUDA0\", \"Metal\" and others.\n\t\tBufferType string\n\n\t\t// _BufferType record parsed buffer type, used internally.\n\t\t_BufferType GGUFRunOverriddenTensorBufferType\n\t\t// _Index record parsed device index, used internally.\n\t\t_Index string\n\t}\n\n\t// GGUFRunDeviceMetric holds the device metric for the estimate.\n\t//\n\t// When the device represents a CPU,\n\t// FLOPS refers to the floating-point operations per second of that CPU,\n\t// while UpBandwidth indicates the bandwidth of the RAM (since SRAM is typically small and cannot hold all weights,\n\t// the RAM here refers to the bandwidth of DRAM,\n\t// unless the device's SRAM can accommodate the corresponding model weights).\n\t//\n\t// When the device represents a GPU,\n\t// FLOPS refers to the floating-point operations per second of that GPU,\n\t// while UpBandwidth indicates the bandwidth of the VRAM.\n\t//\n\t// When the device represents a specific node,\n\t// FLOPS depends on whether a CPU or GPU is being used,\n\t// while UpBandwidth refers to the network bandwidth between nodes.\n\tGGUFRunDeviceMetric struct {\n\t\t// FLOPS is the floating-point operations per second of the device.\n\t\tFLOPS FLOPSScalar\n\t\t// UpBandwidth is the bandwidth of the device to transmit data to calculate,\n\t\t// unit is Bps (bytes per second).\n\t\tUpBandwidth BytesPerSecondScalar\n\t\t// DownBandwidth is the bandwidth of the device to transmit calculated result to next layer,\n\t\t// unit is Bps (bytes per second).\n\t\tDownBandwidth BytesPerSecondScalar\n\t}\n\n\t// GGUFRunEstimateOption is the options for the estimate.\n\tGGUFRunEstimateOption func(*_GGUFRunEstimateOptions)\n)\n\n// GGUFRunOverriddenTensorBufferType is the type of the overridden tensor buffer.\ntype GGUFRunOverriddenTensorBufferType uint32\n\nconst (\n\t_ GGUFRunOverriddenTensorBufferType = iota\n\tGGUFRunOverriddenTensorBufferTypeCPU\n\tGGUFRunOverriddenTensorBufferTypeGPU\n\tGGUFRunOverriddenTensorBufferTypeRPC\n\tGGUFRunOverriddenTensorBufferTypeUnknown\n)\n\nvar (\n\t_GGUFRunOverriddenTensorBufferTypeCPURegex       = regexp.MustCompile(`^(CPU|AMX)`)\n\t_GGUFRunOverriddenTensorBufferTypeUMAGPURegex    = regexp.MustCompile(`^(Metal|OpenCL)`)\n\t_GGUFRunOverriddenTensorBufferTypeNonUMAGPURegex = regexp.MustCompile(`^(CUDA|CANN|ROCm|MUSA|SYCL|Vulkan|Kompute)(\\d+)?`)\n\t_GGUFRunOverriddenTensorBufferTypeRPCRegex       = regexp.MustCompile(`^RPC\\[(.*)\\]`)\n)\n\n// ParseBufferType returns the device index of the overridden tensor.\n//\n// The device index is used to determine which device the tensor belongs to,\n// it is according to the buffer type description.\nfunc (odt *GGUFRunOverriddenTensor) ParseBufferType() (GGUFRunOverriddenTensorBufferType, string) {\n\tif odt == nil {\n\t\treturn GGUFRunOverriddenTensorBufferTypeUnknown, \"\"\n\t}\n\n\tif odt._BufferType == 0 {\n\t\todt._BufferType = GGUFRunOverriddenTensorBufferTypeUnknown\n\t\tif ms := _GGUFRunOverriddenTensorBufferTypeCPURegex.FindStringSubmatch(odt.BufferType); len(ms) > 1 {\n\t\t\todt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeCPU, \"0\"\n\t\t}\n\t\tif ms := _GGUFRunOverriddenTensorBufferTypeUMAGPURegex.FindStringSubmatch(odt.BufferType); len(ms) > 1 {\n\t\t\todt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeGPU, \"1\"\n\t\t}\n\t\tif ms := _GGUFRunOverriddenTensorBufferTypeRPCRegex.FindStringSubmatch(odt.BufferType); len(ms) > 1 {\n\t\t\todt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeRPC, ms[1]\n\t\t}\n\t\tif ms := _GGUFRunOverriddenTensorBufferTypeNonUMAGPURegex.FindStringSubmatch(odt.BufferType); len(ms) > 2 {\n\t\t\tif idx, err := strconv.ParseInt(ms[2], 10, 64); err == nil && idx >= 0 {\n\t\t\t\todt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeGPU, ms[2]\n\t\t\t}\n\t\t}\n\t}\n\treturn odt._BufferType, odt._Index\n}\n\n// WithParallelSize sets the (decoding sequences) parallel size for the estimate.\nfunc WithParallelSize(size int32) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\tif size <= 0 {\n\t\t\treturn\n\t\t}\n\t\to.ParallelSize = &size\n\t}\n}\n\n// WithFlashAttention sets the flash attention flag.\nfunc WithFlashAttention() GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\to.FlashAttention = true\n\t}\n}\n\n// WithMainGPUIndex sets the main device for the estimate.\n//\n// When split mode is LLaMACppSplitModeNone, the main device is the only device.\n// When split mode is LLaMACppSplitModeRow, the main device handles the intermediate results and KV.\n//\n// WithMainGPUIndex needs to combine with WithTensorSplitFraction.\nfunc WithMainGPUIndex(di int) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\to.MainGPUIndex = di\n\t}\n}\n\n// WithRPCServers sets the RPC servers for the estimate.\nfunc WithRPCServers(srvs []string) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\tif len(srvs) == 0 {\n\t\t\treturn\n\t\t}\n\t\to.RPCServers = srvs\n\t}\n}\n\n// WithTensorSplitFraction sets the tensor split cumulative fractions for the estimate.\n//\n// WithTensorSplitFraction accepts a variadic number of fractions,\n// all fraction values must be in the range of [0, 1],\n// and the last fraction must be 1.\n//\n// For example, WithTensorSplitFraction(0.2, 0.4, 0.6, 0.8, 1) will split the tensor into five parts with 20% each.\nfunc WithTensorSplitFraction(fractions []float64) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\tif len(fractions) == 0 {\n\t\t\treturn\n\t\t}\n\t\tfor _, f := range fractions {\n\t\t\tif f < 0 || f > 1 {\n\t\t\t\treturn\n\t\t\t}\n\t\t}\n\t\tif fractions[len(fractions)-1] != 1 {\n\t\t\treturn\n\t\t}\n\t\to.TensorSplitFraction = fractions\n\t}\n}\n\n// WithOverriddenTensors sets the overridden tensors for the estimate.\nfunc WithOverriddenTensors(tensors []GGUFRunOverriddenTensor) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\tif len(tensors) == 0 {\n\t\t\treturn\n\t\t}\n\t\tfor _, t := range tensors {\n\t\t\tif t.PatternRegex == nil || t.BufferType == \"\" {\n\t\t\t\treturn\n\t\t\t}\n\t\t}\n\t\to.OverriddenTensors = make([]*GGUFRunOverriddenTensor, len(tensors))\n\t\tfor i := range tensors {\n\t\t\to.OverriddenTensors[i] = &tensors[i]\n\t\t}\n\t}\n}\n\n// WithDeviceMetrics sets the device metrics for the estimate.\nfunc WithDeviceMetrics(metrics []GGUFRunDeviceMetric) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\tif len(metrics) == 0 {\n\t\t\treturn\n\t\t}\n\t\to.DeviceMetrics = metrics\n\t}\n}\n\n// WithLLaMACppContextSize sets the context size for the estimate.\nfunc WithLLaMACppContextSize(size int32) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\tif size <= 0 {\n\t\t\treturn\n\t\t}\n\t\to.LMCContextSize = &size\n\t}\n}\n\n// WithLLaMACppRoPE sets the RoPE parameters for the estimate.\nfunc WithLLaMACppRoPE(\n\tfrequencyBase float64,\n\tfrequencyScale float64,\n\tscalingType string,\n\tscalingOriginalContextSize int32,\n) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\tif frequencyBase > 0 {\n\t\t\to.LMCRoPEFrequencyBase = ptr.Float32(float32(frequencyBase))\n\t\t}\n\t\tif frequencyScale > 0 {\n\t\t\to.LMCRoPEFrequencyScale = ptr.Float32(float32(frequencyScale))\n\t\t}\n\t\tif slices.Contains([]string{\"none\", \"linear\", \"yarn\"}, scalingType) {\n\t\t\to.LMCRoPEScalingType = &scalingType\n\t\t}\n\t\tif scalingOriginalContextSize > 0 {\n\t\t\to.LMCRoPEScalingOriginalContextSize = ptr.To(scalingOriginalContextSize)\n\t\t}\n\t}\n}\n\n// WithinLLaMACppMaxContextSize limits the context size to the maximum,\n// if the context size is over the maximum.\nfunc WithinLLaMACppMaxContextSize() GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\to.LMCInMaxContextSize = true\n\t}\n}\n\n// WithLLaMACppLogicalBatchSize sets the logical batch size for the estimate.\nfunc WithLLaMACppLogicalBatchSize(size int32) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\tif size <= 0 {\n\t\t\treturn\n\t\t}\n\t\to.LMCLogicalBatchSize = &size\n\t}\n}\n\n// WithLLaMACppPhysicalBatchSize sets the physical batch size for the estimate.\nfunc WithLLaMACppPhysicalBatchSize(size int32) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\tif size <= 0 {\n\t\t\treturn\n\t\t}\n\t\to.LMCPhysicalBatchSize = &size\n\t}\n}\n\n// _GGUFEstimateCacheTypeAllowList is the allow list of cache key and value types.\nvar _GGUFEstimateCacheTypeAllowList = []GGMLType{\n\tGGMLTypeF32,\n\tGGMLTypeF16,\n\tGGMLTypeBF16,\n\tGGMLTypeQ8_0,\n\tGGMLTypeQ4_0, GGMLTypeQ4_1,\n\tGGMLTypeIQ4_NL,\n\tGGMLTypeQ5_0, GGMLTypeQ5_1,\n}\n\n// WithLLaMACppCacheKeyType sets the cache key type for the estimate.\nfunc WithLLaMACppCacheKeyType(t GGMLType) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\tif slices.Contains(_GGUFEstimateCacheTypeAllowList, t) {\n\t\t\to.LMCCacheKeyType = &t\n\t\t}\n\t}\n}\n\n// WithLLaMACppCacheValueType sets the cache value type for the estimate.\nfunc WithLLaMACppCacheValueType(t GGMLType) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\tif slices.Contains(_GGUFEstimateCacheTypeAllowList, t) {\n\t\t\to.LMCCacheValueType = &t\n\t\t}\n\t}\n}\n\n// WithoutLLaMACppOffloadKVCache disables offloading the KV cache.\nfunc WithoutLLaMACppOffloadKVCache() GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\to.LMCOffloadKVCache = ptr.To(false)\n\t}\n}\n\n// WithLLaMACppOffloadLayers sets the number of layers to offload.\nfunc WithLLaMACppOffloadLayers(layers uint64) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\to.LMCOffloadLayers = &layers\n\t}\n}\n\n// LLaMACppSplitMode is the split mode for LLaMACpp.\ntype LLaMACppSplitMode uint\n\nconst (\n\tLLaMACppSplitModeLayer LLaMACppSplitMode = iota\n\tLLaMACppSplitModeRow\n\tLLaMACppSplitModeNone\n\t_LLAMACppSplitModeMax\n)\n\n// WithLLaMACppSplitMode sets the split mode for the estimate.\nfunc WithLLaMACppSplitMode(mode LLaMACppSplitMode) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\tif mode < _LLAMACppSplitModeMax {\n\t\t\to.LMCSplitMode = mode\n\t\t}\n\t}\n}\n\n// WithLLaMACppFullSizeSWACache enables full size sliding window attention cache.\nfunc WithLLaMACppFullSizeSWACache() GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\to.LMCFullSizeSWACache = true\n\t}\n}\n\n// WithLLaMACppVisualMaxImageSize sets the visual maximum image size input for the estimate.\nfunc WithLLaMACppVisualMaxImageSize(size uint32) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\tif size == 0 {\n\t\t\treturn\n\t\t}\n\t\to.LMCVisualMaxImageSize = &size\n\t}\n}\n\n// WithLLaMACppMaxProjectedCache sets the maximum projected embedding cache for the estimate.\nfunc WithLLaMACppMaxProjectedCache(cacheSize uint32) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\tif cacheSize == 0 {\n\t\t\treturn\n\t\t}\n\t\to.LMCMaxProjectedCache = ptr.To(cacheSize)\n\t}\n}\n\n// WithLLaMACppDrafter sets the drafter estimate usage.\nfunc WithLLaMACppDrafter(dft *LLaMACppRunEstimate) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\to.LMCDrafter = dft\n\t}\n}\n\n// WithLLaMACppProjector sets the multimodal projector estimate usage.\nfunc WithLLaMACppProjector(prj *LLaMACppRunEstimate) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\to.LMCProjector = prj\n\t}\n}\n\n// WithLLaMACppAdapters sets the adapters estimate usage.\nfunc WithLLaMACppAdapters(adp []LLaMACppRunEstimate) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\tif len(adp) == 0 {\n\t\t\treturn\n\t\t}\n\t\to.LMCAdapters = adp\n\t}\n}\n\n// WithStableDiffusionCppOffloadLayers sets the number of layers to offload.\nfunc WithStableDiffusionCppOffloadLayers(layers uint64) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\to.SDCOffloadLayers = &layers\n\t}\n}\n\n// WithStableDiffusionCppBatchCount sets the batch count for the estimate.\nfunc WithStableDiffusionCppBatchCount(count int32) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\tif count == 0 {\n\t\t\treturn\n\t\t}\n\t\to.SDCBatchCount = ptr.To(count)\n\t}\n}\n\n// WithStableDiffusionCppHeight sets the image height for the estimate.\nfunc WithStableDiffusionCppHeight(height uint32) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\tif height == 0 {\n\t\t\treturn\n\t\t}\n\t\to.SDCHeight = ptr.To(height)\n\t}\n}\n\n// WithStableDiffusionCppWidth sets the image width for the estimate.\nfunc WithStableDiffusionCppWidth(width uint32) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\tif width == 0 {\n\t\t\treturn\n\t\t}\n\t\to.SDCWidth = ptr.To(width)\n\t}\n}\n\n// WithoutStableDiffusionCppOffloadConditioner disables offloading the conditioner(text encoder).\nfunc WithoutStableDiffusionCppOffloadConditioner() GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\to.SDCOffloadConditioner = ptr.To(false)\n\t}\n}\n\n// WithoutStableDiffusionCppOffloadAutoencoder disables offloading the autoencoder.\nfunc WithoutStableDiffusionCppOffloadAutoencoder() GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\to.SDCOffloadAutoencoder = ptr.To(false)\n\t}\n}\n\n// WithStableDiffusionCppAutoencoderTiling enables tiling for the autoencoder.\nfunc WithStableDiffusionCppAutoencoderTiling() GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\to.SDCAutoencoderTiling = ptr.To(true)\n\t}\n}\n\n// WithStableDiffusionCppFreeComputeMemoryImmediately enables freeing compute memory immediately.\nfunc WithStableDiffusionCppFreeComputeMemoryImmediately() GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\to.SDCFreeComputeMemoryImmediately = ptr.To(true)\n\t}\n}\n\n// WithStableDiffusionCppUpscaler sets the upscaler estimate usage.\nfunc WithStableDiffusionCppUpscaler(ups *StableDiffusionCppRunEstimate) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\to.SDCUpscaler = ups\n\t}\n}\n\n// WithStableDiffusionCppControlNet sets the control net estimate usage.\nfunc WithStableDiffusionCppControlNet(cn *StableDiffusionCppRunEstimate) GGUFRunEstimateOption {\n\treturn func(o *_GGUFRunEstimateOptions) {\n\t\to.SDCControlNet = cn\n\t}\n}\n"
  },
  {
    "path": "file_from_distro.go",
    "content": "package gguf_parser\n\nimport (\n\t\"context\"\n\t\"errors\"\n\t\"fmt\"\n\t\"net/http\"\n\t\"path/filepath\"\n\t\"time\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/httpx\"\n)\n\nvar (\n\tErrOllamaInvalidModel      = errors.New(\"ollama invalid model\")\n\tErrOllamaBaseLayerNotFound = errors.New(\"ollama base layer not found\")\n)\n\n// ParseGGUFFileFromOllama parses a GGUF file from Ollama model's base layer,\n// and returns a GGUFFile, or an error if any.\nfunc ParseGGUFFileFromOllama(ctx context.Context, model string, opts ...GGUFReadOption) (*GGUFFile, error) {\n\treturn ParseGGUFFileFromOllamaModel(ctx, ParseOllamaModel(model), opts...)\n}\n\n// ParseGGUFFileFromOllamaModel is similar to ParseGGUFFileFromOllama,\n// but inputs an OllamaModel instead of a string.\n//\n// The given OllamaModel will be completed(fetching MediaType, Config and Layers) after calling this function.\nfunc ParseGGUFFileFromOllamaModel(ctx context.Context, model *OllamaModel, opts ...GGUFReadOption) (gf *GGUFFile, err error) {\n\tif model == nil {\n\t\treturn nil, ErrOllamaInvalidModel\n\t}\n\n\topts = append(opts[:len(opts):len(opts)], SkipRangeDownloadDetection())\n\n\tvar o _GGUFReadOptions\n\tfor _, opt := range opts {\n\t\topt(&o)\n\t}\n\n\t// Cache.\n\t{\n\t\tif o.CachePath != \"\" {\n\t\t\to.CachePath = filepath.Join(o.CachePath, \"distro\", \"ollama\")\n\t\t}\n\t\tc := GGUFFileCache(o.CachePath)\n\n\t\t// Get from cache.\n\t\tif gf, err = c.Get(model.String(), o.CacheExpiration); err == nil {\n\t\t\treturn gf, nil\n\t\t}\n\n\t\t// Put to cache.\n\t\tdefer func() {\n\t\t\tif err == nil {\n\t\t\t\t_ = c.Put(model.String(), gf)\n\t\t\t}\n\t\t}()\n\t}\n\n\tvar cli *http.Client\n\tcli = httpx.Client(\n\t\thttpx.ClientOptions().\n\t\t\tWithUserAgent(OllamaUserAgent()).\n\t\t\tIf(o.Debug, func(x *httpx.ClientOption) *httpx.ClientOption {\n\t\t\t\treturn x.WithDebug()\n\t\t\t}).\n\t\t\tWithTimeout(0).\n\t\t\tWithRetryBackoff(1*time.Second, 5*time.Second, 10).\n\t\t\tWithRetryIf(func(resp *http.Response, err error) bool {\n\t\t\t\treturn httpx.DefaultRetry(resp, err) || OllamaRegistryAuthorizeRetry(resp, cli)\n\t\t\t}).\n\t\t\tWithTransport(\n\t\t\t\thttpx.TransportOptions().\n\t\t\t\t\tWithoutKeepalive().\n\t\t\t\t\tTimeoutForDial(10*time.Second).\n\t\t\t\t\tTimeoutForTLSHandshake(5*time.Second).\n\t\t\t\t\tIf(o.SkipProxy, func(x *httpx.TransportOption) *httpx.TransportOption {\n\t\t\t\t\t\treturn x.WithoutProxy()\n\t\t\t\t\t}).\n\t\t\t\t\tIf(o.ProxyURL != nil, func(x *httpx.TransportOption) *httpx.TransportOption {\n\t\t\t\t\t\treturn x.WithProxy(http.ProxyURL(o.ProxyURL))\n\t\t\t\t\t}).\n\t\t\t\t\tIf(o.SkipTLSVerification, func(x *httpx.TransportOption) *httpx.TransportOption {\n\t\t\t\t\t\treturn x.WithoutInsecureVerify()\n\t\t\t\t\t}).\n\t\t\t\t\tIf(o.SkipDNSCache, func(x *httpx.TransportOption) *httpx.TransportOption {\n\t\t\t\t\t\treturn x.WithoutDNSCache()\n\t\t\t\t\t})))\n\n\tvar ml OllamaModelLayer\n\t{\n\t\terr := model.Complete(ctx, cli)\n\t\tif err != nil {\n\t\t\treturn nil, fmt.Errorf(\"complete ollama model: %w\", err)\n\t\t}\n\n\t\tvar ok bool\n\t\tml, ok = model.GetLayer(\"application/vnd.ollama.image.model\")\n\t\tif !ok {\n\t\t\treturn nil, ErrOllamaBaseLayerNotFound\n\t\t}\n\t}\n\n\treturn parseGGUFFileFromRemote(ctx, cli, ml.BlobURL().String(), o)\n}\n"
  },
  {
    "path": "file_from_remote.go",
    "content": "package gguf_parser\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"io\"\n\t\"net/http\"\n\t\"path/filepath\"\n\t\"strings\"\n\t\"time\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/httpx\"\n\t\"github.com/gpustack/gguf-parser-go/util/osx\"\n)\n\n// ParseGGUFFileFromHuggingFace parses a GGUF file from Hugging Face(https://huggingface.co/),\n// and returns a GGUFFile, or an error if any.\nfunc ParseGGUFFileFromHuggingFace(ctx context.Context, repo, file string, opts ...GGUFReadOption) (*GGUFFile, error) {\n\tep := osx.Getenv(\"HF_ENDPOINT\", \"https://huggingface.co\")\n\treturn ParseGGUFFileRemote(ctx, fmt.Sprintf(\"%s/%s/resolve/main/%s\", ep, repo, file), opts...)\n}\n\n// ParseGGUFFileFromModelScope parses a GGUF file from Model Scope(https://modelscope.cn/),\n// and returns a GGUFFile, or an error if any.\nfunc ParseGGUFFileFromModelScope(ctx context.Context, repo, file string, opts ...GGUFReadOption) (*GGUFFile, error) {\n\tep := osx.Getenv(\"MS_ENDPOINT\", \"https://modelscope.cn\")\n\topts = append(opts[:len(opts):len(opts)], SkipRangeDownloadDetection())\n\treturn ParseGGUFFileRemote(ctx, fmt.Sprintf(\"%s/models/%s/resolve/master/%s\", ep, repo, file), opts...)\n}\n\n// ParseGGUFFileRemote parses a GGUF file from a remote BlobURL,\n// and returns a GGUFFile, or an error if any.\nfunc ParseGGUFFileRemote(ctx context.Context, url string, opts ...GGUFReadOption) (gf *GGUFFile, err error) {\n\tvar o _GGUFReadOptions\n\tfor _, opt := range opts {\n\t\topt(&o)\n\t}\n\n\t// Cache.\n\t{\n\t\tif o.CachePath != \"\" {\n\t\t\to.CachePath = filepath.Join(o.CachePath, \"remote\")\n\t\t\tif o.SkipLargeMetadata {\n\t\t\t\to.CachePath = filepath.Join(o.CachePath, \"brief\")\n\t\t\t}\n\t\t}\n\t\tc := GGUFFileCache(o.CachePath)\n\n\t\t// Get from cache.\n\t\tif gf, err = c.Get(url, o.CacheExpiration); err == nil {\n\t\t\treturn gf, nil\n\t\t}\n\n\t\t// Put to cache.\n\t\tdefer func() {\n\t\t\tif err == nil {\n\t\t\t\t_ = c.Put(url, gf)\n\t\t\t}\n\t\t}()\n\t}\n\n\tcli := httpx.Client(\n\t\thttpx.ClientOptions().\n\t\t\tWithUserAgent(\"gguf-parser-go\").\n\t\t\tIf(o.Debug,\n\t\t\t\tfunc(x *httpx.ClientOption) *httpx.ClientOption {\n\t\t\t\t\treturn x.WithDebug()\n\t\t\t\t},\n\t\t\t).\n\t\t\tIf(o.BearerAuthToken != \"\",\n\t\t\t\tfunc(x *httpx.ClientOption) *httpx.ClientOption {\n\t\t\t\t\treturn x.WithBearerAuth(o.BearerAuthToken)\n\t\t\t\t},\n\t\t\t).\n\t\t\tIf(len(o.Headers) > 0,\n\t\t\t\tfunc(x *httpx.ClientOption) *httpx.ClientOption {\n\t\t\t\t\treturn x.WithHeaders(o.Headers)\n\t\t\t\t},\n\t\t\t).\n\t\t\tWithTimeout(0).\n\t\t\tWithTransport(\n\t\t\t\thttpx.TransportOptions().\n\t\t\t\t\tWithoutKeepalive().\n\t\t\t\t\tTimeoutForDial(5*time.Second).\n\t\t\t\t\tTimeoutForTLSHandshake(5*time.Second).\n\t\t\t\t\tTimeoutForResponseHeader(5*time.Second).\n\t\t\t\t\tIf(o.SkipProxy,\n\t\t\t\t\t\tfunc(x *httpx.TransportOption) *httpx.TransportOption {\n\t\t\t\t\t\t\treturn x.WithoutProxy()\n\t\t\t\t\t\t},\n\t\t\t\t\t).\n\t\t\t\t\tIf(o.ProxyURL != nil,\n\t\t\t\t\t\tfunc(x *httpx.TransportOption) *httpx.TransportOption {\n\t\t\t\t\t\t\treturn x.WithProxy(http.ProxyURL(o.ProxyURL))\n\t\t\t\t\t\t},\n\t\t\t\t\t).\n\t\t\t\t\tIf(o.SkipTLSVerification || !strings.HasPrefix(url, \"https://\"),\n\t\t\t\t\t\tfunc(x *httpx.TransportOption) *httpx.TransportOption {\n\t\t\t\t\t\t\treturn x.WithoutInsecureVerify()\n\t\t\t\t\t\t},\n\t\t\t\t\t).\n\t\t\t\t\tIf(o.SkipDNSCache,\n\t\t\t\t\t\tfunc(x *httpx.TransportOption) *httpx.TransportOption {\n\t\t\t\t\t\t\treturn x.WithoutDNSCache()\n\t\t\t\t\t\t},\n\t\t\t\t\t),\n\t\t\t),\n\t)\n\n\treturn parseGGUFFileFromRemote(ctx, cli, url, o)\n}\n\nfunc parseGGUFFileFromRemote(ctx context.Context, cli *http.Client, url string, o _GGUFReadOptions) (*GGUFFile, error) {\n\tvar urls []string\n\t{\n\t\trs := CompleteShardGGUFFilename(url)\n\t\tif rs != nil {\n\t\t\turls = rs\n\t\t} else {\n\t\t\turls = []string{url}\n\t\t}\n\t}\n\n\tfs := make([]_GGUFFileReadSeeker, 0, len(urls))\n\tdefer func() {\n\t\tfor i := range fs {\n\t\t\tosx.Close(fs[i])\n\t\t}\n\t}()\n\n\tfor i := range urls {\n\t\treq, err := httpx.NewGetRequestWithContext(ctx, urls[i])\n\t\tif err != nil {\n\t\t\treturn nil, fmt.Errorf(\"new request: %w\", err)\n\t\t}\n\n\t\tsf, err := httpx.OpenSeekerFile(cli, req,\n\t\t\thttpx.SeekerFileOptions().\n\t\t\t\tWithBufferSize(o.BufferSize).\n\t\t\t\tIf(o.SkipRangeDownloadDetection,\n\t\t\t\t\tfunc(x *httpx.SeekerFileOption) *httpx.SeekerFileOption {\n\t\t\t\t\t\treturn x.WithoutRangeDownloadDetect()\n\t\t\t\t\t},\n\t\t\t\t),\n\t\t)\n\t\tif err != nil {\n\t\t\treturn nil, fmt.Errorf(\"open http file: %w\", err)\n\t\t}\n\n\t\tfs = append(fs, _GGUFFileReadSeeker{\n\t\t\tCloser:     sf,\n\t\t\tReadSeeker: io.NewSectionReader(sf, 0, sf.Len()),\n\t\t\tSize:       sf.Len(),\n\t\t})\n\t}\n\n\treturn parseGGUFFile(fs, o)\n}\n"
  },
  {
    "path": "file_metadata.go",
    "content": "package gguf_parser\n\nimport (\n\t\"regexp\"\n\t\"slices\"\n\t\"sort\"\n\t\"strings\"\n\n\t\"golang.org/x/exp/maps\"\n)\n\n// GGUFMetadata represents the model metadata of a GGUF file.\ntype GGUFMetadata struct {\n\t/* Basic */\n\n\t// Type describes what type this GGUF file is,\n\t// default is \"model\".\n\tType string `json:\"type\"`\n\t// Architecture describes what architecture this GGUF file implements.\n\t//\n\t// All lowercase ASCII.\n\tArchitecture string `json:\"architecture\"`\n\t// QuantizationVersion describes the version of the quantization format.\n\t//\n\t// Not required if the model is not quantized (i.e. no tensors are quantized).\n\t// If any tensors are quantized, this must be present.\n\t// This is separate to the quantization scheme of the tensors itself,\n\t// the quantization version may change without changing the scheme's name,\n\t// e.g. the quantization scheme is Q5_K, and the QuantizationVersion is 4.\n\tQuantizationVersion uint32 `json:\"quantizationVersion,omitempty\"`\n\t// Alignment describes the alignment of the GGUF file.\n\t//\n\t// This can vary to allow for different alignment schemes, but it must be a multiple of 8.\n\t// Some writers may not write the alignment.\n\t//\n\t// Default is 32.\n\tAlignment uint32 `json:\"alignment\"`\n\t// Name to the model.\n\t//\n\t// This should be a human-readable name that can be used to identify the GGUF file.\n\t// It should be unique within the community that the model is defined in.\n\tName string `json:\"name,omitempty\"`\n\t// Author to the model.\n\tAuthor string `json:\"author,omitempty\"`\n\t// URL to the model's homepage.\n\t//\n\t// This can be a GitHub repo, a paper, etc.\n\tURL string `json:\"url,omitempty\"`\n\t// Description to the model.\n\tDescription string `json:\"description,omitempty\"`\n\t// License to the model.\n\t//\n\t// This is expressed as a SPDX license expression, e.g. \"MIT OR Apache-2.0\".\n\tLicense string `json:\"license,omitempty\"`\n\t// FileType describes the type of the majority of the tensors in the GGUF file.\n\tFileType GGUFFileType `json:\"fileType\"`\n\t// FileTypeDescriptor describes the type of the GGUF file according to the FileType and trait layer.\n\t//\n\t// This supplies the FileType with more detail.\n\tFileTypeDescriptor string `json:\"fileTypeDetail\"`\n\n\t/* Appendix */\n\n\t// LittleEndian is true if the GGUF file is little-endian,\n\t// and false for big-endian.\n\tLittleEndian bool `json:\"littleEndian\"`\n\t// FileSize is the size of the GGUF file in bytes.\n\tFileSize GGUFBytesScalar `json:\"fileSize\"`\n\t// Size is the model size.\n\tSize GGUFBytesScalar `json:\"size\"`\n\t// Parameters is the parameters of the GGUF file.\n\tParameters GGUFParametersScalar `json:\"parameters\"`\n\t// BitsPerWeight is the bits per weight of the GGUF file.\n\tBitsPerWeight GGUFBitsPerWeightScalar `json:\"bitsPerWeight\"`\n}\n\n// GGUFFileType is a type of GGUF file,\n// see https://github.com/ggml-org/llama.cpp/blob/fd1234cb468935ea087d6929b2487926c3afff4b/ggml/include/ggml.h#L419-L445,\n// and https://github.com/huggingface/huggingface.js/blob/d67a464473ca07fee9811a129e5fac8cc7487098/packages/tasks/src/gguf.ts#L4-L52.\ntype GGUFFileType uint32\n\n// GGUFFileType constants.\n//\n// GGUFFileTypeMostlyQ4_2, GGUFFileTypeMostlyQ4_3 are deprecated.\n// GGUFFileTypeMostlyQ4_0_4_4, GGUFFileTypeMostlyQ4_0_4_8, GGUFFileTypeMostlyQ4_0_8_8 are deprecated.\n//\n// GGUFFileTypeMostlyQ4_1_SOME_F16 is a special case where the majority of the tensors are Q4_1,\n// but 'token_embd.weight' and 'output.weight' tensors are F16.\nconst (\n\tGGUFFileTypeMostlyF32           GGUFFileType = iota // MOSTLY_F32\n\tGGUFFileTypeMostlyF16                               // MOSTLY_F16\n\tGGUFFileTypeMostlyQ4_0                              // MOSTLY_Q4_0\n\tGGUFFileTypeMostlyQ4_1                              // MOSTLY_Q4_1\n\tGGUFFileTypeMostlyQ4_1_SOME_F16                     // MOSTLY_Q4_1_SOME_F16\n\tGGUFFileTypeMostlyQ4_2                              // MOSTLY_Q4_2\n\tGGUFFileTypeMostlyQ4_3                              // MOSTLY_Q4_3\n\tGGUFFileTypeMostlyQ8_0                              // MOSTLY_Q8_0\n\tGGUFFileTypeMostlyQ5_0                              // MOSTLY_Q5_0\n\tGGUFFileTypeMostlyQ5_1                              // MOSTLY_Q5_1\n\tGGUFFileTypeMostlyQ2_K                              // MOSTLY_Q2_K\n\tGGUFFileTypeMostlyQ3_K_S                            // MOSTLY_Q3_K_S\n\tGGUFFileTypeMostlyQ3_K_M                            // MOSTLY_Q3_K_M\n\tGGUFFileTypeMostlyQ3_K_L                            // MOSTLY_Q3_K_L\n\tGGUFFileTypeMostlyQ4_K_S                            // MOSTLY_Q4_K_S\n\tGGUFFileTypeMostlyQ4_K_M                            // MOSTLY_Q4_K_M\n\tGGUFFileTypeMostlyQ5_K_S                            // MOSTLY_Q5_K_S\n\tGGUFFileTypeMostlyQ5_K_M                            // MOSTLY_Q5_K_M\n\tGGUFFileTypeMostlyQ6_K                              // MOSTLY_Q6_K\n\tGGUFFileTypeMostlyIQ2_XXS                           // MOSTLY_IQ2_XXS\n\tGGUFFileTypeMostlyIQ2_XS                            // MOSTLY_IQ2_XS\n\tGGUFFileTypeMostlyQ2_K_S                            // MOSTLY_Q2_K_S\n\tGGUFFileTypeMostlyIQ3_XS                            // MOSTLY_IQ3_XS\n\tGGUFFileTypeMostlyIQ3_XXS                           // MOSTLY_IQ3_XXS\n\tGGUFFileTypeMostlyIQ1_S                             // MOSTLY_IQ1_S\n\tGGUFFileTypeMostlyIQ4_NL                            // MOSTLY_IQ4_NL\n\tGGUFFileTypeMostlyIQ3_S                             // MOSTLY_IQ3_S\n\tGGUFFileTypeMostlyIQ3_M                             // MOSTLY_IQ3_M\n\tGGUFFileTypeMostlyIQ2_S                             // MOSTLY_IQ2_S\n\tGGUFFileTypeMostlyIQ2_M                             // MOSTLY_IQ2_M\n\tGGUFFileTypeMostlyIQ4_XS                            // MOSTLY_IQ4_XS\n\tGGUFFileTypeMostlyIQ1_M                             // MOSTLY_IQ1_M\n\tGGUFFileTypeMostlyBF16                              // MOSTLY_BF16\n\tGGUFFileTypeMostlyQ4_0_4_4                          // MOSTLY_Q4_0_4_4\n\tGGUFFileTypeMostlyQ4_0_4_8                          // MOSTLY_Q4_0_4_8\n\tGGUFFileTypeMostlyQ4_0_8_8                          // MOSTLY_Q4_0_8_8\n\tGGUFFileTypeMostlyTQ1_0                             // MOSTLY_TQ1_0\n\tGGUFFileTypeMostlyTQ2_0                             // MOSTLY_TQ2_0\n\tGGUFFileTypeMostlyMXFP4                             // MOSTLY_MXFP4\n\t_GGUFFileTypeCount                                  // Unknown\n)\n\n// _GGUFPotentialDiffusionArchitectures holds a list representing the potential diffusion architectures.\n//\n// Since we will unify all diffusion architectures to \"diffusion\" during processing,\n// we can use this list to match the value in explicit `general.architecture`.\nvar _GGUFPotentialDiffusionArchitectures = []string{\n\t\"flux\",\n\t\"sd\",\n\t\"sd2.5\",\n\t\"sd3\",\n\t\"stable-diffusion\",\n}\n\n// _GGUFPotentialDiffusionArchitectureTensorsRegexes holds a list of regexes to match the potential diffusion architecture tensors.\n//\n// This is used to detect if the GGUF file is a diffusion model,\n// when the `general.architecture` is not set to a known diffusion architecture.\nvar _GGUFPotentialDiffusionArchitectureTensorsRegexes = []*regexp.Regexp{\n\tregexp.MustCompile(`^model\\.diffusion_model\\..*`),\n\tregexp.MustCompile(`^double_blocks\\..*`),\n\tregexp.MustCompile(`^joint_blocks\\..*`),\n\tregexp.MustCompile(`^decoder\\..*`),\n\tregexp.MustCompile(`^encoder\\..*`),\n\tregexp.MustCompile(`^text_model\\..*`),\n}\n\n// Metadata returns the metadata of the GGUF file.\nfunc (gf *GGUFFile) Metadata() (gm GGUFMetadata) {\n\tconst (\n\t\ttypeKey         = \"general.type\"\n\t\tarchitectureKey = \"general.architecture\"\n\t\tquantizationKey = \"general.quantization_version\"\n\t\talignmentKey    = \"general.alignment\"\n\t\tnameKey         = \"general.name\"\n\t\tauthorKey       = \"general.author\"\n\t\turlKey          = \"general.url\"\n\t\tdescriptionKey  = \"general.description\"\n\t\tlicenseKey      = \"general.license\"\n\n\t\tcontrolVectorModelHintKey = \"controlvector.model_hint\"\n\t)\n\n\tm, _ := gf.Header.MetadataKV.Index([]string{\n\t\ttypeKey,\n\t\tarchitectureKey,\n\t\tquantizationKey,\n\t\talignmentKey,\n\t\tnameKey,\n\t\tauthorKey,\n\t\turlKey,\n\t\tdescriptionKey,\n\t\tlicenseKey,\n\t\tcontrolVectorModelHintKey,\n\t})\n\n\tif v, ok := m[typeKey]; ok {\n\t\tgm.Type = v.ValueString()\n\t} else if _, ok = m[controlVectorModelHintKey]; ok {\n\t\tgm.Type = \"adapter\"\n\t} else {\n\t\tgm.Type = \"model\"\n\t}\n\tif v, ok := m[controlVectorModelHintKey]; ok {\n\t\tgm.Architecture = v.ValueString()\n\t} else if v, ok = m[architectureKey]; ok && !slices.Contains(_GGUFPotentialDiffusionArchitectures, v.ValueString()) {\n\t\tgm.Architecture = v.ValueString()\n\t\tif gm.Architecture == \"clip\" {\n\t\t\tgm.Type = \"projector\"\n\t\t}\n\t} else if gm.Type == \"imatrix\" {\n\t\tgm.Architecture = \"imatrix\" // Default to imatrix.\n\t} else {\n\t\tgm.Architecture = \"llama\" // Default to llama.\n\t\tfor _, re := range _GGUFPotentialDiffusionArchitectureTensorsRegexes {\n\t\t\tif gf.TensorInfos.Match(re) {\n\t\t\t\tgm.Architecture = \"diffusion\"\n\t\t\t\tbreak\n\t\t\t}\n\t\t}\n\t}\n\tif v, ok := m[quantizationKey]; ok {\n\t\tgm.QuantizationVersion = ValueNumeric[uint32](v)\n\t}\n\tif v, ok := m[alignmentKey]; ok {\n\t\tgm.Alignment = ValueNumeric[uint32](v)\n\t} else {\n\t\tgm.Alignment = 32\n\t}\n\tif v, ok := m[nameKey]; ok {\n\t\tgm.Name = v.ValueString()\n\t}\n\tif v, ok := m[authorKey]; ok {\n\t\tgm.Author = v.ValueString()\n\t}\n\tif v, ok := m[urlKey]; ok {\n\t\tgm.URL = v.ValueString()\n\t}\n\tif v, ok := m[descriptionKey]; ok {\n\t\tgm.Description = v.ValueString()\n\t}\n\tif v, ok := m[licenseKey]; ok {\n\t\tgm.License = v.ValueString()\n\t}\n\tgm.FileType, gm.FileTypeDescriptor = gf.extractFileType(gm.Architecture)\n\n\tgm.LittleEndian = gf.Header.Version < GGUFVersionV3 || gf.Header.Magic == GGUFMagicGGUFLe\n\tgm.FileSize = gf.Size\n\tgm.Size = gf.ModelSize\n\tgm.Parameters = gf.ModelParameters\n\tgm.BitsPerWeight = gf.ModelBitsPerWeight\n\n\treturn gm\n}\n\n// GGMLType returns the GGMLType of the GGUFFileType,\n// which is inspired by\n// https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2730-L2763.\nfunc (t GGUFFileType) GGMLType() GGMLType {\n\tswitch t {\n\tcase GGUFFileTypeMostlyF32:\n\t\treturn GGMLTypeF32\n\tcase GGUFFileTypeMostlyF16:\n\t\treturn GGMLTypeF16\n\tcase GGUFFileTypeMostlyQ4_0:\n\t\treturn GGMLTypeQ4_0\n\tcase GGUFFileTypeMostlyQ4_1:\n\t\treturn GGMLTypeQ4_1\n\tcase GGUFFileTypeMostlyQ4_1_SOME_F16:\n\t\treturn GGMLTypeQ4_1\n\tcase GGUFFileTypeMostlyQ4_2:\n\t\treturn GGMLTypeQ4_2\n\tcase GGUFFileTypeMostlyQ4_3:\n\t\treturn GGMLTypeQ4_3\n\tcase GGUFFileTypeMostlyQ8_0:\n\t\treturn GGMLTypeQ8_0\n\tcase GGUFFileTypeMostlyQ5_0:\n\t\treturn GGMLTypeQ5_0\n\tcase GGUFFileTypeMostlyQ5_1:\n\t\treturn GGMLTypeQ5_1\n\tcase GGUFFileTypeMostlyQ2_K:\n\t\treturn GGMLTypeQ2_K\n\tcase GGUFFileTypeMostlyQ3_K_S:\n\t\treturn GGMLTypeQ3_K\n\tcase GGUFFileTypeMostlyQ3_K_M:\n\t\treturn GGMLTypeQ4_K\n\tcase GGUFFileTypeMostlyQ3_K_L:\n\t\treturn GGMLTypeQ5_K\n\tcase GGUFFileTypeMostlyQ4_K_S:\n\t\treturn GGMLTypeQ6_K\n\tcase GGUFFileTypeMostlyQ4_K_M:\n\t\treturn GGMLTypeQ4_K\n\tcase GGUFFileTypeMostlyQ5_K_S:\n\t\treturn GGMLTypeQ5_K\n\tcase GGUFFileTypeMostlyQ5_K_M:\n\t\treturn GGMLTypeQ5_K\n\tcase GGUFFileTypeMostlyQ6_K:\n\t\treturn GGMLTypeQ6_K\n\tcase GGUFFileTypeMostlyIQ2_XXS:\n\t\treturn GGMLTypeIQ2_XXS\n\tcase GGUFFileTypeMostlyIQ2_XS:\n\t\treturn GGMLTypeIQ2_XS\n\tcase GGUFFileTypeMostlyQ2_K_S:\n\t\treturn GGMLTypeQ2_K\n\tcase GGUFFileTypeMostlyIQ3_XS:\n\t\treturn GGMLTypeIQ3_S\n\tcase GGUFFileTypeMostlyIQ3_XXS:\n\t\treturn GGMLTypeIQ3_XXS\n\tcase GGUFFileTypeMostlyIQ1_S:\n\t\treturn GGMLTypeIQ1_S\n\tcase GGUFFileTypeMostlyIQ4_NL:\n\t\treturn GGMLTypeIQ4_NL\n\tcase GGUFFileTypeMostlyIQ3_S:\n\t\treturn GGMLTypeIQ3_S\n\tcase GGUFFileTypeMostlyIQ3_M:\n\t\treturn GGMLTypeIQ3_S\n\tcase GGUFFileTypeMostlyIQ2_S:\n\t\treturn GGMLTypeIQ2_XS\n\tcase GGUFFileTypeMostlyIQ2_M:\n\t\treturn GGMLTypeIQ2_S\n\tcase GGUFFileTypeMostlyIQ4_XS:\n\t\treturn GGMLTypeIQ4_XS\n\tcase GGUFFileTypeMostlyIQ1_M:\n\t\treturn GGMLTypeIQ1_M\n\tcase GGUFFileTypeMostlyBF16:\n\t\treturn GGMLTypeBF16\n\tcase GGUFFileTypeMostlyQ4_0_4_4:\n\t\treturn GGMLTypeQ4_0_4_4\n\tcase GGUFFileTypeMostlyQ4_0_4_8:\n\t\treturn GGMLTypeQ4_0_4_8\n\tcase GGUFFileTypeMostlyQ4_0_8_8:\n\t\treturn GGMLTypeQ4_0_8_8\n\tcase GGUFFileTypeMostlyTQ1_0:\n\t\treturn GGMLTypeTQ1_0\n\tcase GGUFFileTypeMostlyTQ2_0:\n\t\treturn GGMLTypeTQ2_0\n\tcase GGUFFileTypeMostlyMXFP4:\n\t\treturn GGMLTypeMXFP4\n\tdefault:\n\t}\n\treturn _GGMLTypeCount\n}\n\n// extractFileType extracts the GGUF file type from the metadata,\n// it tries to return the descriptor of the file type.\nfunc (gf *GGUFFile) extractFileType(arch string) (fileType GGUFFileType, fileTypeDescriptor string) {\n\tfileType, fileTypeDescriptor = _GGUFFileTypeCount, \"Unknown\"\n\n\tconst fileTypeKey = \"general.file_type\"\n\tm, _ := gf.Header.MetadataKV.Index([]string{\n\t\tfileTypeKey,\n\t})\n\tif v, ok := m[fileTypeKey]; ok {\n\t\tfileType = GGUFFileType(ValueNumeric[uint32](v))\n\t}\n\n\tif fileType == _GGUFFileTypeCount {\n\t\t// Guess.\n\t\tif len(gf.TensorInfos) != 0 {\n\t\t\tcm := make(map[GGMLType]int)\n\t\t\tfor i := range gf.TensorInfos {\n\t\t\t\tswitch {\n\t\t\t\tcase arch != \"diffusion\" &&\n\t\t\t\t\t!strings.HasPrefix(gf.TensorInfos[i].Name, \"token_embd\") &&\n\t\t\t\t\t!strings.HasPrefix(gf.TensorInfos[i].Name, \"blk.\") &&\n\t\t\t\t\t!strings.Contains(gf.TensorInfos[i].Name, \"_norm\") &&\n\t\t\t\t\t!strings.HasSuffix(gf.TensorInfos[i].Name, \".weight\"):\n\t\t\t\t\tcontinue\n\t\t\t\tcase arch == \"diffusion\" &&\n\t\t\t\t\t!strings.HasSuffix(gf.TensorInfos[i].Name, \".weight\"):\n\t\t\t\t\tcontinue\n\t\t\t\t}\n\t\t\t\tcm[gf.TensorInfos[i].Type]++\n\t\t\t}\n\t\t\tfileType = GetFileType(cm)\n\t\t}\n\t}\n\tif fileType == _GGUFFileTypeCount {\n\t\treturn fileType, fileTypeDescriptor\n\t}\n\n\tfileTypeDescriptor = strings.TrimPrefix(fileType.String(), \"MOSTLY_\")\n\n\tconst tokenEmbedWeightTensorName = \"token_embd.weight\"\n\n\tswitch fileType {\n\tcase GGUFFileTypeMostlyQ4_0:\n\t\ttis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})\n\t\tif v, ok := tis[tokenEmbedWeightTensorName]; ok {\n\t\t\tif v.Type == GGMLTypeQ8_0 || v.Type == GGMLTypeQ5_0 || v.Type == GGMLTypeQ5_1 {\n\t\t\t\tfileTypeDescriptor = \"Q4_0_L\"\n\t\t\t}\n\t\t}\n\tcase GGUFFileTypeMostlyQ4_1:\n\t\ttis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})\n\t\tif v, ok := tis[tokenEmbedWeightTensorName]; ok {\n\t\t\tif v.Type == GGMLTypeQ8_0 || v.Type == GGMLTypeQ5_0 || v.Type == GGMLTypeQ5_1 {\n\t\t\t\tfileTypeDescriptor = \"Q4_1_L\"\n\t\t\t}\n\t\t}\n\tcase GGUFFileTypeMostlyQ5_0:\n\t\ttis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})\n\t\tif v, ok := tis[tokenEmbedWeightTensorName]; ok {\n\t\t\tif v.Type == GGMLTypeQ8_0 {\n\t\t\t\tfileTypeDescriptor = \"Q5_0_L\"\n\t\t\t}\n\t\t}\n\tcase GGUFFileTypeMostlyQ5_1:\n\t\ttis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})\n\t\tif v, ok := tis[tokenEmbedWeightTensorName]; ok {\n\t\t\tif v.Type == GGMLTypeQ8_0 {\n\t\t\t\tfileTypeDescriptor = \"Q5_1_L\"\n\t\t\t}\n\t\t}\n\tcase GGUFFileTypeMostlyQ2_K:\n\t\ttis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})\n\t\tif v, ok := tis[tokenEmbedWeightTensorName]; ok {\n\t\t\tif v.Type == GGMLTypeQ8_0 || v.Type == GGMLTypeQ4_K {\n\t\t\t\tfileTypeDescriptor = \"Q2_K_L\"\n\t\t\t}\n\t\t}\n\tcase GGUFFileTypeMostlyQ3_K_M:\n\t\ttis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})\n\t\tif v, ok := tis[tokenEmbedWeightTensorName]; ok {\n\t\t\tif v.Type == GGMLTypeQ8_0 {\n\t\t\t\tfileTypeDescriptor = \"Q3_K_L\"\n\t\t\t}\n\t\t}\n\tcase GGUFFileTypeMostlyQ4_K_M:\n\t\ttis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})\n\t\tif v, ok := tis[tokenEmbedWeightTensorName]; ok {\n\t\t\tif v.Type == GGMLTypeQ8_0 {\n\t\t\t\tfileTypeDescriptor = \"Q4_K_L\"\n\t\t\t}\n\t\t}\n\tcase GGUFFileTypeMostlyQ5_K_M:\n\t\ttis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})\n\t\tif v, ok := tis[tokenEmbedWeightTensorName]; ok {\n\t\t\tif v.Type == GGMLTypeQ8_0 {\n\t\t\t\tfileTypeDescriptor = \"Q5_K_L\"\n\t\t\t}\n\t\t}\n\tcase GGUFFileTypeMostlyQ6_K:\n\t\ttis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})\n\t\tif v, ok := tis[tokenEmbedWeightTensorName]; ok {\n\t\t\tif v.Type == GGMLTypeQ8_0 {\n\t\t\t\tfileTypeDescriptor = \"Q6_K_L\"\n\t\t\t}\n\t\t}\n\t}\n\n\treturn fileType, fileTypeDescriptor\n}\n\n// GetFileType returns the GGUFFileType represented the mostly GGMLType of the given tensors counter.\n//\n// The input `cm` is a map of GGMLType to the count of tensors of that type.\nfunc GetFileType(cm map[GGMLType]int) GGUFFileType {\n\tif len(cm) == 0 {\n\t\treturn _GGUFFileTypeCount\n\t}\n\n\t// Sort.\n\tts := maps.Keys(cm)\n\tsort.Slice(ts, func(i, j int) bool {\n\t\treturn cm[ts[i]] > cm[ts[j]]\n\t})\n\n\t// Guess.\n\tif ts[0] == GGMLTypeF32 {\n\t\tif len(ts) == 1 {\n\t\t\treturn GGUFFileTypeMostlyF32\n\t\t}\n\t\tts[0] = ts[1]\n\t}\n\tswitch ts[0] {\n\tcase GGMLTypeF16:\n\t\treturn GGUFFileTypeMostlyF16\n\tcase GGMLTypeQ4_0:\n\t\treturn GGUFFileTypeMostlyQ4_0\n\tcase GGMLTypeQ4_1:\n\t\treturn GGUFFileTypeMostlyQ4_1\n\tcase GGMLTypeQ4_2:\n\t\treturn GGUFFileTypeMostlyQ4_2\n\tcase GGMLTypeQ4_3:\n\t\treturn GGUFFileTypeMostlyQ4_3\n\tcase GGMLTypeQ5_0:\n\t\treturn GGUFFileTypeMostlyQ5_0\n\tcase GGMLTypeQ5_1:\n\t\treturn GGUFFileTypeMostlyQ5_1\n\tcase GGMLTypeQ8_0:\n\t\treturn GGUFFileTypeMostlyQ8_0\n\tcase GGMLTypeQ2_K:\n\t\tif ts[len(ts)-1] == GGMLTypeQ5_K {\n\t\t\treturn GGUFFileTypeMostlyQ2_K_S\n\t\t}\n\t\treturn GGUFFileTypeMostlyQ2_K\n\tcase GGMLTypeQ3_K:\n\t\tif cm[GGMLTypeQ8_0] > 0 ||\n\t\t\t(cm[GGMLTypeQ5_K] > 1 && cm[GGMLTypeQ4_K] == 0) {\n\t\t\treturn GGUFFileTypeMostlyQ3_K_L\n\t\t}\n\t\tif cm[GGMLTypeQ4_K] > 1 {\n\t\t\treturn GGUFFileTypeMostlyQ3_K_M\n\t\t}\n\t\treturn GGUFFileTypeMostlyQ3_K_S\n\tcase GGMLTypeQ4_K:\n\t\tif cm[GGMLTypeQ6_K] > 1 {\n\t\t\treturn GGUFFileTypeMostlyQ4_K_M\n\t\t}\n\t\tif cm[GGMLTypeQ3_K] > 1 {\n\t\t\treturn GGUFFileTypeMostlyQ3_K_M\n\t\t}\n\t\treturn GGUFFileTypeMostlyQ4_K_S\n\tcase GGMLTypeQ5_K:\n\t\tif cm[GGMLTypeQ6_K] > 1 {\n\t\t\treturn GGUFFileTypeMostlyQ5_K_M\n\t\t}\n\t\treturn GGUFFileTypeMostlyQ5_K_S\n\tcase GGMLTypeQ6_K:\n\t\treturn GGUFFileTypeMostlyQ6_K\n\tcase GGMLTypeIQ2_XXS:\n\t\treturn GGUFFileTypeMostlyIQ2_XXS\n\tcase GGMLTypeIQ2_XS:\n\t\tif cm[GGMLTypeIQ4_XS] > 1 {\n\t\t\treturn GGUFFileTypeMostlyIQ2_S\n\t\t}\n\t\treturn GGUFFileTypeMostlyIQ2_XS\n\tcase GGMLTypeIQ2_S:\n\t\treturn GGUFFileTypeMostlyIQ2_M\n\tcase GGMLTypeIQ3_XXS:\n\t\treturn GGUFFileTypeMostlyIQ3_XXS\n\tcase GGMLTypeIQ3_S:\n\t\tif cm[GGMLTypeIQ3_XXS] > 1 {\n\t\t\treturn GGUFFileTypeMostlyIQ3_XS\n\t\t}\n\t\treturn GGUFFileTypeMostlyIQ3_S\n\tcase GGMLTypeIQ1_S:\n\t\treturn GGUFFileTypeMostlyIQ1_S\n\tcase GGMLTypeIQ4_NL:\n\t\treturn GGUFFileTypeMostlyIQ4_NL\n\tcase GGMLTypeIQ4_XS:\n\t\treturn GGUFFileTypeMostlyIQ4_XS\n\tcase GGMLTypeIQ1_M:\n\t\treturn GGUFFileTypeMostlyIQ1_M\n\tcase GGMLTypeBF16:\n\t\treturn GGUFFileTypeMostlyBF16\n\tcase GGMLTypeQ4_0_4_4:\n\t\treturn GGUFFileTypeMostlyQ4_0_4_4\n\tcase GGMLTypeQ4_0_4_8:\n\t\treturn GGUFFileTypeMostlyQ4_0_4_8\n\tcase GGMLTypeQ4_0_8_8:\n\t\treturn GGUFFileTypeMostlyQ4_0_8_8\n\tcase GGMLTypeTQ1_0:\n\t\treturn GGUFFileTypeMostlyTQ1_0\n\tcase GGMLTypeTQ2_0:\n\t\treturn GGUFFileTypeMostlyTQ2_0\n\tcase GGMLTypeMXFP4:\n\t\treturn GGUFFileTypeMostlyMXFP4\n\tdefault:\n\t}\n\treturn _GGUFFileTypeCount\n}\n"
  },
  {
    "path": "file_metadata_test.go",
    "content": "package gguf_parser\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"os\"\n\t\"strings\"\n\t\"testing\"\n\n\t\"github.com/davecgh/go-spew/spew\"\n\t\"github.com/stretchr/testify/assert\"\n)\n\nfunc TestGGUFFile_Metadata(t *testing.T) {\n\tctx := context.Background()\n\n\tf, err := ParseGGUFFileFromHuggingFace(\n\t\tctx,\n\t\t\"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF\",\n\t\t\"Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf\",\n\t\tSkipLargeMetadata())\n\tif err != nil {\n\t\tt.Fatal(err)\n\t\treturn\n\t}\n\n\tt.Log(\"\\n\", spew.Sdump(f.Metadata()), \"\\n\")\n}\n\nfunc BenchmarkGGUFFile_Metadata(b *testing.B) {\n\tmp, ok := os.LookupEnv(\"TEST_MODEL_PATH\")\n\tif !ok {\n\t\tb.Skip(\"TEST_MODEL_PATH is not set\")\n\t\treturn\n\t}\n\n\tf, err := ParseGGUFFile(mp, UseMMap(), SkipLargeMetadata())\n\tif err != nil {\n\t\tb.Fatal(err)\n\t\treturn\n\t}\n\n\tb.ReportAllocs()\n\n\tb.ResetTimer()\n\tfor i := 0; i < b.N; i++ {\n\t\t_ = f.Metadata()\n\t}\n}\n\nfunc TestGGUFFile_extractFileType(t *testing.T) {\n\tctx := context.Background()\n\n\trepo := \"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF\"\n\tcases := []string{\n\t\t\"Q2_K\",\n\t\t\"Q3_K_L\",\n\t\t\"Q3_K_M\",\n\t\t\"Q3_K_S\",\n\t\t\"Q4_0\",\n\t\t\"Q4_K_M\",\n\t\t\"Q4_K_S\",\n\t\t\"Q5_0\",\n\t\t\"Q5_K_M\",\n\t\t\"Q5_K_S\",\n\t\t\"Q6_K\",\n\t\t\"Q8_0\",\n\t}\n\tfor _, tc := range cases {\n\t\tt.Run(repo+\"/\"+tc, func(t *testing.T) {\n\t\t\tgf, err := ParseGGUFFileFromHuggingFace(\n\t\t\t\tctx,\n\t\t\t\trepo,\n\t\t\t\tfmt.Sprintf(\"Hermes-2-Pro-Mistral-7B.%s.gguf\", tc))\n\t\t\tif err != nil {\n\t\t\t\tt.Fatal(err)\n\t\t\t\treturn\n\t\t\t}\n\t\t\tmd := gf.Metadata()\n\t\t\tft, ftd := gf.extractFileType(md.Architecture)\n\t\t\tassert.Equal(t, md.FileType.String(), ft.String(), tc+\" file type should be equal\")\n\t\t\tassert.Equal(t, tc, ftd, tc+\" file type descriptor should be equal\")\n\t\t})\n\t}\n\n\t// Ignore unsupported cases for https://huggingface.co/Mungert/Qwen2.5-VL-3B-Instruct-GGUF/commit/42f8e463b233df7575f1e1e9a83cb5936db56d2a.\n\trepo = \"Mungert/Qwen2.5-VL-3B-Instruct-GGUF\"\n\tcases = []string{\n\t\t\"IQ2_M\",\n\t\t\"IQ2_S\",\n\t\t\"IQ2_XS\",\n\t\t\"IQ2_XXS\",\n\t\t\"IQ3_M\",\n\t\t\"IQ3_S\",\n\t\t\"IQ3_XS\",\n\t\t\"IQ3_XXS\",\n\t\t\"IQ4_NL\",\n\t\t\"IQ4_XS\",\n\t\t// \"Q2_K_L\",\n\t\t\"Q2_K_S\",\n\t\t// \"Q3_K_L\",\n\t\t\"Q3_K_M\",\n\t\t\"Q3_K_S\",\n\t\t\"Q4_0\",\n\t\t// \"Q4_0_L\",\n\t\t\"Q4_1\",\n\t\t// \"Q4_1_L\",\n\t\t// \"Q4_K_L\",\n\t\t\"Q4_K_M\",\n\t\t\"Q4_K_S\",\n\t\t\"Q5_0\",\n\t\t// \"Q5_0_L\",\n\t\t// \"Q5_K_L\",\n\t\t\"Q5_K_M\",\n\t\t\"Q5_K_S\",\n\t\t// \"Q6_K_L\",\n\t\t// \"Q6_K_M\", == \"Q6_K\"\n\t\t\"Q8_0\",\n\t}\n\tfor _, tc := range cases {\n\t\tt.Run(repo+\"/\"+tc, func(t *testing.T) {\n\t\t\tgf, err := ParseGGUFFileFromHuggingFace(\n\t\t\t\tctx,\n\t\t\t\trepo,\n\t\t\t\tfmt.Sprintf(\"Qwen2.5-VL-3B-Instruct-%s.gguf\", strings.ToLower(tc)))\n\t\t\tif err != nil {\n\t\t\t\tt.Fatal(err)\n\t\t\t\treturn\n\t\t\t}\n\t\t\tmd := gf.Metadata()\n\t\t\tft, ftd := gf.extractFileType(md.Architecture)\n\t\t\tassert.Equal(t, md.FileType.String(), ft.String(), tc+\" file type should be equal\")\n\t\t\tassert.Equal(t, tc, ftd, tc+\" file type descriptor should be equal\")\n\t\t})\n\t}\n\n\trepo = \"unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF\"\n\tcases = []string{\n\t\t\"BF16\",\n\t\t\"Q2_K\",\n\t\t\"Q2_K_L\",\n\t\t\"Q3_K_M\",\n\t\t\"Q4_K_M\",\n\t\t\"Q5_K_M\",\n\t\t\"Q6_K\",\n\t\t\"Q8_0\",\n\t}\n\tfor _, tc := range cases {\n\t\tt.Run(repo+\"/\"+tc, func(t *testing.T) {\n\t\t\tgf, err := ParseGGUFFileFromHuggingFace(\n\t\t\t\tctx,\n\t\t\t\trepo,\n\t\t\t\tfmt.Sprintf(\"DeepSeek-R1-Distill-Qwen-1.5B-%s.gguf\", tc))\n\t\t\tif err != nil {\n\t\t\t\tt.Fatal(err)\n\t\t\t\treturn\n\t\t\t}\n\t\t\tmd := gf.Metadata()\n\t\t\tft, ftd := gf.extractFileType(md.Architecture)\n\t\t\tassert.Equal(t, md.FileType.String(), ft.String(), tc+\" file type should be equal\")\n\t\t\tassert.Equal(t, tc, ftd, tc+\" file type descriptor should be equal\")\n\t\t})\n\t}\n\n\trepo = \"unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF\"\n\tcases = []string{\n\t\t\"IQ1_M\",\n\t\t\"IQ1_S\",\n\t\t\"IQ2_M\",\n\t\t\"IQ2_XXS\",\n\t\t\"IQ3_XXS\",\n\t\t\"IQ4_XS\",\n\t\t// \"Q2_K_XL\" == \"Q2_K_L\"\n\t\t// \"Q3_K_XL\" == \"Q3_K_M\"\n\t\t// \"Q4_K_XL\" == \"Q4_K_M\"\n\t}\n\tfor _, tc := range cases {\n\t\tt.Run(repo+\"/\"+tc, func(t *testing.T) {\n\t\t\tgf, err := ParseGGUFFileFromHuggingFace(\n\t\t\t\tctx,\n\t\t\t\trepo,\n\t\t\t\tfmt.Sprintf(\"DeepSeek-R1-Distill-Qwen-1.5B-UD-%s.gguf\", tc))\n\t\t\tif err != nil {\n\t\t\t\tt.Fatal(err)\n\t\t\t\treturn\n\t\t\t}\n\t\t\tmd := gf.Metadata()\n\t\t\tft, ftd := gf.extractFileType(md.Architecture)\n\t\t\tassert.Equal(t, md.FileType.String(), ft.String(), tc+\" file type should be equal\")\n\t\t\tassert.Equal(t, tc, ftd, tc+\" file type descriptor should be equal\")\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "file_option.go",
    "content": "package gguf_parser\n\nimport (\n\t\"net/url\"\n\t\"path/filepath\"\n\t\"runtime\"\n\t\"strings\"\n\t\"time\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/osx\"\n)\n\ntype (\n\t_GGUFReadOptions struct {\n\t\tDebug             bool\n\t\tSkipLargeMetadata bool\n\n\t\t// Local.\n\t\tMMap bool\n\n\t\t// Remote.\n\t\tBearerAuthToken            string\n\t\tHeaders                    map[string]string\n\t\tProxyURL                   *url.URL\n\t\tSkipProxy                  bool\n\t\tSkipTLSVerification        bool\n\t\tSkipDNSCache               bool\n\t\tBufferSize                 int\n\t\tSkipRangeDownloadDetection bool\n\t\tCachePath                  string\n\t\tCacheExpiration            time.Duration\n\t}\n\n\t// GGUFReadOption is the option for reading the file.\n\tGGUFReadOption func(o *_GGUFReadOptions)\n)\n\n// UseDebug uses debug mode to read the file.\nfunc UseDebug() GGUFReadOption {\n\treturn func(o *_GGUFReadOptions) {\n\t\to.Debug = true\n\t}\n}\n\n// SkipLargeMetadata skips reading large GGUFMetadataKV items,\n// which are not necessary for most cases.\nfunc SkipLargeMetadata() GGUFReadOption {\n\treturn func(o *_GGUFReadOptions) {\n\t\to.SkipLargeMetadata = true\n\t}\n}\n\n// UseMMap uses mmap to read the local file.\nfunc UseMMap() GGUFReadOption {\n\treturn func(o *_GGUFReadOptions) {\n\t\to.MMap = true\n\t}\n}\n\n// UseBearerAuth uses the given token as a bearer auth when reading from remote.\nfunc UseBearerAuth(token string) GGUFReadOption {\n\treturn func(o *_GGUFReadOptions) {\n\t\to.BearerAuthToken = token\n\t}\n}\n\n// UseHeaders uses the given headers when reading from remote.\nfunc UseHeaders(headers map[string]string) GGUFReadOption {\n\treturn func(o *_GGUFReadOptions) {\n\t\to.Headers = headers\n\t}\n}\n\n// UseProxy uses the given url as a proxy when reading from remote.\nfunc UseProxy(url *url.URL) GGUFReadOption {\n\treturn func(o *_GGUFReadOptions) {\n\t\to.ProxyURL = url\n\t}\n}\n\n// SkipProxy skips the proxy when reading from remote.\nfunc SkipProxy() GGUFReadOption {\n\treturn func(o *_GGUFReadOptions) {\n\t\to.SkipProxy = true\n\t}\n}\n\n// SkipTLSVerification skips the TLS verification when reading from remote.\nfunc SkipTLSVerification() GGUFReadOption {\n\treturn func(o *_GGUFReadOptions) {\n\t\to.SkipTLSVerification = true\n\t}\n}\n\n// SkipDNSCache skips the DNS cache when reading from remote.\nfunc SkipDNSCache() GGUFReadOption {\n\treturn func(o *_GGUFReadOptions) {\n\t\to.SkipDNSCache = true\n\t}\n}\n\n// UseBufferSize sets the buffer size when reading from remote.\nfunc UseBufferSize(size int) GGUFReadOption {\n\tconst minSize = 32 * 1024\n\tif size < minSize {\n\t\tsize = minSize\n\t}\n\treturn func(o *_GGUFReadOptions) {\n\t\to.BufferSize = size\n\t}\n}\n\n// SkipRangeDownloadDetection skips the range download detection when reading from remote.\nfunc SkipRangeDownloadDetection() GGUFReadOption {\n\treturn func(o *_GGUFReadOptions) {\n\t\to.SkipRangeDownloadDetection = true\n\t}\n}\n\n// UseCache caches the remote reading result.\nfunc UseCache() GGUFReadOption {\n\treturn func(o *_GGUFReadOptions) {\n\t\to.CachePath = DefaultCachePath()\n\t\to.CacheExpiration = 24 * time.Hour\n\t}\n}\n\n// SkipCache skips the cache when reading from remote.\nfunc SkipCache() GGUFReadOption {\n\treturn func(o *_GGUFReadOptions) {\n\t\to.CachePath = \"\"\n\t\to.CacheExpiration = 0\n\t}\n}\n\n// DefaultCachePath returns the default cache path.\nfunc DefaultCachePath() string {\n\tcd := filepath.Join(osx.UserHomeDir(), \".cache\")\n\tif runtime.GOOS == \"windows\" {\n\t\tcd = osx.Getenv(\"APPDATA\", cd)\n\t}\n\treturn filepath.Join(cd, \"gguf-parser\")\n}\n\n// UseCachePath uses the given path to cache the remote reading result.\nfunc UseCachePath(path string) GGUFReadOption {\n\tpath = strings.TrimSpace(filepath.Clean(osx.InlineTilde(path)))\n\treturn func(o *_GGUFReadOptions) {\n\t\tif path == \"\" {\n\t\t\treturn\n\t\t}\n\t\to.CachePath = path\n\t}\n}\n\n// UseCacheExpiration uses the given expiration to cache the remote reading result.\n//\n// Disable cache expiration by setting it to 0.\nfunc UseCacheExpiration(expiration time.Duration) GGUFReadOption {\n\tif expiration < 0 {\n\t\texpiration = 0\n\t}\n\treturn func(o *_GGUFReadOptions) {\n\t\to.CacheExpiration = expiration\n\t}\n}\n"
  },
  {
    "path": "file_test.go",
    "content": "package gguf_parser\n\nimport (\n\t\"bytes\"\n\t\"context\"\n\t\"encoding/binary\"\n\t\"os\"\n\t\"testing\"\n\t\"time\"\n\n\t\"github.com/davecgh/go-spew/spew\"\n)\n\nfunc TestParseGGUFFile(t *testing.T) {\n\tmp, ok := os.LookupEnv(\"TEST_MODEL_PATH\")\n\tif !ok {\n\t\tt.Skip(\"TEST_MODEL_PATH is not set\")\n\t\treturn\n\t}\n\n\t// Slow read.\n\t{\n\t\tf, err := ParseGGUFFile(mp)\n\t\tif err != nil {\n\t\t\tt.Fatal(err)\n\t\t\treturn\n\t\t}\n\t\ts := spew.ConfigState{\n\t\t\tIndent:   \"  \",\n\t\t\tMaxDepth: 5, // Avoid console overflow.\n\t\t}\n\t\tt.Log(\"\\n\", s.Sdump(f), \"\\n\")\n\t}\n\n\t// Fast read.\n\t{\n\t\tf, err := ParseGGUFFile(mp, SkipLargeMetadata(), UseMMap())\n\t\tif err != nil {\n\t\t\tt.Fatal(err)\n\t\t\treturn\n\t\t}\n\t\tt.Log(\"\\n\", spew.Sdump(f), \"\\n\")\n\t}\n}\n\nfunc BenchmarkParseGGUFFileMMap(b *testing.B) {\n\tmp, ok := os.LookupEnv(\"TEST_MODEL_PATH\")\n\tif !ok {\n\t\tb.Skip(\"TEST_MODEL_PATH is not set\")\n\t\treturn\n\t}\n\n\tb.ReportAllocs()\n\n\tb.ResetTimer()\n\tb.Run(\"Normal\", func(b *testing.B) {\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\t_, err := ParseGGUFFile(mp)\n\t\t\tif err != nil {\n\t\t\t\tb.Fatal(err)\n\t\t\t\treturn\n\t\t\t}\n\t\t}\n\t})\n\n\tb.ResetTimer()\n\tb.Run(\"UseMMap\", func(b *testing.B) {\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\t_, err := ParseGGUFFile(mp, UseMMap())\n\t\t\tif err != nil {\n\t\t\t\tb.Fatal(err)\n\t\t\t\treturn\n\t\t\t}\n\t\t}\n\t})\n}\n\nfunc BenchmarkParseGGUFFileSkipLargeMetadata(b *testing.B) {\n\tmp, ok := os.LookupEnv(\"TEST_MODEL_PATH\")\n\tif !ok {\n\t\tb.Skip(\"TEST_MODEL_PATH is not set\")\n\t\treturn\n\t}\n\n\tb.ReportAllocs()\n\n\tb.ResetTimer()\n\tb.Run(\"Normal\", func(b *testing.B) {\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\t_, err := ParseGGUFFile(mp, UseMMap())\n\t\t\tif err != nil {\n\t\t\t\tb.Fatal(err)\n\t\t\t\treturn\n\t\t\t}\n\t\t}\n\t})\n\n\tb.ResetTimer()\n\tb.Run(\"SkipLargeMetadata\", func(b *testing.B) {\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\t_, err := ParseGGUFFile(mp, SkipLargeMetadata(), UseMMap())\n\t\t\tif err != nil {\n\t\t\t\tb.Fatal(err)\n\t\t\t\treturn\n\t\t\t}\n\t\t}\n\t})\n}\n\nfunc TestParseGGUFFileRemote(t *testing.T) {\n\tconst u = \"https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF\" +\n\t\t\"/resolve/main/Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf\"\n\n\tctx := context.Background()\n\n\t// Slow read.\n\t{\n\t\tf, err := ParseGGUFFileRemote(ctx, u, UseDebug())\n\t\tif err != nil {\n\t\t\tt.Fatal(err)\n\t\t\treturn\n\t\t}\n\t\ts := spew.ConfigState{\n\t\t\tIndent:   \"  \",\n\t\t\tMaxDepth: 5, // Avoid console overflow.\n\t\t}\n\t\tt.Log(\"\\n\", s.Sdump(f), \"\\n\")\n\t}\n\n\t// Fast read.\n\t{\n\t\tf, err := ParseGGUFFileRemote(ctx, u, UseDebug(), SkipLargeMetadata())\n\t\tif err != nil {\n\t\t\tt.Fatal(err)\n\t\t\treturn\n\t\t}\n\t\tt.Log(\"\\n\", spew.Sdump(f), \"\\n\")\n\t}\n}\n\nfunc BenchmarkParseGGUFFileRemoteWithBufferSize(b *testing.B) {\n\tconst u = \"https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF\" +\n\t\t\"/resolve/main/Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf\"\n\n\tctx := context.Background()\n\n\tb.ReportAllocs()\n\n\tb.ResetTimer()\n\tb.Run(\"256KibBuffer\", func(b *testing.B) {\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\t_, err := ParseGGUFFileRemote(ctx, u, SkipLargeMetadata(), UseBufferSize(256*1024))\n\t\t\tif err != nil {\n\t\t\t\tb.Fatal(err)\n\t\t\t\treturn\n\t\t\t}\n\t\t}\n\t})\n\n\tb.ResetTimer()\n\tb.Run(\"1MibBuffer\", func(b *testing.B) {\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\t_, err := ParseGGUFFileRemote(ctx, u, SkipLargeMetadata(), UseBufferSize(1024*1024))\n\t\t\tif err != nil {\n\t\t\t\tb.Fatal(err)\n\t\t\t\treturn\n\t\t\t}\n\t\t}\n\t})\n\n\tb.ResetTimer()\n\tb.Run(\"4MibBuffer\", func(b *testing.B) {\n\t\tfor i := 0; i < b.N; i++ {\n\t\t\t_, err := ParseGGUFFileRemote(ctx, u, SkipLargeMetadata(), UseBufferSize(4*1024*1024))\n\t\t\tif err != nil {\n\t\t\t\tb.Fatal(err)\n\t\t\t\treturn\n\t\t\t}\n\t\t}\n\t})\n}\n\nfunc TestParseGGUFFileFromHuggingFace(t *testing.T) {\n\tctx := context.Background()\n\n\tcases := [][2]string{\n\t\t{\n\t\t\t\"TheBloke/Llama-2-13B-chat-GGUF\",\n\t\t\t\"llama-2-13b-chat.Q8_0.gguf\",\n\t\t},\n\t\t{\n\t\t\t\"lmstudio-community/Yi-1.5-9B-Chat-GGUF\",\n\t\t\t\"Yi-1.5-9B-Chat-Q5_K_M.gguf\",\n\t\t},\n\t\t{\n\t\t\t\"bartowski/gemma-2-9b-it-GGUF\",\n\t\t\t\"gemma-2-9b-it-Q3_K_M.gguf\",\n\t\t},\n\t}\n\tfor _, tc := range cases {\n\t\tt.Run(tc[0]+\"/\"+tc[1], func(t *testing.T) {\n\t\t\tf, err := ParseGGUFFileFromHuggingFace(ctx, tc[0], tc[1], SkipLargeMetadata())\n\t\t\tif err != nil {\n\t\t\t\tt.Fatal(err)\n\t\t\t\treturn\n\t\t\t}\n\t\t\tt.Log(\"\\n\", spew.Sdump(f), \"\\n\")\n\t\t})\n\t}\n}\n\nfunc TestParseGGUFFileFromModelScope(t *testing.T) {\n\tctx := context.Background()\n\n\tcases := [][2]string{\n\t\t{\n\t\t\t\"qwen/Qwen1.5-0.5B-Chat-GGUF\",\n\t\t\t\"qwen1_5-0_5b-chat-q5_k_m.gguf\",\n\t\t},\n\t\t{\n\t\t\t\"HIT-SCIR/huozi3-gguf\",\n\t\t\t\"huozi3-q2_k.gguf\",\n\t\t},\n\t\t{\n\t\t\t\"shaowenchen/chinese-alpaca-2-13b-16k-gguf\",\n\t\t\t\"chinese-alpaca-2-13b-16k.Q5_K.gguf\",\n\t\t},\n\t}\n\tfor _, tc := range cases {\n\t\tt.Run(tc[0]+\"/\"+tc[1], func(t *testing.T) {\n\t\t\tf, err := ParseGGUFFileFromModelScope(ctx, tc[0], tc[1], SkipLargeMetadata())\n\t\t\tif err != nil {\n\t\t\t\tt.Fatal(err)\n\t\t\t\treturn\n\t\t\t}\n\t\t\tt.Log(\"\\n\", spew.Sdump(f), \"\\n\")\n\t\t})\n\t}\n}\n\nfunc TestParseGGUFFileFromOllama(t *testing.T) {\n\tctx := context.Background()\n\n\tcases := []string{\n\t\t\"gemma2\",\n\t\t\"llama3.1\",\n\t\t\"qwen2:72b-instruct-q3_K_M\",\n\t}\n\tfor _, tc := range cases {\n\t\tt.Run(tc, func(t *testing.T) {\n\t\t\tstart := time.Now()\n\t\t\tf, err := ParseGGUFFileFromOllama(ctx, tc, SkipLargeMetadata())\n\t\t\tif err != nil {\n\t\t\t\tt.Fatal(err)\n\t\t\t\treturn\n\t\t\t}\n\t\t\tt.Logf(\"cost: %v\\n\", time.Since(start))\n\t\t\tt.Log(\"\\n\", spew.Sdump(f), \"\\n\")\n\t\t})\n\t}\n}\n\n// FuzzParseGGUFFile writes the fuzz input to a temp file and calls ParseGGUFFile.\n// Any panic during parsing will be reported by the fuzzing harness.\nfunc FuzzParseGGUFFile(f *testing.F) {\n\tbuf := new(bytes.Buffer)\n\tbo := binary.LittleEndian\n\n\tfor _, v := range []GGUFMagic{GGUFMagicGGML, GGUFMagicGGMF, GGUFMagicGGJT, GGUFMagicGGUFLe, GGUFMagicGGUFBe} {\n\t\t_ = binary.Write(buf, bo, uint32(v))\n\t\tf.Add(buf.Bytes())\n\t\tbuf.Reset()\n\t}\n\n\tf.Fuzz(func(t *testing.T, data []byte) {\n\t\ttmp, err := os.CreateTemp(\"\", \"gguf_fuzz_*.gguf\")\n\t\tif err != nil {\n\t\t\tt.Fatalf(\"create tmp: %v\", err)\n\t\t}\n\t\tdefer os.Remove(tmp.Name())\n\n\t\tif _, err := tmp.Write(data); err != nil {\n\t\t\tt.Fatalf(\"write tmp: %v\", err)\n\t\t}\n\t\tif err := tmp.Close(); err != nil {\n\t\t\tt.Fatalf(\"close tmp: %v\", err)\n\t\t}\n\n\t\t// Call the public ParseGGUFFile which exercises parseGGUFFile.\n\t\t_, _ = ParseGGUFFile(tmp.Name())\n\t})\n}\n\nfunc TestParseGGUFFileWithFuzzInput(t *testing.T) {\n\t// Use the fuzz-generated data\n\t// data := []byte(\"GGUF\\x00\\x00\\x00\\x030000000000000000\")\n\tdata := []byte(\"FUGG\\x00\\x00\\x00\\x00GG>?\\x00\\x00\\x00\\x000000\")\n\n\t// Create temp file\n\ttmpFile, err := os.CreateTemp(\"\", \"fuzz_test_gguf_*.gguf\")\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\tdefer os.Remove(tmpFile.Name())\n\n\t_, err = tmpFile.Write(data)\n\tif err != nil {\n\t\tt.Fatal(err)\n\t}\n\ttmpFile.Close()\n\n\t// Parse should return error (since it's invalid or triggers the check)\n\t_, err = ParseGGUFFile(tmpFile.Name())\n\tif err == nil {\n\t\tt.Error(\"expected error for fuzz-generated data\")\n\t} else {\n\t\tt.Logf(\"got expected error: %v\", err)\n\t}\n}\n"
  },
  {
    "path": "file_tokenizer.go",
    "content": "package gguf_parser\n\n// GGUFTokenizer represents the tokenizer metadata of a GGUF file.\ntype GGUFTokenizer struct {\n\t/* Basic */\n\n\t// Model is the model of the tokenizer.\n\tModel string `json:\"model\"`\n\t// TokensLength is the size of tokens.\n\tTokensLength uint64 `json:\"tokensLength\"`\n\t// MergeLength is the size of merges.\n\tMergesLength uint64 `json:\"mergesLength\"`\n\t// AddedTokensLength is the size of added tokens after training.\n\tAddedTokensLength uint64 `json:\"addedTokenLength\"`\n\t// BOSTokenID is the ID of the beginning of sentence token.\n\t//\n\t// Use -1 if the token is not found.\n\tBOSTokenID int64 `json:\"bosTokenID\"`\n\t// EOSTokenID is the ID of the end of sentence token.\n\t//\n\t// Use -1 if the token is not found.\n\tEOSTokenID int64 `json:\"eosTokenID\"`\n\t// EOTTokenID is the ID of the end of text token.\n\t//\n\t// Use -1 if the token is not found.\n\tEOTTokenID int64 `json:\"eotTokenID\"`\n\t// EOMTokenID is the ID of the end of message token.\n\t//\n\t// Use -1 if the token is not found.\n\tEOMTokenID int64 `json:\"eomTokenID\"`\n\t// UnknownTokenID is the ID of the unknown token.\n\t//\n\t// Use -1 if the token is not found.\n\tUnknownTokenID int64 `json:\"unknownTokenID\"`\n\t// SeparatorTokenID is the ID of the separator token.\n\t//\n\t// Use -1 if the token is not found.\n\tSeparatorTokenID int64 `json:\"separatorTokenID\"`\n\t// PaddingTokenID is the ID of the padding token.\n\t//\n\t// Use -1 if the token is not found.\n\tPaddingTokenID int64 `json:\"paddingTokenID\"`\n\n\t/* Appendix */\n\n\t// TokenSize is the size of tokens in bytes.\n\tTokensSize int64 `json:\"tokensSize\"`\n\t// MergesSize is the size of merges in bytes.\n\tMergesSize int64 `json:\"mergesSize\"`\n}\n\n// Tokenizer returns the tokenizer metadata of a GGUF file.\nfunc (gf *GGUFFile) Tokenizer() (gt GGUFTokenizer) {\n\tconst (\n\t\tmodelKey            = \"tokenizer.ggml.model\"\n\t\ttokensKey           = \"tokenizer.ggml.tokens\"\n\t\tmergesKey           = \"tokenizer.ggml.merges\"\n\t\taddedTokensKey      = \"tokenizer.ggml.added_tokens\"\n\t\tbosTokenIDKey       = \"tokenizer.ggml.bos_token_id\"\n\t\teosTokenIDKey       = \"tokenizer.ggml.eos_token_id\"\n\t\teotTokenIDKey       = \"tokenizer.ggml.eot_token_id\"\n\t\teomTokenIDKey       = \"tokenizer.ggml.eom_token_id\"\n\t\tunknownTokenIDKey   = \"tokenizer.ggml.unknown_token_id\"\n\t\tseparatorTokenIDKey = \"tokenizer.ggml.separator_token_id\"\n\t\tpaddingTokenIDKey   = \"tokenizer.ggml.padding_token_id\"\n\t)\n\n\tm, _ := gf.Header.MetadataKV.Index([]string{\n\t\tmodelKey,\n\t\ttokensKey,\n\t\tmergesKey,\n\t\taddedTokensKey,\n\t\tbosTokenIDKey,\n\t\teosTokenIDKey,\n\t\teotTokenIDKey,\n\t\teomTokenIDKey,\n\t\tunknownTokenIDKey,\n\t\tseparatorTokenIDKey,\n\t\tpaddingTokenIDKey,\n\t})\n\n\tgt.BOSTokenID = -1\n\tgt.EOSTokenID = -1\n\tgt.EOTTokenID = -1\n\tgt.EOMTokenID = -1\n\tgt.UnknownTokenID = -1\n\tgt.SeparatorTokenID = -1\n\tgt.PaddingTokenID = -1\n\n\tif v, ok := m[modelKey]; ok {\n\t\tgt.Model = v.ValueString()\n\t}\n\tif v, ok := m[tokensKey]; ok {\n\t\tarr := v.ValueArray()\n\t\tgt.TokensLength = arr.Len\n\t\tgt.TokensSize = arr.Size\n\t}\n\tif v, ok := m[mergesKey]; ok {\n\t\tarr := v.ValueArray()\n\t\tgt.MergesLength = arr.Len\n\t\tgt.MergesSize = arr.Size\n\t}\n\tif v, ok := m[addedTokensKey]; ok {\n\t\tgt.AddedTokensLength = v.ValueArray().Len\n\t}\n\tif v, ok := m[bosTokenIDKey]; ok {\n\t\tgt.BOSTokenID = ValueNumeric[int64](v)\n\t}\n\tif v, ok := m[eosTokenIDKey]; ok {\n\t\tgt.EOSTokenID = ValueNumeric[int64](v)\n\t}\n\tif v, ok := m[eotTokenIDKey]; ok {\n\t\tgt.EOTTokenID = ValueNumeric[int64](v)\n\t}\n\tif v, ok := m[eomTokenIDKey]; ok {\n\t\tgt.EOMTokenID = ValueNumeric[int64](v)\n\t}\n\tif v, ok := m[unknownTokenIDKey]; ok {\n\t\tgt.UnknownTokenID = ValueNumeric[int64](v)\n\t}\n\tif v, ok := m[separatorTokenIDKey]; ok {\n\t\tgt.SeparatorTokenID = ValueNumeric[int64](v)\n\t}\n\tif v, ok := m[paddingTokenIDKey]; ok {\n\t\tgt.PaddingTokenID = ValueNumeric[int64](v)\n\t}\n\n\treturn gt\n}\n"
  },
  {
    "path": "file_tokenizer_test.go",
    "content": "package gguf_parser\n\nimport (\n\t\"context\"\n\t\"os\"\n\t\"testing\"\n\n\t\"github.com/davecgh/go-spew/spew\"\n)\n\nfunc TestGGUFFile_Tokenizer(t *testing.T) {\n\tctx := context.Background()\n\n\tf, err := ParseGGUFFileFromHuggingFace(\n\t\tctx,\n\t\t\"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF\",\n\t\t\"Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf\",\n\t\tSkipLargeMetadata())\n\tif err != nil {\n\t\tt.Fatal(err)\n\t\treturn\n\t}\n\n\tt.Log(\"\\n\", spew.Sdump(f.Tokenizer()), \"\\n\")\n}\n\nfunc BenchmarkGGUFFile_Tokenizer(b *testing.B) {\n\tmp, ok := os.LookupEnv(\"TEST_MODEL_PATH\")\n\tif !ok {\n\t\tb.Skip(\"TEST_MODEL_PATH is not set\")\n\t\treturn\n\t}\n\n\tf, err := ParseGGUFFile(mp, SkipLargeMetadata(), UseMMap())\n\tif err != nil {\n\t\tb.Fatal(err)\n\t\treturn\n\t}\n\n\tb.ReportAllocs()\n\n\tb.ResetTimer()\n\tfor i := 0; i < b.N; i++ {\n\t\t_ = f.Tokenizer()\n\t}\n}\n"
  },
  {
    "path": "filename.go",
    "content": "package gguf_parser\n\nimport (\n\t\"fmt\"\n\t\"regexp\"\n\t\"strconv\"\n\t\"strings\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/funcx\"\n\t\"github.com/gpustack/gguf-parser-go/util/ptr\"\n)\n\n// GGUFFilename represents a GGUF filename,\n// see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention.\ntype GGUFFilename struct {\n\tBaseName   string `json:\"baseName\"`\n\tSizeLabel  string `json:\"sizeLabel\"`\n\tFineTune   string `json:\"fineTune\"`\n\tVersion    string `json:\"version\"`\n\tEncoding   string `json:\"encoding\"`\n\tType       string `json:\"type\"`\n\tShard      *int   `json:\"shard,omitempty\"`\n\tShardTotal *int   `json:\"shardTotal,omitempty\"`\n}\n\nvar GGUFFilenameRegex = regexp.MustCompile(`^(?P<BaseName>[A-Za-z\\s][A-Za-z0-9._\\s]*(?:(?:-(?:(?:[A-Za-z\\s][A-Za-z0-9._\\s]*)|(?:[0-9._\\s]*)))*))-(?:(?P<SizeLabel>(?:\\d+x)?(?:\\d+\\.)?\\d+[A-Za-z](?:-[A-Za-z]+(\\d+\\.)?\\d+[A-Za-z]+)?)(?:-(?P<FineTune>[A-Za-z][A-Za-z0-9\\s_-]+[A-Za-z](?i:[^BFKIQ])))?)?(?:-(?P<Version>[vV]\\d+(?:\\.\\d+)*))?(?i:-(?P<Encoding>(BF16|F32|F16|([KI]?Q[0-9][A-Z0-9_]*))))?(?:-(?P<Type>LoRA|vocab))?(?:-(?P<Shard>\\d{5})-of-(?P<ShardTotal>\\d{5}))?\\.gguf$`) // nolint:lll\n\n// ParseGGUFFilename parses the given GGUF filename string,\n// and returns the GGUFFilename, or nil if the filename is invalid.\nfunc ParseGGUFFilename(name string) *GGUFFilename {\n\tn := name\n\tif !strings.HasSuffix(n, \".gguf\") {\n\t\tn += \".gguf\"\n\t}\n\n\tm := make(map[string]string)\n\t{\n\t\tr := GGUFFilenameRegex.FindStringSubmatch(n)\n\t\tfor i, ne := range GGUFFilenameRegex.SubexpNames() {\n\t\t\tif i != 0 && i <= len(r) {\n\t\t\t\tm[ne] = r[i]\n\t\t\t}\n\t\t}\n\t}\n\tif m[\"BaseName\"] == \"\" {\n\t\treturn nil\n\t}\n\n\tvar gn GGUFFilename\n\tgn.BaseName = strings.ReplaceAll(m[\"BaseName\"], \"-\", \" \")\n\tgn.SizeLabel = m[\"SizeLabel\"]\n\tgn.FineTune = m[\"FineTune\"]\n\tgn.Version = m[\"Version\"]\n\tgn.Encoding = m[\"Encoding\"]\n\tgn.Type = m[\"Type\"]\n\tif v := m[\"Shard\"]; v != \"\" {\n\t\tgn.Shard = ptr.To(parseInt(v))\n\t}\n\tif v := m[\"ShardTotal\"]; v != \"\" {\n\t\tgn.ShardTotal = ptr.To(parseInt(v))\n\t}\n\treturn &gn\n}\n\nfunc (gn GGUFFilename) String() string {\n\tif gn.BaseName == \"\" {\n\t\treturn \"\"\n\t}\n\n\tvar sb strings.Builder\n\tsb.WriteString(strings.ReplaceAll(gn.BaseName, \" \", \"-\"))\n\tif gn.SizeLabel != \"\" {\n\t\tsb.WriteString(\"-\")\n\t\tsb.WriteString(gn.SizeLabel)\n\t}\n\tif gn.FineTune != \"\" {\n\t\tsb.WriteString(\"-\")\n\t\tsb.WriteString(gn.FineTune)\n\t}\n\tif gn.Version != \"\" {\n\t\tsb.WriteString(\"-\")\n\t\tsb.WriteString(gn.Version)\n\t}\n\tif gn.Encoding != \"\" {\n\t\tsb.WriteString(\"-\")\n\t\tsb.WriteString(gn.Encoding)\n\t}\n\tif gn.Type != \"\" {\n\t\tsb.WriteString(\"-\")\n\t\tsb.WriteString(gn.Type)\n\t}\n\tif m, n := ptr.Deref(gn.Shard, 0), ptr.Deref(gn.ShardTotal, 0); m > 0 && n > 0 {\n\t\tsb.WriteString(\"-\")\n\t\tsb.WriteString(fmt.Sprintf(\"%05d\", m))\n\t\tsb.WriteString(\"-of-\")\n\t\tsb.WriteString(fmt.Sprintf(\"%05d\", n))\n\t}\n\tsb.WriteString(\".gguf\")\n\treturn sb.String()\n}\n\n// IsShard returns true if the GGUF filename is a shard.\nfunc (gn GGUFFilename) IsShard() bool {\n\treturn ptr.Deref(gn.Shard, 0) > 0 && ptr.Deref(gn.ShardTotal, 0) > 0\n}\n\nvar ShardGGUFFilenameRegex = regexp.MustCompile(`^(?P<Prefix>.*)-(?:(?P<Shard>\\d{5})-of-(?P<ShardTotal>\\d{5}))\\.gguf$`)\n\n// IsShardGGUFFilename returns true if the given filename is a shard GGUF filename.\nfunc IsShardGGUFFilename(name string) bool {\n\tn := name\n\tif !strings.HasSuffix(n, \".gguf\") {\n\t\tn += \".gguf\"\n\t}\n\n\tm := make(map[string]string)\n\t{\n\t\tr := ShardGGUFFilenameRegex.FindStringSubmatch(n)\n\t\tfor i, ne := range ShardGGUFFilenameRegex.SubexpNames() {\n\t\t\tif i != 0 && i <= len(r) {\n\t\t\t\tm[ne] = r[i]\n\t\t\t}\n\t\t}\n\t}\n\n\tvar shard, shardTotal int\n\tif v := m[\"Shard\"]; v != \"\" {\n\t\tshard = parseInt(v)\n\t}\n\tif v := m[\"ShardTotal\"]; v != \"\" {\n\t\tshardTotal = parseInt(v)\n\t}\n\treturn shard > 0 && shardTotal > 0\n}\n\n// CompleteShardGGUFFilename returns the list of shard GGUF filenames that are related to the given shard GGUF filename.\n//\n// Only available if the given filename is a shard GGUF filename.\nfunc CompleteShardGGUFFilename(name string) []string {\n\tn := name\n\tif !strings.HasSuffix(n, \".gguf\") {\n\t\tn += \".gguf\"\n\t}\n\n\tm := make(map[string]string)\n\t{\n\t\tr := ShardGGUFFilenameRegex.FindStringSubmatch(n)\n\t\tfor i, ne := range ShardGGUFFilenameRegex.SubexpNames() {\n\t\t\tif i != 0 && i <= len(r) {\n\t\t\t\tm[ne] = r[i]\n\t\t\t}\n\t\t}\n\t}\n\n\tvar shard, shardTotal int\n\tif v := m[\"Shard\"]; v != \"\" {\n\t\tshard = parseInt(v)\n\t}\n\tif v := m[\"ShardTotal\"]; v != \"\" {\n\t\tshardTotal = parseInt(v)\n\t}\n\n\tif shard <= 0 || shardTotal <= 0 {\n\t\treturn nil\n\t}\n\n\tnames := make([]string, 0, shardTotal)\n\tfor i := 1; i <= shardTotal; i++ {\n\t\tnames = append(names, fmt.Sprintf(\"%s-%05d-of-%05d.gguf\", m[\"Prefix\"], i, shardTotal))\n\t}\n\treturn names\n}\n\nfunc parseInt(v string) int {\n\treturn int(funcx.MustNoError(strconv.ParseInt(v, 10, 64)))\n}\n"
  },
  {
    "path": "filename_test.go",
    "content": "package gguf_parser\n\nimport (\n\t\"testing\"\n\n\t\"github.com/stretchr/testify/assert\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/ptr\"\n)\n\nfunc TestParseGGUFFilename(t *testing.T) {\n\tcases := []struct {\n\t\tgiven    string\n\t\texpected *GGUFFilename\n\t}{\n\t\t{\n\t\t\tgiven: \"Mixtral-8x7B-V0.1-KQ2.gguf\",\n\t\t\texpected: &GGUFFilename{\n\t\t\t\tBaseName:  \"Mixtral\",\n\t\t\t\tSizeLabel: \"8x7B\",\n\t\t\t\tVersion:   \"V0.1\",\n\t\t\t\tEncoding:  \"KQ2\",\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tgiven: \"Grok-100B-v1.0-Q4_0-00003-of-00009.gguf\",\n\t\t\texpected: &GGUFFilename{\n\t\t\t\tBaseName:   \"Grok\",\n\t\t\t\tSizeLabel:  \"100B\",\n\t\t\t\tVersion:    \"v1.0\",\n\t\t\t\tEncoding:   \"Q4_0\",\n\t\t\t\tShard:      ptr.To(3),\n\t\t\t\tShardTotal: ptr.To(9),\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tgiven: \"Hermes-2-Pro-Llama-3-8B-F16.gguf\",\n\t\t\texpected: &GGUFFilename{\n\t\t\t\tBaseName:  \"Hermes 2 Pro Llama 3\",\n\t\t\t\tSizeLabel: \"8B\",\n\t\t\t\tEncoding:  \"F16\",\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tgiven: \"Phi-3-mini-3.8B-ContextLength4k-instruct-v1.0.gguf\",\n\t\t\texpected: &GGUFFilename{\n\t\t\t\tBaseName:  \"Phi 3 mini\",\n\t\t\t\tSizeLabel: \"3.8B-ContextLength4k\",\n\t\t\t\tFineTune:  \"instruct\",\n\t\t\t\tVersion:   \"v1.0\",\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tgiven: \"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00001-of-00018.gguf\",\n\t\t\texpected: &GGUFFilename{\n\t\t\t\tBaseName:   \"Meta Llama 3.1\",\n\t\t\t\tSizeLabel:  \"405B\",\n\t\t\t\tFineTune:   \"Instruct-XelotX\",\n\t\t\t\tEncoding:   \"BF16\",\n\t\t\t\tShard:      ptr.To(1),\n\t\t\t\tShardTotal: ptr.To(18),\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tgiven: \"qwen2-72b-instruct-q6_k-00001-of-00002.gguf\",\n\t\t\texpected: &GGUFFilename{\n\t\t\t\tBaseName:   \"qwen2\",\n\t\t\t\tSizeLabel:  \"72b\",\n\t\t\t\tFineTune:   \"instruct\",\n\t\t\t\tEncoding:   \"q6_k\",\n\t\t\t\tShard:      ptr.To(1),\n\t\t\t\tShardTotal: ptr.To(2),\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tgiven:    \"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00001-of-00009.gguf\",\n\t\t\texpected: nil,\n\t\t},\n\t\t{\n\t\t\tgiven:    \"not-a-known-arrangement.gguf\",\n\t\t\texpected: nil,\n\t\t},\n\t}\n\tfor _, tc := range cases {\n\t\tt.Run(tc.given, func(t *testing.T) {\n\t\t\tactual := ParseGGUFFilename(tc.given)\n\t\t\tassert.Equal(t, tc.expected, actual)\n\t\t})\n\t}\n}\n\nfunc TestGGUFFilenameString(t *testing.T) {\n\tcases := []struct {\n\t\tgiven    GGUFFilename\n\t\texpected string\n\t}{\n\t\t{\n\t\t\tgiven: GGUFFilename{\n\t\t\t\tBaseName:  \"Mixtral\",\n\t\t\t\tSizeLabel: \"8x7B\",\n\t\t\t\tVersion:   \"v0.1\",\n\t\t\t\tEncoding:  \"KQ2\",\n\t\t\t},\n\t\t\texpected: \"Mixtral-8x7B-v0.1-KQ2.gguf\",\n\t\t},\n\t\t{\n\t\t\tgiven: GGUFFilename{\n\t\t\t\tBaseName:   \"Grok\",\n\t\t\t\tSizeLabel:  \"100B\",\n\t\t\t\tVersion:    \"v1.0\",\n\t\t\t\tEncoding:   \"Q4_0\",\n\t\t\t\tShard:      ptr.To(3),\n\t\t\t\tShardTotal: ptr.To(9),\n\t\t\t},\n\t\t\texpected: \"Grok-100B-v1.0-Q4_0-00003-of-00009.gguf\",\n\t\t},\n\t\t{\n\t\t\tgiven: GGUFFilename{\n\t\t\t\tBaseName:  \"Hermes 2 Pro Llama 3\",\n\t\t\t\tSizeLabel: \"8B\",\n\t\t\t\tEncoding:  \"F16\",\n\t\t\t},\n\t\t\texpected: \"Hermes-2-Pro-Llama-3-8B-F16.gguf\",\n\t\t},\n\t\t{\n\t\t\tgiven: GGUFFilename{\n\t\t\t\tBaseName:  \"Phi 3 mini\",\n\t\t\t\tSizeLabel: \"3.8B-ContextLength4k\",\n\t\t\t\tFineTune:  \"instruct\",\n\t\t\t\tVersion:   \"v1.0\",\n\t\t\t},\n\t\t\texpected: \"Phi-3-mini-3.8B-ContextLength4k-instruct-v1.0.gguf\",\n\t\t},\n\t\t{\n\t\t\tgiven:    GGUFFilename{},\n\t\t\texpected: \"\",\n\t\t},\n\t}\n\tfor _, tc := range cases {\n\t\tt.Run(tc.expected, func(t *testing.T) {\n\t\t\tactual := tc.given.String()\n\t\t\tassert.Equal(t, tc.expected, actual)\n\t\t})\n\t}\n}\n\nfunc TestIsShardGGUFFilename(t *testing.T) {\n\tcases := []struct {\n\t\tgiven    string\n\t\texpected bool\n\t}{\n\t\t{\n\t\t\tgiven:    \"qwen2-72b-instruct-q6_k-00001-of-00002.gguf\",\n\t\t\texpected: true,\n\t\t},\n\t\t{\n\t\t\tgiven:    \"Grok-100B-v1.0-Q4_0-00003-of-00009.gguf\",\n\t\t\texpected: true,\n\t\t},\n\t\t{\n\t\t\tgiven:    \"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00001-of-00009.gguf\",\n\t\t\texpected: true,\n\t\t},\n\t\t{\n\t\t\tgiven:    \"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00001-of-00018.gguf\",\n\t\t\texpected: true,\n\t\t},\n\t\t{\n\t\t\tgiven:    \"not-a-known-arrangement.gguf\",\n\t\t\texpected: false,\n\t\t},\n\t}\n\tfor _, tc := range cases {\n\t\tt.Run(tc.given, func(t *testing.T) {\n\t\t\tactual := IsShardGGUFFilename(tc.given)\n\t\t\tassert.Equal(t, tc.expected, actual)\n\t\t})\n\t}\n}\n\nfunc TestCompleteShardGGUFFilename(t *testing.T) {\n\tcases := []struct {\n\t\tgiven    string\n\t\texpected []string\n\t}{\n\t\t{\n\t\t\tgiven: \"qwen2-72b-instruct-q6_k-00001-of-00002.gguf\",\n\t\t\texpected: []string{\n\t\t\t\t\"qwen2-72b-instruct-q6_k-00001-of-00002.gguf\",\n\t\t\t\t\"qwen2-72b-instruct-q6_k-00002-of-00002.gguf\",\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tgiven: \"Grok-100B-v1.0-Q4_0-00003-of-00009.gguf\",\n\t\t\texpected: []string{\n\t\t\t\t\"Grok-100B-v1.0-Q4_0-00001-of-00009.gguf\",\n\t\t\t\t\"Grok-100B-v1.0-Q4_0-00002-of-00009.gguf\",\n\t\t\t\t\"Grok-100B-v1.0-Q4_0-00003-of-00009.gguf\",\n\t\t\t\t\"Grok-100B-v1.0-Q4_0-00004-of-00009.gguf\",\n\t\t\t\t\"Grok-100B-v1.0-Q4_0-00005-of-00009.gguf\",\n\t\t\t\t\"Grok-100B-v1.0-Q4_0-00006-of-00009.gguf\",\n\t\t\t\t\"Grok-100B-v1.0-Q4_0-00007-of-00009.gguf\",\n\t\t\t\t\"Grok-100B-v1.0-Q4_0-00008-of-00009.gguf\",\n\t\t\t\t\"Grok-100B-v1.0-Q4_0-00009-of-00009.gguf\",\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tgiven: \"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00001-of-00009.gguf\",\n\t\t\texpected: []string{\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00001-of-00009.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00002-of-00009.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00003-of-00009.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00004-of-00009.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00005-of-00009.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00006-of-00009.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00007-of-00009.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00008-of-00009.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00009-of-00009.gguf\",\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tgiven: \"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00001-of-00018.gguf\",\n\t\t\texpected: []string{\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00001-of-00018.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00002-of-00018.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00003-of-00018.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00004-of-00018.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00005-of-00018.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00006-of-00018.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00007-of-00018.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00008-of-00018.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00009-of-00018.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00010-of-00018.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00011-of-00018.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00012-of-00018.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00013-of-00018.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00014-of-00018.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00015-of-00018.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00016-of-00018.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00017-of-00018.gguf\",\n\t\t\t\t\"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00018-of-00018.gguf\",\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tgiven:    \"not-a-known-arrangement.gguf\",\n\t\t\texpected: nil,\n\t\t},\n\t}\n\tfor _, tc := range cases {\n\t\tt.Run(tc.given, func(t *testing.T) {\n\t\t\tactual := CompleteShardGGUFFilename(tc.given)\n\t\t\tassert.Equal(t, tc.expected, actual)\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "gen.go",
    "content": "//go:generate go generate -tags stringer gen.stringer.go\n//go:generate go generate -tags regression gen.regression.go\npackage gguf_parser\n"
  },
  {
    "path": "gen.regression.go",
    "content": "//go:build regression\n\n//go:generate go run -tags regression gen.regression.go\npackage main\n\nimport (\n\t\"fmt\"\n\t\"strconv\"\n\t\"math\"\n\t\"os\"\n\t\"text/template\"\n\t\"bytes\"\n\t\"go/format\"\n\n\t\"gonum.org/v1/gonum/mat\"\n\t\"golang.org/x/exp/maps\"\n\t\"sort\"\n)\n\ntype LinearRegression struct {\n\tIntercept float64\n\tSlope     float64\n}\n\nfunc (lr *LinearRegression) Fit(xs, ys []float64) {\n\tif len(xs) != len(ys) {\n\t\tpanic(\"length of xs and ys must be the same\")\n\t}\n\n\tvar sX, sY, sXY, sXX float64\n\tfor i := 0; i < len(xs); i++ {\n\t\tsX += xs[i]\n\t\tsY += ys[i]\n\t\tsXY += xs[i] * ys[i]\n\t\tsXX += xs[i] * xs[i]\n\t}\n\n\tn := float64(len(xs))\n\td := n*sXX - sX*sX\n\tif d == 0 {\n\t\td = 1\n\t}\n\n\tlr.Slope = (n*sXY - sX*sY) / d\n\tlr.Intercept = (sY*sXX - sX*sXY) / d\n}\n\nfunc (lr *LinearRegression) Predict(x float64) (y float64) {\n\treturn lr.Intercept + lr.Slope*x\n}\n\ntype PolynomialRegression struct {\n\tDegree       int\n\tCoefficients []float64\n}\n\nfunc (pr *PolynomialRegression) Fit(xs, ys []float64) {\n\tsamples := len(xs)\n\tfeats := pr.Degree + 1\n\n\tfeat := mat.NewDense(samples, feats, nil)\n\t{\n\t\tfor i := 0; i < samples; i++ {\n\t\t\tfor j := 0; j < feats; j++ {\n\t\t\t\tfeat.Set(i, j, math.Pow(xs[i], float64(j)))\n\t\t\t}\n\t\t}\n\t\tvar qr mat.QR\n\t\tqr.Factorize(feat)\n\t}\n\tyVec := mat.NewVecDense(samples, ys)\n\n\tvar coef mat.VecDense\n\tif err := coef.SolveVec(feat, yVec); err != nil {\n\t\tpanic(\"failed to solve\")\n\t}\n\n\tpr.Coefficients = coef.RawVector().Data\n}\n\nfunc (pr *PolynomialRegression) Predict(x float64) (y float64) {\n\ty = 0\n\tfor i := 0; i < pr.Degree+1; i++ {\n\t\ty += pr.Coefficients[i] * math.Pow(x, float64(i))\n\t}\n\treturn\n}\n\nfunc DiffusionModelMemoryUsageRegression(output string) {\n\ttype Regression struct {\n\t\tName                 string\n\t\tLinearRegression     *LinearRegression\n\t\tPolynomialRegression *PolynomialRegression\n\t}\n\n\tconst tmplStr = `\npackage gguf_parser\n\nimport \"math\"\n\n{{ range . -}}\n// {{ .Name }} returns the memory usage in bytes for the given width and height,\n// which is calculated by linear regression or polynomial regression.\nfunc {{ .Name }}(width, height uint32, flashAttention bool) uint64 {\n\tcoefficients := []float64{ {{ range $i, $c := .PolynomialRegression.Coefficients }}{{ if eq $i 0 }}{{ printf \"%.4f\" $c }}{{ else }}{{ printf \"%.10f\" $c }}{{ end }}, {{ end }} }\n\tdegree := {{ .PolynomialRegression.Degree }}\n\tx := float64(width * height)\n\t\n\t{{ if .LinearRegression -}}\n    if flashAttention {\n\t\tcoefficients = []float64{ {{ printf \"%.5f\" .LinearRegression.Intercept }}, {{ printf \"%.10f\" .LinearRegression.Slope }} }\n\t\tdegree = 1\n    }\n    {{- end }}\n\n\ty := float64(0)\n\tfor i := 0; i <= degree; i++ {\n\t\ty += coefficients[i] * math.Pow(x, float64(i))\n\t}\n\treturn uint64(y)\n}\n\n{{ end }}\n\n`\n\tts := []struct {\n\t\tn     string\n\t\tx2y   map[float64]float64\n\t\tc     map[float64]float64\n\t\tfax2y map[float64]float64\n\t\tfac   map[float64]float64\n\t}{\n\t\t{\n\t\t\tn: \"GuessSD1DiffusionModelMemoryUsage\",\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 49.57 MB(VRAM)     // 256*256\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 559.90 MB(VRAM)    // 512*512\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 8360.93 MB(VRAM)   // 1024*1024\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 18681.62 MB(VRAM)  // 1024*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 25377.96 MB(VRAM)  // 1024*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 41842.65 MB(VRAM)  // 1536*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 77333.77 MB(VRAM)  // 1792*1792\n\t\t\tx2y: map[float64]float64{\n\t\t\t\t256 * 256:   49.57,\n\t\t\t\t512 * 512:   559.90,\n\t\t\t\t1024 * 1024: 8360.93,\n\t\t\t\t1024 * 1536: 18681.62,\n\t\t\t\t1024 * 1792: 25377.96,\n\t\t\t\t1536 * 1536: 41842.65,\n\t\t\t\t1792 * 1792: 77333.77,\n\t\t\t},\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 56879.17 MB(VRAM)  // 1536*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 100924.37 MB(VRAM) // 1792*2048\n\t\t\tc: map[float64]float64{\n\t\t\t\t1536 * 1792: 56879.17,\n\t\t\t\t1792 * 2048: 100924.37,\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tn: \"GuessSD2DiffusionModelMemoryUsage\",\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 37.65 MB(VRAM)     // 256*256\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 367.98 MB(VRAM)    // 512*512\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 830.86 MB(VRAM)    // 1024*1024\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 11769.69 MB(VRAM)  // 1024*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 15970.04 MB(VRAM)  // 1024*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 26290.73 MB(VRAM)  // 1536*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 48521.84 MB(VRAM)  // 1792*1792\n\t\t\tx2y: map[float64]float64{\n\t\t\t\t256 * 256:   37.65,\n\t\t\t\t512 * 512:   367.98,\n\t\t\t\t1024 * 1024: 830.86,\n\t\t\t\t1024 * 1536: 11769.69,\n\t\t\t\t1024 * 1792: 15970.04,\n\t\t\t\t1536 * 1536: 26290.73,\n\t\t\t\t1792 * 1792: 48521.84,\n\t\t\t},\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 35711.24 MB(VRAM) // 1536*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 63292.44 MB(VRAM) // 1792*2048\n\t\t\tc: map[float64]float64{\n\t\t\t\t1536 * 1792: 35711.24,\n\t\t\t\t1792 * 2048: 63292.44,\n\t\t\t},\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 34.52 MB(VRAM)   // 256*256\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 130.48 MB(VRAM)  // 512*512\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 519.01 MB(VRAM)  // 1024*1024\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 774.69 MB(VRAM)  // 1024*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 902.54 MB(VRAM)  // 1024*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1158.23 MB(VRAM) // 1536*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1573.72 MB(VRAM) // 1792*1792\n\t\t\tfax2y: map[float64]float64{\n\t\t\t\t256 * 256:   34.52,\n\t\t\t\t512 * 512:   130.48,\n\t\t\t\t1024 * 1024: 519.01,\n\t\t\t\t1024 * 1536: 774.69,\n\t\t\t\t1024 * 1792: 902.54,\n\t\t\t\t1536 * 1536: 1158.23,\n\t\t\t\t1792 * 1792: 1573.72,\n\t\t\t},\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1349.99 MB(VRAM) // 1536*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1797.44 MB(VRAM) // 1792*2048\n\t\t\tfac: map[float64]float64{\n\t\t\t\t1536 * 1792: 1349.99,\n\t\t\t\t1792 * 2048: 1797.44,\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tn: \"GuessSDXLDiffusionModelMemoryUsage\",\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 60.76 MB(VRAM)     // 256*256\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 132.05 MB(VRAM)    // 512*512\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 830.86 MB(VRAM)    // 1024*1024\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1701.55 MB(VRAM)   // 1024*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 2256.90 MB(VRAM)   // 1024*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 3607.58 MB(VRAM)   // 1536*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 6484.95 MB(VRAM)   // 1792*1792\n\t\t\tx2y: map[float64]float64{\n\t\t\t\t256 * 256:   60.76,\n\t\t\t\t512 * 512:   132.05,\n\t\t\t\t1024 * 1024: 830.86,\n\t\t\t\t1024 * 1536: 1701.55,\n\t\t\t\t1024 * 1792: 2256.90,\n\t\t\t\t1536 * 1536: 3607.58,\n\t\t\t\t1792 * 1792: 6484.95,\n\t\t\t},\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 4830.60 MB(VRAM) // 1536*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 8384.30 MB(VRAM) // 1792*2048\n\t\t\tc: map[float64]float64{\n\t\t\t\t1536 * 1792: 4830.60,\n\t\t\t\t1792 * 2048: 8384.30,\n\t\t\t},\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 60.13 MB(VRAM)   // 256*256\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 132.05 MB(VRAM)  // 512*512\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 440.86 MB(VRAM)  // 1024*1024\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 726.55 MB(VRAM)  // 1024*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 874.40 MB(VRAM)  // 1024*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1110.08 MB(VRAM) // 1536*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1554.33 MB(VRAM) // 1792*1792\n\t\t\tfax2y: map[float64]float64{\n\t\t\t\t256 * 256:   60.13,\n\t\t\t\t512 * 512:   132.05,\n\t\t\t\t1024 * 1024: 440.86,\n\t\t\t\t1024 * 1536: 726.55,\n\t\t\t\t1024 * 1792: 874.40,\n\t\t\t\t1536 * 1536: 1110.08,\n\t\t\t\t1792 * 1792: 1554.33,\n\t\t\t},\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1339.35 MB(VRAM) // 1536*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1769.30 MB(VRAM) // 1792*2048\n\t\t\tfac: map[float64]float64{\n\t\t\t\t1536 * 1792: 1339.35,\n\t\t\t\t1792 * 2048: 1769.30,\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 44.57 MB(VRAM)     // 256*256\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 154.40 MB(VRAM)    // 512*512\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 968.43 MB(VRAM)    // 1024*1024\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 2013.12 MB(VRAM)   // 1024*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 2679.46 MB(VRAM)   // 1024*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 4300.15 MB(VRAM)   // 1536*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 7752.77 MB(VRAM)   // 1792*1792\n\t\t\tn: \"GuessSDXLRefinerDiffusionModelMemoryUsage\",\n\t\t\tx2y: map[float64]float64{\n\t\t\t\t256 * 256:   44.57,\n\t\t\t\t512 * 512:   154.40,\n\t\t\t\t1024 * 1024: 968.43,\n\t\t\t\t1024 * 1536: 2013.12,\n\t\t\t\t1024 * 1792: 2679.46,\n\t\t\t\t1536 * 1536: 4300.15,\n\t\t\t\t1792 * 1792: 7752.77,\n\t\t\t},\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 5767.67 MB(VRAM)   // 1536*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 10031.87 MB(VRAM)  // 1792*2048\n\t\t\tc: map[float64]float64{\n\t\t\t\t1536 * 1792: 5767.67,\n\t\t\t\t1792 * 2048: 10031.87,\n\t\t\t},\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 44.57 MB(VRAM)   // 256*256\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 154.40 MB(VRAM)  // 512*512\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 596.43 MB(VRAM)  // 1024*1024\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 915.12 MB(VRAM)  // 1024*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1062.46 MB(VRAM) // 1024*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1357.15 MB(VRAM) // 1536*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1836.02 MB(VRAM) // 1792*1792\n\t\t\tfax2y: map[float64]float64{\n\t\t\t\t256 * 256:   44.57,\n\t\t\t\t512 * 512:   154.40,\n\t\t\t\t1024 * 1024: 596.43,\n\t\t\t\t1024 * 1536: 915.12,\n\t\t\t\t1024 * 1792: 1062.46,\n\t\t\t\t1536 * 1536: 1357.15,\n\t\t\t\t1792 * 1792: 1836.02,\n\t\t\t},\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1578.17 MB(VRAM) // 1536*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 2014.02 MB(VRAM) // 1792*2048\n\t\t\tfac: map[float64]float64{\n\t\t\t\t1536 * 1792: 1578.17,\n\t\t\t\t1792 * 2048: 2014.02,\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tn: \"GuessSD3MediumDiffusionModelMemoryUsage\",\n\t\t\t// [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 37.09 MB(VRAM)    // 256*256\n\t\t\t// [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 169.64 MB(VRAM)   // 512*512\n\t\t\t// [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 1786.11 MB(VRAM)  // 1024*1024\n\t\t\t// [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 3824.36 MB(VRAM)  // 1024*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 5131.48 MB(VRAM)  // 1024*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 8319.03 MB(VRAM)  // 1536*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 15141.18 MB(VRAM) // 1792*1792\n\t\t\tx2y: map[float64]float64{\n\t\t\t\t256 * 256:   37.09,\n\t\t\t\t512 * 512:   169.64,\n\t\t\t\t1024 * 1024: 1786.11,\n\t\t\t\t1024 * 1536: 3824.36,\n\t\t\t\t1024 * 1792: 5131.48,\n\t\t\t\t1536 * 1536: 8319.03,\n\t\t\t\t1792 * 1792: 15141.18,\n\t\t\t},\n\t\t\t// [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 11215.71 MB(VRAM) // 1536*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 19654.65 MB(VRAM) // 1792*2048\n\t\t\tc: map[float64]float64{\n\t\t\t\t1536 * 1792: 11215.71,\n\t\t\t\t1792 * 2048: 19654.65,\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tn: \"GuessSD35MediumDiffusionModelMemoryUsage\",\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 41.48 MB(VRAM)     // 256*256\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 181.64 MB(VRAM)    // 512*512\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 1834.11 MB(VRAM)   // 1024*1024\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 3896.36 MB(VRAM)   // 1024*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 5215.48 MB(VRAM)   // 1024*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 8427.03 MB(VRAM)   // 1536*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 15288.18 MiB(VRAM) // 1792*1792\n\t\t\tx2y: map[float64]float64{\n\t\t\t\t256 * 256:   41.48,\n\t\t\t\t512 * 512:   181.64,\n\t\t\t\t1024 * 1024: 1834.11,\n\t\t\t\t1024 * 1536: 3896.36,\n\t\t\t\t1024 * 1792: 5215.48,\n\t\t\t\t1536 * 1536: 8427.03,\n\t\t\t\t1792 * 1792: 15288.18,\n\t\t\t},\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 11341.71 MB(VRAM)  // 1536*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 19822.65 MB(VRAM)  // 1792*2048\n\t\t\tc: map[float64]float64{\n\t\t\t\t1536 * 1792: 11341.71,\n\t\t\t\t1792 * 2048: 19822.65,\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tn: \"GuessSD35LargeDiffusionModelMemoryUsage\",\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 57.27 MB(VRAM)     // 256*256\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 276.54 MB(VRAM)    // 512*512\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 2865.44 MB(VRAM)   // 1024*1024\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 6109.95 MB(VRAM)   // 1024*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 8188.92 MB(VRAM)   // 1024*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 13258.86 MB(VRAM)  // 1536*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 24091.01 MiB(VRAM) // 1792*1792\n\t\t\tx2y: map[float64]float64{\n\t\t\t\t256 * 256:   57.27,\n\t\t\t\t512 * 512:   276.54,\n\t\t\t\t1024 * 1024: 2865.44,\n\t\t\t\t1024 * 1536: 6109.95,\n\t\t\t\t1024 * 1792: 8188.92,\n\t\t\t\t1536 * 1536: 13258.86,\n\t\t\t\t1792 * 1792: 24091.01,\n\t\t\t},\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 17859.31 MB(VRAM)  // 1536*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 31253.70 MB(VRAM)  // 1792*2048\n\t\t\tc: map[float64]float64{\n\t\t\t\t1536 * 1792: 17859.31,\n\t\t\t\t1792 * 2048: 31253.70,\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tn: \"GuessFLUXDiffusionModelMemoryUsage\",\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 103.35 MB(VRAM)     // 256*256\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 398.05 MB(VRAM)     // 512*512\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 2576.18 MB(VRAM)    // 1024*1024\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 4978.31 MB(VRAM)    // 1024*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 6467.37 MB(VRAM)    // 1024*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 10021.49 MB(VRAM)   // 1536*1536\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 17434.95 MB(VRAM)   // 1792*1792\n\t\t\tx2y: map[float64]float64{\n\t\t\t\t256 * 256:   103.35,\n\t\t\t\t512 * 512:   398.05,\n\t\t\t\t1024 * 1024: 2576.18,\n\t\t\t\t1024 * 1536: 4978.31,\n\t\t\t\t1024 * 1792: 6467.37,\n\t\t\t\t1536 * 1536: 10021.49,\n\t\t\t\t1792 * 1792: 17434.95,\n\t\t\t},\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 13191.09 MB(VRAM)  // 1536*1792\n\t\t\t// [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 22266.81 MB(VRAM)  // 1792*2048\n\t\t\tc: map[float64]float64{\n\t\t\t\t1536 * 1792: 13191.09,\n\t\t\t\t1792 * 2048: 22266.81,\n\t\t\t},\n\t\t},\n\t}\n\n\trs := make([]Regression, len(ts))\n\tfor i, t := range ts {\n\t\trs[i].Name = t.n\n\t}\n\n\tfmt.Println(\"Polynomial Regression For None Flash Attention\")\n\tfor i, t := range ts {\n\t\tpr := PolynomialRegression{\n\t\t\tDegree: 2,\n\t\t}\n\n\t\txs := maps.Keys(t.x2y)\n\t\tsort.Float64s(xs)\n\t\tys := make([]float64, len(xs))\n\t\tfor j, x := range xs {\n\t\t\tys[j] = t.x2y[x] * 1024 * 1024 // MB to B\n\t\t}\n\t\tpr.Fit(xs, ys)\n\n\t\tfor x, y := range t.c {\n\t\t\ty_ := pr.Predict(x) / 1024 / 1024 // B to MB\n\t\t\td := (y_ - y) / y * 100\n\t\t\ts := \"+\"\n\t\t\tif d < 0 {\n\t\t\t\ts = \"\"\n\t\t\t}\n\t\t\tc := \"\"\n\t\t\tif d > 10 {\n\t\t\t\tc = \"?\"\n\t\t\t}\n\n\t\t\tfmt.Printf(\"%50s: y': %10.2f | y: %10.2f | d: %10s%% %s\\n\", t.n, y_, y, s+strconv.FormatFloat(d, 'f', 6, 64), c)\n\t\t}\n\n\t\trs[i].PolynomialRegression = &pr\n\t}\n\n\tfmt.Println(\"Linear Regression For Flash Attention\")\n\tfor i, t := range ts {\n\t\tif len(t.fax2y) == 0 {\n\t\t\tcontinue\n\t\t}\n\n\t\tlr := LinearRegression{}\n\n\t\txs := maps.Keys(t.fax2y)\n\t\tsort.Float64s(xs)\n\t\tys := make([]float64, len(xs))\n\t\tfor j, x := range xs {\n\t\t\tys[j] = t.fax2y[x] * 1024 * 1024 // MB to B\n\t\t}\n\t\tlr.Fit(xs, ys)\n\n\t\tfor x, y := range t.fac {\n\t\t\ty_ := lr.Predict(x) / 1024 / 1024 // B to MB\n\t\t\td := (y_ - y) / y * 100\n\t\t\ts := \"+\"\n\t\t\tif d < 0 {\n\t\t\t\ts = \"\"\n\t\t\t}\n\t\t\tc := \"\"\n\t\t\tif d > 10 {\n\t\t\t\tc = \"?\"\n\t\t\t}\n\n\t\t\tfmt.Printf(\"%50s: y': %10.2f | y: %10.2f | d: %10s%% %s\\n\", t.n, y_, y, s+strconv.FormatFloat(d, 'f', 6, 64), c)\n\t\t}\n\n\t\trs[i].LinearRegression = &lr\n\t}\n\n\tvar code []byte\n\t{\n\t\tvar (\n\t\t\tbuff bytes.Buffer\n\t\t\terr  error\n\t\t)\n\t\ttmpl := template.Must(template.New(\"tmpl\").Parse(tmplStr))\n\t\tif err = tmpl.Execute(&buff, rs); err != nil {\n\t\t\tpanic(fmt.Errorf(\"failed to execute template: %w\", err))\n\t\t}\n\t\tcode, err = format.Source(buff.Bytes())\n\t\tif err != nil {\n\t\t\tpanic(fmt.Errorf(\"failed to format source: %w\", err))\n\t\t}\n\t}\n\n\tif err := os.WriteFile(output, code, 0644); err != nil {\n\t\tpanic(fmt.Errorf(\"failed to write file: %w\", err))\n\t}\n}\n\nfunc main() {\n\tDiffusionModelMemoryUsageRegression(\"zz_generated.diffusion_model_memory_usage.regression.go\")\n}\n"
  },
  {
    "path": "gen.stringer.go",
    "content": "//go:build stringer\n\n//go:generate go run golang.org/x/tools/cmd/stringer -linecomment -type GGUFMagic -output zz_generated.ggufmagic.stringer.go -trimprefix GGUFMagic\n//go:generate go run golang.org/x/tools/cmd/stringer -linecomment -type GGUFVersion -output zz_generated.ggufversion.stringer.go -trimprefix GGUFVersion\n//go:generate go run golang.org/x/tools/cmd/stringer -linecomment -type GGUFMetadataValueType -output zz_generated.ggufmetadatavaluetype.stringer.go -trimprefix GGUFMetadataValueType\n//go:generate go run golang.org/x/tools/cmd/stringer -linecomment -type GGUFFileType -output zz_generated.gguffiletype.stringer.go -trimprefix GGUFFileType\n//go:generate go run golang.org/x/tools/cmd/stringer -linecomment -type GGMLType -output zz_generated.ggmltype.stringer.go -trimprefix GGMLType\npackage gguf_parser\n\nimport _ \"golang.org/x/tools/cmd/stringer\"\n"
  },
  {
    "path": "ggml.go",
    "content": "package gguf_parser\n\nimport (\n\t\"errors\"\n\t\"fmt\"\n\t\"slices\"\n)\n\n// Types for GGMLType.\ntype (\n\t// GGMLType is a type of GGML tensor,\n\t// see https://github.com/ggml-org/llama.cpp/blob/fd1234cb468935ea087d6929b2487926c3afff4b/ggml/include/ggml.h#L368-L410.\n\tGGMLType uint32\n\n\t// GGMLTypeTrait holds the trait of a GGMLType,\n\t// see https://github.com/ggml-org/llama.cpp/blob/fd1234cb468935ea087d6929b2487926c3afff4b/ggml/src/ggml.c#L586-L876.\n\tGGMLTypeTrait struct {\n\t\tBlockSize uint64 // Original is int, in order to reduce conversion, here we use uint64.\n\t\tTypeSize  uint64 // Original is uint32, in order to reduce conversion, here we use uint64.\n\t\tQuantized bool\n\t}\n)\n\n// GGMLType constants.\n//\n// GGMLTypeQ4_2, GGMLTypeQ4_3 are deprecated.\n// GGMLTypeQ4_0_4_4, GGMLTypeQ4_0_4_8, GGMLTypeQ4_0_8_8 are deprecated.\n// GGMLTypeIQ4_NL_4_4, GGMLTypeIQ4_NL_4_8, GGMLTypeIQ4_NL_8_8 are deprecated.\nconst (\n\tGGMLTypeF32 GGMLType = iota\n\tGGMLTypeF16\n\tGGMLTypeQ4_0\n\tGGMLTypeQ4_1\n\tGGMLTypeQ4_2\n\tGGMLTypeQ4_3\n\tGGMLTypeQ5_0\n\tGGMLTypeQ5_1\n\tGGMLTypeQ8_0\n\tGGMLTypeQ8_1\n\tGGMLTypeQ2_K\n\tGGMLTypeQ3_K\n\tGGMLTypeQ4_K\n\tGGMLTypeQ5_K\n\tGGMLTypeQ6_K\n\tGGMLTypeQ8_K\n\tGGMLTypeIQ2_XXS\n\tGGMLTypeIQ2_XS\n\tGGMLTypeIQ3_XXS\n\tGGMLTypeIQ1_S\n\tGGMLTypeIQ4_NL\n\tGGMLTypeIQ3_S\n\tGGMLTypeIQ2_S\n\tGGMLTypeIQ4_XS\n\tGGMLTypeI8\n\tGGMLTypeI16\n\tGGMLTypeI32\n\tGGMLTypeI64\n\tGGMLTypeF64\n\tGGMLTypeIQ1_M\n\tGGMLTypeBF16\n\tGGMLTypeQ4_0_4_4\n\tGGMLTypeQ4_0_4_8\n\tGGMLTypeQ4_0_8_8\n\tGGMLTypeTQ1_0\n\tGGMLTypeTQ2_0\n\tGGMLTypeIQ4_NL_4_4\n\tGGMLTypeIQ4_NL_4_8\n\tGGMLTypeIQ4_NL_8_8\n\tGGMLTypeMXFP4\n\t_GGMLTypeCount // Unknown\n)\n\n// _GGMLTypeTraits is a table of GGMLTypeTrait for GGMLType.\nvar _GGMLTypeTraits = map[GGMLType]GGMLTypeTrait{\n\tGGMLTypeF32:        {BlockSize: 1, TypeSize: 4},\n\tGGMLTypeF16:        {BlockSize: 1, TypeSize: 2},\n\tGGMLTypeQ4_0:       {BlockSize: 32, TypeSize: 18, Quantized: true},\n\tGGMLTypeQ4_1:       {BlockSize: 32, TypeSize: 20, Quantized: true},\n\tGGMLTypeQ4_2:       {BlockSize: 0, TypeSize: 0}, // Deprecated\n\tGGMLTypeQ4_3:       {BlockSize: 0, TypeSize: 0}, // Deprecated\n\tGGMLTypeQ5_0:       {BlockSize: 32, TypeSize: 22, Quantized: true},\n\tGGMLTypeQ5_1:       {BlockSize: 32, TypeSize: 24, Quantized: true},\n\tGGMLTypeQ8_0:       {BlockSize: 32, TypeSize: 34, Quantized: true},\n\tGGMLTypeQ8_1:       {BlockSize: 32, TypeSize: 36, Quantized: true},\n\tGGMLTypeQ2_K:       {BlockSize: 256, TypeSize: 84, Quantized: true},\n\tGGMLTypeQ3_K:       {BlockSize: 256, TypeSize: 110, Quantized: true},\n\tGGMLTypeQ4_K:       {BlockSize: 256, TypeSize: 144, Quantized: true},\n\tGGMLTypeQ5_K:       {BlockSize: 256, TypeSize: 176, Quantized: true},\n\tGGMLTypeQ6_K:       {BlockSize: 256, TypeSize: 210, Quantized: true},\n\tGGMLTypeQ8_K:       {BlockSize: 256, TypeSize: 292, Quantized: true},\n\tGGMLTypeIQ2_XXS:    {BlockSize: 256, TypeSize: 66, Quantized: true},\n\tGGMLTypeIQ2_XS:     {BlockSize: 256, TypeSize: 74, Quantized: true},\n\tGGMLTypeIQ3_XXS:    {BlockSize: 256, TypeSize: 98, Quantized: true},\n\tGGMLTypeIQ1_S:      {BlockSize: 256, TypeSize: 50, Quantized: true},\n\tGGMLTypeIQ4_NL:     {BlockSize: 32, TypeSize: 18, Quantized: true},\n\tGGMLTypeIQ3_S:      {BlockSize: 256, TypeSize: 110, Quantized: true},\n\tGGMLTypeIQ2_S:      {BlockSize: 256, TypeSize: 82, Quantized: true},\n\tGGMLTypeIQ4_XS:     {BlockSize: 256, TypeSize: 136, Quantized: true},\n\tGGMLTypeI8:         {BlockSize: 1, TypeSize: 1},\n\tGGMLTypeI16:        {BlockSize: 1, TypeSize: 2},\n\tGGMLTypeI32:        {BlockSize: 1, TypeSize: 4},\n\tGGMLTypeI64:        {BlockSize: 1, TypeSize: 8},\n\tGGMLTypeF64:        {BlockSize: 1, TypeSize: 8},\n\tGGMLTypeIQ1_M:      {BlockSize: 256, TypeSize: 56, Quantized: true},\n\tGGMLTypeBF16:       {BlockSize: 1, TypeSize: 2},\n\tGGMLTypeQ4_0_4_4:   {BlockSize: 32, TypeSize: 18, Quantized: true},\n\tGGMLTypeQ4_0_4_8:   {BlockSize: 32, TypeSize: 18, Quantized: true},\n\tGGMLTypeQ4_0_8_8:   {BlockSize: 32, TypeSize: 18, Quantized: true},\n\tGGMLTypeTQ1_0:      {BlockSize: 256, TypeSize: 54, Quantized: true},\n\tGGMLTypeTQ2_0:      {BlockSize: 256, TypeSize: 66, Quantized: true},\n\tGGMLTypeIQ4_NL_4_4: {BlockSize: 32, TypeSize: 18, Quantized: true},\n\tGGMLTypeIQ4_NL_4_8: {BlockSize: 32, TypeSize: 18, Quantized: true},\n\tGGMLTypeIQ4_NL_8_8: {BlockSize: 32, TypeSize: 18, Quantized: true},\n\tGGMLTypeMXFP4:      {BlockSize: 32, TypeSize: 17, Quantized: true},\n}\n\n// Trait returns the GGMLTypeTrait of the GGMLType.\nfunc (t GGMLType) Trait() (GGMLTypeTrait, bool) {\n\ttt, ok := _GGMLTypeTraits[t]\n\treturn tt, ok\n}\n\n// IsQuantized returns whether the GGMLType is quantized.\nfunc (t GGMLType) IsQuantized() bool {\n\ttt, ok := t.Trait()\n\tif !ok {\n\t\treturn false\n\t}\n\treturn tt.Quantized\n}\n\n// RowSizeOf returns the size of the given dimensions according to the GGMLType's GGMLTypeTrait,\n// which is inspired by\n// https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L3142-L3145.\n//\n// The index of the given dimensions means the number of dimension,\n// i.e. 0 is the first dimension, 1 is the second dimension, and so on.\n//\n// The value of the item is the number of elements in the corresponding dimension.\nfunc (t GGMLType) RowSizeOf(dimensions []uint64) uint64 {\n\tif len(dimensions) == 0 {\n\t\tpanic(errors.New(\"no dimensions\"))\n\t}\n\n\ttt, ok := t.Trait()\n\tif !ok {\n\t\tpanic(fmt.Errorf(\"invalid type: %v\", t))\n\t}\n\n\t// https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2640-L2643\n\tds := tt.TypeSize * dimensions[0] / tt.BlockSize // Row size\n\tfor i := 1; i < len(dimensions); i++ {\n\t\tds *= dimensions[i]\n\t}\n\treturn ds\n}\n\n// GGMLMemoryPadding returns the padded size of the given size according to GGML memory padding,\n// see https://github.com/ggerganov/ggml/blob/0cbb7c0/include/ggml/ggml.h#L238-L243.\nfunc GGMLMemoryPadding(size uint64) uint64 {\n\tconst align = 16\n\treturn GGMLPadding(size, align)\n}\n\n// GGMLPadding returns the padded size of the given size according to given align,\n// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L255.\nfunc GGMLPadding(size, align uint64) uint64 {\n\treturn (size + align - 1) &^ (align - 1)\n}\n\n// GGML tensor constants.\nconst (\n\t// GGMLTensorSize is the size of GGML tensor in bytes,\n\t// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L606.\n\tGGMLTensorSize = 368\n\n\t// GGMLObjectSize is the size of GGML object in bytes,\n\t// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L563.\n\tGGMLObjectSize = 32\n)\n\n// GGMLTensorOverhead is the overhead of GGML tensor in bytes,\n// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L2765-L2767.\nfunc GGMLTensorOverhead() uint64 {\n\treturn GGMLObjectSize + GGMLTensorSize\n}\n\n// GGML computation graph constants.\nconst (\n\t// GGMLComputationGraphSize is the size of GGML computation graph in bytes.\n\tGGMLComputationGraphSize = 80\n\n\t// GGMLComputationBitsetSize is the size of GGML computation bitset in bytes,\n\t// see https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-impl.h#L165.\n\tGGMLComputationBitsetSize = 4\n)\n\n// GGMLComputationGraphOverhead is the overhead of GGML graph in bytes,\n// see https://github.com/ggml-org/ggml/blob/5592ffda9c417c3c12232c828247c23d17004c88/src/ggml.c#L5941-L5956.\nfunc GGMLComputationGraphOverhead(nodes uint64, grads bool) uint64 {\n\tconst ps = 8 // c++ pointer size\n\n\ths := GGMLHashSize(nodes * 2)\n\n\tvar g uint64 = GGMLComputationGraphSize // graph\n\tg += GGMLPadding(nodes*ps, ps)          // nodes\n\tg += GGMLPadding(nodes*ps, ps)          // leafs\n\tg += GGMLPadding(nodes*ps, ps)          // parents\n\tg += GGMLPadding(hs*ps, ps)             // hash keys\n\tif grads {\n\t\tg += GGMLPadding(hs*ps, ps) // grads\n\t\tg += GGMLPadding(hs*ps, ps) // grad_accs\n\t}\n\tg += GGMLPadding(GGMLBitsetSize(hs)*GGMLComputationBitsetSize, GGMLComputationBitsetSize) // bitset\n\n\treturn GGMLObjectSize + GGMLMemoryPadding(g)\n}\n\n// GGMLHashSize returns the size of the hash table for the given base,\n// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L17698-L17722.\nfunc GGMLHashSize(base uint64) uint64 {\n\tprimes := []uint64{\n\t\t2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,\n\t\t2053, 4099, 8209, 16411, 32771, 65537, 131101,\n\t\t262147, 524309, 1048583, 2097169, 4194319, 8388617,\n\t\t16777259, 33554467, 67108879, 134217757, 268435459,\n\t\t536870923, 1073741827, 2147483659,\n\t}\n\ti, ok := slices.BinarySearchFunc(primes, base, func(e, t uint64) int {\n\t\tif t >= e {\n\t\t\treturn 0\n\t\t}\n\t\treturn -1\n\t})\n\tif !ok {\n\t\treturn base | 1\n\t}\n\treturn primes[i]\n}\n\n// GGMLBitsetSize returns the size of the bitset for the given number of bits,\n// see https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/ggml/src/ggml-impl.h#L166-L171.\nfunc GGMLBitsetSize(n uint64) uint64 {\n\treturn (n + (GGMLComputationBitsetSize*8 - 1)) >> 5\n}\n"
  },
  {
    "path": "go.mod",
    "content": "module github.com/gpustack/gguf-parser-go\n\ngo 1.22.0\n\ntoolchain go1.22.9\n\nrequire (\n\tgithub.com/davecgh/go-spew v1.1.1\n\tgithub.com/henvic/httpretty v0.1.4\n\tgithub.com/json-iterator/go v1.1.12\n\tgithub.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d\n\tgithub.com/stretchr/testify v1.9.0\n\tgolang.org/x/crypto v0.29.0\n\tgolang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f\n\tgolang.org/x/sync v0.9.0\n\tgolang.org/x/sys v0.27.0\n\tgolang.org/x/tools v0.27.0\n\tgonum.org/v1/gonum v0.15.1\n)\n\nrequire (\n\tgithub.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect\n\tgithub.com/modern-go/reflect2 v1.0.2 // indirect\n\tgithub.com/pmezard/go-difflib v1.0.0 // indirect\n\tgolang.org/x/mod v0.22.0 // indirect\n\tgopkg.in/yaml.v3 v3.0.1 // indirect\n)\n"
  },
  {
    "path": "go.sum",
    "content": "github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=\ngithub.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=\ngithub.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=\ngithub.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=\ngithub.com/henvic/httpretty v0.1.4 h1:Jo7uwIRWVFxkqOnErcoYfH90o3ddQyVrSANeS4cxYmU=\ngithub.com/henvic/httpretty v0.1.4/go.mod h1:Dn60sQTZfbt2dYsdUSNsCljyF4AfdqnuJFDLJA1I4AM=\ngithub.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=\ngithub.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=\ngithub.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=\ngithub.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=\ngithub.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=\ngithub.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=\ngithub.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=\ngithub.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=\ngithub.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=\ngithub.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d h1:3VwvTjiRPA7cqtgOWddEL+JrcijMlXUmj99c/6YyZoY=\ngithub.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d/go.mod h1:tAG61zBM1DYRaGIPloumExGvScf08oHuo0kFoOqdbT0=\ngithub.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=\ngithub.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=\ngithub.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=\ngithub.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=\ngolang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ=\ngolang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg=\ngolang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f h1:XdNn9LlyWAhLVp6P/i8QYBW+hlyhrhei9uErw2B5GJo=\ngolang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f/go.mod h1:D5SMRVC3C2/4+F/DB1wZsLRnSNimn2Sp/NPsCrsv8ak=\ngolang.org/x/mod v0.22.0 h1:D4nJWe9zXqHOmWqj4VMOJhvzj7bEZg4wEYa759z1pH4=\ngolang.org/x/mod v0.22.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY=\ngolang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ=\ngolang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=\ngolang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s=\ngolang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=\ngolang.org/x/term v0.26.0 h1:WEQa6V3Gja/BhNxg540hBip/kkaYtRg3cxg4oXSw4AU=\ngolang.org/x/term v0.26.0/go.mod h1:Si5m1o57C5nBNQo5z1iq+XDijt21BDBDp2bK0QI8e3E=\ngolang.org/x/tools v0.27.0 h1:qEKojBykQkQ4EynWy4S8Weg69NumxKdn40Fce3uc/8o=\ngolang.org/x/tools v0.27.0/go.mod h1:sUi0ZgbwW9ZPAq26Ekut+weQPR5eIM6GQLQ1Yjm1H0Q=\ngonum.org/v1/gonum v0.15.1 h1:FNy7N6OUZVUaWG9pTiD+jlhdQ3lMP+/LcTpJ6+a8sQ0=\ngonum.org/v1/gonum v0.15.1/go.mod h1:eZTZuRFrzu5pcyjN5wJhcIhnUdNijYxX1T2IcrOGY0o=\ngopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=\ngopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=\ngopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=\ngopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=\n"
  },
  {
    "path": "ollama_model.go",
    "content": "package gguf_parser\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"net/http\"\n\t\"net/url\"\n\t\"regexp\"\n\t\"strings\"\n\n\t\"golang.org/x/sync/errgroup\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/httpx\"\n\t\"github.com/gpustack/gguf-parser-go/util/json\"\n\t\"github.com/gpustack/gguf-parser-go/util/stringx\"\n)\n\n// Inspired by https://github.com/ollama/ollama/blob/380e06e5bea06ae8ded37f47c37bd5d604194d3e/types/model/name.go,\n// and https://github.com/ollama/ollama/blob/380e06e5bea06ae8ded37f47c37bd5d604194d3e/server/modelpath.go.\n\nconst (\n\tOllamaDefaultScheme    = \"https\"\n\tOllamaDefaultRegistry  = \"registry.ollama.ai\"\n\tOllamaDefaultNamespace = \"library\"\n\tOllamaDefaultTag       = \"latest\"\n)\n\ntype (\n\t// OllamaModel represents an Ollama model,\n\t// its manifest(including MediaType, Config and Layers) can be completed further by calling the Complete method.\n\tOllamaModel struct {\n\t\tSchema        string             `json:\"schema\"`\n\t\tRegistry      string             `json:\"registry\"`\n\t\tNamespace     string             `json:\"namespace\"`\n\t\tRepository    string             `json:\"repository\"`\n\t\tTag           string             `json:\"tag\"`\n\t\tSchemaVersion uint32             `json:\"schemaVersion\"`\n\t\tMediaType     string             `json:\"mediaType\"`\n\t\tConfig        OllamaModelLayer   `json:\"config\"`\n\t\tLayers        []OllamaModelLayer `json:\"layers\"`\n\n\t\t// Client is the http client used to complete the OllamaModel's network operations.\n\t\t//\n\t\t// When this field is nil,\n\t\t// it will be set to the client used by OllamaModel.Complete.\n\t\t//\n\t\t// When this field is offered,\n\t\t// the network operations will be done with this client.\n\t\tClient *http.Client `json:\"-\"`\n\t}\n\n\t// OllamaModelLayer represents an Ollama model layer,\n\t// its digest can be used to download the artifact.\n\tOllamaModelLayer struct {\n\t\tMediaType string `json:\"mediaType\"`\n\t\tSize      uint64 `json:\"size\"`\n\t\tDigest    string `json:\"digest\"`\n\n\t\t// Root points to the root OllamaModel,\n\t\t// which is never serialized or deserialized.\n\t\t//\n\t\t// When called OllamaModel.Complete,\n\t\t// this field will be set to the OllamaModel itself.\n\t\t// If not, this field will be nil,\n\t\t// and must be set manually to the root OllamaModel before calling the method of OllamaModelLayer.\n\t\tRoot *OllamaModel `json:\"-\"`\n\t}\n)\n\n// ParseOllamaModel parses the given Ollama model string,\n// and returns the OllamaModel, or nil if the model is invalid.\nfunc ParseOllamaModel(model string, opts ...OllamaModelOption) *OllamaModel {\n\tif model == \"\" {\n\t\treturn nil\n\t}\n\n\tvar o _OllamaModelOptions\n\tfor _, opt := range opts {\n\t\topt(&o)\n\t}\n\n\tom := OllamaModel{\n\t\tSchema:    OllamaDefaultScheme,\n\t\tRegistry:  OllamaDefaultRegistry,\n\t\tNamespace: OllamaDefaultNamespace,\n\t\tTag:       OllamaDefaultTag,\n\t}\n\t{\n\t\tif o.DefaultScheme != \"\" {\n\t\t\tom.Schema = o.DefaultScheme\n\t\t}\n\t\tif o.DefaultRegistry != \"\" {\n\t\t\tom.Registry = o.DefaultRegistry\n\t\t}\n\t\tif o.DefaultNamespace != \"\" {\n\t\t\tom.Namespace = o.DefaultNamespace\n\t\t}\n\t\tif o.DefaultTag != \"\" {\n\t\t\tom.Tag = o.DefaultTag\n\t\t}\n\t}\n\n\tm := model\n\n\t// Drop digest.\n\tm, _, _ = stringx.CutFromRight(m, \"@\")\n\n\t// Get tag.\n\tm, s, ok := stringx.CutFromRight(m, \":\")\n\tif ok && s != \"\" {\n\t\tom.Tag = s\n\t}\n\n\t// Get repository.\n\tm, s, ok = stringx.CutFromRight(m, \"/\")\n\tif ok && s != \"\" {\n\t\tom.Repository = s\n\t} else if m != \"\" {\n\t\tom.Repository = m\n\t\tm = \"\"\n\t}\n\n\t// Get namespace.\n\tm, s, ok = stringx.CutFromRight(m, \"/\")\n\tif ok && s != \"\" {\n\t\tom.Namespace = s\n\t} else if m != \"\" {\n\t\tom.Namespace = m\n\t\tm = \"\"\n\t}\n\n\t// Get registry.\n\tm, s, ok = stringx.CutFromLeft(m, \"://\")\n\tif ok && s != \"\" {\n\t\tom.Schema = m\n\t\tom.Registry = s\n\t} else if m != \"\" {\n\t\tom.Registry = m\n\t}\n\n\tif om.Repository == \"\" {\n\t\treturn nil\n\t}\n\treturn &om\n}\n\nfunc (om *OllamaModel) String() string {\n\tvar b strings.Builder\n\tif om.Registry != \"\" {\n\t\tb.WriteString(om.Registry)\n\t\tb.WriteByte('/')\n\t}\n\tif om.Namespace != \"\" {\n\t\tb.WriteString(om.Namespace)\n\t\tb.WriteByte('/')\n\t}\n\tb.WriteString(om.Repository)\n\tif om.Tag != \"\" {\n\t\tb.WriteByte(':')\n\t\tb.WriteString(om.Tag)\n\t}\n\treturn b.String()\n}\n\n// GetLayer returns the OllamaModelLayer with the given media type,\n// and true if found, and false otherwise.\nfunc (om *OllamaModel) GetLayer(mediaType string) (OllamaModelLayer, bool) {\n\tfor i := range om.Layers {\n\t\tif om.Layers[i].MediaType == mediaType {\n\t\t\treturn om.Layers[i], true\n\t\t}\n\t}\n\treturn OllamaModelLayer{}, false\n}\n\n// SearchLayers returns a list of OllamaModelLayer with the media type that matches the given regex.\nfunc (om *OllamaModel) SearchLayers(mediaTypeRegex *regexp.Regexp) []OllamaModelLayer {\n\tvar ls []OllamaModelLayer\n\tfor i := range om.Layers {\n\t\tif mediaTypeRegex.MatchString(om.Layers[i].MediaType) {\n\t\t\tls = append(ls, om.Layers[i])\n\t\t}\n\t}\n\treturn ls\n}\n\n// WebPageURL returns the Ollama web page URL of the OllamaModel.\nfunc (om *OllamaModel) WebPageURL() *url.URL {\n\tu := &url.URL{\n\t\tScheme: om.Schema,\n\t\tHost:   om.Registry,\n\t}\n\treturn u.JoinPath(om.Namespace, om.Repository+\":\"+om.Tag)\n}\n\n// Complete completes the OllamaModel with the given context and http client.\nfunc (om *OllamaModel) Complete(ctx context.Context, cli *http.Client) error {\n\tif om.Client == nil {\n\t\tom.Client = cli\n\t}\n\n\tu := &url.URL{\n\t\tScheme: om.Schema,\n\t\tHost:   om.Registry,\n\t}\n\tu = u.JoinPath(\"v2\", om.Namespace, om.Repository, \"manifests\", om.Tag)\n\n\treq, err := httpx.NewGetRequestWithContext(ctx, u.String())\n\tif err != nil {\n\t\treturn fmt.Errorf(\"new request: %w\", err)\n\t}\n\treq.Header.Set(\"Accept\", \"application/vnd.docker.distribution.manifest.v2+json\")\n\n\terr = httpx.Do(om.Client, req, func(resp *http.Response) error {\n\t\tif resp.StatusCode != http.StatusOK {\n\t\t\treturn fmt.Errorf(\"status code %d\", resp.StatusCode)\n\t\t}\n\t\treturn json.NewDecoder(resp.Body).Decode(om)\n\t})\n\tif err != nil {\n\t\treturn fmt.Errorf(\"do request %s: %w\", u, err)\n\t}\n\n\t// Connect.\n\tom.Config.Root = om\n\tfor i := range om.Layers {\n\t\tom.Layers[i].Root = om\n\t}\n\n\treturn nil\n}\n\n// Params returns the parameters of the OllamaModel.\nfunc (om *OllamaModel) Params(ctx context.Context, cli *http.Client) (map[string]any, error) {\n\tif cli == nil {\n\t\tcli = om.Client\n\t}\n\tif cli == nil {\n\t\treturn nil, fmt.Errorf(\"no client\")\n\t}\n\n\tmls := om.SearchLayers(regexp.MustCompile(`^application/vnd\\.ollama\\.image\\.params$`))\n\tif len(mls) == 0 {\n\t\treturn nil, nil\n\t}\n\n\trs := make([]map[string]any, len(mls))\n\teg, ctx := errgroup.WithContext(ctx)\n\tfor i := range mls {\n\t\tx := i\n\t\teg.Go(func() error {\n\t\t\tbs, err := mls[x].FetchBlob(ctx, cli)\n\t\t\tif err == nil {\n\t\t\t\tp := make(map[string]any)\n\t\t\t\tif err = json.Unmarshal(bs, &p); err == nil {\n\t\t\t\t\trs[x] = p\n\t\t\t\t}\n\t\t\t}\n\t\t\treturn err\n\t\t})\n\t}\n\tif err := eg.Wait(); err != nil {\n\t\treturn nil, fmt.Errorf(\"fetch blob: %w\", err)\n\t}\n\n\tr := make(map[string]any)\n\tfor i := range rs {\n\t\tfor k, v := range rs[i] {\n\t\t\tr[k] = v\n\t\t}\n\t}\n\treturn r, nil\n}\n\n// Template returns the template of the OllamaModel.\nfunc (om *OllamaModel) Template(ctx context.Context, cli *http.Client) (string, error) {\n\tif cli == nil {\n\t\tcli = om.Client\n\t}\n\tif cli == nil {\n\t\treturn \"\", fmt.Errorf(\"no client\")\n\t}\n\n\tmls := om.SearchLayers(regexp.MustCompile(`^application/vnd\\.ollama\\.image\\.(prompt|template)$`))\n\tif len(mls) == 0 {\n\t\treturn \"\", nil\n\t}\n\n\tml := mls[len(mls)-1]\n\tbs, err := ml.FetchBlob(ctx, cli)\n\tif err != nil {\n\t\treturn \"\", fmt.Errorf(\"fetch blob: %w\", err)\n\t}\n\treturn stringx.FromBytes(&bs), nil\n}\n\n// System returns the system message of the OllamaModel.\nfunc (om *OllamaModel) System(ctx context.Context, cli *http.Client) (string, error) {\n\tif cli == nil {\n\t\tcli = om.Client\n\t}\n\tif cli == nil {\n\t\treturn \"\", fmt.Errorf(\"no client\")\n\t}\n\n\tmls := om.SearchLayers(regexp.MustCompile(`^application/vnd\\.ollama\\.image\\.system$`))\n\tif len(mls) == 0 {\n\t\treturn \"\", nil\n\t}\n\n\tml := mls[len(mls)-1]\n\tbs, err := ml.FetchBlob(ctx, cli)\n\tif err != nil {\n\t\treturn \"\", fmt.Errorf(\"fetch blob: %w\", err)\n\t}\n\treturn stringx.FromBytes(&bs), nil\n}\n\n// License returns the license of the OllamaModel.\nfunc (om *OllamaModel) License(ctx context.Context, cli *http.Client) ([]string, error) {\n\tif cli == nil {\n\t\tcli = om.Client\n\t}\n\tif cli == nil {\n\t\treturn nil, fmt.Errorf(\"no client\")\n\t}\n\n\tmls := om.SearchLayers(regexp.MustCompile(`^application/vnd\\.ollama\\.image\\.license$`))\n\tif len(mls) == 0 {\n\t\treturn nil, nil\n\t}\n\n\trs := make([]string, len(mls))\n\teg, ctx := errgroup.WithContext(ctx)\n\tfor i := range mls {\n\t\tx := i\n\t\teg.Go(func() error {\n\t\t\tbs, err := mls[x].FetchBlob(ctx, cli)\n\t\t\tif err == nil {\n\t\t\t\trs[x] = stringx.FromBytes(&bs)\n\t\t\t}\n\t\t\treturn err\n\t\t})\n\t}\n\tif err := eg.Wait(); err != nil {\n\t\treturn nil, fmt.Errorf(\"fetch blob: %w\", err)\n\t}\n\treturn rs, nil\n}\n\n// Messages returns the messages of the OllamaModel.\nfunc (om *OllamaModel) Messages(ctx context.Context, cli *http.Client) ([]json.RawMessage, error) {\n\tif cli == nil {\n\t\tcli = om.Client\n\t}\n\tif cli == nil {\n\t\treturn nil, fmt.Errorf(\"no client\")\n\t}\n\n\tmls := om.SearchLayers(regexp.MustCompile(`^application/vnd\\.ollama\\.image\\.messages$`))\n\tif len(mls) == 0 {\n\t\treturn nil, nil\n\t}\n\n\trs := make([]json.RawMessage, len(mls))\n\teg, ctx := errgroup.WithContext(ctx)\n\tfor i := range mls {\n\t\tx := i\n\t\teg.Go(func() error {\n\t\t\tbs, err := mls[x].FetchBlob(ctx, cli)\n\t\t\tif err == nil {\n\t\t\t\trs[x] = bs\n\t\t\t}\n\t\t\treturn err\n\t\t})\n\t}\n\tif err := eg.Wait(); err != nil {\n\t\treturn nil, fmt.Errorf(\"fetch blob: %w\", err)\n\t}\n\treturn rs, nil\n}\n\n// BlobURL returns the blob URL of the OllamaModelLayer.\nfunc (ol *OllamaModelLayer) BlobURL() *url.URL {\n\tif ol.Root == nil {\n\t\treturn nil\n\t}\n\n\tu := &url.URL{\n\t\tScheme: ol.Root.Schema,\n\t\tHost:   ol.Root.Registry,\n\t}\n\treturn u.JoinPath(\"v2\", ol.Root.Namespace, ol.Root.Repository, \"blobs\", ol.Digest)\n}\n\n// FetchBlob fetches the blob of the OllamaModelLayer with the given context and http client,\n// and returns the response body as bytes.\nfunc (ol *OllamaModelLayer) FetchBlob(ctx context.Context, cli *http.Client) ([]byte, error) {\n\tvar b []byte\n\terr := ol.FetchBlobFunc(ctx, cli, func(resp *http.Response) error {\n\t\tb = httpx.BodyBytes(resp)\n\t\treturn nil\n\t})\n\treturn b, err\n}\n\n// FetchBlobFunc fetches the blob of the OllamaModelLayer with the given context and http client,\n// and processes the response with the given function.\nfunc (ol *OllamaModelLayer) FetchBlobFunc(ctx context.Context, cli *http.Client, process func(*http.Response) error) error {\n\tif cli == nil {\n\t\tcli = ol.Root.Client\n\t}\n\tif cli == nil {\n\t\treturn fmt.Errorf(\"no client\")\n\t}\n\n\tu := ol.BlobURL()\n\tif u == nil {\n\t\treturn fmt.Errorf(\"no blob URL\")\n\t}\n\n\treq, err := httpx.NewGetRequestWithContext(ctx, u.String())\n\tif err != nil {\n\t\treturn fmt.Errorf(\"new request: %w\", err)\n\t}\n\n\terr = httpx.Do(cli, req, process)\n\tif err != nil {\n\t\treturn fmt.Errorf(\"do request %s: %w\", u, err)\n\t}\n\treturn nil\n}\n"
  },
  {
    "path": "ollama_model_option.go",
    "content": "package gguf_parser\n\nimport (\n\t\"net/url\"\n\t\"strings\"\n)\n\ntype (\n\t_OllamaModelOptions struct {\n\t\tDefaultScheme    string\n\t\tDefaultRegistry  string\n\t\tDefaultNamespace string\n\t\tDefaultTag       string\n\t}\n\tOllamaModelOption func(*_OllamaModelOptions)\n)\n\n// SetOllamaModelBaseURL parses the given base URL,\n// and sets default schema/registry for OllamaModel.\nfunc SetOllamaModelBaseURL(baseURL string) OllamaModelOption {\n\tbaseURL = strings.TrimSpace(baseURL)\n\treturn func(o *_OllamaModelOptions) {\n\t\tif baseURL == \"\" {\n\t\t\treturn\n\t\t}\n\n\t\tif !strings.Contains(baseURL, \"://\") {\n\t\t\tbaseURL = \"https://\" + baseURL\n\t\t}\n\n\t\tu, err := url.Parse(baseURL)\n\t\tif err != nil {\n\t\t\treturn\n\t\t}\n\n\t\to.DefaultScheme = u.Scheme\n\t\to.DefaultRegistry = u.Host\n\t}\n}\n\n// SetOllamaModelDefaultScheme sets the default scheme for OllamaModel.\nfunc SetOllamaModelDefaultScheme(scheme string) OllamaModelOption {\n\treturn func(o *_OllamaModelOptions) {\n\t\tif scheme == \"\" {\n\t\t\treturn\n\t\t}\n\t\to.DefaultScheme = scheme\n\t}\n}\n\n// SetOllamaModelDefaultRegistry sets the default registry for OllamaModel.\nfunc SetOllamaModelDefaultRegistry(registry string) OllamaModelOption {\n\treturn func(o *_OllamaModelOptions) {\n\t\tif registry == \"\" {\n\t\t\treturn\n\t\t}\n\t\to.DefaultRegistry = registry\n\t}\n}\n\n// SetOllamaModelDefaultNamespace sets the default namespace for OllamaModel.\nfunc SetOllamaModelDefaultNamespace(namespace string) OllamaModelOption {\n\treturn func(o *_OllamaModelOptions) {\n\t\tif namespace == \"\" {\n\t\t\treturn\n\t\t}\n\t\to.DefaultNamespace = namespace\n\t}\n}\n\n// SetOllamaModelDefaultTag sets the default tag for OllamaModel.\nfunc SetOllamaModelDefaultTag(tag string) OllamaModelOption {\n\treturn func(o *_OllamaModelOptions) {\n\t\tif tag == \"\" {\n\t\t\treturn\n\t\t}\n\t\to.DefaultTag = tag\n\t}\n}\n"
  },
  {
    "path": "ollama_model_test.go",
    "content": "package gguf_parser\n\nimport (\n\t\"testing\"\n\n\t\"github.com/stretchr/testify/assert\"\n)\n\nfunc TestParseOllamaModel(t *testing.T) {\n\tcases := []struct {\n\t\tgiven    string\n\t\texpected *OllamaModel\n\t}{\n\t\t{\n\t\t\tgiven: \"gemma2\",\n\t\t\texpected: &OllamaModel{\n\t\t\t\tSchema:     OllamaDefaultScheme,\n\t\t\t\tRegistry:   OllamaDefaultRegistry,\n\t\t\t\tNamespace:  OllamaDefaultNamespace,\n\t\t\t\tRepository: \"gemma2\",\n\t\t\t\tTag:        OllamaDefaultTag,\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tgiven: \"gemma2:awesome\",\n\t\t\texpected: &OllamaModel{\n\t\t\t\tSchema:     OllamaDefaultScheme,\n\t\t\t\tRegistry:   OllamaDefaultRegistry,\n\t\t\t\tNamespace:  OllamaDefaultNamespace,\n\t\t\t\tRepository: \"gemma2\",\n\t\t\t\tTag:        \"awesome\",\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tgiven: \"gemma2:awesome@sha256:1234567890abcdef\",\n\t\t\texpected: &OllamaModel{\n\t\t\t\tSchema:     OllamaDefaultScheme,\n\t\t\t\tRegistry:   OllamaDefaultRegistry,\n\t\t\t\tNamespace:  OllamaDefaultNamespace,\n\t\t\t\tRepository: \"gemma2\",\n\t\t\t\tTag:        \"awesome\",\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tgiven: \"awesome/gemma2:latest@sha256:1234567890abcdef\",\n\t\t\texpected: &OllamaModel{\n\t\t\t\tSchema:     OllamaDefaultScheme,\n\t\t\t\tRegistry:   OllamaDefaultRegistry,\n\t\t\t\tNamespace:  \"awesome\",\n\t\t\t\tRepository: \"gemma2\",\n\t\t\t\tTag:        \"latest\",\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tgiven: \"mysite.com/library/gemma2:latest@sha256:1234567890abcdef\",\n\t\t\texpected: &OllamaModel{\n\t\t\t\tSchema:     OllamaDefaultScheme,\n\t\t\t\tRegistry:   \"mysite.com\",\n\t\t\t\tNamespace:  \"library\",\n\t\t\t\tRepository: \"gemma2\",\n\t\t\t\tTag:        \"latest\",\n\t\t\t},\n\t\t},\n\t\t{\n\t\t\tgiven: \"http://mysite.com/library/gemma2:latest@sha256:1234567890abcdef\",\n\t\t\texpected: &OllamaModel{\n\t\t\t\tSchema:     \"http\",\n\t\t\t\tRegistry:   \"mysite.com\",\n\t\t\t\tNamespace:  \"library\",\n\t\t\t\tRepository: \"gemma2\",\n\t\t\t\tTag:        \"latest\",\n\t\t\t},\n\t\t},\n\t}\n\tfor _, tc := range cases {\n\t\tt.Run(tc.given, func(t *testing.T) {\n\t\t\tactual := ParseOllamaModel(tc.given)\n\t\t\tassert.Equal(t, tc.expected, actual)\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "ollama_registry_authenticate.go",
    "content": "package gguf_parser\n\nimport (\n\t\"bytes\"\n\t\"context\"\n\t\"crypto/ed25519\"\n\t\"crypto/rand\"\n\t\"encoding/base64\"\n\t\"encoding/json\"\n\t\"encoding/pem\"\n\t\"errors\"\n\t\"fmt\"\n\t\"net/http\"\n\t\"net/url\"\n\t\"os\"\n\t\"path/filepath\"\n\t\"runtime\"\n\t\"strconv\"\n\t\"strings\"\n\t\"time\"\n\n\t\"golang.org/x/crypto/ssh\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/funcx\"\n\t\"github.com/gpustack/gguf-parser-go/util/httpx\"\n\t\"github.com/gpustack/gguf-parser-go/util/osx\"\n\t\"github.com/gpustack/gguf-parser-go/util/stringx\"\n)\n\nconst (\n\thttpHeaderWWWAuthenticate = \"WWW-Authenticate\"\n\thttpHeaderAuthorization   = \"Authorization\"\n)\n\n// OllamaUserAgent returns the user agent string for Ollama,\n// since llama3.1, the user agent is required to be set,\n// otherwise the request will be rejected by 412.\nfunc OllamaUserAgent() string {\n\treturn fmt.Sprintf(\"ollama/9.9.9 (%s %s) Go/%s\", runtime.GOARCH, runtime.GOOS, runtime.Version())\n}\n\n// OllamaRegistryAuthorizeRetry returns true if the request should be retried with authorization.\n//\n// OllamaRegistryAuthorizeRetry leverages OllamaRegistryAuthorize to obtain an authorization token,\n// and configures the request with the token.\nfunc OllamaRegistryAuthorizeRetry(resp *http.Response, cli *http.Client) bool {\n\tif resp == nil || cli == nil {\n\t\treturn false\n\t}\n\n\tif resp.StatusCode != http.StatusUnauthorized && resp.Request == nil {\n\t\t// Not unauthorized, return.\n\t\treturn false\n\t}\n\n\treq := resp.Request\n\tif req.Header.Get(httpHeaderAuthorization) != \"\" {\n\t\t// Already authorized, return.\n\t\treturn false\n\t}\n\n\tconst tokenPrefix = \"Bearer \"\n\tauthnToken := strings.TrimPrefix(resp.Header.Get(httpHeaderWWWAuthenticate), tokenPrefix)\n\tif authnToken == \"\" {\n\t\t// No authentication token, return.\n\t\treturn false\n\t}\n\tauthzToken := funcx.MustNoError(OllamaRegistryAuthorize(req.Context(), cli, authnToken))\n\treq.Header.Set(httpHeaderAuthorization, tokenPrefix+authzToken)\n\treturn true\n}\n\n// OllamaRegistryAuthorize authorizes the request with the given authentication token,\n// and returns the authorization token.\nfunc OllamaRegistryAuthorize(ctx context.Context, cli *http.Client, authnToken string) (string, error) {\n\tpriKey, err := OllamaSingKeyLoad()\n\tif err != nil {\n\t\treturn \"\", fmt.Errorf(\"load sign key: %w\", err)\n\t}\n\n\tvar authzUrl string\n\t{\n\t\tss := strings.Split(authnToken, \",\")\n\t\tif len(ss) < 3 {\n\t\t\treturn \"\", errors.New(\"invalid authn token\")\n\t\t}\n\n\t\tvar realm, service, scope string\n\t\tfor _, s := range ss {\n\t\t\tsp := strings.SplitN(s, \"=\", 2)\n\t\t\tif len(sp) < 2 {\n\t\t\t\tcontinue\n\t\t\t}\n\t\t\tsp[1] = strings.TrimFunc(sp[1], func(r rune) bool {\n\t\t\t\treturn r == '\"' || r == '\\''\n\t\t\t})\n\t\t\tswitch sp[0] {\n\t\t\tcase \"realm\":\n\t\t\t\trealm = sp[1]\n\t\t\tcase \"service\":\n\t\t\t\tservice = sp[1]\n\t\t\tcase \"scope\":\n\t\t\t\tscope = sp[1]\n\t\t\t}\n\t\t}\n\n\t\tu, err := url.Parse(realm)\n\t\tif err != nil {\n\t\t\treturn \"\", fmt.Errorf(\"parse realm: %w\", err)\n\t\t}\n\n\t\tqs := u.Query()\n\t\tqs.Add(\"service\", service)\n\t\tfor _, s := range strings.Split(scope, \" \") {\n\t\t\tqs.Add(\"scope\", s)\n\t\t}\n\t\tqs.Add(\"ts\", strconv.FormatInt(time.Now().Unix(), 10))\n\t\tqs.Add(\"nonce\", stringx.RandomBase64(16))\n\t\tu.RawQuery = qs.Encode()\n\n\t\tauthzUrl = u.String()\n\t}\n\n\tvar authnData string\n\t{\n\t\tpubKey := ssh.MarshalAuthorizedKey(priKey.PublicKey())\n\t\tpubKeyp := bytes.Split(pubKey, []byte(\" \"))\n\t\tif len(pubKeyp) < 2 {\n\t\t\treturn \"\", errors.New(\"malformed public key\")\n\t\t}\n\n\t\tnc := base64.StdEncoding.EncodeToString([]byte(stringx.SumBytesBySHA256(nil)))\n\t\tpy := []byte(fmt.Sprintf(\"%s,%s,%s\", http.MethodGet, authzUrl, nc))\n\t\tsd, err := priKey.Sign(rand.Reader, py)\n\t\tif err != nil {\n\t\t\treturn \"\", fmt.Errorf(\"signing data: %w\", err)\n\t\t}\n\t\tauthnData = fmt.Sprintf(\"%s:%s\", bytes.TrimSpace(pubKeyp[1]), base64.StdEncoding.EncodeToString(sd.Blob))\n\t}\n\n\treq, err := httpx.NewGetRequestWithContext(ctx, authzUrl)\n\tif err != nil {\n\t\treturn \"\", fmt.Errorf(\"new request: %w\", err)\n\t}\n\treq.Header.Add(httpHeaderAuthorization, authnData)\n\n\tvar authzToken string\n\terr = httpx.Do(cli, req, func(resp *http.Response) error {\n\t\tif resp.StatusCode != http.StatusOK {\n\t\t\treturn fmt.Errorf(\"status code %d\", resp.StatusCode)\n\t\t}\n\t\tvar tok struct {\n\t\t\tToken string `json:\"token\"`\n\t\t}\n\t\tif err = json.NewDecoder(resp.Body).Decode(&tok); err != nil {\n\t\t\treturn err\n\t\t}\n\t\tif tok.Token == \"\" {\n\t\t\treturn errors.New(\"empty token\")\n\t\t}\n\t\tauthzToken = tok.Token\n\t\treturn nil\n\t})\n\tif err != nil {\n\t\treturn \"\", fmt.Errorf(\"do request %s: %w\", authzUrl, err)\n\t}\n\n\treturn authzToken, nil\n}\n\n// OllamaSingKeyLoad loads the signing key for Ollama,\n// and generates a new key if not exists.\nfunc OllamaSingKeyLoad() (ssh.Signer, error) {\n\thd := filepath.Join(osx.UserHomeDir(), \".ollama\")\n\n\tpriKeyPath := filepath.Join(hd, \"id_ed25519\")\n\tif !osx.ExistsFile(priKeyPath) {\n\t\t// Generate key if not exists.\n\t\tpubKey, priKey, err := ed25519.GenerateKey(rand.Reader)\n\t\tif err != nil {\n\t\t\treturn nil, fmt.Errorf(\"generate key: %w\", err)\n\t\t}\n\n\t\tpriKeyPem, err := ssh.MarshalPrivateKey(priKey, \"\")\n\t\tif err != nil {\n\t\t\treturn nil, fmt.Errorf(\"marshal private key: %w\", err)\n\t\t}\n\t\tpriKeyBs := pem.EncodeToMemory(priKeyPem)\n\n\t\tsshPubKey, err := ssh.NewPublicKey(pubKey)\n\t\tif err != nil {\n\t\t\treturn nil, fmt.Errorf(\"new public key: %w\", err)\n\t\t}\n\t\tpubKeyBs := ssh.MarshalAuthorizedKey(sshPubKey)\n\n\t\tif err = osx.WriteFile(priKeyPath, priKeyBs, 0o600); err != nil {\n\t\t\treturn nil, fmt.Errorf(\"write private key: %w\", err)\n\t\t}\n\t\tif err = osx.WriteFile(priKeyPath+\".pub\", pubKeyBs, 0o644); err != nil {\n\t\t\t_ = os.Remove(priKeyPath)\n\t\t\treturn nil, fmt.Errorf(\"write public key: %w\", err)\n\t\t}\n\t}\n\n\tpriKeyBs, err := os.ReadFile(priKeyPath)\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"read private key: %w\", err)\n\t}\n\tpriKey, err := ssh.ParsePrivateKey(priKeyBs)\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"parse private key: %w\", err)\n\t}\n\treturn priKey, nil\n}\n"
  },
  {
    "path": "scalar.go",
    "content": "package gguf_parser\n\nimport (\n\t\"errors\"\n\t\"strconv\"\n\t\"strings\"\n)\n\nconst (\n\t_Ki = 1 << ((iota + 1) * 10)\n\t_Mi\n\t_Gi\n\t_Ti\n\t_Pi\n)\n\nconst (\n\t_K = 1e3\n\t_M = 1e6\n\t_G = 1e9\n\t_T = 1e12\n\t_P = 1e15\n)\n\nconst (\n\t_Thousand    = 1e3\n\t_Million     = 1e6\n\t_Billion     = 1e9\n\t_Trillion    = 1e12\n\t_Quadrillion = 1e15\n)\n\ntype (\n\t// SizeScalar is the scalar for size.\n\tSizeScalar uint64\n\n\t// FLOPSScalar is the scalar for FLOPS.\n\tFLOPSScalar uint64\n\n\t// BytesPerSecondScalar is the scalar for bytes per second (Bps).\n\tBytesPerSecondScalar uint64\n)\n\nvar (\n\t// _GeneralBaseUnitMatrix is the base unit matrix for bytes.\n\t_GeneralBaseUnitMatrix = []struct {\n\t\tBase float64\n\t\tUnit string\n\t}{\n\t\t{_Pi, \"Pi\"},\n\t\t{_P, \"P\"},\n\t\t{_Ti, \"Ti\"},\n\t\t{_T, \"T\"},\n\t\t{_Gi, \"Gi\"},\n\t\t{_G, \"G\"},\n\t\t{_Mi, \"Mi\"},\n\t\t{_M, \"M\"},\n\t\t{_Ki, \"Ki\"},\n\t\t{_K, \"K\"},\n\t}\n\n\t// _SizeBaseUnitMatrix is the base unit matrix for size.\n\t_SizeBaseUnitMatrix = []struct {\n\t\tBase float64\n\t\tUnit string\n\t}{\n\t\t{_Pi, \"P\"},\n\t\t{_Ti, \"T\"},\n\t\t{_Gi, \"G\"},\n\t\t{_Mi, \"M\"},\n\t\t{_Ki, \"K\"},\n\t}\n\n\t// _NumberBaseUnitMatrix is the base unit matrix for numbers.\n\t_NumberBaseUnitMatrix = []struct {\n\t\tBase float64\n\t\tUnit string\n\t}{\n\t\t{_Quadrillion, \"Q\"},\n\t\t{_Trillion, \"T\"},\n\t\t{_Billion, \"B\"},\n\t\t{_Million, \"M\"},\n\t\t{_Thousand, \"K\"},\n\t}\n)\n\n// ParseSizeScalar parses the SizeScalar from the string.\nfunc ParseSizeScalar(s string) (_ SizeScalar, err error) {\n\tif s == \"\" {\n\t\treturn 0, errors.New(\"invalid SizeScalar\")\n\t}\n\tb := float64(1)\n\tfor i := range _SizeBaseUnitMatrix {\n\t\tif strings.HasSuffix(s, _SizeBaseUnitMatrix[i].Unit) {\n\t\t\tb = _SizeBaseUnitMatrix[i].Base\n\t\t\ts = strings.TrimSuffix(s, _SizeBaseUnitMatrix[i].Unit)\n\t\t\tbreak\n\t\t}\n\t}\n\tf, err := strconv.ParseFloat(strings.TrimSpace(s), 64)\n\tif err != nil {\n\t\treturn 0, err\n\t}\n\treturn SizeScalar(f * b), nil\n}\n\nfunc (s SizeScalar) String() string {\n\tif s == 0 {\n\t\treturn \"0\"\n\t}\n\tb, u := float64(1), \"\"\n\tfor i := range _SizeBaseUnitMatrix {\n\t\tif float64(s) >= _SizeBaseUnitMatrix[i].Base {\n\t\t\tb = _SizeBaseUnitMatrix[i].Base\n\t\t\tu = _SizeBaseUnitMatrix[i].Unit\n\t\t\tbreak\n\t\t}\n\t}\n\tf := strconv.FormatFloat(float64(s)/b, 'f', 2, 64)\n\treturn strings.TrimSuffix(f, \".00\") + \" \" + u\n}\n\n// ParseFLOPSScalar parses the FLOPSScalar from the string.\nfunc ParseFLOPSScalar(s string) (_ FLOPSScalar, err error) {\n\tif s == \"\" {\n\t\treturn 0, errors.New(\"invalid FLOPSScalar\")\n\t}\n\ts = strings.TrimSuffix(s, \"FLOPS\")\n\tb := float64(1)\n\tfor i := range _GeneralBaseUnitMatrix {\n\t\tif strings.HasSuffix(s, _GeneralBaseUnitMatrix[i].Unit) {\n\t\t\tb = _GeneralBaseUnitMatrix[i].Base\n\t\t\ts = strings.TrimSuffix(s, _GeneralBaseUnitMatrix[i].Unit)\n\t\t\tbreak\n\t\t}\n\t}\n\tf, err := strconv.ParseFloat(strings.TrimSpace(s), 64)\n\tif err != nil {\n\t\treturn 0, err\n\t}\n\treturn FLOPSScalar(f * b), nil\n}\n\nfunc (s FLOPSScalar) String() string {\n\tif s == 0 {\n\t\treturn \"0 FLOPS\"\n\t}\n\tb, u := float64(1), \"\"\n\tfor i := range _GeneralBaseUnitMatrix {\n\t\tif float64(s) >= _GeneralBaseUnitMatrix[i].Base {\n\t\t\tb = _GeneralBaseUnitMatrix[i].Base\n\t\t\tu = _GeneralBaseUnitMatrix[i].Unit\n\t\t\tbreak\n\t\t}\n\t}\n\tf := strconv.FormatFloat(float64(s)/b, 'f', 2, 64)\n\treturn strings.TrimSuffix(f, \".00\") + \" \" + u + \"FLOPS\"\n}\n\n// ParseBytesPerSecondScalar parses the BytesPerSecondScalar from the string.\nfunc ParseBytesPerSecondScalar(s string) (_ BytesPerSecondScalar, err error) {\n\tif s == \"\" {\n\t\treturn 0, errors.New(\"invalid BytesPerSecondScalar\")\n\t}\n\tb := float64(1)\n\to := float64(1)\n\tswitch {\n\tcase strings.HasSuffix(s, \"Bps\") || strings.HasSuffix(s, \"B/s\"):\n\t\ts = strings.TrimSuffix(strings.TrimSuffix(s, \"Bps\"), \"B/s\")\n\tcase strings.HasSuffix(s, \"bps\") || strings.HasSuffix(s, \"b/s\"):\n\t\ts = strings.TrimSuffix(strings.TrimSuffix(s, \"bps\"), \"b/s\")\n\t\to = 8\n\t}\n\tfor i := range _GeneralBaseUnitMatrix {\n\t\tif strings.HasSuffix(s, _GeneralBaseUnitMatrix[i].Unit) {\n\t\t\tb = _GeneralBaseUnitMatrix[i].Base\n\t\t\ts = strings.TrimSuffix(s, _GeneralBaseUnitMatrix[i].Unit)\n\t\t\tbreak\n\t\t}\n\t}\n\tf, err := strconv.ParseFloat(strings.TrimSpace(s), 64)\n\tif err != nil {\n\t\treturn 0, err\n\t}\n\treturn BytesPerSecondScalar(f * b / o), nil\n}\n\nfunc (s BytesPerSecondScalar) String() string {\n\tif s == 0 {\n\t\treturn \"0 Bps\"\n\t}\n\tb, u := float64(1), \"\"\n\tfor i := range _GeneralBaseUnitMatrix {\n\t\tif float64(s) >= _GeneralBaseUnitMatrix[i].Base {\n\t\t\tb = _GeneralBaseUnitMatrix[i].Base\n\t\t\tu = _GeneralBaseUnitMatrix[i].Unit\n\t\t\tbreak\n\t\t}\n\t}\n\tf := strconv.FormatFloat(float64(s)/b, 'f', 2, 64)\n\treturn strings.TrimSuffix(f, \".00\") + \" \" + u + \"Bps\"\n}\n\ntype (\n\t// GGUFBytesScalar is the scalar for bytes.\n\tGGUFBytesScalar uint64\n\n\t// GGUFParametersScalar is the scalar for parameters.\n\tGGUFParametersScalar uint64\n\n\t// GGUFBitsPerWeightScalar is the scalar for bits per weight.\n\tGGUFBitsPerWeightScalar float64\n\n\t// GGUFTokensPerSecondScalar is the scalar for tokens per second.\n\tGGUFTokensPerSecondScalar float64\n)\n\n// ParseGGUFBytesScalar parses the GGUFBytesScalar from the string.\nfunc ParseGGUFBytesScalar(s string) (_ GGUFBytesScalar, err error) {\n\tif s == \"\" {\n\t\treturn 0, errors.New(\"invalid GGUFBytesScalar\")\n\t}\n\ts = strings.TrimSuffix(s, \"B\")\n\tb := float64(1)\n\tfor i := range _GeneralBaseUnitMatrix {\n\t\tif strings.HasSuffix(s, _GeneralBaseUnitMatrix[i].Unit) {\n\t\t\tb = _GeneralBaseUnitMatrix[i].Base\n\t\t\ts = strings.TrimSuffix(s, _GeneralBaseUnitMatrix[i].Unit)\n\t\t\tbreak\n\t\t}\n\t}\n\tf, err := strconv.ParseFloat(strings.TrimSpace(s), 64)\n\tif err != nil {\n\t\treturn 0, err\n\t}\n\treturn GGUFBytesScalar(f * b), nil\n}\n\n// GGUFBytesScalarStringInMiBytes is the flag to show the GGUFBytesScalar string in MiB.\nvar GGUFBytesScalarStringInMiBytes bool\n\nfunc (s GGUFBytesScalar) String() string {\n\tif s == 0 {\n\t\treturn \"0 B\"\n\t}\n\tb, u := float64(1), \"\"\n\tif GGUFBytesScalarStringInMiBytes {\n\t\tb = _Mi\n\t\tu = \"Mi\"\n\t} else {\n\t\tfor i := range _GeneralBaseUnitMatrix {\n\t\t\tif float64(s) >= _GeneralBaseUnitMatrix[i].Base {\n\t\t\t\tb = _GeneralBaseUnitMatrix[i].Base\n\t\t\t\tu = _GeneralBaseUnitMatrix[i].Unit\n\t\t\t\tbreak\n\t\t\t}\n\t\t}\n\t}\n\tf := strconv.FormatFloat(float64(s)/b, 'f', 2, 64)\n\treturn strings.TrimSuffix(f, \".00\") + \" \" + u + \"B\"\n}\n\nfunc (s GGUFParametersScalar) String() string {\n\tif s == 0 {\n\t\treturn \"0\"\n\t}\n\tb, u := float64(1), \"\"\n\tfor i := range _NumberBaseUnitMatrix {\n\t\tif float64(s) >= _NumberBaseUnitMatrix[i].Base {\n\t\t\tb = _NumberBaseUnitMatrix[i].Base\n\t\t\tu = _NumberBaseUnitMatrix[i].Unit\n\t\t\tbreak\n\t\t}\n\t}\n\tf := strconv.FormatFloat(float64(s)/b, 'f', 2, 64)\n\treturn strings.TrimSuffix(f, \".00\") + \" \" + u\n}\n\nfunc (s GGUFBitsPerWeightScalar) String() string {\n\tif s <= 0 {\n\t\treturn \"0 bpw\"\n\t}\n\treturn strconv.FormatFloat(float64(s), 'f', 2, 64) + \" bpw\"\n}\n\nfunc (s GGUFTokensPerSecondScalar) String() string {\n\tif s <= 0 {\n\t\treturn \"0 tps\"\n\t}\n\treturn strconv.FormatFloat(float64(s), 'f', 2, 64) + \" tps\"\n}\n"
  },
  {
    "path": "scalar_test.go",
    "content": "package gguf_parser\n\nimport (\n\t\"testing\"\n\n\t\"github.com/stretchr/testify/assert\"\n)\n\nfunc TestParseSizeScalar(t *testing.T) {\n\ttestCases := []struct {\n\t\tgiven    string\n\t\texpected SizeScalar\n\t}{\n\t\t{\"1\", 1},\n\t\t{\"1K\", 1 * _Ki},\n\t\t{\"1M\", 1 * _Mi},\n\t\t{\"1G\", 1 * _Gi},\n\t\t{\"1T\", 1 * _Ti},\n\t\t{\"1P\", 1 * _Pi},\n\t}\n\tfor _, tc := range testCases {\n\t\tt.Run(tc.given, func(t *testing.T) {\n\t\t\tactual, err := ParseSizeScalar(tc.given)\n\t\t\tif !assert.NoError(t, err) {\n\t\t\t\treturn\n\t\t\t}\n\t\t\tassert.Equal(t, tc.expected, actual)\n\t\t})\n\t}\n}\n\nfunc TestParseFLOPSScalar(t *testing.T) {\n\ttestCases := []struct {\n\t\tgiven    string\n\t\texpected FLOPSScalar\n\t}{\n\t\t{\"1FLOPS\", 1},\n\t\t{\"1KFLOPS\", 1 * _K},\n\t\t{\"1MFLOPS\", 1 * _M},\n\t\t{\"1GFLOPS\", 1 * _G},\n\t\t{\"1TFLOPS\", 1 * _T},\n\t\t{\"1PFLOPS\", 1 * _P},\n\t}\n\tfor _, tc := range testCases {\n\t\tt.Run(tc.given, func(t *testing.T) {\n\t\t\tactual, err := ParseFLOPSScalar(tc.given)\n\t\t\tif !assert.NoError(t, err) {\n\t\t\t\treturn\n\t\t\t}\n\t\t\tassert.Equal(t, tc.expected, actual)\n\t\t})\n\t}\n}\n\nfunc TestParseBytesPerSecondScalar(t *testing.T) {\n\ttestCases := []struct {\n\t\tgiven    string\n\t\texpected BytesPerSecondScalar\n\t}{\n\t\t{\"1B/s\", 1},\n\t\t{\"1KB/s\", 1 * _K},\n\t\t{\"1MB/s\", 1 * _M},\n\t\t{\"1GB/s\", 1 * _G},\n\t\t{\"1TB/s\", 1 * _T},\n\t\t{\"1PB/s\", 1 * _P},\n\t\t{\"1KiBps\", 1 * _Ki},\n\t\t{\"1MiBps\", 1 * _Mi},\n\t\t{\"1GiBps\", 1 * _Gi},\n\t\t{\"1TiBps\", 1 * _Ti},\n\t\t{\"1PiBps\", 1 * _Pi},\n\t\t{\"8b/s\", 1},\n\t\t{\"1Kbps\", 1 * _K >> 3},\n\t\t{\"1Mbps\", 1 * _M >> 3},\n\t\t{\"1Gbps\", 1 * _G >> 3},\n\t\t{\"1Tbps\", 1 * _T >> 3},\n\t\t{\"1Pbps\", 1 * _P >> 3},\n\t\t{\"1Kibps\", 1 * _Ki >> 3},\n\t\t{\"1Mibps\", 1 * _Mi >> 3},\n\t\t{\"1Gibps\", 1 * _Gi >> 3},\n\t\t{\"1Tibps\", 1 * _Ti >> 3},\n\t\t{\"1Pibps\", 1 * _Pi >> 3},\n\t}\n\tfor _, tc := range testCases {\n\t\tt.Run(tc.given, func(t *testing.T) {\n\t\t\tactual, err := ParseBytesPerSecondScalar(tc.given)\n\t\t\tif !assert.NoError(t, err) {\n\t\t\t\treturn\n\t\t\t}\n\t\t\tassert.Equal(t, tc.expected, actual)\n\t\t})\n\t}\n}\n\nfunc TestParseGGUFBytesScalar(t *testing.T) {\n\ttestCases := []struct {\n\t\tgiven    string\n\t\texpected GGUFBytesScalar\n\t}{\n\t\t{\"1B\", 1},\n\t\t{\"1KB\", 1 * _K},\n\t\t{\"1MB\", 1 * _M},\n\t\t{\"1GB\", 1 * _G},\n\t\t{\"1TB\", 1 * _T},\n\t\t{\"1PB\", 1 * _P},\n\t\t{\"1KiB\", 1 * _Ki},\n\t\t{\"1MiB\", 1 * _Mi},\n\t\t{\"1GiB\", 1 * _Gi},\n\t\t{\"1TiB\", 1 * _Ti},\n\t\t{\"1PiB\", 1 * _Pi},\n\t}\n\tfor _, tc := range testCases {\n\t\tt.Run(tc.given, func(t *testing.T) {\n\t\t\tactual, err := ParseGGUFBytesScalar(tc.given)\n\t\t\tif !assert.NoError(t, err) {\n\t\t\t\treturn\n\t\t\t}\n\t\t\tassert.Equal(t, tc.expected, actual)\n\t\t})\n\t}\n}\n"
  },
  {
    "path": "util/anyx/any.go",
    "content": "package anyx\n\nimport (\n\t\"encoding/json\"\n\t\"fmt\"\n\t\"strconv\"\n\n\t\"golang.org/x/exp/constraints\"\n)\n\n// Number converts any type to the specified number type.\nfunc Number[T constraints.Integer | constraints.Float](v any) T {\n\tswitch vv := v.(type) {\n\tcase int:\n\t\treturn T(vv)\n\tcase int8:\n\t\treturn T(vv)\n\tcase int16:\n\t\treturn T(vv)\n\tcase int32:\n\t\treturn T(vv)\n\tcase int64:\n\t\treturn T(vv)\n\tcase uint:\n\t\treturn T(vv)\n\tcase uint8:\n\t\treturn T(vv)\n\tcase uint16:\n\t\treturn T(vv)\n\tcase uint32:\n\t\treturn T(vv)\n\tcase uint64:\n\t\treturn T(vv)\n\tcase float32:\n\t\treturn T(vv)\n\tcase float64:\n\t\treturn T(vv)\n\tcase bool:\n\t\tif vv {\n\t\t\treturn T(1)\n\t\t}\n\t\treturn T(0)\n\tcase string:\n\t\tx, err := strconv.ParseInt(vv, 10, 64)\n\t\tif err != nil {\n\t\t\ty, err := strconv.ParseFloat(vv, 64)\n\t\t\tif err != nil {\n\t\t\t\treturn T(0)\n\t\t\t} else {\n\t\t\t\treturn T(y)\n\t\t\t}\n\t\t}\n\t\treturn T(x)\n\tcase json.Number:\n\t\tx, err := vv.Int64()\n\t\tif err != nil {\n\t\t\ty, err := vv.Float64()\n\t\t\tif err != nil {\n\t\t\t\treturn T(0)\n\t\t\t} else {\n\t\t\t\treturn T(y)\n\t\t\t}\n\t\t}\n\t\treturn T(x)\n\tdefault:\n\t\treturn T(0)\n\t}\n}\n\n// Bool converts any type to a bool.\nfunc Bool(v any) bool {\n\tswitch vv := v.(type) {\n\tcase bool:\n\t\treturn vv\n\tcase int, int8, int16, int32, int64, uint, uint8, uint16, uint32, uint64, uintptr:\n\t\treturn vv != 0\n\tcase float32, float64:\n\t\treturn vv != 0\n\tcase string:\n\t\treturn vv != \"0\"\n\tcase fmt.Stringer:\n\t\treturn vv.String() != \"0\"\n\tdefault:\n\t\treturn false\n\t}\n}\n\n// String converts any type to a string.\nfunc String(v any) string {\n\tswitch vv := v.(type) {\n\tcase string:\n\t\treturn vv\n\tcase []byte:\n\t\treturn string(vv)\n\tcase int:\n\t\treturn strconv.FormatInt(int64(vv), 10)\n\tcase int8:\n\t\treturn strconv.FormatInt(int64(vv), 10)\n\tcase int16:\n\t\treturn strconv.FormatInt(int64(vv), 10)\n\tcase int32:\n\t\treturn strconv.FormatInt(int64(vv), 10)\n\tcase int64:\n\t\treturn strconv.FormatInt(vv, 10)\n\tcase uint:\n\t\treturn strconv.FormatUint(uint64(vv), 10)\n\tcase uint8:\n\t\treturn strconv.FormatUint(uint64(vv), 10)\n\tcase uint16:\n\t\treturn strconv.FormatUint(uint64(vv), 10)\n\tcase uint32:\n\t\treturn strconv.FormatUint(uint64(vv), 10)\n\tcase uint64:\n\t\treturn strconv.FormatUint(vv, 10)\n\tcase float32:\n\t\treturn strconv.FormatFloat(float64(vv), 'f', -1, 32)\n\tcase float64:\n\t\treturn strconv.FormatFloat(vv, 'f', -1, 64)\n\tcase bool:\n\t\treturn strconv.FormatBool(vv)\n\tcase fmt.Stringer:\n\t\treturn vv.String()\n\tcase json.RawMessage:\n\t\treturn string(vv)\n\tdefault:\n\t\treturn fmt.Sprintf(\"%v\", v)\n\t}\n}\n"
  },
  {
    "path": "util/bytex/pool.go",
    "content": "package bytex\n\nimport (\n\t\"bytes\"\n\t\"sync\"\n)\n\nconst defaultSize = 32 * 1024\n\ntype (\n\tBytes       = []byte\n\tBytesBuffer = *bytes.Buffer\n)\n\nvar gp = sync.Pool{\n\tNew: func() any {\n\t\tbuf := make(Bytes, defaultSize)\n\t\treturn &buf\n\t},\n}\n\n// GetBytes gets a bytes buffer from the pool,\n// which can specify with a size,\n// default is 32k.\nfunc GetBytes(size ...uint64) Bytes {\n\tbuf := *(gp.Get().(*Bytes))\n\n\ts := defaultSize\n\tif len(size) != 0 {\n\t\ts = int(size[0])\n\t\tif s == 0 {\n\t\t\ts = defaultSize\n\t\t}\n\t}\n\tif cap(buf) >= s {\n\t\treturn buf[:s]\n\t}\n\n\tgp.Put(&buf)\n\n\tns := s\n\tif ns < defaultSize {\n\t\tns = defaultSize\n\t}\n\tbuf = make(Bytes, ns)\n\treturn buf[:s]\n}\n\n// WithBytes relies on GetBytes to get a buffer,\n// calls the function with the buffer,\n// finally, puts it back to the pool after the function returns.\nfunc WithBytes(fn func(Bytes) error, size ...uint64) error {\n\tif fn == nil {\n\t\treturn nil\n\t}\n\n\tbuf := GetBytes(size...)\n\tdefer Put(buf)\n\treturn fn(buf)\n}\n\n// GetBuffer is similar to GetBytes,\n// but it returns the bytes buffer wrapped by bytes.Buffer.\nfunc GetBuffer(size ...uint64) BytesBuffer {\n\treturn bytes.NewBuffer(GetBytes(size...)[:0])\n}\n\n// WithBuffer relies on GetBuffer to get a buffer,\n// calls the function with the buffer,\n// finally, puts it back to the pool after the function returns.\nfunc WithBuffer(fn func(BytesBuffer) error, size ...uint64) error {\n\tif fn == nil {\n\t\treturn nil\n\t}\n\n\tbuf := GetBuffer(size...)\n\tdefer Put(buf)\n\treturn fn(buf)\n}\n\n// Put puts the buffer(either Bytes or BytesBuffer) back to the pool.\nfunc Put[T Bytes | BytesBuffer](buf T) {\n\tswitch v := any(buf).(type) {\n\tcase Bytes:\n\t\tgp.Put(&v)\n\tcase BytesBuffer:\n\t\tbs := v.Bytes()\n\t\tgp.Put(&bs)\n\t\tv.Reset()\n\t}\n}\n"
  },
  {
    "path": "util/funcx/error.go",
    "content": "package funcx\n\n// NoError ignores the given error,\n// it is usually a nice helper for chain function calling.\nfunc NoError[T any](t T, _ error) T {\n\treturn t\n}\n\n// NoError2 ignores the given error,\n// it is usually a nice helper for chain function calling.\nfunc NoError2[T, U any](t T, u U, _ error) (T, U) {\n\treturn t, u\n}\n\n// NoError3 ignores the given error,\n// it is usually a nice helper for chain function calling.\nfunc NoError3[T, U, V any](t T, u U, v V, _ error) (T, U, V) {\n\treturn t, u, v\n}\n\n// NoError4 ignores the given error,\n// it is usually a nice helper for chain function calling.\nfunc NoError4[T, U, V, W any](t T, u U, v V, w W, _ error) (T, U, V, W) {\n\treturn t, u, v, w\n}\n\n// MustNoError is similar to NoError,\n// but it panics if the given error is not nil,\n// it is usually a nice helper for chain function calling.\nfunc MustNoError[T any](t T, e error) T {\n\tif e != nil {\n\t\tpanic(e)\n\t}\n\treturn t\n}\n\n// MustNoError2 is similar to NoError2,\n// but it panics if the given error is not nil,\n// it is usually a nice helper for chain function calling.\nfunc MustNoError2[T, U any](t T, u U, e error) (T, U) {\n\tif e != nil {\n\t\tpanic(e)\n\t}\n\treturn t, u\n}\n\n// MustNoError3 is similar to NoError3,\n// but it panics if the given error is not nil,\n// it is usually a nice helper for chain function calling.\nfunc MustNoError3[T, U, V any](t T, u U, v V, e error) (T, U, V) {\n\tif e != nil {\n\t\tpanic(e)\n\t}\n\treturn t, u, v\n}\n\n// MustNoError4 is similar to NoError4,\n// but it panics if the given error is not nil,\n// it is usually a nice helper for chain function calling.\nfunc MustNoError4[T, U, V, W any](t T, u U, v V, w W, e error) (T, U, V, W) {\n\tif e != nil {\n\t\tpanic(e)\n\t}\n\treturn t, u, v, w\n}\n"
  },
  {
    "path": "util/httpx/client.go",
    "content": "package httpx\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"io\"\n\t\"net/http\"\n\t\"time\"\n\n\t\"github.com/henvic/httpretty\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/bytex\"\n)\n\n// DefaultClient is similar to the default http.Client used by the package.\n//\n// It is used for requests pooling.\nvar DefaultClient = &http.Client{\n\tTransport: DefaultTransport,\n}\n\n// DefaultInsecureClient is the default http.Client used by the package,\n// with TLS insecure skip verify.\n//\n// It is used for requests pooling.\nvar DefaultInsecureClient = &http.Client{\n\tTransport: DefaultInsecureTransport,\n}\n\n// Client returns a new http.Client with the given options,\n// the result http.Client is used for fast-consuming requests.\n//\n// If you want a requests pool management, use DefaultClient instead.\nfunc Client(opts ...*ClientOption) *http.Client {\n\tvar o *ClientOption\n\tif len(opts) > 0 {\n\t\to = opts[0]\n\t} else {\n\t\to = ClientOptions()\n\t}\n\n\troot := DefaultTransport\n\tif o.transport != nil {\n\t\troot = o.transport\n\t}\n\n\tif o.debug {\n\t\tpretty := &httpretty.Logger{\n\t\t\tTime:            true,\n\t\t\tTLS:             true,\n\t\t\tRequestHeader:   true,\n\t\t\tRequestBody:     true,\n\t\t\tMaxRequestBody:  1024,\n\t\t\tResponseHeader:  true,\n\t\t\tResponseBody:    true,\n\t\t\tMaxResponseBody: 1024,\n\t\t\tFormatters:      []httpretty.Formatter{&JSONFormatter{}},\n\t\t}\n\t\troot = pretty.RoundTripper(root)\n\t}\n\n\trtc := RoundTripperChain{\n\t\tNext: root,\n\t}\n\tfor i := range o.roundTrippers {\n\t\trtc = RoundTripperChain{\n\t\t\tDo:   o.roundTrippers[i],\n\t\t\tNext: rtc,\n\t\t}\n\t}\n\n\tvar rt http.RoundTripper = rtc\n\tif o.retryIf != nil {\n\t\trt = RoundTripperFunc(func(req *http.Request) (*http.Response, error) {\n\t\t\tfor i := 0; ; i++ {\n\t\t\t\tresp, err := rtc.RoundTrip(req)\n\t\t\t\tif !o.retryIf(resp, err) {\n\t\t\t\t\treturn resp, err\n\t\t\t\t}\n\t\t\t\tw, ok := o.retryBackoff(i+1, resp)\n\t\t\t\tif !ok {\n\t\t\t\t\treturn resp, err\n\t\t\t\t}\n\t\t\t\twt := time.NewTimer(w)\n\t\t\t\tselect {\n\t\t\t\tcase <-req.Context().Done():\n\t\t\t\t\twt.Stop()\n\t\t\t\t\treturn resp, req.Context().Err()\n\t\t\t\tcase <-wt.C:\n\t\t\t\t}\n\t\t\t}\n\t\t})\n\t}\n\n\treturn &http.Client{\n\t\tTransport: rt,\n\t\tTimeout:   o.timeout,\n\t}\n}\n\n// NewGetRequestWithContext returns a new http.MethodGet request,\n// which is saving your life from http.NewRequestWithContext.\nfunc NewGetRequestWithContext(ctx context.Context, uri string) (*http.Request, error) {\n\treturn http.NewRequestWithContext(ctx, http.MethodGet, uri, nil)\n}\n\n// NewGetRequest returns a new http.MethodGet request,\n// which is saving your life from http.NewRequest.\nfunc NewGetRequest(uri string) (*http.Request, error) {\n\treturn http.NewRequest(http.MethodGet, uri, nil)\n}\n\n// NewHeadRequestWithContext returns a new http.MethodHead request,\n// which is saving your life from http.NewRequestWithContext.\nfunc NewHeadRequestWithContext(ctx context.Context, uri string) (*http.Request, error) {\n\treturn http.NewRequestWithContext(ctx, http.MethodHead, uri, nil)\n}\n\n// NewHeadRequest returns a new http.MethodHead request,\n// which is saving your life from http.NewRequest.\nfunc NewHeadRequest(uri string) (*http.Request, error) {\n\treturn http.NewRequest(http.MethodHead, uri, nil)\n}\n\n// NewPostRequestWithContext returns a new http.MethodPost request with the given context,\n// which is saving your life from http.NewRequestWithContext.\nfunc NewPostRequestWithContext(ctx context.Context, uri string, body io.Reader) (*http.Request, error) {\n\treturn http.NewRequestWithContext(ctx, http.MethodPost, uri, body)\n}\n\n// NewPostRequest returns a new http.MethodPost request,\n// which is saving your life from http.NewRequest.\nfunc NewPostRequest(uri string, body io.Reader) (*http.Request, error) {\n\treturn http.NewRequest(http.MethodPost, uri, body)\n}\n\n// NewPutRequestWithContext returns a new http.MethodPut request with the given context,\n// which is saving your life from http.NewRequestWithContext.\nfunc NewPutRequestWithContext(ctx context.Context, uri string, body io.Reader) (*http.Request, error) {\n\treturn http.NewRequestWithContext(ctx, http.MethodPut, uri, body)\n}\n\n// NewPutRequest returns a new http.MethodPut request,\n// which is saving your life from http.NewRequest.\nfunc NewPutRequest(uri string, body io.Reader) (*http.Request, error) {\n\treturn http.NewRequest(http.MethodPut, uri, body)\n}\n\n// NewPatchRequestWithContext returns a new http.MethodPatch request with the given context,\n// which is saving your life from http.NewRequestWithContext.\nfunc NewPatchRequestWithContext(ctx context.Context, uri string, body io.Reader) (*http.Request, error) {\n\treturn http.NewRequestWithContext(ctx, http.MethodPatch, uri, body)\n}\n\n// NewPatchRequest returns a new http.MethodPatch request,\n// which is saving your life from http.NewRequest.\nfunc NewPatchRequest(uri string, body io.Reader) (*http.Request, error) {\n\treturn http.NewRequest(http.MethodPatch, uri, body)\n}\n\n// NewDeleteRequestWithContext returns a new http.MethodDelete request with the given context,\n// which is saving your life from http.NewRequestWithContext.\nfunc NewDeleteRequestWithContext(ctx context.Context, uri string) (*http.Request, error) {\n\treturn http.NewRequestWithContext(ctx, http.MethodDelete, uri, nil)\n}\n\n// NewDeleteRequest returns a new http.MethodDelete request,\n// which is saving your life from http.NewRequest.\nfunc NewDeleteRequest(uri string) (*http.Request, error) {\n\treturn http.NewRequest(http.MethodDelete, uri, nil)\n}\n\n// NewConnectRequestWithContext returns a new http.MethodConnect request with the given context,\n// which is saving your life from http.NewRequestWithContext.\nfunc NewConnectRequestWithContext(ctx context.Context, uri string) (*http.Request, error) {\n\treturn http.NewRequestWithContext(ctx, http.MethodConnect, uri, nil)\n}\n\n// NewConnectRequest returns a new http.MethodConnect request,\n// which is saving your life from http.NewRequest.\nfunc NewConnectRequest(uri string) (*http.Request, error) {\n\treturn http.NewRequest(http.MethodConnect, uri, nil)\n}\n\n// NewOptionsRequestWithContext returns a new http.MethodOptions request with the given context,\n// which is saving your life from http.NewRequestWithContext.\nfunc NewOptionsRequestWithContext(ctx context.Context, uri string) (*http.Request, error) {\n\treturn http.NewRequestWithContext(ctx, http.MethodOptions, uri, nil)\n}\n\n// NewOptionsRequest returns a new http.MethodOptions request,\n// which is saving your life from http.NewRequest.\nfunc NewOptionsRequest(uri string) (*http.Request, error) {\n\treturn http.NewRequest(http.MethodOptions, uri, nil)\n}\n\n// NewTraceRequestWithContext returns a new http.MethodTrace request with the given context,\n// which is saving your life from http.NewRequestWithContext.\nfunc NewTraceRequestWithContext(ctx context.Context, uri string) (*http.Request, error) {\n\treturn http.NewRequestWithContext(ctx, http.MethodTrace, uri, nil)\n}\n\n// NewTraceRequest returns a new http.MethodTrace request,\n// which is saving your life from http.NewRequest.\nfunc NewTraceRequest(uri string) (*http.Request, error) {\n\treturn http.NewRequest(http.MethodTrace, uri, nil)\n}\n\n// Error is similar to http.Error,\n// but it can get the error message by the given code.\nfunc Error(rw http.ResponseWriter, code int) {\n\thttp.Error(rw, http.StatusText(code), code)\n}\n\n// Close closes the http response body without error.\nfunc Close(resp *http.Response) {\n\tif resp != nil && resp.Body != nil {\n\t\t_ = resp.Body.Close()\n\t}\n}\n\n// BodyBytes returns the body of the http response as a byte slice.\nfunc BodyBytes(resp *http.Response) []byte {\n\tbuf := bytex.GetBytes()\n\tdefer bytex.Put(buf)\n\n\tw := bytex.GetBuffer()\n\t_, _ = io.CopyBuffer(w, resp.Body, buf)\n\treturn w.Bytes()\n}\n\n// BodyString returns the body of the http response as a string.\nfunc BodyString(resp *http.Response) string {\n\treturn string(BodyBytes(resp))\n}\n\n// Do is a helper function to execute the given http request with the given http client,\n// and execute the given function with the http response.\n//\n// It is useful to avoid forgetting to close the http response body.\n//\n// Do will return the error if failed to execute the http request or the given function.\nfunc Do(cli *http.Client, req *http.Request, respFunc func(*http.Response) error) error {\n\tresp, err := cli.Do(req)\n\tif err != nil {\n\t\treturn fmt.Errorf(\"do request: %w\", err)\n\t}\n\tdefer Close(resp)\n\tif respFunc == nil {\n\t\treturn nil\n\t}\n\treturn respFunc(resp)\n}\n"
  },
  {
    "path": "util/httpx/client_helper.go",
    "content": "package httpx\n\nimport (\n\t\"bytes\"\n\t\"errors\"\n\t\"io\"\n\t\"net/http\"\n\t\"regexp\"\n\n\t\"github.com/henvic/httpretty\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/json\"\n)\n\nvar _ httpretty.Formatter = (*JSONFormatter)(nil)\n\n// JSONFormatter is copied from httpretty.JSONFormatter,\n// but use our own json package.\ntype JSONFormatter struct{}\n\nvar jsonTypeRE = regexp.MustCompile(`[/+]json($|;)`)\n\n// Match JSON media type.\nfunc (j *JSONFormatter) Match(mediatype string) bool {\n\treturn jsonTypeRE.MatchString(mediatype)\n}\n\n// Format JSON content.\nfunc (j *JSONFormatter) Format(w io.Writer, src []byte) error {\n\tif !json.Valid(src) {\n\t\t// We want to get the error of json.checkValid, not unmarshal it.\n\t\t// The happy path has been optimized, maybe prematurely.\n\t\tif err := json.Unmarshal(src, &json.RawMessage{}); err != nil {\n\t\t\treturn err\n\t\t}\n\t}\n\t// Avoiding allocation as we use *bytes.Buffer to store the formatted body before printing\n\tdst, ok := w.(*bytes.Buffer)\n\tif !ok {\n\t\t// Mitigating panic to avoid upsetting anyone who uses this directly\n\t\treturn errors.New(\"underlying writer for JSONFormatter must be *bytes.Buffer\")\n\t}\n\treturn json.Indent(dst, src, \"\", \"    \")\n}\n\ntype RoundTripperChain struct {\n\tDo   func(req *http.Request) error\n\tNext http.RoundTripper\n}\n\nfunc (c RoundTripperChain) RoundTrip(req *http.Request) (*http.Response, error) {\n\tif c.Do != nil {\n\t\tif err := c.Do(req); err != nil {\n\t\t\treturn nil, err\n\t\t}\n\t}\n\tif c.Next != nil {\n\t\treturn c.Next.RoundTrip(req)\n\t}\n\treturn nil, nil\n}\n\ntype RoundTripperFunc func(*http.Request) (*http.Response, error)\n\nfunc (fn RoundTripperFunc) RoundTrip(req *http.Request) (*http.Response, error) {\n\treturn fn(req)\n}\n"
  },
  {
    "path": "util/httpx/client_options.go",
    "content": "package httpx\n\nimport (\n\t\"math\"\n\t\"net/http\"\n\t\"strconv\"\n\t\"strings\"\n\t\"time\"\n)\n\ntype ClientOption struct {\n\t*TransportOption\n\n\ttimeout       time.Duration\n\tdebug         bool\n\tretryIf       RetryFunc\n\tretryBackoff  func(attemptNum int, resp *http.Response) (wait time.Duration, ok bool)\n\troundTrippers []func(req *http.Request) error\n}\n\nfunc ClientOptions() *ClientOption {\n\treturn &ClientOption{\n\t\tTransportOption: TransportOptions().WithoutKeepalive(),\n\t\ttimeout:         30 * time.Second,\n\t\tretryIf:         DefaultRetry,\n\t\tretryBackoff:    createRetryBackoff(100*time.Millisecond, 5*time.Second, 5),\n\t}\n}\n\n// WithTransport sets the TransportOption.\nfunc (o *ClientOption) WithTransport(opt *TransportOption) *ClientOption {\n\tif o == nil || opt == nil {\n\t\treturn o\n\t}\n\to.TransportOption = opt\n\treturn o\n}\n\n// WithTimeout sets the request timeout.\n//\n// This timeout controls the sum of [network dial], [tls handshake], [request], [response header reading] and [response body reading].\n//\n// Use 0 to disable timeout.\nfunc (o *ClientOption) WithTimeout(timeout time.Duration) *ClientOption {\n\tif o == nil || timeout < 0 {\n\t\treturn o\n\t}\n\to.timeout = timeout\n\treturn o\n}\n\n// WithDebug sets the debug mode.\nfunc (o *ClientOption) WithDebug() *ClientOption {\n\tif o == nil {\n\t\treturn o\n\t}\n\to.debug = true\n\treturn o\n}\n\ntype RetryFunc func(resp *http.Response, err error) (retry bool)\n\n// WithRetryIf specifies the if-condition of retry operation for request,\n// or stops retrying if setting with `nil`.\nfunc (o *ClientOption) WithRetryIf(retryIf RetryFunc) *ClientOption {\n\tif o == nil {\n\t\treturn o\n\t}\n\to.retryIf = retryIf\n\treturn o\n}\n\n// WithRetryBackoff specifies the retry-backoff mechanism for request.\nfunc (o *ClientOption) WithRetryBackoff(waitMin, waitMax time.Duration, attemptMax int) *ClientOption {\n\tif o == nil || waitMin < 0 || waitMax < 0 || waitMax < waitMin || attemptMax <= 0 {\n\t\treturn o\n\t}\n\to.retryBackoff = createRetryBackoff(waitMin, waitMax, attemptMax)\n\treturn o\n}\n\n// WithUserAgent sets the user agent.\nfunc (o *ClientOption) WithUserAgent(ua string) *ClientOption {\n\treturn o.WithRoundTripper(func(req *http.Request) error {\n\t\treq.Header.Set(\"User-Agent\", ua)\n\t\treturn nil\n\t})\n}\n\n// WithBearerAuth sets the bearer token.\nfunc (o *ClientOption) WithBearerAuth(token string) *ClientOption {\n\treturn o.WithRoundTripper(func(req *http.Request) error {\n\t\treq.Header.Set(\"Authorization\", \"Bearer \"+token)\n\t\treturn nil\n\t})\n}\n\n// WithBasicAuth sets the basic authentication.\nfunc (o *ClientOption) WithBasicAuth(username, password string) *ClientOption {\n\treturn o.WithRoundTripper(func(req *http.Request) error {\n\t\treq.SetBasicAuth(username, password)\n\t\treturn nil\n\t})\n}\n\n// WithHeader sets the header.\nfunc (o *ClientOption) WithHeader(key, value string) *ClientOption {\n\treturn o.WithRoundTripper(func(req *http.Request) error {\n\t\treq.Header.Set(key, value)\n\t\treturn nil\n\t})\n}\n\n// WithHeaders sets the headers.\nfunc (o *ClientOption) WithHeaders(headers map[string]string) *ClientOption {\n\treturn o.WithRoundTripper(func(req *http.Request) error {\n\t\tfor k, v := range headers {\n\t\t\treq.Header.Set(k, v)\n\t\t}\n\t\treturn nil\n\t})\n}\n\n// WithRoundTripper sets the round tripper.\nfunc (o *ClientOption) WithRoundTripper(rt func(req *http.Request) error) *ClientOption {\n\tif o == nil || rt == nil {\n\t\treturn o\n\t}\n\to.roundTrippers = append(o.roundTrippers, rt)\n\treturn o\n}\n\n// If is a conditional option,\n// which receives a boolean condition to trigger the given function or not.\nfunc (o *ClientOption) If(condition bool, then func(*ClientOption) *ClientOption) *ClientOption {\n\tif condition {\n\t\treturn then(o)\n\t}\n\treturn o\n}\n\n// DefaultRetry is the default retry condition,\n// inspired by https://github.com/hashicorp/go-retryablehttp/blob/40b0cad1633fd521cee5884724fcf03d039aaf3f/client.go#L68-L86.\nfunc DefaultRetry(resp *http.Response, respErr error) bool {\n\tif respErr != nil {\n\t\tswitch errMsg := respErr.Error(); {\n\t\tcase strings.Contains(errMsg, `redirects`):\n\t\t\treturn false\n\t\tcase strings.Contains(errMsg, `unsupported protocol scheme`):\n\t\t\treturn false\n\t\tcase strings.Contains(errMsg, `certificate is not trusted`):\n\t\t\treturn false\n\t\tcase strings.Contains(errMsg, `invalid header`):\n\t\t\treturn false\n\t\tcase strings.Contains(errMsg, `failed to verify certificate`):\n\t\t\treturn false\n\t\t}\n\n\t\t// Retry if receiving connection closed.\n\t\treturn true\n\t}\n\n\t// Retry if receiving rate-limited of server.\n\tif resp.StatusCode == http.StatusTooManyRequests {\n\t\treturn true\n\t}\n\n\t// Retry if receiving unexpected responses.\n\tif resp.StatusCode == 0 || (resp.StatusCode >= 500 && resp.StatusCode != http.StatusNotImplemented) {\n\t\treturn true\n\t}\n\n\treturn false\n}\n\n// createRetryBackoff creates a backoff function for retry operation.\nfunc createRetryBackoff(waitMin, waitMax time.Duration, attemptMax int) func(int, *http.Response) (time.Duration, bool) {\n\treturn func(attemptNum int, resp *http.Response) (wait time.Duration, ok bool) {\n\t\tif attemptNum > attemptMax {\n\t\t\treturn 0, false\n\t\t}\n\n\t\tif resp != nil && (resp.StatusCode == http.StatusTooManyRequests || resp.StatusCode == http.StatusServiceUnavailable) {\n\t\t\tif retryAfter := resp.Header.Get(\"Retry-After\"); retryAfter != \"\" {\n\t\t\t\tif seconds, err := strconv.Atoi(retryAfter); err == nil {\n\t\t\t\t\treturn time.Duration(seconds) * time.Second, true\n\t\t\t\t}\n\t\t\t}\n\t\t}\n\n\t\twait = time.Duration(math.Pow(2, float64(attemptNum)) * float64(waitMin))\n\t\treturn min(wait, waitMax), true\n\t}\n}\n"
  },
  {
    "path": "util/httpx/file.go",
    "content": "package httpx\n\nimport (\n\t\"errors\"\n\t\"fmt\"\n\t\"io\"\n\t\"net/http\"\n\t\"strings\"\n\t\"syscall\"\n\n\t\"github.com/smallnest/ringbuffer\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/bytex\"\n)\n\ntype SeekerFile struct {\n\tcli *http.Client\n\treq *http.Request\n\tb   *ringbuffer.RingBuffer\n\tc   int64\n\tl   int64\n}\n\n// OpenSeekerFile tries the GET http.Request as a SeekerFile,\n// and returns a SeekerFile, or an error if any.\nfunc OpenSeekerFile(cli *http.Client, req *http.Request, opts ...*SeekerFileOption) (*SeekerFile, error) {\n\tif cli == nil {\n\t\treturn nil, errors.New(\"client is nil\")\n\t}\n\tif req == nil {\n\t\treturn nil, errors.New(\"request is nil\")\n\t}\n\tif req.Method != http.MethodGet {\n\t\treturn nil, errors.New(\"request method is not GET\")\n\t}\n\n\tvar o *SeekerFileOption\n\tif len(opts) > 0 {\n\t\to = opts[0]\n\t} else {\n\t\to = SeekerFileOptions()\n\t}\n\tif o.bufSize <= 0 {\n\t\to.bufSize = 4 * 1024 * 1024 // 4mb\n\t}\n\n\tvar l int64\n\t{\n\t\tif !o.skipRangeDownloadDetect {\n\t\t\treq := req.Clone(req.Context())\n\t\t\treq.Method = http.MethodHead\n\t\t\terr := Do(cli, req, func(resp *http.Response) error {\n\t\t\t\tif resp.StatusCode != http.StatusOK {\n\t\t\t\t\treturn fmt.Errorf(\"stat: status code %d\", resp.StatusCode)\n\t\t\t\t}\n\t\t\t\tif !strings.EqualFold(resp.Header.Get(\"Accept-Ranges\"), \"bytes\") {\n\t\t\t\t\treturn fmt.Errorf(\"stat: not support range download\")\n\t\t\t\t}\n\t\t\t\tl = resp.ContentLength\n\t\t\t\treturn nil\n\t\t\t})\n\t\t\tif err != nil {\n\t\t\t\treturn nil, fmt.Errorf(\"stat: do head request: %w\", err)\n\t\t\t}\n\t\t} else {\n\t\t\treq := req.Clone(req.Context())\n\t\t\terr := Do(cli, req, func(resp *http.Response) error {\n\t\t\t\tif resp.StatusCode != http.StatusOK {\n\t\t\t\t\treturn fmt.Errorf(\"stat: status code %d\", resp.StatusCode)\n\t\t\t\t}\n\t\t\t\tl = resp.ContentLength\n\t\t\t\treturn nil\n\t\t\t})\n\t\t\tif err != nil {\n\t\t\t\treturn nil, fmt.Errorf(\"stat: do get request: %w\", err)\n\t\t\t}\n\t\t}\n\t\tswitch sz := int64(o.size); {\n\t\tcase sz > l:\n\t\t\treturn nil, fmt.Errorf(\"size %d is greater than limit %d\", o.size, l)\n\t\tcase sz <= 0:\n\t\tdefault:\n\t\t\tl = sz\n\t\t}\n\t}\n\n\tb := ringbuffer.New(o.bufSize).WithCancel(req.Context())\n\treturn &SeekerFile{cli: cli, req: req, b: b, c: 1<<63 - 1, l: l}, nil\n}\n\nfunc (f *SeekerFile) Close() error {\n\tif f.b != nil {\n\t\tf.b.CloseWriter()\n\t}\n\treturn nil\n}\n\nfunc (f *SeekerFile) Len() int64 {\n\treturn f.l\n}\n\nfunc (f *SeekerFile) ReadAt(p []byte, off int64) (int, error) {\n\tif off < 0 {\n\t\treturn 0, syscall.EINVAL\n\t}\n\tif off > f.Len() {\n\t\treturn 0, io.EOF\n\t}\n\n\t// Sync and move to new offset, if backward or empty buffer.\n\tif f.c > off || f.b.IsEmpty() {\n\t\tif err := f.sync(off, true); err != nil {\n\t\t\treturn 0, err\n\t\t}\n\t}\n\n\tvar (\n\t\tremain   = int64(f.b.Length())\n\t\tcapacity = int64(f.b.Capacity())\n\t\tneed     = int64(len(p))\n\t)\n\n\tswitch {\n\tcase f.c+remain >= off+need: // Skip and move to new offset, if enough to forward.\n\t\tif err := f.skip(off - f.c); err != nil {\n\t\t\treturn 0, err\n\t\t}\n\t\treturn f.Read(p)\n\tcase f.c+capacity >= off+need: // Sync and move to new offset, if enough to forward after synced.\n\t\tif err := f.sync(f.c+remain, false); err != nil {\n\t\t\treturn 0, err\n\t\t}\n\t\tif err := f.skip(off - f.c); err != nil {\n\t\t\treturn 0, err\n\t\t}\n\t\treturn f.Read(p)\n\tdefault:\n\t}\n\n\t// Otherwise, read directly.\n\n\tf.b.Reset()\n\tf.c = off\n\n\t// Request remain needing.\n\tlim := off + int64(len(p)) - 1\n\tif lim > f.Len() {\n\t\tlim = f.Len()\n\t}\n\treq := f.req.Clone(f.req.Context())\n\treq.Header.Set(\"Range\", fmt.Sprintf(\"bytes=%d-%d\", off, lim))\n\tresp, err := f.cli.Do(req)\n\tif err != nil {\n\t\treturn 0, err\n\t}\n\tdefer Close(resp)\n\tif resp.StatusCode != http.StatusPartialContent && resp.StatusCode != http.StatusOK {\n\t\treturn 0, errors.New(resp.Status)\n\t}\n\tn, err := resp.Body.Read(p)\n\tf.c += int64(n)\n\treturn n, err\n}\n\nfunc (f *SeekerFile) Read(p []byte) (int, error) {\n\tn, err := f.b.Read(p)\n\tf.c += int64(n)\n\treturn n, err\n}\n\nfunc (f *SeekerFile) sync(off int64, reset bool) error {\n\tlim := off + int64(f.b.Free()) - 1\n\tif lim > f.Len() {\n\t\tlim = f.Len()\n\t}\n\treq := f.req.Clone(f.req.Context())\n\treq.Header.Set(\"Range\", fmt.Sprintf(\"bytes=%d-%d\", off, lim))\n\n\tresp, err := f.cli.Do(req)\n\tif err != nil {\n\t\treturn err\n\t}\n\tdefer Close(resp)\n\tif resp.StatusCode != http.StatusPartialContent && resp.StatusCode != http.StatusOK {\n\t\treturn errors.New(resp.Status)\n\t}\n\n\tbuf := bytex.GetBytes()\n\tdefer bytex.Put(buf)\n\tif reset {\n\t\tf.b.Reset()\n\t\tf.c = off\n\t}\n\n\t_, err = io.CopyBuffer(_WriterOnly{w: f.b}, resp.Body, buf)\n\tif err != nil {\n\t\treturn err\n\t}\n\n\treturn nil\n}\n\nfunc (f *SeekerFile) skip(dif int64) error {\n\tif dif <= 0 {\n\t\treturn nil\n\t}\n\n\tbuf := bytex.GetBytes(uint64(dif))\n\tdefer bytex.Put(buf)\n\tn, err := f.b.Read(buf)\n\tf.c += int64(n)\n\tif err != nil {\n\t\treturn err\n\t}\n\treturn nil\n}\n\n// _WriterOnly is a wrapper to expose the io.Writer method only,\n// which to avoid calling the io.ReaderFrom method.\ntype _WriterOnly struct {\n\tw io.Writer\n}\n\nfunc (w _WriterOnly) Write(p []byte) (int, error) {\n\treturn w.w.Write(p)\n}\n"
  },
  {
    "path": "util/httpx/file_options.go",
    "content": "package httpx\n\ntype SeekerFileOption struct {\n\tbufSize                 int\n\tsize                    int\n\tskipRangeDownloadDetect bool\n}\n\nfunc SeekerFileOptions() *SeekerFileOption {\n\treturn &SeekerFileOption{\n\t\tbufSize: 4 * 1024 * 1024, // 4mb\n\t}\n}\n\n// WithBufferSize sets the size of the buffer to read the file,\n//\n// Default is 4mb.\nfunc (o *SeekerFileOption) WithBufferSize(bufSize int) *SeekerFileOption {\n\tif o == nil || bufSize <= 0 {\n\t\treturn o\n\t}\n\to.bufSize = bufSize\n\treturn o\n}\n\n// WithSize sets the size of the file to read,\n//\n// If the size is greater than the content size of the file, it will return an error.\nfunc (o *SeekerFileOption) WithSize(size int) *SeekerFileOption {\n\tif o == nil || size <= 0 {\n\t\treturn o\n\t}\n\to.size = size\n\treturn o\n}\n\n// WithoutRangeDownloadDetect disables range download detection.\n//\n// Usually, OpenSeekerFile sends a \"HEAD\" HTTP request to destination to get the content size from the \"Content-Length\" header,\n// and confirms whether supports range download via the \"Accept-Ranges\" header.\n// However, some servers may not support the \"HEAD\" method, or the \"Accept-Ranges\" header is not set correctly.\n//\n// With this option, OpenSeekerFile sends \"GET\" HTTP request to get the content size as usual,\n// and does not confirm whether supports range download. But during the seeking read,\n// it still uses the \"Range\" header to read the file.\nfunc (o *SeekerFileOption) WithoutRangeDownloadDetect() *SeekerFileOption {\n\tif o == nil {\n\t\treturn o\n\t}\n\to.skipRangeDownloadDetect = true\n\treturn o\n}\n\n// If is a conditional option,\n// which receives a boolean condition to trigger the given function or not.\nfunc (o *SeekerFileOption) If(condition bool, then func(*SeekerFileOption) *SeekerFileOption) *SeekerFileOption {\n\tif condition {\n\t\treturn then(o)\n\t}\n\treturn o\n}\n"
  },
  {
    "path": "util/httpx/proxy.go",
    "content": "package httpx\n\nimport (\n\t\"net\"\n\t\"net/http\"\n\t\"net/url\"\n\t\"strings\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/osx\"\n)\n\nvar noProxies []*net.IPNet\n\nfunc init() {\n\tnoProxyEnv := osx.Getenv(\"NO_PROXY\", osx.Getenv(\"no_proxy\"))\n\tnoProxyRules := strings.Split(noProxyEnv, \",\")\n\tfor i := range noProxyRules {\n\t\t_, cidr, _ := net.ParseCIDR(noProxyRules[i])\n\t\tif cidr != nil {\n\t\t\tnoProxies = append(noProxies, cidr)\n\t\t}\n\t}\n}\n\n// ProxyFromEnvironment is similar to http.ProxyFromEnvironment,\n// but it also respects the NO_PROXY environment variable.\nfunc ProxyFromEnvironment(r *http.Request) (*url.URL, error) {\n\tif ip := net.ParseIP(r.URL.Hostname()); ip != nil {\n\t\tfor i := range noProxies {\n\t\t\tif noProxies[i].Contains(ip) {\n\t\t\t\treturn nil, nil\n\t\t\t}\n\t\t}\n\t}\n\n\treturn http.ProxyFromEnvironment(r)\n}\n"
  },
  {
    "path": "util/httpx/resolver.go",
    "content": "package httpx\n\nimport (\n\t\"context\"\n\t\"net\"\n)\n\nfunc DNSCacheDialContext(dialer *net.Dialer) func(context.Context, string, string) (net.Conn, error) {\n\tcs := map[string][]net.IP{}\n\n\treturn func(ctx context.Context, nw, addr string) (conn net.Conn, err error) {\n\t\th, p, err := net.SplitHostPort(addr)\n\t\tif err != nil {\n\t\t\treturn nil, err\n\t\t}\n\t\tips, ok := cs[h]\n\t\tif !ok {\n\t\t\tips, err = net.DefaultResolver.LookupIP(ctx, \"ip4\", h)\n\t\t\tif len(ips) == 0 {\n\t\t\t\tips, err = net.DefaultResolver.LookupIP(ctx, \"ip\", h)\n\t\t\t}\n\t\t\tif err != nil {\n\t\t\t\treturn nil, err\n\t\t\t}\n\t\t\tcs[h] = ips\n\t\t}\n\t\t// Try to connect to each IP address in order.\n\t\tfor _, ip := range ips {\n\t\t\tconn, err = dialer.DialContext(ctx, nw, net.JoinHostPort(ip.String(), p))\n\t\t\tif err == nil {\n\t\t\t\tbreak\n\t\t\t}\n\t\t}\n\t\treturn conn, err\n\t}\n}\n"
  },
  {
    "path": "util/httpx/transport.go",
    "content": "package httpx\n\nimport (\n\t\"net/http\"\n)\n\n// DefaultTransport is similar to the default http.DefaultTransport used by the package.\nvar DefaultTransport http.RoundTripper = Transport()\n\n// DefaultInsecureTransport is the default http.DefaultTransport used by the package,\n// with TLS insecure skip verify.\nvar DefaultInsecureTransport http.RoundTripper = Transport(TransportOptions().WithoutInsecureVerify())\n\n// Transport returns a new http.Transport with the given options,\n// the result http.Transport is used for constructing http.Client.\nfunc Transport(opts ...*TransportOption) *http.Transport {\n\tvar o *TransportOption\n\tif len(opts) > 0 {\n\t\to = opts[0]\n\t} else {\n\t\to = TransportOptions()\n\t}\n\n\treturn o.transport\n}\n"
  },
  {
    "path": "util/httpx/transport_options.go",
    "content": "package httpx\n\nimport (\n\t\"crypto/tls\"\n\t\"net\"\n\t\"net/http\"\n\t\"net/url\"\n\t\"time\"\n)\n\ntype TransportOption struct {\n\tdialer    *net.Dialer\n\ttransport *http.Transport\n}\n\nfunc TransportOptions() *TransportOption {\n\tdialer := &net.Dialer{\n\t\tTimeout:   30 * time.Second,\n\t\tKeepAlive: 30 * time.Second,\n\t}\n\ttransport := &http.Transport{\n\t\tProxy: ProxyFromEnvironment,\n\t\tTLSClientConfig: &tls.Config{\n\t\t\tMinVersion: tls.VersionTLS12,\n\t\t},\n\t\tDialContext:           DNSCacheDialContext(dialer),\n\t\tForceAttemptHTTP2:     true,\n\t\tMaxIdleConns:          100,\n\t\tIdleConnTimeout:       90 * time.Second,\n\t\tTLSHandshakeTimeout:   10 * time.Second,\n\t\tExpectContinueTimeout: 1 * time.Second,\n\t}\n\n\treturn &TransportOption{\n\t\tdialer:    dialer,\n\t\ttransport: transport,\n\t}\n}\n\n// WithProxy sets the proxy.\nfunc (o *TransportOption) WithProxy(proxy func(*http.Request) (*url.URL, error)) *TransportOption {\n\tif o == nil || o.transport == nil {\n\t\treturn o\n\t}\n\to.transport.Proxy = proxy\n\treturn o\n}\n\n// WithoutProxy disables the proxy.\nfunc (o *TransportOption) WithoutProxy() *TransportOption {\n\tif o == nil || o.transport == nil {\n\t\treturn o\n\t}\n\to.transport.Proxy = nil\n\treturn o\n}\n\n// WithKeepalive sets the keepalive.\nfunc (o *TransportOption) WithKeepalive(timeoutAndKeepalive ...time.Duration) *TransportOption {\n\tif o == nil || o.transport == nil || o.dialer == nil {\n\t\treturn o\n\t}\n\ttak := [2]time.Duration{30 * time.Second, 30 * time.Second}\n\tif len(timeoutAndKeepalive) > 0 {\n\t\ttak[0] = timeoutAndKeepalive[0]\n\t\tif len(timeoutAndKeepalive) > 1 {\n\t\t\ttak[1] = timeoutAndKeepalive[1]\n\t\t}\n\t}\n\to.dialer.Timeout, o.dialer.KeepAlive = tak[0], tak[1]\n\to.transport.MaxIdleConns = 100\n\to.transport.IdleConnTimeout = 90 * time.Second\n\treturn o\n}\n\n// WithoutKeepalive disables the keepalive.\nfunc (o *TransportOption) WithoutKeepalive() *TransportOption {\n\tif o == nil || o.transport == nil {\n\t\treturn o\n\t}\n\to.dialer.KeepAlive = -1\n\to.transport.MaxIdleConns = 0\n\to.transport.IdleConnTimeout = 0\n\treturn o\n}\n\n// WithInsecureVerify verifies the insecure connection.\nfunc (o *TransportOption) WithInsecureVerify() *TransportOption {\n\tif o == nil || o.transport == nil || o.transport.TLSClientConfig == nil {\n\t\treturn o\n\t}\n\to.transport.TLSClientConfig.InsecureSkipVerify = false\n\treturn o\n}\n\n// WithoutInsecureVerify skips the insecure connection verify.\nfunc (o *TransportOption) WithoutInsecureVerify() *TransportOption {\n\tif o == nil || o.transport == nil || o.transport.TLSClientConfig == nil {\n\t\treturn o\n\t}\n\to.transport.TLSClientConfig.InsecureSkipVerify = true\n\treturn o\n}\n\n// TimeoutForDial sets the timeout for network dial.\n//\n// This timeout controls the [network dial] only.\n//\n// Use 0 to disable timeout.\nfunc (o *TransportOption) TimeoutForDial(timeout time.Duration) *TransportOption {\n\tif o == nil || o.dialer == nil {\n\t\treturn o\n\t}\n\to.dialer.Timeout = timeout\n\treturn o\n}\n\n// TimeoutForResponseHeader sets the timeout for response header.\n//\n// This timeout controls the [response header reading] only.\n//\n// Use 0 to disable timeout.\nfunc (o *TransportOption) TimeoutForResponseHeader(timeout time.Duration) *TransportOption {\n\tif o == nil || o.transport == nil {\n\t\treturn o\n\t}\n\to.transport.ResponseHeaderTimeout = timeout\n\treturn o\n}\n\n// TimeoutForTLSHandshake sets the timeout for tls handshake.\n//\n// This timeout controls the [tls handshake] only.\n//\n// Use 0 to disable timeout.\nfunc (o *TransportOption) TimeoutForTLSHandshake(timeout time.Duration) *TransportOption {\n\tif o == nil || o.transport == nil {\n\t\treturn o\n\t}\n\to.transport.TLSHandshakeTimeout = timeout\n\treturn o\n}\n\n// TimeoutForIdleConn sets the timeout for idle connection.\n//\n// This timeout controls the [idle connection lifetime] only.\n//\n// Use 0 to disable timeout.\nfunc (o *TransportOption) TimeoutForIdleConn(timeout time.Duration) *TransportOption {\n\tif o == nil || o.transport == nil {\n\t\treturn o\n\t}\n\to.transport.IdleConnTimeout = timeout\n\treturn o\n}\n\n// WithTLSClientConfig sets the tls.Config.\nfunc (o *TransportOption) WithTLSClientConfig(config *tls.Config) *TransportOption {\n\tif o == nil || o.transport == nil {\n\t\treturn o\n\t}\n\to.transport.TLSClientConfig = config\n\treturn o\n}\n\n// WithoutDNSCache disables the dns cache.\nfunc (o *TransportOption) WithoutDNSCache() *TransportOption {\n\tif o == nil || o.transport == nil || o.dialer == nil {\n\t\treturn o\n\t}\n\to.transport.DialContext = o.dialer.DialContext\n\treturn o\n}\n\n// WithDialer sets the dialer.\nfunc (o *TransportOption) WithDialer(dialer *net.Dialer) *TransportOption {\n\tif o == nil || o.transport == nil || dialer == nil {\n\t\treturn o\n\t}\n\to.dialer = dialer\n\to.transport.DialContext = DNSCacheDialContext(o.dialer)\n\treturn o\n}\n\n// Customize sets the transport.\nfunc (o *TransportOption) Customize(fn func(*http.Transport)) *TransportOption {\n\tif o == nil || o.transport == nil {\n\t\treturn o\n\t}\n\to.dialer = nil\n\tfn(o.transport)\n\treturn o\n}\n\n// If is a conditional option,\n// which receives a boolean condition to trigger the given function or not.\nfunc (o *TransportOption) If(condition bool, then func(*TransportOption) *TransportOption) *TransportOption {\n\tif condition {\n\t\treturn then(o)\n\t}\n\treturn o\n}\n"
  },
  {
    "path": "util/json/common.go",
    "content": "package json\n\nimport (\n\tstdjson \"encoding/json\"\n\t\"fmt\"\n)\n\ntype RawMessage = stdjson.RawMessage\n\nvar (\n\tMarshalIndent = stdjson.MarshalIndent\n\tIndent        = stdjson.Indent\n\tNewEncoder    = stdjson.NewEncoder\n\tValid         = stdjson.Valid\n)\n\n// MustMarshal is similar to Marshal,\n// but panics if found error.\nfunc MustMarshal(v any) []byte {\n\tbs, err := Marshal(v)\n\tif err != nil {\n\t\tpanic(fmt.Errorf(\"error marshaling json: %w\", err))\n\t}\n\n\treturn bs\n}\n\n// MustUnmarshal is similar to Unmarshal,\n// but panics if found error.\nfunc MustUnmarshal(data []byte, v any) {\n\terr := Unmarshal(data, v)\n\tif err != nil {\n\t\tpanic(fmt.Errorf(\"error unmarshaling json: %w\", err))\n\t}\n}\n\n// MustMarshalIndent is similar to MarshalIndent,\n// but panics if found error.\nfunc MustMarshalIndent(v any, prefix, indent string) []byte {\n\tbs, err := MarshalIndent(v, prefix, indent)\n\tif err != nil {\n\t\tpanic(fmt.Errorf(\"error marshaling indent json: %w\", err))\n\t}\n\n\treturn bs\n}\n\n// ShouldMarshal is similar to Marshal,\n// but never return error.\nfunc ShouldMarshal(v any) []byte {\n\tbs, _ := Marshal(v)\n\treturn bs\n}\n\n// ShouldUnmarshal is similar to Unmarshal,\n// but never return error.\nfunc ShouldUnmarshal(data []byte, v any) {\n\t_ = Unmarshal(data, v)\n}\n\n// ShouldMarshalIndent is similar to MarshalIndent,\n// but never return error.\nfunc ShouldMarshalIndent(v any, prefix, indent string) []byte {\n\tbs, _ := MarshalIndent(v, prefix, indent)\n\treturn bs\n}\n"
  },
  {
    "path": "util/json/jsoniter.go",
    "content": "//go:build !stdjson\n\npackage json\n\nimport (\n\tstdjson \"encoding/json\"\n\t\"strconv\"\n\t\"unsafe\"\n\n\tjsoniter \"github.com/json-iterator/go\"\n)\n\nvar json = jsoniter.ConfigCompatibleWithStandardLibrary\n\nfunc init() {\n\t// borrowed from https://github.com/json-iterator/go/issues/145#issuecomment-323483602\n\tdecodeNumberAsInt64IfPossible := func(ptr unsafe.Pointer, iter *jsoniter.Iterator) {\n\t\tswitch iter.WhatIsNext() {\n\t\tcase jsoniter.NumberValue:\n\t\t\tvar number stdjson.Number\n\n\t\t\titer.ReadVal(&number)\n\t\t\ti, err := strconv.ParseInt(string(number), 10, 64)\n\n\t\t\tif err == nil {\n\t\t\t\t*(*any)(ptr) = i\n\t\t\t\treturn\n\t\t\t}\n\n\t\t\tf, err := strconv.ParseFloat(string(number), 64)\n\t\t\tif err == nil {\n\t\t\t\t*(*any)(ptr) = f\n\t\t\t\treturn\n\t\t\t}\n\t\tdefault:\n\t\t\t*(*any)(ptr) = iter.Read()\n\t\t}\n\t}\n\tjsoniter.RegisterTypeDecoderFunc(\"interface {}\", decodeNumberAsInt64IfPossible)\n\tjsoniter.RegisterTypeDecoderFunc(\"any\", decodeNumberAsInt64IfPossible)\n}\n\nvar (\n\tMarshal    = json.Marshal\n\tUnmarshal  = json.Unmarshal\n\tNewDecoder = json.NewDecoder\n)\n"
  },
  {
    "path": "util/json/stdjson.go",
    "content": "//go:build stdjson\n\npackage json\n\nimport (\n\t\"encoding/json\"\n)\n\nvar (\n\tMarshal    = json.Marshal\n\tUnmarshal  = json.Unmarshal\n\tNewDecoder = json.NewDecoder\n)\n"
  },
  {
    "path": "util/osx/env.go",
    "content": "package osx\n\nimport (\n\t\"os\"\n)\n\n// ExistEnv checks if the environment variable named by the key exists.\nfunc ExistEnv(key string) bool {\n\t_, ok := os.LookupEnv(key)\n\treturn ok\n}\n\n// Getenv retrieves the value of the environment variable named by the key.\n// It returns the default, which will be empty if the variable is not present.\n// To distinguish between an empty value and an unset value, use LookupEnv.\nfunc Getenv(key string, def ...string) string {\n\te, ok := os.LookupEnv(key)\n\tif !ok && len(def) != 0 {\n\t\treturn def[0]\n\t}\n\n\treturn e\n}\n\n// ExpandEnv is similar to Getenv,\n// but replaces ${var} or $var in the result.\nfunc ExpandEnv(key string, def ...string) string {\n\treturn os.ExpandEnv(Getenv(key, def...))\n}\n"
  },
  {
    "path": "util/osx/file.go",
    "content": "package osx\n\nimport (\n\t\"io\"\n\t\"os\"\n\t\"path/filepath\"\n\t\"strings\"\n)\n\n// InlineTilde replaces the leading ~ with the home directory.\nfunc InlineTilde(path string) string {\n\tif path == \"\" {\n\t\treturn path\n\t}\n\tif strings.HasPrefix(path, \"~\"+string(filepath.Separator)) {\n\t\thd, err := os.UserHomeDir()\n\t\tif err == nil {\n\t\t\tpath = filepath.Join(hd, path[2:])\n\t\t}\n\t}\n\treturn path\n}\n\n// Open is similar to os.Open but supports ~ as the home directory.\nfunc Open(path string) (*os.File, error) {\n\tp := filepath.Clean(path)\n\tp = InlineTilde(p)\n\treturn os.Open(p)\n}\n\n// Exists checks if the given path exists.\nfunc Exists(path string, checks ...func(os.FileInfo) bool) bool {\n\tp := filepath.Clean(path)\n\tp = InlineTilde(p)\n\n\tstat, err := os.Lstat(p)\n\tif err != nil {\n\t\treturn false\n\t}\n\n\tfor i := range checks {\n\t\tif checks[i] == nil {\n\t\t\tcontinue\n\t\t}\n\n\t\tif !checks[i](stat) {\n\t\t\treturn false\n\t\t}\n\t}\n\n\treturn true\n}\n\n// ExistsDir checks if the given path exists and is a directory.\nfunc ExistsDir(path string) bool {\n\treturn Exists(path, func(stat os.FileInfo) bool {\n\t\treturn stat.Mode().IsDir()\n\t})\n}\n\n// ExistsLink checks if the given path exists and is a symbolic link.\nfunc ExistsLink(path string) bool {\n\treturn Exists(path, func(stat os.FileInfo) bool {\n\t\treturn stat.Mode()&os.ModeSymlink != 0\n\t})\n}\n\n// ExistsFile checks if the given path exists and is a regular file.\nfunc ExistsFile(path string) bool {\n\treturn Exists(path, func(stat os.FileInfo) bool {\n\t\treturn stat.Mode().IsRegular()\n\t})\n}\n\n// ExistsSocket checks if the given path exists and is a socket.\nfunc ExistsSocket(path string) bool {\n\treturn Exists(path, func(stat os.FileInfo) bool {\n\t\treturn stat.Mode()&os.ModeSocket != 0\n\t})\n}\n\n// ExistsDevice checks if the given path exists and is a device.\nfunc ExistsDevice(path string) bool {\n\treturn Exists(path, func(stat os.FileInfo) bool {\n\t\treturn stat.Mode()&os.ModeDevice != 0\n\t})\n}\n\n// Close closes the given io.Closer without error.\nfunc Close(c io.Closer) {\n\tif c == nil {\n\t\treturn\n\t}\n\t_ = c.Close()\n}\n\n// WriteFile is similar to os.WriteFile but supports ~ as the home directory,\n// and also supports the parent directory creation.\nfunc WriteFile(name string, data []byte, perm os.FileMode) error {\n\tp := filepath.Clean(name)\n\tp = InlineTilde(p)\n\n\tif err := os.MkdirAll(filepath.Dir(p), 0o700); err != nil {\n\t\treturn err\n\t}\n\n\treturn os.WriteFile(p, data, perm)\n}\n\n// CreateFile is similar to os.Create but supports ~ as the home directory,\n// and also supports the parent directory creation.\nfunc CreateFile(name string, perm os.FileMode) (*os.File, error) {\n\tp := filepath.Clean(name)\n\tp = InlineTilde(p)\n\n\tif err := os.MkdirAll(filepath.Dir(p), 0o700); err != nil {\n\t\treturn nil, err\n\t}\n\n\treturn os.OpenFile(p, os.O_RDWR|os.O_CREATE|os.O_TRUNC, perm)\n}\n\n// OpenFile is similar to os.OpenFile but supports ~ as the home directory,\n// and also supports the parent directory creation.\nfunc OpenFile(name string, flag int, perm os.FileMode) (*os.File, error) {\n\tp := filepath.Clean(name)\n\tp = InlineTilde(p)\n\n\tif err := os.MkdirAll(filepath.Dir(p), 0o700); err != nil {\n\t\treturn nil, err\n\t}\n\n\treturn os.OpenFile(p, flag, perm)\n}\n"
  },
  {
    "path": "util/osx/file_mmap.go",
    "content": "// Copyright 2018 The Prometheus Authors\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n// http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\npackage osx\n\nimport (\n\t\"errors\"\n\t\"fmt\"\n\t\"io\"\n\t\"os\"\n\t\"path/filepath\"\n\t\"runtime/debug\"\n\t\"syscall\"\n)\n\ntype MmapFile struct {\n\tf *os.File\n\tb []byte\n}\n\nfunc OpenMmapFile(path string) (*MmapFile, error) {\n\treturn OpenMmapFileWithSize(path, 0)\n}\n\nfunc OpenMmapFileWithSize(path string, size int) (*MmapFile, error) {\n\tp := filepath.Clean(path)\n\tp = InlineTilde(p)\n\n\tf, err := os.Open(p)\n\tif err != nil {\n\t\treturn nil, fmt.Errorf(\"try lock file: %w\", err)\n\t}\n\tif size <= 0 {\n\t\tinfo, err := f.Stat()\n\t\tif err != nil {\n\t\t\tClose(f)\n\t\t\treturn nil, fmt.Errorf(\"stat: %w\", err)\n\t\t}\n\t\tsize = int(info.Size())\n\t}\n\n\tb, err := mmap(f, size)\n\tif err != nil {\n\t\tClose(f)\n\t\treturn nil, fmt.Errorf(\"mmap, size %d: %w\", size, err)\n\t}\n\n\treturn &MmapFile{f: f, b: b}, nil\n}\n\nfunc (f *MmapFile) Close() error {\n\terr0 := munmap(f.b)\n\terr1 := f.f.Close()\n\n\tif err0 != nil {\n\t\treturn err0\n\t}\n\treturn err1\n}\n\nfunc (f *MmapFile) Bytes() []byte {\n\treturn f.b\n}\n\nfunc (f *MmapFile) Len() int64 {\n\treturn int64(len(f.b))\n}\n\nvar ErrPageFault = errors.New(\"page fault occurred while reading from memory map\")\n\nfunc (f *MmapFile) ReadAt(p []byte, off int64) (_ int, err error) {\n\tif off < 0 {\n\t\treturn 0, syscall.EINVAL\n\t}\n\tif off > f.Len() {\n\t\treturn 0, io.EOF\n\t}\n\n\told := debug.SetPanicOnFault(true)\n\tdefer func() {\n\t\tdebug.SetPanicOnFault(old)\n\t\tif recover() != nil {\n\t\t\terr = ErrPageFault\n\t\t}\n\t}()\n\n\tn := copy(p, f.b[off:])\n\tif n < len(p) {\n\t\terr = io.EOF\n\t}\n\treturn n, err\n}\n"
  },
  {
    "path": "util/osx/file_mmap_js.go",
    "content": "// Copyright 2022 The Prometheus Authors\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n// http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\npackage osx\n\nimport (\n\t\"errors\"\n\t\"os\"\n)\n\nfunc mmap(f *os.File, length int) ([]byte, error) {\n\treturn nil, errors.New(\"unsupported\")\n}\n\nfunc munmap(b []byte) (err error) {\n\treturn errors.New(\"unsupported\")\n}\n"
  },
  {
    "path": "util/osx/file_mmap_unix.go",
    "content": "// Copyright 2017 The Prometheus Authors\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n// http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris\n\npackage osx\n\nimport (\n\t\"os\"\n\n\t\"golang.org/x/sys/unix\"\n)\n\nfunc mmap(f *os.File, length int) ([]byte, error) {\n\treturn unix.Mmap(int(f.Fd()), 0, length, unix.PROT_READ, unix.MAP_SHARED)\n}\n\nfunc munmap(b []byte) (err error) {\n\treturn unix.Munmap(b)\n}\n"
  },
  {
    "path": "util/osx/file_mmap_windows.go",
    "content": "package osx\n\nimport (\n\t\"os\"\n\t\"syscall\"\n\t\"unsafe\"\n)\n\nfunc mmap(f *os.File, size int) ([]byte, error) {\n\tlow, high := uint32(size), uint32(size>>32)\n\th, errno := syscall.CreateFileMapping(syscall.Handle(f.Fd()), nil, syscall.PAGE_READONLY, high, low, nil)\n\tif h == 0 {\n\t\treturn nil, os.NewSyscallError(\"CreateFileMapping\", errno)\n\t}\n\n\taddr, errno := syscall.MapViewOfFile(h, syscall.FILE_MAP_READ, 0, 0, uintptr(size))\n\tif addr == 0 {\n\t\treturn nil, os.NewSyscallError(\"MapViewOfFile\", errno)\n\t}\n\n\tif err := syscall.CloseHandle(h); err != nil {\n\t\treturn nil, os.NewSyscallError(\"CloseHandle\", err)\n\t}\n\n\treturn (*[maxMapSize]byte)(unsafe.Pointer(addr))[:size], nil\n}\n\nfunc munmap(b []byte) error {\n\tif err := syscall.UnmapViewOfFile((uintptr)(unsafe.Pointer(&b[0]))); err != nil {\n\t\treturn os.NewSyscallError(\"UnmapViewOfFile\", err)\n\t}\n\treturn nil\n}\n"
  },
  {
    "path": "util/osx/file_mmap_windows_386.go",
    "content": "// Copyright 2018 The Prometheus Authors\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n// http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\npackage osx\n\nconst maxMapSize = 0x7FFFFFFF // 2GB\n"
  },
  {
    "path": "util/osx/file_mmap_windows_non386.go",
    "content": "// Copyright 2018 The Prometheus Authors\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may not use this file except in compliance with the License.\n// You may obtain a copy of the License at\n//\n// http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS,\n// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n// See the License for the specific language governing permissions and\n// limitations under the License.\n\n//go:build windows && !386\n\npackage osx\n\nconst maxMapSize = 0xFFFFFFFFFFFF // 256TB\n"
  },
  {
    "path": "util/osx/homedir.go",
    "content": "package osx\n\nimport (\n\t\"os\"\n\t\"path/filepath\"\n\t\"time\"\n)\n\n// UserHomeDir is similar to os.UserHomeDir,\n// but returns the temp dir if the home dir is not found.\nfunc UserHomeDir() string {\n\thd, err := os.UserHomeDir()\n\tif err != nil {\n\t\thd = filepath.Join(os.TempDir(), time.Now().Format(time.DateOnly))\n\t}\n\treturn hd\n}\n"
  },
  {
    "path": "util/ptr/pointer.go",
    "content": "package ptr\n\nimport (\n\t\"time\"\n\n\t\"golang.org/x/exp/constraints\"\n)\n\nfunc Int(v int) *int {\n\treturn Ref(v)\n}\n\nfunc IntDeref(v *int, def int) int {\n\treturn Deref(v, def)\n}\n\nfunc Int8(v int8) *int8 {\n\treturn Ref(v)\n}\n\nfunc Int8Deref(v *int8, def int8) int8 {\n\treturn Deref(v, def)\n}\n\nfunc Int16(v int16) *int16 {\n\treturn Ref(v)\n}\n\nfunc Int16Deref(v *int16, def int16) int16 {\n\treturn Deref(v, def)\n}\n\nfunc Int32(v int32) *int32 {\n\treturn Ref(v)\n}\n\nfunc Int32Deref(v *int32, def int32) int32 {\n\treturn Deref(v, def)\n}\n\nfunc Int64(v int64) *int64 {\n\treturn Ref(v)\n}\n\nfunc Int64Deref(v *int64, def int64) int64 {\n\treturn Deref(v, def)\n}\n\nfunc Uint(v uint) *uint {\n\treturn Ref(v)\n}\n\nfunc UintDeref(v *uint, def uint) uint {\n\treturn Deref(v, def)\n}\n\nfunc Uint8(v uint8) *uint8 {\n\treturn Ref(v)\n}\n\nfunc Uint8Deref(v *uint8, def uint8) uint8 {\n\treturn Deref(v, def)\n}\n\nfunc Uint16(v uint16) *uint16 {\n\treturn Ref(v)\n}\n\nfunc Uint16Deref(v *uint16, def uint16) uint16 {\n\treturn Deref(v, def)\n}\n\nfunc Uint32(v uint32) *uint32 {\n\treturn Ref(v)\n}\n\nfunc Uint32Deref(v *uint32, def uint32) uint32 {\n\treturn Deref(v, def)\n}\n\nfunc Uint64(v uint64) *uint64 {\n\treturn Ref(v)\n}\n\nfunc Uint64Deref(v *uint64, def uint64) uint64 {\n\treturn Deref(v, def)\n}\n\nfunc Float32(v float32) *float32 {\n\treturn Ref(v)\n}\n\nfunc Float32Deref(v *float32, def float32) float32 {\n\treturn Deref(v, def)\n}\n\nfunc Float64(v float64) *float64 {\n\treturn Ref(v)\n}\n\nfunc Float64Deref(v *float64, def float64) float64 {\n\treturn Deref(v, def)\n}\n\nfunc String(v string) *string {\n\treturn Ref(v)\n}\n\nfunc StringDeref(v *string, def string) string {\n\treturn Deref(v, def)\n}\n\nfunc Bool(v bool) *bool {\n\treturn Ref(v)\n}\n\nfunc BoolDeref(v *bool, def bool) bool {\n\treturn Deref(v, def)\n}\n\nfunc Duration(v time.Duration) *time.Duration {\n\treturn Ref(v)\n}\n\nfunc DurationDeref(v *time.Duration, def time.Duration) time.Duration {\n\treturn Deref(v, def)\n}\n\nfunc Time(v time.Time) *time.Time {\n\treturn Ref(v)\n}\n\nfunc TimeDeref(v *time.Time, def time.Time) time.Time {\n\treturn Deref(v, def)\n}\n\ntype Pointerable interface {\n\tconstraints.Ordered | ~bool | time.Time\n}\n\nfunc Ref[T Pointerable](v T) *T {\n\treturn &v\n}\n\nfunc To[T Pointerable](v T) *T {\n\treturn Ref(v)\n}\n\nfunc Deref[T Pointerable](ptr *T, def T) T {\n\tif ptr != nil {\n\t\treturn *ptr\n\t}\n\n\treturn def\n}\n\nfunc Equal[T Pointerable](a, b *T) bool {\n\tif a != nil && b != nil {\n\t\treturn *a == *b\n\t}\n\n\treturn false\n}\n"
  },
  {
    "path": "util/signalx/handler.go",
    "content": "package signalx\n\nimport (\n\t\"context\"\n\t\"os\"\n\t\"os/signal\"\n)\n\nvar registered = make(chan struct{})\n\n// Handler registers for signals and returns a context.\nfunc Handler() context.Context {\n\tclose(registered) // Panics when called twice.\n\n\tsigChan := make(chan os.Signal, len(sigs))\n\tctx, cancel := context.WithCancel(context.Background())\n\n\t// Register for signals.\n\tsignal.Notify(sigChan, sigs...)\n\n\t// Process signals.\n\tgo func() {\n\t\tvar exited bool\n\t\tfor range sigChan {\n\t\t\tif exited {\n\t\t\t\tos.Exit(1)\n\t\t\t}\n\t\t\tcancel()\n\t\t\texited = true\n\t\t}\n\t}()\n\n\treturn ctx\n}\n"
  },
  {
    "path": "util/signalx/handler_unix.go",
    "content": "//go:build !windows\n\npackage signalx\n\nimport (\n\t\"os\"\n\t\"syscall\"\n)\n\nvar sigs = []os.Signal{syscall.SIGINT, syscall.SIGTERM}\n"
  },
  {
    "path": "util/signalx/handler_windows.go",
    "content": "package signalx\n\nimport (\n\t\"os\"\n\t\"syscall\"\n)\n\nvar sigs = []os.Signal{syscall.SIGINT}\n"
  },
  {
    "path": "util/slicex/search.go",
    "content": "package slicex\n\nimport \"golang.org/x/exp/constraints\"\n\n// UpperBound returns an index of the first element that is greater than value.\nfunc UpperBound[T constraints.Integer | constraints.Float](s []T, e T) int {\n\tl, r := 0, len(s)\n\tfor l < r {\n\t\tm := l + (r-l)/2\n\t\tif s[m] <= e {\n\t\t\tl = m + 1\n\t\t} else {\n\t\t\tr = m\n\t\t}\n\t}\n\treturn l\n}\n"
  },
  {
    "path": "util/stringx/bytes.go",
    "content": "package stringx\n\nimport \"unsafe\"\n\n// FromBytes converts a byte slice to a string.\nfunc FromBytes(b *[]byte) string {\n\treturn unsafe.String(unsafe.SliceData(*b), len(*b))\n}\n\n// ToBytes converts a string to a byte slice,\n// which is impossible to modify the item of slice.\nfunc ToBytes(s *string) (bs []byte) {\n\treturn unsafe.Slice(unsafe.StringData(*s), len(*s))\n}\n"
  },
  {
    "path": "util/stringx/random.go",
    "content": "package stringx\n\n// Borrowed from github.com/thanhpk/randstr.\n\nimport (\n\t\"bytes\"\n\t\"crypto/rand\"\n\t\"encoding/binary\"\n\t\"encoding/hex\"\n)\n\n// list of default letters that can be used to make a random string when calling RandomString\n// function with no letters provided.\nvar defLetters = []rune(\"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ\")\n\n// RandomBytes generates n random bytes.\nfunc RandomBytes(n int) []byte {\n\tb := make([]byte, n)\n\n\t_, err := rand.Read(b)\n\tif err != nil {\n\t\tpanic(err)\n\t}\n\n\treturn b\n}\n\n// RandomHex generates a random hex string with length of n\n// e.g: 67aab2d956bd7cc621af22cfb169cba8.\nfunc RandomHex(n int) string { return hex.EncodeToString(RandomBytes(n)) }\n\n// RandomString generates a random string using only letters provided in the letters parameter\n// if user omit letters parameters, this function will use defLetters instead.\nfunc RandomString(n int, letters ...string) string {\n\tvar (\n\t\tletterRunes []rune\n\t\tbb          bytes.Buffer\n\t)\n\n\tif len(letters) == 0 {\n\t\tletterRunes = defLetters\n\t} else {\n\t\tletterRunes = []rune(letters[0])\n\t}\n\n\tbb.Grow(n)\n\n\tl := uint32(len(letterRunes))\n\t// On each loop, generate one random rune and append to output.\n\tfor i := 0; i < n; i++ {\n\t\tbb.WriteRune(letterRunes[binary.BigEndian.Uint32(RandomBytes(4))%l])\n\t}\n\n\treturn bb.String()\n}\n\n// RandomBase64 generates a random base64 string with length of n,\n// safe for URL.\nfunc RandomBase64(n int) string {\n\treturn RandomString(n, \"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_\")\n}\n"
  },
  {
    "path": "util/stringx/strings.go",
    "content": "package stringx\n\nimport \"strings\"\n\n// CutFromLeft is the same as strings.Cut,\n// which starts from left to right,\n// slices s around the first instance of sep,\n// returning the text before and after sep.\n// The found result reports whether sep appears in s.\n// If sep does not appear in s, cut returns s, \"\", false.\nfunc CutFromLeft(s, sep string) (before, after string, found bool) {\n\treturn strings.Cut(s, sep)\n}\n\n// CutFromRight takes the same arguments as CutFromLeft,\n// but starts from right to left,\n// slices s around the last instance of sep,\n// return the text before and after sep.\n// The found result reports whether sep appears in s.\n// If sep does not appear in s, cut returns s, \"\", false.\nfunc CutFromRight(s, sep string) (before, after string, found bool) {\n\tif i := strings.LastIndex(s, sep); i >= 0 {\n\t\treturn s[:i], s[i+len(sep):], true\n\t}\n\treturn s, \"\", false\n}\n\n// ReplaceAllFunc is similar to strings.ReplaceAll,\n// but it replaces each rune in s with the result of f(r).\nfunc ReplaceAllFunc(s string, f func(rune) rune) string {\n\tvar b strings.Builder\n\tfor _, r := range s {\n\t\tb.WriteRune(f(r))\n\t}\n\treturn b.String()\n}\n\n// HasSuffixes checks if s has any of the suffixes in prefixes.\nfunc HasSuffixes(s string, suffixes ...string) bool {\n\tfor _, suffix := range suffixes {\n\t\tif strings.HasSuffix(s, suffix) {\n\t\t\treturn true\n\t\t}\n\t}\n\treturn false\n}\n"
  },
  {
    "path": "util/stringx/sum.go",
    "content": "package stringx\n\nimport (\n\t\"crypto/sha256\"\n\t\"encoding/hex\"\n\t\"hash/fnv\"\n)\n\n// SumByFNV64a sums up the string(s) by FNV-64a hash algorithm.\nfunc SumByFNV64a(s string, ss ...string) string {\n\th := fnv.New64a()\n\n\t_, _ = h.Write(ToBytes(&s))\n\tfor i := range ss {\n\t\t_, _ = h.Write(ToBytes(&ss[i]))\n\t}\n\n\tsum := h.Sum(nil)\n\treturn hex.EncodeToString(sum)\n}\n\n// SumBytesByFNV64a sums up the byte slice(s) by FNV-64a hash algorithm.\nfunc SumBytesByFNV64a(bs []byte, bss ...[]byte) string {\n\th := fnv.New64a()\n\n\t_, _ = h.Write(bs)\n\tfor i := range bss {\n\t\t_, _ = h.Write(bss[i])\n\t}\n\n\tsum := h.Sum(nil)\n\treturn hex.EncodeToString(sum)\n}\n\n// SumBySHA256 sums up the string(s) by SHA256 hash algorithm.\nfunc SumBySHA256(s string, ss ...string) string {\n\th := sha256.New()\n\n\t_, _ = h.Write(ToBytes(&s))\n\tfor i := range ss {\n\t\t_, _ = h.Write(ToBytes(&ss[i]))\n\t}\n\n\tsum := h.Sum(nil)\n\treturn hex.EncodeToString(sum)\n}\n\n// SumBytesBySHA256 sums up the byte slice(s) by SHA256 hash algorithm.\nfunc SumBytesBySHA256(bs []byte, bss ...[]byte) string {\n\th := sha256.New()\n\n\t_, _ = h.Write(bs)\n\tfor i := range bss {\n\t\t_, _ = h.Write(bss[i])\n\t}\n\n\tsum := h.Sum(nil)\n\treturn hex.EncodeToString(sum)\n}\n\n// SumBySHA224 sums up the string(s) by SHA224 hash algorithm.\nfunc SumBySHA224(s string, ss ...string) string {\n\th := sha256.New224()\n\n\t_, _ = h.Write(ToBytes(&s))\n\tfor i := range ss {\n\t\t_, _ = h.Write(ToBytes(&ss[i]))\n\t}\n\n\tsum := h.Sum(nil)\n\treturn hex.EncodeToString(sum)\n}\n\n// SumBytesBySHA224 sums up the byte slice(s) by SHA224 hash algorithm.\nfunc SumBytesBySHA224(bs []byte, bss ...[]byte) string {\n\th := sha256.New224()\n\n\t_, _ = h.Write(bs)\n\tfor i := range bss {\n\t\t_, _ = h.Write(bss[i])\n\t}\n\n\tsum := h.Sum(nil)\n\treturn hex.EncodeToString(sum)\n}\n"
  },
  {
    "path": "zz_generated.diffusion_model_memory_usage.regression.go",
    "content": "package gguf_parser\n\nimport \"math\"\n\n// GuessSD1DiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height,\n// which is calculated by linear regression or polynomial regression.\nfunc GuessSD1DiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 {\n\tcoefficients := []float64{7876368.5672, 161.4230198633, 0.0078124893}\n\tdegree := 2\n\tx := float64(width * height)\n\n\ty := float64(0)\n\tfor i := 0; i <= degree; i++ {\n\t\ty += coefficients[i] * math.Pow(x, float64(i))\n\t}\n\treturn uint64(y)\n}\n\n// GuessSD2DiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height,\n// which is calculated by linear regression or polynomial regression.\nfunc GuessSD2DiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 {\n\tcoefficients := []float64{-355043979.0562, -1193.3271458642, 0.0054023818}\n\tdegree := 2\n\tx := float64(width * height)\n\n\tif flashAttention {\n\t\tcoefficients = []float64{3780681.28078, 513.2102510935}\n\t\tdegree = 1\n\t}\n\n\ty := float64(0)\n\tfor i := 0; i <= degree; i++ {\n\t\ty += coefficients[i] * math.Pow(x, float64(i))\n\t}\n\treturn uint64(y)\n}\n\n// GuessSDXLDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height,\n// which is calculated by linear regression or polynomial regression.\nfunc GuessSDXLDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 {\n\tcoefficients := []float64{55541290.3893, 138.3196116655, 0.0006109455}\n\tdegree := 2\n\tx := float64(width * height)\n\n\tif flashAttention {\n\t\tcoefficients = []float64{-5958802.78052, 500.0687898915}\n\t\tdegree = 1\n\t}\n\n\ty := float64(0)\n\tfor i := 0; i <= degree; i++ {\n\t\ty += coefficients[i] * math.Pow(x, float64(i))\n\t}\n\treturn uint64(y)\n}\n\n// GuessSDXLRefinerDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height,\n// which is calculated by linear regression or polynomial regression.\nfunc GuessSDXLRefinerDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 {\n\tcoefficients := []float64{49395992.3449, 155.2477810191, 0.0007351736}\n\tdegree := 2\n\tx := float64(width * height)\n\n\tif flashAttention {\n\t\tcoefficients = []float64{7031343.31998, 599.4137437227}\n\t\tdegree = 1\n\t}\n\n\ty := float64(0)\n\tfor i := 0; i <= degree; i++ {\n\t\ty += coefficients[i] * math.Pow(x, float64(i))\n\t}\n\treturn uint64(y)\n}\n\n// GuessSD3MediumDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height,\n// which is calculated by linear regression or polynomial regression.\nfunc GuessSD3MediumDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 {\n\tcoefficients := []float64{16529921.3700, 234.6656247718, 0.0014648995}\n\tdegree := 2\n\tx := float64(width * height)\n\n\ty := float64(0)\n\tfor i := 0; i <= degree; i++ {\n\t\ty += coefficients[i] * math.Pow(x, float64(i))\n\t}\n\treturn uint64(y)\n}\n\n// GuessSD35MediumDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height,\n// which is calculated by linear regression or polynomial regression.\nfunc GuessSD35MediumDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 {\n\tcoefficients := []float64{17441103.4726, 281.6956819806, 0.0014651233}\n\tdegree := 2\n\tx := float64(width * height)\n\n\ty := float64(0)\n\tfor i := 0; i <= degree; i++ {\n\t\ty += coefficients[i] * math.Pow(x, float64(i))\n\t}\n\treturn uint64(y)\n}\n\n// GuessSD35LargeDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height,\n// which is calculated by linear regression or polynomial regression.\nfunc GuessSD35LargeDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 {\n\tcoefficients := []float64{23204369.2029, 410.3731196298, 0.0023195947}\n\tdegree := 2\n\tx := float64(width * height)\n\n\ty := float64(0)\n\tfor i := 0; i <= degree; i++ {\n\t\ty += coefficients[i] * math.Pow(x, float64(i))\n\t}\n\treturn uint64(y)\n}\n\n// GuessFLUXDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height,\n// which is calculated by linear regression or polynomial regression.\nfunc GuessFLUXDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 {\n\tcoefficients := []float64{46511668.6742, 997.7758807792, 0.0014573393}\n\tdegree := 2\n\tx := float64(width * height)\n\n\ty := float64(0)\n\tfor i := 0; i <= degree; i++ {\n\t\ty += coefficients[i] * math.Pow(x, float64(i))\n\t}\n\treturn uint64(y)\n}\n"
  },
  {
    "path": "zz_generated.ggmltype.stringer.go",
    "content": "// Code generated by \"stringer -linecomment -type GGMLType -output zz_generated.ggmltype.stringer.go -trimprefix GGMLType\"; DO NOT EDIT.\n\npackage gguf_parser\n\nimport \"strconv\"\n\nfunc _() {\n\t// An \"invalid array index\" compiler error signifies that the constant values have changed.\n\t// Re-run the stringer command to generate them again.\n\tvar x [1]struct{}\n\t_ = x[GGMLTypeF32-0]\n\t_ = x[GGMLTypeF16-1]\n\t_ = x[GGMLTypeQ4_0-2]\n\t_ = x[GGMLTypeQ4_1-3]\n\t_ = x[GGMLTypeQ4_2-4]\n\t_ = x[GGMLTypeQ4_3-5]\n\t_ = x[GGMLTypeQ5_0-6]\n\t_ = x[GGMLTypeQ5_1-7]\n\t_ = x[GGMLTypeQ8_0-8]\n\t_ = x[GGMLTypeQ8_1-9]\n\t_ = x[GGMLTypeQ2_K-10]\n\t_ = x[GGMLTypeQ3_K-11]\n\t_ = x[GGMLTypeQ4_K-12]\n\t_ = x[GGMLTypeQ5_K-13]\n\t_ = x[GGMLTypeQ6_K-14]\n\t_ = x[GGMLTypeQ8_K-15]\n\t_ = x[GGMLTypeIQ2_XXS-16]\n\t_ = x[GGMLTypeIQ2_XS-17]\n\t_ = x[GGMLTypeIQ3_XXS-18]\n\t_ = x[GGMLTypeIQ1_S-19]\n\t_ = x[GGMLTypeIQ4_NL-20]\n\t_ = x[GGMLTypeIQ3_S-21]\n\t_ = x[GGMLTypeIQ2_S-22]\n\t_ = x[GGMLTypeIQ4_XS-23]\n\t_ = x[GGMLTypeI8-24]\n\t_ = x[GGMLTypeI16-25]\n\t_ = x[GGMLTypeI32-26]\n\t_ = x[GGMLTypeI64-27]\n\t_ = x[GGMLTypeF64-28]\n\t_ = x[GGMLTypeIQ1_M-29]\n\t_ = x[GGMLTypeBF16-30]\n\t_ = x[GGMLTypeQ4_0_4_4-31]\n\t_ = x[GGMLTypeQ4_0_4_8-32]\n\t_ = x[GGMLTypeQ4_0_8_8-33]\n\t_ = x[GGMLTypeTQ1_0-34]\n\t_ = x[GGMLTypeTQ2_0-35]\n\t_ = x[GGMLTypeIQ4_NL_4_4-36]\n\t_ = x[GGMLTypeIQ4_NL_4_8-37]\n\t_ = x[GGMLTypeIQ4_NL_8_8-38]\n\t_ = x[GGMLTypeMXFP4-39]\n\t_ = x[_GGMLTypeCount-40]\n}\n\nconst _GGMLType_name = \"F32F16Q4_0Q4_1Q4_2Q4_3Q5_0Q5_1Q8_0Q8_1Q2_KQ3_KQ4_KQ5_KQ6_KQ8_KIQ2_XXSIQ2_XSIQ3_XXSIQ1_SIQ4_NLIQ3_SIQ2_SIQ4_XSI8I16I32I64F64IQ1_MBF16Q4_0_4_4Q4_0_4_8Q4_0_8_8TQ1_0TQ2_0IQ4_NL_4_4IQ4_NL_4_8IQ4_NL_8_8MXFP4Unknown\"\n\nvar _GGMLType_index = [...]uint8{0, 3, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, 69, 75, 82, 87, 93, 98, 103, 109, 111, 114, 117, 120, 123, 128, 132, 140, 148, 156, 161, 166, 176, 186, 196, 201, 208}\n\nfunc (i GGMLType) String() string {\n\tif i >= GGMLType(len(_GGMLType_index)-1) {\n\t\treturn \"GGMLType(\" + strconv.FormatInt(int64(i), 10) + \")\"\n\t}\n\treturn _GGMLType_name[_GGMLType_index[i]:_GGMLType_index[i+1]]\n}\n"
  },
  {
    "path": "zz_generated.gguffiletype.stringer.go",
    "content": "// Code generated by \"stringer -linecomment -type GGUFFileType -output zz_generated.gguffiletype.stringer.go -trimprefix GGUFFileType\"; DO NOT EDIT.\n\npackage gguf_parser\n\nimport \"strconv\"\n\nfunc _() {\n\t// An \"invalid array index\" compiler error signifies that the constant values have changed.\n\t// Re-run the stringer command to generate them again.\n\tvar x [1]struct{}\n\t_ = x[GGUFFileTypeMostlyF32-0]\n\t_ = x[GGUFFileTypeMostlyF16-1]\n\t_ = x[GGUFFileTypeMostlyQ4_0-2]\n\t_ = x[GGUFFileTypeMostlyQ4_1-3]\n\t_ = x[GGUFFileTypeMostlyQ4_1_SOME_F16-4]\n\t_ = x[GGUFFileTypeMostlyQ4_2-5]\n\t_ = x[GGUFFileTypeMostlyQ4_3-6]\n\t_ = x[GGUFFileTypeMostlyQ8_0-7]\n\t_ = x[GGUFFileTypeMostlyQ5_0-8]\n\t_ = x[GGUFFileTypeMostlyQ5_1-9]\n\t_ = x[GGUFFileTypeMostlyQ2_K-10]\n\t_ = x[GGUFFileTypeMostlyQ3_K_S-11]\n\t_ = x[GGUFFileTypeMostlyQ3_K_M-12]\n\t_ = x[GGUFFileTypeMostlyQ3_K_L-13]\n\t_ = x[GGUFFileTypeMostlyQ4_K_S-14]\n\t_ = x[GGUFFileTypeMostlyQ4_K_M-15]\n\t_ = x[GGUFFileTypeMostlyQ5_K_S-16]\n\t_ = x[GGUFFileTypeMostlyQ5_K_M-17]\n\t_ = x[GGUFFileTypeMostlyQ6_K-18]\n\t_ = x[GGUFFileTypeMostlyIQ2_XXS-19]\n\t_ = x[GGUFFileTypeMostlyIQ2_XS-20]\n\t_ = x[GGUFFileTypeMostlyQ2_K_S-21]\n\t_ = x[GGUFFileTypeMostlyIQ3_XS-22]\n\t_ = x[GGUFFileTypeMostlyIQ3_XXS-23]\n\t_ = x[GGUFFileTypeMostlyIQ1_S-24]\n\t_ = x[GGUFFileTypeMostlyIQ4_NL-25]\n\t_ = x[GGUFFileTypeMostlyIQ3_S-26]\n\t_ = x[GGUFFileTypeMostlyIQ3_M-27]\n\t_ = x[GGUFFileTypeMostlyIQ2_S-28]\n\t_ = x[GGUFFileTypeMostlyIQ2_M-29]\n\t_ = x[GGUFFileTypeMostlyIQ4_XS-30]\n\t_ = x[GGUFFileTypeMostlyIQ1_M-31]\n\t_ = x[GGUFFileTypeMostlyBF16-32]\n\t_ = x[GGUFFileTypeMostlyQ4_0_4_4-33]\n\t_ = x[GGUFFileTypeMostlyQ4_0_4_8-34]\n\t_ = x[GGUFFileTypeMostlyQ4_0_8_8-35]\n\t_ = x[GGUFFileTypeMostlyTQ1_0-36]\n\t_ = x[GGUFFileTypeMostlyTQ2_0-37]\n\t_ = x[GGUFFileTypeMostlyMXFP4-38]\n\t_ = x[_GGUFFileTypeCount-39]\n}\n\nconst _GGUFFileType_name = \"MOSTLY_F32MOSTLY_F16MOSTLY_Q4_0MOSTLY_Q4_1MOSTLY_Q4_1_SOME_F16MOSTLY_Q4_2MOSTLY_Q4_3MOSTLY_Q8_0MOSTLY_Q5_0MOSTLY_Q5_1MOSTLY_Q2_KMOSTLY_Q3_K_SMOSTLY_Q3_K_MMOSTLY_Q3_K_LMOSTLY_Q4_K_SMOSTLY_Q4_K_MMOSTLY_Q5_K_SMOSTLY_Q5_K_MMOSTLY_Q6_KMOSTLY_IQ2_XXSMOSTLY_IQ2_XSMOSTLY_Q2_K_SMOSTLY_IQ3_XSMOSTLY_IQ3_XXSMOSTLY_IQ1_SMOSTLY_IQ4_NLMOSTLY_IQ3_SMOSTLY_IQ3_MMOSTLY_IQ2_SMOSTLY_IQ2_MMOSTLY_IQ4_XSMOSTLY_IQ1_MMOSTLY_BF16MOSTLY_Q4_0_4_4MOSTLY_Q4_0_4_8MOSTLY_Q4_0_8_8MOSTLY_TQ1_0MOSTLY_TQ2_0MOSTLY_MXFP4Unknown\"\n\nvar _GGUFFileType_index = [...]uint16{0, 10, 20, 31, 42, 62, 73, 84, 95, 106, 117, 128, 141, 154, 167, 180, 193, 206, 219, 230, 244, 257, 270, 283, 297, 309, 322, 334, 346, 358, 370, 383, 395, 406, 421, 436, 451, 463, 475, 487, 494}\n\nfunc (i GGUFFileType) String() string {\n\tif i >= GGUFFileType(len(_GGUFFileType_index)-1) {\n\t\treturn \"GGUFFileType(\" + strconv.FormatInt(int64(i), 10) + \")\"\n\t}\n\treturn _GGUFFileType_name[_GGUFFileType_index[i]:_GGUFFileType_index[i+1]]\n}\n"
  },
  {
    "path": "zz_generated.ggufmagic.stringer.go",
    "content": "// Code generated by \"stringer -linecomment -type GGUFMagic -output zz_generated.ggufmagic.stringer.go -trimprefix GGUFMagic\"; DO NOT EDIT.\n\npackage gguf_parser\n\nimport \"strconv\"\n\nfunc _() {\n\t// An \"invalid array index\" compiler error signifies that the constant values have changed.\n\t// Re-run the stringer command to generate them again.\n\tvar x [1]struct{}\n\t_ = x[GGUFMagicGGML-1734831468]\n\t_ = x[GGUFMagicGGMF-1734831462]\n\t_ = x[GGUFMagicGGJT-1734830708]\n\t_ = x[GGUFMagicGGUFLe-1179993927]\n\t_ = x[GGUFMagicGGUFBe-1195857222]\n}\n\nconst (\n\t_GGUFMagic_name_0 = \"GGUF\"\n\t_GGUFMagic_name_1 = \"GGUF\"\n\t_GGUFMagic_name_2 = \"GGJT\"\n\t_GGUFMagic_name_3 = \"GGMF\"\n\t_GGUFMagic_name_4 = \"GGML\"\n)\n\nfunc (i GGUFMagic) String() string {\n\tswitch {\n\tcase i == 1179993927:\n\t\treturn _GGUFMagic_name_0\n\tcase i == 1195857222:\n\t\treturn _GGUFMagic_name_1\n\tcase i == 1734830708:\n\t\treturn _GGUFMagic_name_2\n\tcase i == 1734831462:\n\t\treturn _GGUFMagic_name_3\n\tcase i == 1734831468:\n\t\treturn _GGUFMagic_name_4\n\tdefault:\n\t\treturn \"GGUFMagic(\" + strconv.FormatInt(int64(i), 10) + \")\"\n\t}\n}\n"
  },
  {
    "path": "zz_generated.ggufmetadatavaluetype.stringer.go",
    "content": "// Code generated by \"stringer -linecomment -type GGUFMetadataValueType -output zz_generated.ggufmetadatavaluetype.stringer.go -trimprefix GGUFMetadataValueType\"; DO NOT EDIT.\n\npackage gguf_parser\n\nimport \"strconv\"\n\nfunc _() {\n\t// An \"invalid array index\" compiler error signifies that the constant values have changed.\n\t// Re-run the stringer command to generate them again.\n\tvar x [1]struct{}\n\t_ = x[GGUFMetadataValueTypeUint8-0]\n\t_ = x[GGUFMetadataValueTypeInt8-1]\n\t_ = x[GGUFMetadataValueTypeUint16-2]\n\t_ = x[GGUFMetadataValueTypeInt16-3]\n\t_ = x[GGUFMetadataValueTypeUint32-4]\n\t_ = x[GGUFMetadataValueTypeInt32-5]\n\t_ = x[GGUFMetadataValueTypeFloat32-6]\n\t_ = x[GGUFMetadataValueTypeBool-7]\n\t_ = x[GGUFMetadataValueTypeString-8]\n\t_ = x[GGUFMetadataValueTypeArray-9]\n\t_ = x[GGUFMetadataValueTypeUint64-10]\n\t_ = x[GGUFMetadataValueTypeInt64-11]\n\t_ = x[GGUFMetadataValueTypeFloat64-12]\n\t_ = x[_GGUFMetadataValueTypeCount-13]\n}\n\nconst _GGUFMetadataValueType_name = \"Uint8Int8Uint16Int16Uint32Int32Float32BoolStringArrayUint64Int64Float64Unknown\"\n\nvar _GGUFMetadataValueType_index = [...]uint8{0, 5, 9, 15, 20, 26, 31, 38, 42, 48, 53, 59, 64, 71, 78}\n\nfunc (i GGUFMetadataValueType) String() string {\n\tif i >= GGUFMetadataValueType(len(_GGUFMetadataValueType_index)-1) {\n\t\treturn \"GGUFMetadataValueType(\" + strconv.FormatInt(int64(i), 10) + \")\"\n\t}\n\treturn _GGUFMetadataValueType_name[_GGUFMetadataValueType_index[i]:_GGUFMetadataValueType_index[i+1]]\n}\n"
  },
  {
    "path": "zz_generated.ggufversion.stringer.go",
    "content": "// Code generated by \"stringer -linecomment -type GGUFVersion -output zz_generated.ggufversion.stringer.go -trimprefix GGUFVersion\"; DO NOT EDIT.\n\npackage gguf_parser\n\nimport \"strconv\"\n\nfunc _() {\n\t// An \"invalid array index\" compiler error signifies that the constant values have changed.\n\t// Re-run the stringer command to generate them again.\n\tvar x [1]struct{}\n\t_ = x[GGUFVersionV1-1]\n\t_ = x[GGUFVersionV2-2]\n\t_ = x[GGUFVersionV3-3]\n}\n\nconst _GGUFVersion_name = \"V1V2V3\"\n\nvar _GGUFVersion_index = [...]uint8{0, 2, 4, 6}\n\nfunc (i GGUFVersion) String() string {\n\ti -= 1\n\tif i >= GGUFVersion(len(_GGUFVersion_index)-1) {\n\t\treturn \"GGUFVersion(\" + strconv.FormatInt(int64(i+1), 10) + \")\"\n\t}\n\treturn _GGUFVersion_name[_GGUFVersion_index[i]:_GGUFVersion_index[i+1]]\n}\n"
  }
]