Repository: gpustack/gguf-parser-go
Branch: main
Commit: 35c4501b75f7
Files: 85
Total size: 593.2 KB

Directory structure:
gitextract_sklq_6lp/

├── .gitattributes
├── .github/
│   └── workflows/
│       ├── ci.yml
│       ├── cmd.yml
│       ├── prune.yml
│       └── sync.yml
├── .gitignore
├── .golangci.yaml
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── cache.go
├── cmd/
│   └── gguf-parser/
│       ├── README.md
│       ├── go.mod
│       ├── go.sum
│       └── main.go
├── file.go
├── file_architecture.go
├── file_architecture_test.go
├── file_estimate__llamacpp.go
├── file_estimate__llamacpp_test.go
├── file_estimate__stablediffusioncpp.go
├── file_estimate__stablediffusioncpp_test.go
├── file_estimate_option.go
├── file_from_distro.go
├── file_from_remote.go
├── file_metadata.go
├── file_metadata_test.go
├── file_option.go
├── file_test.go
├── file_tokenizer.go
├── file_tokenizer_test.go
├── filename.go
├── filename_test.go
├── gen.go
├── gen.regression.go
├── gen.stringer.go
├── ggml.go
├── go.mod
├── go.sum
├── ollama_model.go
├── ollama_model_option.go
├── ollama_model_test.go
├── ollama_registry_authenticate.go
├── scalar.go
├── scalar_test.go
├── util/
│   ├── anyx/
│   │   └── any.go
│   ├── bytex/
│   │   └── pool.go
│   ├── funcx/
│   │   └── error.go
│   ├── httpx/
│   │   ├── client.go
│   │   ├── client_helper.go
│   │   ├── client_options.go
│   │   ├── file.go
│   │   ├── file_options.go
│   │   ├── proxy.go
│   │   ├── resolver.go
│   │   ├── transport.go
│   │   └── transport_options.go
│   ├── json/
│   │   ├── common.go
│   │   ├── jsoniter.go
│   │   └── stdjson.go
│   ├── osx/
│   │   ├── env.go
│   │   ├── file.go
│   │   ├── file_mmap.go
│   │   ├── file_mmap_js.go
│   │   ├── file_mmap_unix.go
│   │   ├── file_mmap_windows.go
│   │   ├── file_mmap_windows_386.go
│   │   ├── file_mmap_windows_non386.go
│   │   └── homedir.go
│   ├── ptr/
│   │   └── pointer.go
│   ├── signalx/
│   │   ├── handler.go
│   │   ├── handler_unix.go
│   │   └── handler_windows.go
│   ├── slicex/
│   │   └── search.go
│   └── stringx/
│       ├── bytes.go
│       ├── random.go
│       ├── strings.go
│       └── sum.go
├── zz_generated.diffusion_model_memory_usage.regression.go
├── zz_generated.ggmltype.stringer.go
├── zz_generated.gguffiletype.stringer.go
├── zz_generated.ggufmagic.stringer.go
├── zz_generated.ggufmetadatavaluetype.stringer.go
└── zz_generated.ggufversion.stringer.go

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitattributes
================================================
* text=auto eol=lf

**/go.sum linguist-generated=true
**/zz_generated.*.go linguist-generated=true


================================================
FILE: .github/workflows/ci.yml
================================================
name: ci

permissions:
  contents: read
  pull-requests: read
  actions: read

defaults:
  run:
    shell: bash

on:
  push:
    branches:
      - 'main'
      - 'branch-v*.*'
    paths-ignore:
      - "docs/**"
      - "**.md"
      - "**.mdx"
      - "**.png"
      - "**.jpg"
      - ".github/workflows/cmd.yml"
      - ".github/workflows/prune.yml"
      - ".github/workflows/sync.yml"
  pull_request:
    branches:
      - 'main'
    paths-ignore:
      - "docs/**"
      - "**.md"
      - "**.mdx"
      - "**.png"
      - "**.jpg"
      - ".github/workflows/cmd.yml"
      - ".github/workflows/prune.yml"
      - ".github/workflows/sync.yml"

jobs:
  ci:
    timeout-minutes: 15
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 1
          persist-credentials: false
      - name: Setup Go
        timeout-minutes: 15
        uses: actions/setup-go@v5
        with:
          go-version: "1.22.9"
          cache-dependency-path: |
            **/go.sum
      - name: Setup Toolbox
        timeout-minutes: 5
        uses: actions/cache@v4
        with:
          key: toolbox-${{ runner.os }}
          path: |
            ${{ github.workspace }}/.sbin
      - name: Make
        run: make ci
        env:
          LINT_DIRTY: "true"


================================================
FILE: .github/workflows/cmd.yml
================================================
name: cmd

permissions:
  contents: write
  actions: read
  id-token: write

defaults:
  run:
    shell: bash

on:
  push:
    branches:
      - 'main'
      - 'branch-v*.*'
    paths-ignore:
      - "docs/**"
      - "**.md"
      - "**.mdx"
      - "**.png"
      - "**.jpg"
      - ".github/workflows/ci.yml"
      - ".github/workflows/prune.yml"
      - ".github/workflows/sync.yml"
    tags:
      - "v*.*.*"

jobs:
  build:
    timeout-minutes: 15
    runs-on: ubuntu-22.04
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 1
          persist-credentials: false
      - name: Setup Go
        timeout-minutes: 15
        uses: actions/setup-go@v5
        with:
          go-version: "1.22.9"
          cache-dependency-path: |
            cmd/**/go.sum
      - name: Make
        run: make build
        env:
          VERSION: "${{ github.ref_name }}"
      - name: Upload Artifact
        uses: actions/upload-artifact@v4
        with:
          include-hidden-files: true
          path: ${{ github.workspace }}/.dist/*
      - name: Release
        if: ${{ startsWith(github.ref, 'refs/tags/') }}
        uses: softprops/action-gh-release@v2
        with:
          fail_on_unmatched_files: true
          tag_name: "${{ github.ref_name }}"
          prerelease: ${{ contains(github.ref, 'rc') }}
          files: ${{ github.workspace }}/.dist/*

  publish:
    needs:
      - build
    permissions:
      contents: write
      actions: read
      id-token: write
    timeout-minutes: 15
    runs-on: ubuntu-22.04
    env:
      PACKAGE_REGISTRY: "gpustack"
      PACKAGE_IMAGE: "gguf-parser"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 1
          persist-credentials: false
      - name: Setup QEMU
        uses: docker/setup-qemu-action@v3
        with:
          image: tonistiigi/binfmt:qemu-v9.2.2
          platforms: "arm64"
      - name: Setup Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login DockerHub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.CI_DOCKERHUB_USERNAME }}
          password: ${{ secrets.CI_DOCKERHUB_PASSWORD }}
      - name: Download Artifact
        uses: actions/download-artifact@v4
        with:
          path: ${{ github.workspace }}/.dist
          merge-multiple: true
      - name: Get Metadata
        id: metadata
        uses: docker/metadata-action@v5
        with:
          images: "${{ env.PACKAGE_REGISTRY }}/${{ env.PACKAGE_IMAGE }}"
      - name: Package
        uses: docker/build-push-action@v6
        with:
          push: true
          file: ${{ github.workspace }}/Dockerfile
          context: ${{ github.workspace }}
          platforms: "linux/amd64,linux/arm64"
          tags: ${{ steps.metadata.outputs.tags }}
          labels: ${{ steps.metadata.outputs.labels }}
          cache-from: |
            type=registry,ref=${{ env.PACKAGE_REGISTRY }}/${{ env.PACKAGE_IMAGE }}:build-cache
          cache-to: |
            type=registry,mode=max,compression=gzip,ref=${{ env.PACKAGE_REGISTRY }}/${{ env.PACKAGE_IMAGE }}:build-cache,ignore-error=true
          provenance: true
          sbom: true


================================================
FILE: .github/workflows/prune.yml
================================================
name: prune

permissions:
  contents: write
  pull-requests: write
  actions: write
  issues: write

defaults:
  run:
    shell: bash

on:
  workflow_dispatch:
    inputs:
      prune:
        description: 'Prune all caches'
        required: false
        type: boolean
        default: false
  schedule:
    - cron: "0 0 * * *" # every day at 00:00 UTC

jobs:
  close-stale-issues-and-prs:
    uses: gpustack/.github/.github/workflows/close-stale-issues-and-prs.yml@main

  clean-stale-caches:
    uses: gpustack/.github/.github/workflows/clean-stale-caches.yml@main
    with:
      # allow to prune all caches on demand
      prune: ${{ github.event_name != 'schedule' && inputs.prune || false }}


================================================
FILE: .github/workflows/sync.yml
================================================
name: sync

permissions:
  contents: read
  pull-requests: read
  actions: read

defaults:
  run:
    shell: bash

on:
  workflow_dispatch:
    inputs:
      max_releases:
        description: "Maximum number of latest releases to sync"
        required: false
        default: 1
        type: number
      specific_release_tag:
        description: "Specific release tag to sync"
        required: false
        default: ""
        type: string
      dry_run:
        description: "Skip the actual sync"
        required: false
        default: false
        type: boolean
  schedule:
    - cron: "0 */12 * * *" # every 12 hours

jobs:
  gitcode:
    runs-on: ubuntu-22.04
    timeout-minutes: 240
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          persist-credentials: false
      - name: Sync
        uses: gpustack/.github/.github/actions/mirror-release-gitcode@main
        with:
          gitcode-username: "${{ secrets.CI_GITCODE_USERNAME }}"
          gitcode-password: "${{ secrets.CI_GITCODE_PASSWORD }}"
          gitcode-token: "${{ secrets.CI_GITCODE_TOKEN }}"
          max-releases: "${{ inputs.max_releases && inputs.max_releases || '1' }}"
          specific-release-tag: "${{ inputs.specific_release_tag && inputs.specific_release_tag || '' }}"
          code-only: true
          dry-run: "${{ inputs.dry_run && inputs.dry_run || 'false' }}"

  gitee:
    runs-on: ubuntu-22.04
    timeout-minutes: 120
    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          persist-credentials: false
      - name: Sync
        uses: gpustack/.github/.github/actions/mirror-release-gitee@main
        with:
          gitee-username: "${{ secrets.CI_GITEE_USERNAME }}"
          gitee-token: "${{ secrets.CI_GITEE_TOKEN }}"
          max-releases: "${{ inputs.max_releases && inputs.max_releases || '1' }}"
          specific-release-tag: "${{ inputs.specific_release_tag && inputs.specific_release_tag || '' }}"
          code-only: true
          dry-run: "${{ inputs.dry_run && inputs.dry_run || 'false' }}"

  tencent-cos:
    runs-on: ubuntu-22.04
    timeout-minutes: 120
    steps:
      - name: Sync
        uses: gpustack/.github/.github/actions/mirror-release-tencent-cos@main
        with:
          tencent-secret-id: "${{ secrets.CI_TECENTCOS_SECRET_ID }}"
          tencent-secret-key: "${{ secrets.CI_TECENTCOS_SECRET_KEY }}"
          tencent-cos-region: "ap-guangzhou"
          tencent-cos-bucket: "gpustack-1303613262"
          max-releases: "${{ inputs.max_releases && inputs.max_releases || '1' }}"
          specific-release-tag: "${{ inputs.specific_release_tag && inputs.specific_release_tag || '' }}"
          dry-run: "${{ inputs.dry_run && inputs.dry_run || 'false' }}"


================================================
FILE: .gitignore
================================================
# Files
.DS_Store
*.lock
*.test
*.out
*.swp
*.swo
*.db
*.exe
*.exe~
*.dll
*.so
*.dylib
*.log
go.work
go.work.*

# Dirs
/.idea
/.vscode
/.kube
/.terraform
/.vagrant
/.bundle
/.cache
/.docker
/.entc
/.sbin
/.dist
/log
/certs


================================================
FILE: .golangci.yaml
================================================
version: "1"

run:
  timeout: 10m
  tests: true
  modules-download-mode: readonly
  go: "1.22"

# output configuration options
output:
  print-issued-lines: true
  print-linter-name: true
  path-prefix: ""
  sort-results: true

linters:
  disable-all: true
  enable:
    - asciicheck
    - bidichk
    - decorder
    - durationcheck
    - errcheck
    - errname
    - errorlint
    - copyloopvar
    - godot
    - goconst
    - gocritic
    - gosimple
    - gosec
    - govet
    - gofumpt
    - gofmt
    - ineffassign
    - importas
    - lll
    - makezero
    - misspell
    - nakedret
    - nilerr
    - prealloc
    - predeclared
    - revive
    - staticcheck
    - stylecheck
    - typecheck
    - unconvert
    - unparam
    - unused
    - usestdlibvars
    - whitespace

linters-settings:
  decorder:
    dec-order:
      - const
      - var
      - func
    disable-init-func-first-check: false
    disable-dec-order-check: true
  errorlint:
    errorf: true
    asserts: true
    comparison: true
  godot:
    scope: all
    exclude:
      - "(?i)^ FIXME:"
      - "(?i)^ TODO:"
      - "(?i)^ SPDX\\-License\\-Identifier:"
      - "(?i)^ +"
    period: true
    capital: false
  goconst:
    min-len: 3
    min-occurrences: 10
  gosimple:
    checks: [ "all" ]
  gosec:
    severity: "low"
    confidence: "low"
    excludes:
      - G101
      - G107
      - G112
      - G115
      - G404
  gofumpt:
    extra-rules: true
  gofmt:
    simplify: true
    rewrite-rules:
      - pattern: 'interface{}'
        replacement: 'any'
      - pattern: 'a[b:len(a)]'
        replacement: 'a[b:]'
  importas:
    no-unaliased: true
  lll:
    line-length: 150
    tab-width: 1
  makezero:
    always: false
  misspell:
    locale: US
  nakedret:
    max-func-lines: 60
  revive:
    rules:
      - name: var-naming
        disabled: true
        arguments:
          - [ "HTTP", "ID", "TLS", "TCP", "UDP", "API", "CA", "URL", "DNS" ]
  staticcheck:
    checks: [ "all", "-SA1019", "-SA2002", "-SA5008" ]
  stylecheck:
    checks: [ "all", "-ST1003" ]
  unparam:
    check-exported: false
  unused:
    field-writes-are-uses: true
    post-statements-are-reads: true
    exported-fields-are-used: true
    parameters-are-used: true
    local-variables-are-used: true
    generated-is-used: true
  usestdlibvars:
    http-method: true
    http-status-code: true
    time-weekday: true
    time-month: true
    time-layout: true
    crypto-hash: true

issues:
  uniq-by-line: true
  exclude-files:
    - "doc.go"
    - "zz_generated.*.go"
    - "gen.*.go"
  exclude-rules:
    - path: _test\.go
      linters:
        - errcheck
        - gosec
        - makezero
        - lll


================================================
FILE: Dockerfile
================================================
FROM scratch
ARG TARGETOS
ARG TARGETARCH
COPY --chmod=755 .dist/gguf-parser-${TARGETOS}-${TARGETARCH} /bin/gguf-parser
ENTRYPOINT ["/bin/gguf-parser"]


================================================
FILE: LICENSE
================================================
MIT License

Copyright (c) 2024 gguf-parser-go authors

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

================================================
FILE: Makefile
================================================
.SILENT:
.DEFAULT_GOAL := ci

SHELL := /bin/bash

SRCDIR := $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
GOOS := $(shell go env GOOS)
GOARCH := $(shell go env GOARCH)
LINT_DIRTY ?= false
VERSION ?= $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null | tr '[:upper:]' '[:lower:]' || echo "unknown")

DEPS_UPDATE ?= false
deps:
	@echo "+++ $@ +++"

	cd $(SRCDIR) && go mod tidy && go mod download
	cd $(SRCDIR)/cmd/gguf-parser && go mod tidy && go mod download

	if [[ "$(DEPS_UPDATE)" == "true" ]]; then \
		cd $(SRCDIR) && go get -u -v ./...; \
		cd $(SRCDIR)/cmd/gguf-parser && go get -u -v ./...; \
	fi

	@echo "--- $@ ---"

generate:
	@echo "+++ $@ +++"

	cd $(SRCDIR) && go generate ./...
	cd $(SRCDIR)/cmd/gguf-parser && go generate ./...

	@echo "--- $@ ---"

lint:
	@echo "+++ $@ +++"

	[[ -d "$(SRCDIR)/.sbin" ]] || mkdir -p "$(SRCDIR)/.sbin"

	[[ -f "$(SRCDIR)/.sbin/goimports-reviser" ]] || \
		curl --retry 3 --retry-all-errors --retry-delay 3 -sSfL "https://github.com/incu6us/goimports-reviser/releases/download/v3.8.2/goimports-reviser_3.8.2_$(GOOS)_$(GOARCH).tar.gz" \
		| tar -zxvf - --directory "$(SRCDIR)/.sbin" --no-same-owner --exclude ./LICENSE --exclude ./README.md && chmod +x "$(SRCDIR)/.sbin/goimports-reviser"
	cd $(SRCDIR) && \
		go list -f "{{.Dir}}" ./... | xargs -I {} find {} -maxdepth 1 -type f -name '*.go' ! -name 'gen.*' ! -name 'zz_generated.*' \
		| xargs -I {} "$(SRCDIR)/.sbin/goimports-reviser" -use-cache -imports-order=std,general,company,project,blanked,dotted -output=file {} 1>/dev/null 2>&1
	cd $(SRCDIR)/cmd/gguf-parser && \
		go list -f "{{.Dir}}" ./... | xargs -I {} find {} -maxdepth 1 -type f -name '*.go' ! -name 'gen.*' ! -name 'zz_generated.*' \
		| xargs -I {} "$(SRCDIR)/.sbin/goimports-reviser" -use-cache -imports-order=std,general,company,project,blanked,dotted -output=file {} 1>/dev/null 2>&1

	[[ -f "$(SRCDIR)/.sbin/golangci-lint" ]] || \
		curl --retry 3 --retry-all-errors --retry-delay 3 -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh \
		| sh -s -- -b "$(SRCDIR)/.sbin" "v1.63.4"
	cd $(SRCDIR) && \
		"$(SRCDIR)/.sbin/golangci-lint" run --fix ./...
	cd $(SRCDIR)/cmd/gguf-parser && \
		"$(SRCDIR)/.sbin/golangci-lint" run --fix ./...

	if [[ "$(LINT_DIRTY)" == "true" ]]; then \
		if [[ -n $$(git status --porcelain) ]]; then \
			echo "Code tree is dirty."; \
			git diff --exit-code; \
		fi; \
	fi

	@echo "--- $@ ---"

test:
	@echo "+++ $@ +++"

	go test -v -failfast -race -cover -timeout=30m $(SRCDIR)/...

	@echo "--- $@ ---"

benchmark:
	@echo "+++ $@ +++"

	go test -v -failfast -run="^Benchmark[A-Z]+" -bench=. -benchmem -timeout=30m $(SRCDIR)/...

	@echo "--- $@ ---"

gguf-parser:
	[[ -d "$(SRCDIR)/.dist" ]] || mkdir -p "$(SRCDIR)/.dist"

	cd "$(SRCDIR)/cmd/gguf-parser" && for os in darwin linux windows; do \
  		tags="netgo"; \
  		if [[ $$os == "windows" ]]; then \
		  suffix=".exe"; \
		  tags="netcgo"; \
		else \
		  suffix=""; \
		fi; \
		for arch in amd64 arm64; do \
		  	echo "Building gguf-parser for $$os-$$arch $(VERSION)"; \
			GOOS="$$os" GOARCH="$$arch" CGO_ENABLED=1 go build \
				-trimpath \
				-ldflags="-w -s -X main.Version=$(VERSION)" \
				-tags="urfave_cli_no_docs $$tags" \
				-o $(SRCDIR)/.dist/gguf-parser-$$os-$$arch$$suffix; \
		done; \
		if [[ $$os == "darwin" ]]; then \
		  [[ -d "$(SRCDIR)/.sbin" ]] || mkdir -p "$(SRCDIR)/.sbin"; \
		  [[ -f "$(SRCDIR)/.sbin/lipo" ]] || \
			GOBIN="$(SRCDIR)/.sbin" go install github.com/konoui/lipo@v0.9.2; \
		  	"$(SRCDIR)/.sbin/lipo" -create -output $(SRCDIR)/.dist/gguf-parser-darwin-universal $(SRCDIR)/.dist/gguf-parser-darwin-amd64 $(SRCDIR)/.dist/gguf-parser-darwin-arm64; \
		fi;\
		if [[ $$os == "$(GOOS)" ]] && [[ $$arch == "$(GOARCH)" ]]; then \
			cp -rf $(SRCDIR)/.dist/gguf-parser-$$os-$$arch$$suffix $(SRCDIR)/.dist/gguf-parser$$suffix; \
		fi; \
	done

build: gguf-parser

PACKAGE_PUBLISH ?= false
PACKAGE_REGISTRY ?= "gpustack"
PACKAGE_IMAGE ?= "gguf-parser"
package: build
	@echo "+++ $@ +++"

	if [[ -z $$(command -v docker) ]]; then \
  		echo "Docker is not installed."; \
		exit 1; \
	fi; \
	platform="linux/amd64,linux/arm64"; \
	image="$(PACKAGE_IMAGE):$(VERSION)"; \
	if [[ -n "$(PACKAGE_REGISTRY)" ]]; then \
		image="$(PACKAGE_REGISTRY)/$$image"; \
	fi; \
	if [[ "$(PACKAGE_PUBLISH)" == "true" ]]; then \
	  	if [[ -z $$(docker buildx inspect --builder "gguf-parser") ]]; then \
      		docker run --rm --privileged tonistiigi/binfmt:qemu-v9.2.2 --install $$platform; \
      		docker buildx create --name "gguf-parser" --driver "docker-container" --buildkitd-flags "--allow-insecure-entitlement security.insecure --allow-insecure-entitlement network.host" --bootstrap; \
      	fi; \
		docker buildx build --progress=plain --platform=$$platform --builder="gguf-parser" --output="type=image,name=$$image,push=true" "$(SRCDIR)"; \
	else \
	  	platform="linux/$(GOARCH)"; \
  		docker buildx build --progress=plain --platform=$$platform --output="type=docker,name=$$image" "$(SRCDIR)"; \
	fi

	@echo "--- $@ ---"

ci: deps generate lint test build


================================================
FILE: README.md
================================================
# GGUF Parser

> tl;dr, Review/Check [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) files and estimate the memory
> usage.

[![Go Report Card](https://goreportcard.com/badge/github.com/gpustack/gguf-parser-go)](https://goreportcard.com/report/github.com/gpustack/gguf-parser-go)
[![CI](https://img.shields.io/github/actions/workflow/status/gpustack/gguf-parser-go/cmd.yml?label=ci)](https://github.com/gpustack/gguf-parser-go/actions)
[![License](https://img.shields.io/github/license/gpustack/gguf-parser-go?label=license)](https://github.com/gpustack/gguf-parser-go#license)
[![Download](https://img.shields.io/github/downloads/gpustack/gguf-parser-go/total)](https://github.com/gpustack/gguf-parser-go/releases)
[![Docker Pulls](https://img.shields.io/docker/pulls/gpustack/gguf-parser)](https://hub.docker.com/r/gpustack/gguf-parser)
[![Release](https://img.shields.io/github/v/release/gpustack/gguf-parser-go)](https://github.com/gpustack/gguf-parser-go/releases/latest)

[GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) is a file format for storing models for inference
with GGML and executors based on GGML. GGUF is a binary format that is designed for fast loading and saving of models,
and for ease of reading. Models are traditionally developed using PyTorch or another framework, and then converted to
GGUF for use in GGML.

GGUF Parser helps in reviewing and estimating the usage and maximum tokens per second of a GGUF format model without
download it.

## Key Features

- **No File Required**: GGUF Parser uses chunking reading to parse the metadata of remote GGUF file, which means you
  don't need to download the entire file and load it.
- **Accurate Prediction**: The evaluation results of GGUF Parser usually deviate from the actual usage by about 100MiB.
- **Quick Verification**: You can provide device metrics to calculate the maximum tokens per second (TPS) without
  running the model.
- **Type Screening**: GGUF Parser can distinguish what the GGUF file used for, such as Embedding, Reranking, LoRA, etc.
- **Fast**: GGUF Parser is written in Go, which is fast and efficient.

## Agenda

- [Notes](#notes)
- [Installation](#installation)
- [Overview](#overview)
    + [Parse](#parse)
        * [Local File](#parse-local-file)
        * [Remote File](#parse-remote-file)
        * [From HuggingFace](#parse-from-huggingface)
        * [From ModelScope](#parse-from-modelscope)
        * [From Ollama Library](#parse-from-ollama-library)
        * [Others](#others)
            * [Image Model](#parse-image-model)
            * [None Model](#parse-none-model)
    + [Estimate](#estimate)
        * [Across Multiple GPU devices](#across-multiple-gpu-devices)
        * [Maximum Tokens Per Second](#maximum-tokens-per-second)
        * [Full Layers Offload (default)](#full-layers-offload-default)
        * [Zero Layers Offload](#zero-layers-offload)
        * [Specific Layers Offload](#specific-layers-offload)
        * [Specific Context Size](#specific-context-size)
        * [Enable Flash Attention](#enable-flash-attention)
        * [Disable MMap](#disable-mmap)
        * [With Adapter](#with-adapter)
        * [Get Proper Offload Layers](#get-proper-offload-layers)

## Notes

- **Since v0.20.0**, GGUF Parser supports leveraging `--override-tensor` to indicate how to place the model tensors.
- **Since v0.19.0**, GGUF Parser supports estimating Audio projector model file, like Ultravox series, Qwen2 Audio
  series, etc.
- **Since v0.18.0**, GGUF Parser supports estimating SWA-supported(sliding window attention) model file, like LLaMA 4
  series, Gemma2/3 series, etc.
- **Since v0.17.0**, GGUF Parser align the `QUANTIZATION`(
  aka. [`general.file_type`](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#general-metadata))
  to [HuggingFace processing](https://github.com/huggingface/huggingface.js/blob/2475d6d316135c0a4fceff6b3fe2aed0dde36ac1/packages/gguf/src/types.ts#L11-L48),
  but there are still many model files whose naming does not fully follow `general.file_type`.
- **Since v0.16.0**, GGUF Parser supports estimating MLA-supported model file, like DeepSeek series.
- **Since v0.14.0 (BREAKING CHANGE)**, GGUF Parser parses `*.feed_forward_length` metadata as `[]uint64`,
  which means the architecture `feedForwardLength` is a list of integers.
- **Since v0.13.0 (BREAKING CHANGE)**, GGUF Parser can parse files
  for [StableDiffusion.Cpp](https://github.com/leejet/stable-diffusion.cpp) or StableDiffusion.Cpp like application.
    + [LLaMA Box](https://github.com/gpustack/llama-box) is able to offload different components of the all-in-one model
      to different devices, e.g. with `-ts 1,1,1`, GGUF Parser return the usage of Text Encoder Models in 1st device,
      VAE Model in 2nd device, and Diffusion Model in 3rd device.
- Experimentally, GGUF Parser can estimate the maximum tokens per second(`MAX TPS`) for a (V)LM model according to the
  `--device-metric` options.
- GGUF Parser distinguishes the remote devices from `--tensor-split` via `--rpc`.
    + For one host multiple GPU devices, you can use `--tensor-split` to get the estimated memory usage of each GPU.
    + For multiple hosts multiple GPU devices, you can use `--tensor-split` and `--rpc` to get the estimated memory
      usage of each GPU. Since v0.11.0, `--rpc` flag masks the devices specified by `--tensor-split` in front.
- Table result usage:
    + `DISTRIBUTABLE` indicates the GGUF file supports distribution inference or not, if the file doesn't support
      distribution inference, you can not offload it
      with [RPC servers](https://github.com/ggerganov/llama.cpp/tree/master/examples/rpc).
    + `RAM` indicates the system memory usage.
    + `VRAM *` indicates the local GPU memory usage.
    + `RPC * (V)RAM` indicates the remote memory usage. The kind of memory is determined by which backend the RPC server
      uses, check the running logs for more details.
    + `UMA` indicates the memory usage of Apple macOS only. `NONUMA` adapts to other cases, including non-GPU devices.
    + `LAYERS`(`I`/`T`/`O`) indicates the count for input layers, transformer layers, and output layers. Input layers
      are not offloaded at present.

## Installation

Install from [releases](https://github.com/gpustack/gguf-parser-go/releases).

## Overview

### Parse

#### Parse Local File

```shell
$ gguf-parser --path ~/.cache/lm-studio/models/unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF/DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf
+-----------------------------------------------------------------------------------------------------------+
| METADATA                                                                                                  |
+-------+-------------------------+-------+--------------+---------------+----------+------------+----------+
|  TYPE |           NAME          |  ARCH | QUANTIZATION | LITTLE ENDIAN |   SIZE   | PARAMETERS |    BPW   |
+-------+-------------------------+-------+--------------+---------------+----------+------------+----------+
| model | DeepSeek R1 Distill ... | qwen2 |    Q4_K_M    |      true     | 4.36 GiB |   7.62 B   | 4.91 bpw |
+-------+-------------------------+-------+--------------+---------------+----------+------------+----------+

+-----------------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE                                                                                                                      |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
|      131072     |      3584     |       true       |         28         |   28   |       18944      |      0     |     152064     |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+

+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| TOKENIZER                                                                                                                                             |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
|  gpt2 |   2.47 MiB  |   152064   |        N/A       |   151646  |   151643  |    N/A    |    N/A    |      N/A      |       N/A       |     151654    |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE                                                                                                                                                                                                                                        |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+
|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                 VRAM 0                |
|       |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+----------+-----------+
|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |   NONUMA  |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
| qwen2 |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   29 (28 + 1)  |       Yes      |      1 + 0 + 0     | 677.44 MiB | 827.44 MiB |     28 + 1     | 7.30 GiB | 18.89 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+

$ # Retrieve the model's metadata via split file,
$ # which needs all split files has been downloaded.
$  gguf-parser --path ~/.cache/lm-studio/models/Qwen/Qwen2.5-7B-Instruct-GGUF/qwen2.5-7b-instruct-q8_0-00001-of-00003.gguf 
+-------------------------------------------------------------------------------------------------------+
| METADATA                                                                                              |
+-------+---------------------+-------+--------------+---------------+----------+------------+----------+
|  TYPE |         NAME        |  ARCH | QUANTIZATION | LITTLE ENDIAN |   SIZE   | PARAMETERS |    BPW   |
+-------+---------------------+-------+--------------+---------------+----------+------------+----------+
| model | qwen2.5-7b-instruct | qwen2 |     Q8_0     |      true     | 7.54 GiB |   7.62 B   | 8.50 bpw |
+-------+---------------------+-------+--------------+---------------+----------+------------+----------+

+-----------------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE                                                                                                                      |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
|      131072     |      3584     |       true       |         28         |   28   |       18944      |      0     |     152064     |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+

+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| TOKENIZER                                                                                                                                             |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
|  gpt2 |   2.47 MiB  |   152064   |        N/A       |   151643  |   151645  |    N/A    |    N/A    |      N/A      |       N/A       |     151643    |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE                                                                                                                                                                                                                                        |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+
|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                 VRAM 0                |
|       |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+----------+-----------+
|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |   NONUMA  |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
| qwen2 |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   29 (28 + 1)  |       Yes      |      1 + 0 + 0     | 677.44 MiB | 827.44 MiB |     28 + 1     | 7.30 GiB | 21.82 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
```

#### Parse Remote File

```shell
$ gguf-parser --url="https://huggingface.co/bartowski/Qwen2.5-72B-Instruct-GGUF/resolve/main/Qwen2.5-72B-Instruct-Q4_K_M.gguf"
+---------------------------------------------------------------------------------------------------------+
| METADATA                                                                                                |
+-------+----------------------+-------+--------------+---------------+-----------+------------+----------+
|  TYPE |         NAME         |  ARCH | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW   |
+-------+----------------------+-------+--------------+---------------+-----------+------------+----------+
| model | Qwen2.5 72B Instruct | qwen2 |    Q4_K_M    |      true     | 44.15 GiB |   72.71 B  | 5.22 bpw |
+-------+----------------------+-------+--------------+---------------+-----------+------------+----------+

+-----------------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE                                                                                                                      |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
|      32768      |      8192     |       true       |         64         |   80   |       29568      |      0     |     152064     |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+

+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| TOKENIZER                                                                                                                                             |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
|  gpt2 |   2.47 MiB  |   152064   |        N/A       |   151643  |   151645  |    N/A    |    N/A    |      N/A      |       N/A       |     151643    |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE                                                                                                                                                                                                                                         |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+----------------------------------------+
|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                 VRAM 0                 |
|       |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+-----------+-----------+
|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA    |   NONUMA  |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+-----------+-----------+
| qwen2 |     32768    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   81 (80 + 1)  |       Yes      |      1 + 0 + 0     | 426.57 MiB | 576.57 MiB |     80 + 1     | 10.31 GiB | 58.18 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+-----------+-----------+

$ # Retrieve the model's metadata via split file

$ gguf-parser --url="https://huggingface.co/unsloth/DeepSeek-R1-GGUF/resolve/main/DeepSeek-R1-UD-IQ1_S/DeepSeek-R1-UD-IQ1_S-00001-of-00003.gguf"
+----------------------------------------------------------------------------------------------------------+
| METADATA                                                                                                 |
+-------+------------------+-----------+--------------+---------------+------------+------------+----------+
|  TYPE |       NAME       |    ARCH   | QUANTIZATION | LITTLE ENDIAN |    SIZE    | PARAMETERS |    BPW   |
+-------+------------------+-----------+--------------+---------------+------------+------------+----------+
| model | DeepSeek R1 BF16 | deepseek2 |     IQ1_S    |      true     | 130.60 GiB |  671.03 B  | 1.67 bpw |
+-------+------------------+-----------+--------------+---------------+------------+------------+----------+

+-----------------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE                                                                                                                      |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
|      163840     |      7168     |       true       |         N/A        |   61   |       18432      |     256    |     129280     |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+

+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| TOKENIZER                                                                                                                                             |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
|  gpt2 |   2.21 MiB  |   129280   |        N/A       |     0     |     1     |    N/A    |    N/A    |      N/A      |       N/A       |     128815    |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE                                                                                                                                                                                                                                         |
+-----------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------------------------------+--------------------------------------+
|    ARCH   | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                     RAM                    |                VRAM 0                |
|           |              |                    |                 |           |                |             |               |                |                +--------------------+-----------+-----------+----------------+------------+--------+
|           |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |    UMA    |   NONUMA  | LAYERS (T + O) |     UMA    | NONUMA |
+-----------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+-----------+-----------+----------------+------------+--------+
| deepseek2 |    163840    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   62 (61 + 1)  |       Yes      |      1 + 0 + 0     | 13.03 GiB | 13.18 GiB |     61 + 1     | 762.76 GiB |  1 TB  |
+-----------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+-----------+-----------+----------------+------------+--------+
```

#### Parse From HuggingFace

> [!NOTE]
>
> Allow using `HF_ENDPOINT` to override the default HuggingFace endpoint: `https://huggingface.co`.

```shell
$ gguf-parser --hf-repo="bartowski/Qwen2-VL-2B-Instruct-GGUF" --hf-file="Qwen2-VL-2B-Instruct-f16.gguf" --hf-mmproj-file="mmproj-Qwen2-VL-2B-Instruct-f32.gguf" --visual-max-image-size 1344
+-----------------------------------------------------------------------------------------------------------+
| METADATA                                                                                                  |
+-------+----------------------+---------+--------------+---------------+----------+------------+-----------+
|  TYPE |         NAME         |   ARCH  | QUANTIZATION | LITTLE ENDIAN |   SIZE   | PARAMETERS |    BPW    |
+-------+----------------------+---------+--------------+---------------+----------+------------+-----------+
| model | Qwen2 VL 2B Instruct | qwen2vl |      F16     |      true     | 2.88 GiB |   1.54 B   | 16.00 bpw |
+-------+----------------------+---------+--------------+---------------+----------+------------+-----------+

+-----------------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE                                                                                                                      |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
|      32768      |      1536     |       true       |         12         |   28   |       8960       |      0     |     151936     |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+

+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| TOKENIZER                                                                                                                                             |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
|  gpt2 |   2.47 MiB  |   151936   |        N/A       |   151643  |   151645  |    N/A    |    N/A    |      N/A      |       N/A       |     151643    |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE                                                                                                                                                                                                                                          |
+---------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+
|   ARCH  | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                 VRAM 0                |
|         |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+----------+-----------+
|         |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |   NONUMA  |
+---------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
| qwen2vl |     32768    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   29 (28 + 1)  |       Yes      |      1 + 0 + 0     | 236.87 MiB | 386.87 MiB |     28 + 1     | 3.65 GiB | 12.86 GiB |
+---------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+

$ # Retrieve the model's metadata via split file

$ gguf-parser --hf-repo="bartowski/openbuddy-llama3.3-70b-v24.1-131k-GGUF" --hf-file="openbuddy-llama3.3-70b-v24.1-131k-Q4_0.gguf"
+------------------------------------------------------------------------------------------------------------+
| METADATA                                                                                                   |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
|  TYPE |           NAME          |  ARCH | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW   |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
| model | Openbuddy Llama3.3 7... | llama |     Q4_0     |      true     | 37.35 GiB |   70.55 B  | 4.55 bpw |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+

+-----------------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE                                                                                                                      |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
|      131072     |      8192     |       true       |         64         |   80   |       28672      |      0     |     128256     |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+

+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| TOKENIZER                                                                                                                                             |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
|  gpt2 |    2 MiB    |   128256   |        N/A       |   128000  |   128048  |    N/A    |    N/A    |      N/A      |       N/A       |     128044    |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE                                                                                                                                                                                                                                    |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+-----------------------------------------+----------------------------------------+
|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                   RAM                   |                 VRAM 0                 |
|       |              |                    |                 |           |                |             |               |                |                +--------------------+---------+----------+----------------+-----------+-----------+
|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |   UMA   |  NONUMA  | LAYERS (T + O) |    UMA    |   NONUMA  |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+
| llama |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   81 (80 + 1)  |       Yes      |      1 + 0 + 0     | 1.06 GB | 1.13 GiB |     80 + 1     | 40.26 GiB | 93.62 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+
```

#### Parse From ModelScope

> [!NOTE]
>
> Allow using `MS_ENDPOINT` to override the default ModelScope endpoint: `https://modelscope.cn`.

```shell
$ gguf-parser --ms-repo="unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF" --ms-file="DeepSeek-R1-Distill-Qwen-7B-F16.gguf"
+-------------------------------------------------------------------------------------------------------------+
| METADATA                                                                                                    |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+-----------+
|  TYPE |           NAME          |  ARCH | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW    |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+-----------+
| model | DeepSeek R1 Distill ... | qwen2 |      F16     |      true     | 14.19 GiB |   7.62 B   | 16.00 bpw |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+-----------+

+-----------------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE                                                                                                                      |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
|      131072     |      3584     |       true       |         28         |   28   |       18944      |      0     |     152064     |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+

+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| TOKENIZER                                                                                                                                             |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
|  gpt2 |   2.47 MiB  |   152064   |        N/A       |   151646  |   151643  |    N/A    |    N/A    |      N/A      |       N/A       |     151654    |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE                                                                                                                                                                                                                                        |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+
|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                 VRAM 0                |
|       |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+----------+-----------+
|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |   NONUMA  |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
| qwen2 |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   29 (28 + 1)  |       Yes      |      1 + 0 + 0     | 677.44 MiB | 827.44 MiB |     28 + 1     | 7.30 GiB | 27.99 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
```

#### Parse From Ollama Library

> [!NOTE]
>
> Allow using `--ol-base-url` to override the default Ollama registry endpoint: `https://registry.ollama.ai`.

```shell
$ gguf-parser --ol-model="llama3.3"
+------------------------------------------------------------------------------------------------------------+
| METADATA                                                                                                   |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
|  TYPE |           NAME          |  ARCH | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW   |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
| model | Llama 3.1 70B Instru... | llama |    Q4_K_M    |      true     | 39.59 GiB |   70.55 B  | 4.82 bpw |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+

+-----------------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE                                                                                                                      |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
|      131072     |      8192     |       true       |         64         |   80   |       28672      |      0     |     128256     |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+

+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| TOKENIZER                                                                                                                                             |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
|  gpt2 |    2 MiB    |   128256   |        N/A       |   128000  |   128009  |    N/A    |    N/A    |      N/A      |       N/A       |      N/A      |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE                                                                                                                                                                                                                                    |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+-----------------------------------------+----------------------------------------+
|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                   RAM                   |                 VRAM 0                 |
|       |              |                    |                 |           |                |             |               |                |                +--------------------+---------+----------+----------------+-----------+-----------+
|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |   UMA   |  NONUMA  | LAYERS (T + O) |    UMA    |   NONUMA  |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+
| llama |    131072    |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   81 (80 + 1)  |       Yes      |      1 + 0 + 0     | 1.06 GB | 1.13 GiB |     80 + 1     | 40.26 GiB | 95.86 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+

$ # Ollama Model includes the preset params and other artifacts, like multimodal projectors or LoRA adapters, 
$ # you can get the usage of Ollama running by using `--ol-usage` option.

+------------------------------------------------------------------------------------------------------------+
| METADATA                                                                                                   |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
|  TYPE |           NAME          |  ARCH | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW   |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
| model | Llama 3.1 70B Instru... | llama |    Q4_K_M    |      true     | 39.59 GiB |   70.55 B  | 4.82 bpw |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+

+-----------------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE                                                                                                                      |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
|      131072     |      8192     |       true       |         64         |   80   |       28672      |      0     |     128256     |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+

+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| TOKENIZER                                                                                                                                             |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
|  gpt2 |    2 MiB    |   128256   |        N/A       |   128000  |   128009  |    N/A    |    N/A    |      N/A      |       N/A       |      N/A      |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE                                                                                                                                                                                                                                          |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+-----------------------------------------+
|  ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY |  RERANKING  | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED |                      RAM                     |                  VRAM 0                 |
|       |              |                    |                 |           |                |             |               |                |                +--------------------+------------+------------+----------------+------------+-----------+
|       |              |                    |                 |           |                |             |               |                |                | LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |     UMA    |   NONUMA  |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+------------+-----------+
| llama |     2048     |     2048 / 512     |     Disabled    |  Enabled  |       No       | Unsupported |   Supported   |   81 (80 + 1)  |       Yes      |      1 + 0 + 0     | 255.27 MiB | 405.27 MiB |     80 + 1     | 906.50 MiB | 40.49 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+------------+-----------+
```

#### Others

##### Parse Image Model

```shell
$ # Parse FLUX.1-dev Model
$ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gguf"
+----------------------------------------------------------------------------------------------+
| METADATA                                                                                     |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
|  TYPE | NAME |    ARCH   | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW    |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
| model |  N/A | diffusion |      F16     |      true     | 31.79 GiB |    17 B    | 16.06 bpw |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+

+----------------------------------------------------------------------------------------------------------+
| ARCHITECTURE                                                                                             |
+----------------+---------------------------------------------------------------+-------------------------+
| DIFFUSION ARCH |                          CONDITIONERS                         |       AUTOENCODER       |
+----------------+---------------------------------------------------------------+-------------------------+
|     FLUX.1     | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) |
+----------------+---------------------------------------------------------------+-------------------------+

+---------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE                                                                                                                  |
+--------+-----------------+-------------+---------------+----------------+-------------------------+-----------------------+
|  ARCH  | FLASH ATTENTION |  MMAP LOAD  | DISTRIBUTABLE | FULL OFFLOADED |           RAM           |         VRAM 0        |
|        |                 |             |               |                +------------+------------+-----------+-----------+
|        |                 |             |               |                |     UMA    |   NONUMA   |    UMA    |   NONUMA  |
+--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+
| flux_1 |     Disabled    | Unsupported |   Supported   |       Yes      | 343.89 MiB | 493.89 MiB | 31.89 GiB | 41.15 GiB |
+--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+

$ # Parse FLUX.1-dev Model without offload Conditioner and Autoencoder
$ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gguf" --clip-on-cpu --vae-on-cpu
+----------------------------------------------------------------------------------------------+
| METADATA                                                                                     |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
|  TYPE | NAME |    ARCH   | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW    |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
| model |  N/A | diffusion |      F16     |      true     | 31.79 GiB |    17 B    | 16.06 bpw |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+

+----------------------------------------------------------------------------------------------------------+
| ARCHITECTURE                                                                                             |
+----------------+---------------------------------------------------------------+-------------------------+
| DIFFUSION ARCH |                          CONDITIONERS                         |       AUTOENCODER       |
+----------------+---------------------------------------------------------------+-------------------------+
|     FLUX.1     | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) |
+----------------+---------------------------------------------------------------+-------------------------+

+-------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE                                                                                                                |
+--------+-----------------+-------------+---------------+----------------+-----------------------+-----------------------+
|  ARCH  | FLASH ATTENTION |  MMAP LOAD  | DISTRIBUTABLE | FULL OFFLOADED |          RAM          |         VRAM 0        |
|        |                 |             |               |                +-----------+-----------+-----------+-----------+
|        |                 |             |               |                |    UMA    |   NONUMA  |    UMA    |   NONUMA  |
+--------+-----------------+-------------+---------------+----------------+-----------+-----------+-----------+-----------+
| flux_1 |     Disabled    | Unsupported |   Supported   |       Yes      | 16.44 GiB | 16.59 GiB | 22.29 GiB | 25.05 GiB |
+--------+-----------------+-------------+---------------+----------------+-----------+-----------+-----------+-----------+

$ # Parse FLUX.1-dev Model with Autoencoder tiling
$ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gguf" --vae-tiling
+----------------------------------------------------------------------------------------------+
| METADATA                                                                                     |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
|  TYPE | NAME |    ARCH   | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW    |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
| model |  N/A | diffusion |      F16     |      true     | 31.79 GiB |    17 B    | 16.06 bpw |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+

+----------------------------------------------------------------------------------------------------------+
| ARCHITECTURE                                                                                             |
+----------------+---------------------------------------------------------------+-------------------------+
| DIFFUSION ARCH |                          CONDITIONERS                         |       AUTOENCODER       |
+----------------+---------------------------------------------------------------+-------------------------+
|     FLUX.1     | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) |
+----------------+---------------------------------------------------------------+-------------------------+

+---------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE                                                                                                                  |
+--------+-----------------+-------------+---------------+----------------+-------------------------+-----------------------+
|  ARCH  | FLASH ATTENTION |  MMAP LOAD  | DISTRIBUTABLE | FULL OFFLOADED |           RAM           |         VRAM 0        |
|        |                 |             |               |                +------------+------------+-----------+-----------+
|        |                 |             |               |                |     UMA    |   NONUMA   |    UMA    |   NONUMA  |
+--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+
| flux_1 |     Disabled    | Unsupported |   Supported   |       Yes      | 343.89 MiB | 493.89 MiB | 31.89 GiB | 36.28 GiB |
+--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+

$ # Parse FLUX.1-dev Model with multiple devices offloading
$ # Support by LLaMA Box v0.0.106+, https://github.com/gpustack/llama-box.
$ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gguf" --tensor-split="1,1,1"
+----------------------------------------------------------------------------------------------+
| METADATA                                                                                     |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
|  TYPE | NAME |    ARCH   | QUANTIZATION | LITTLE ENDIAN |    SIZE   | PARAMETERS |    BPW    |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
| model |  N/A | diffusion |      F16     |      true     | 31.79 GiB |    17 B    | 16.06 bpw |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+

+----------------------------------------------------------------------------------------------------------+
| ARCHITECTURE                                                                                             |
+----------------+---------------------------------------------------------------+-------------------------+
| DIFFUSION ARCH |                          CONDITIONERS                         |       AUTOENCODER       |
+----------------+---------------------------------------------------------------+-------------------------+
|     FLUX.1     | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) |
+----------------+---------------------------------------------------------------+-------------------------+

+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE                                                                                                                                                              |
+--------+-----------------+-------------+---------------+----------------+-------------------------+---------------------+---------------------+-----------------------+
|  ARCH  | FLASH ATTENTION |  MMAP LOAD  | DISTRIBUTABLE | FULL OFFLOADED |           RAM           |        VRAM 0       |        VRAM 1       |         VRAM 2        |
|        |                 |             |               |                +------------+------------+----------+----------+------------+--------+-----------+-----------+
|        |                 |             |               |                |     UMA    |   NONUMA   |    UMA   |  NONUMA  |     UMA    | NONUMA |    UMA    |   NONUMA  |
+--------+-----------------+-------------+---------------+----------------+------------+------------+----------+----------+------------+--------+-----------+-----------+
| flux_1 |     Disabled    | Unsupported |   Supported   |       Yes      | 343.89 MiB | 493.89 MiB | 9.34 GiB | 9.60 GiB | 259.96 MiB |  7 GiB | 22.29 GiB | 25.05 GiB |
+--------+-----------------+-------------+---------------+----------------+------------+------------+----------+----------+------------+--------+-----------+-----------+
```

##### Parse None Model

```shell
$ # Parse Multi-Modal Projector
$ gguf-parser --hf-repo="unsloth/Qwen2.5-Omni-3B-GGUF" --hf-file="mmproj-F32.gguf"                                                                        
+-------------------------------------------------------------------------------------------------------+
| METADATA                                                                                              |
+-----------+-----------------+------+--------------+---------------+----------+------------+-----------+
|    TYPE   |       NAME      | ARCH | QUANTIZATION | LITTLE ENDIAN |   SIZE   | PARAMETERS |    BPW    |
+-----------+-----------------+------+--------------+---------------+----------+------------+-----------+
| projector | Qwen2.5-Omni-3B | clip |      F32     |      true     | 4.86 GiB |   1.31 B   | 31.93 bpw |
+-----------+-----------------+------+--------------+---------------+----------+------------+-----------+

+-------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE                                                                                                            |
+----------------+-------------------------------+-----------------+-------------------------------------+----------------+
| PROJECTOR TYPE |         EMBEDDING LEN         |      LAYERS     |           FEED FORWARD LEN          |     ENCODER    |
|                +---------------+---------------+--------+--------+------------------+------------------+                |
|                |     VISION    |     AUDIO     | VISION |  AUDIO |      VISION      |       AUDIO      |                |
+----------------+---------------+---------------+--------+--------+------------------+------------------+----------------+
|    qwen2.5o    |      1280     |      1280     |   32   |   32   |       1280       |       5120       | Vision & Audio |
+----------------+---------------+---------------+--------+--------+------------------+------------------+----------------+

$ # Parse LoRA Adapter
$ gguf-parser --hf-repo="ngxson/test_gguf_lora_adapter" --hf-file="lora-Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf"
+---------------------------------------------------------------------------------------------+
| METADATA                                                                                    |
+---------+------+-------+--------------+---------------+------------+------------+-----------+
|   TYPE  | NAME |  ARCH | QUANTIZATION | LITTLE ENDIAN |    SIZE    | PARAMETERS |    BPW    |
+---------+------+-------+--------------+---------------+------------+------------+-----------+
| adapter |  N/A | llama |      F16     |      true     | 168.08 MiB |   88.12 M  | 16.00 bpw |
+---------+------+-------+--------------+---------------+------------+------------+-----------+

+---------------------------+
| ARCHITECTURE              |
+--------------+------------+
| ADAPTER TYPE | LORA ALPHA |
+--------------+------------+
|     lora     |     32     |
+--------------+------------+
```

### Estimate

#### Across Multiple GPU Devices

Imaging you're preparing to run
the [hierholzer/Llama-3.1-70B-Instruct-GGUF](https://huggingface.co/hierholzer/Llama-3.1-70B-Instruct-GGUF) model file
across several hosts in your local network. Some of these hosts are equipped with GPU devices, while others do not have
any GPU capabilities.

```mermaid
flowchart TD
    subgraph host4["Windows 11 (host4)"]
        ram40(["11GiB RAM remaining"])
    end
    subgraph host3["Apple macOS (host3)"]
        gpu10["Apple M1 Max (6GiB VRAM remaining)"]
    end
    subgraph host2["Windows 11 (host2)"]
        gpu20["NVIDIA 4090 (12GiB VRAM remaining)"]
    end
    subgraph host1["Ubuntu (host1)"]
        gpu30["NVIDIA 4080 0 (8GiB VRAM remaining)"]
        gpu31["NVIDIA 4080 1 (10GiB VRAM remaining)"]
    end
```

##### Single Host Multiple GPU Devices

Let's assume you plan to run the model on `host1` only.

```mermaid
flowchart TD
    subgraph host1["Ubuntu (host1)"]
        gpu30["NVIDIA 4080 0 (8GiB VRAM remaining)"]
        gpu31["NVIDIA 4080 1 (10GiB VRAM remaining)"]
    end
```

```shell
$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --ctx-size=1024 --tensor-split="8,10" --estimate --in-short
+------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE                                                                                                                     |
+----------------------------------------------+--------------------------------------+----------------------------------------+
|                      RAM                     |                VRAM 0                |                 VRAM 1                 |
+--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+
| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA   |   NONUMA  | LAYERS (T + O) |    UMA    |   NONUMA  |
+--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+
|      1 + 0 + 0     | 249.27 MiB | 399.27 MiB |     36 + 0     | 144 MiB | 17.83 GiB |     44 + 1     | 22.27 GiB | 22.83 GiB |
+--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+
```

Based on the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host1` has the following
resource consumption:

| Host                  | Available RAM | Request RAM | Available VRAM | Request VRAM | Result     |
|-----------------------|---------------|-------------|----------------|--------------|------------|
| host1                 | ENOUGH        | 399.27 MiB  |                |              | :thumbsup: |
| host1 (NVIDIA 4080 0) |               |             | 8 GiB          | 17.83 GiB    |            |
| host1 (NVIDIA 4080 1) |               |             | 10 GiB         | 22.83 GiB    |            |

It appears that running the model on `host1` alone is not feasible.

##### Multiple Hosts Multiple GPU Devices

Next, let's consider the scenario where you plan to run the model on `host4`, while offloading all layers to `host1`,
`host2`,
and `host3`.

```mermaid
flowchart TD
    host4 -->|TCP| gpu10
    host4 -->|TCP| gpu20
    host4 -->|TCP| gpu30
    host4 -->|TCP| gpu31

    subgraph host4["Windows 11 (host4)"]
        ram40(["11GiB RAM remaining"])
    end
    subgraph host3["Apple macOS (host3)"]
        gpu10["Apple M1 Max (6GiB VRAM remaining)"]
    end
    subgraph host2["Windows 11 (host2)"]
        gpu20["NVIDIA 4090 (12GiB VRAM remaining)"]
    end
    subgraph host1["Ubuntu (host1)"]
        gpu30["NVIDIA 4080 0 (8GiB VRAM remaining)"]
        gpu31["NVIDIA 4080 1 (10GiB VRAM remaining)"]
    end
```

```shell
$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --ctx-size=1024 --tensor-split="8,10,12,6" --rpc="host1:50052,host1:50053,host2:50052,host3:50052" --estimate --in-short
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE                                                                                                                                                                                                                                 |
+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+
|                      RAM                     |                 RPC 0 (V)RAM                 |                 RPC 1 (V)RAM                 |                 RPC 2 (V)RAM                 |                 RPC 3 (V)RAM                 |
+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+
| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    |
+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+
|      1 + 0 + 0     | 249.27 MiB | 399.27 MiB |     18 + 0     |   8.85 GiB   |   9.28 GiB   |     23 + 0     |   10.88 GiB  |   11.32 GiB  |     27 + 0     |   12.75 GiB  |   13.19 GiB  |     12 + 1     |   7.13 GiB   |   7.64 GiB   |
+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+
```

According to the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host4` results in the
following resource consumption:

| Host                  | Available RAM | Request RAM | Available VRAM | Request VRAM | Result     |
|-----------------------|---------------|-------------|----------------|--------------|------------|
| host4                 | 11 GiB        | 399.27 MiB  |                |              | :thumbsup: |
| host1 (NVIDIA 4080 0) |               |             | 8 GiB          | 9.28 GiB     |            |
| host1 (NVIDIA 4080 1) |               |             | 10 GiB         | 11.32 GiB    |            |
| host2 (NVIDIA 4090)   |               |             | 12 GiB         | 13.19 GiB    |            |
| host3 (Apple M1 Max)  | ENOUGH        |             | 6 GiB          | 7.13 GiB     |            |

It seems that the model cannot be served on `host4`, even with all layers offloaded to `host1`, `host2`, and `host3`.

We should consider a different approach: running the model on `host3` while offloading all layers to `host1`, `host2`,
and `host4`.

```mermaid
flowchart TD
    host3 -->|TCP| ram40
    host3 -->|TCP| gpu20
    host3 -->|TCP| gpu30
    host3 -->|TCP| gpu31

    subgraph host4["Windows 11 (host4)"]
        ram40(["11GiB RAM remaining"])
    end
    subgraph host3["Apple macOS (host3)"]
        gpu10["Apple M1 Max (6GiB VRAM remaining)"]
    end
    subgraph host2["Windows 11 (host2)"]
        gpu20["NVIDIA 4090 (12GiB VRAM remaining)"]
    end
    subgraph host1["Ubuntu (host1)"]
        gpu30["NVIDIA 4080 0 (8GiB VRAM remaining)"]
        gpu31["NVIDIA 4080 1 (10GiB VRAM remaining)"]
    end
```

```shell
$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --ctx-size=1024 --tensor-split="11,12,8,10,6" --rpc="host4:50052,host2:50052,host1:50052,host1:50053" --estimate --in-short
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE                                                                                                                                                                                                                                                                          |
+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------+
|                      RAM                     |                 RPC 0 (V)RAM                 |                 RPC 1 (V)RAM                 |                 RPC 2 (V)RAM                 |                 RPC 3 (V)RAM                 |                 VRAM 0                 |
+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+------------+----------+
| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |     UMA    |  NONUMA  |
+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+------------+----------+
|      1 + 0 + 0     | 249.27 MiB | 399.27 MiB |     19 + 0     |   9.36 GiB   |   9.79 GiB   |     21 + 0     |   9.92 GiB   |   10.35 GiB  |     14 + 0     |   6.57 GiB   |   7.01 GiB   |     17 + 0     |   8.11 GiB   |   8.54 GiB   |      9 + 1     | 302.50 MiB | 6.16 GiB |
+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+------------+----------+
```

According to the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host3` results in the
following resource consumption:

| Host                  | Available RAM | Request RAM | Available VRAM | Request VRAM | Result     |
|-----------------------|---------------|-------------|----------------|--------------|------------|
| host3 (Apple M1 Max)  | ENOUGH        | 249.27 MiB  |                |              | :thumbsup: |
| host4                 | 11 GiB        | 9.79 GiB    |                |              | :thumbsup: |
| host2 (NVIDIA 4090)   |               |             | 12 GiB         | 10.35 GiB    | :thumbsup: |
| host1 (NVIDIA 4080 0) |               |             | 8 GiB          | 7.01 GiB     | :thumbsup: |
| host1 (NVIDIA 4080 1) |               |             | 10 GiB         | 8.54 GiB     | :thumbsup: |
| host3 (Apple M1 Max)  |               |             | 6 GiB          | 302.50 MiB   | :thumbsup: |

Now, the model can be successfully served on `host3`, with all layers offloaded to `host1`, `host2`, and `host4`.

#### Maximum Tokens Per Second

The maximum TPS estimation for the GGUF Parser is determined by the model's parameter size, context size, model
offloaded layers, and devices on which the model runs. Among these factors, the device's specifications are particularly
important.

Inspired
by [LLM inference speed of light](https://zeux.io/2024/03/15/llm-inference-sol/), GGUF Parser use the **FLOPS** and
**bandwidth** of the device as evaluation metrics:

- When the device is a CPU, FLOPS refers to the performance of that CPU, while bandwidth corresponds to the DRAM
  bandwidth.
- When the device is a (i)GPU, FLOPS indicates the performance of that (i)GPU, and bandwidth corresponds to the VRAM
  bandwidth.
- When the device is a specific host, FLOPS depends on whether the CPU or (i)GPU of that host is being used, while
  bandwidth corresponds to the bandwidth connecting the main node to that host. **After all, a chain is only as strong
  as
  its weakest link.** If the connection bandwidth between the
  main node and the host is equal to or greater than the *RAM bandwidth, then the bandwidth should be taken as the *RAM
  bandwidth value.

##### CPU FLOPS Calculation

The performance of a single CPU cache can be calculated using the following formula:

$$ CPU\ FLOPS = Number\ of \ Cores \times Core\ Frequency \times Floating\ Point\ Operations\ per\ Cycle $$

The Apple M1 Max CPU features a total of 10 cores, consisting of 8 performance cores and 2 efficiency cores. The
performance cores operate at a clock speed of 3.2 GHz, while the efficiency cores run at 2.2 GHz. All cores support
the [ARM NEON instruction set](https://en.wikipedia.org/wiki/ARM_architecture_family#Advanced_SIMD_(Neon)), which
enables 128-bit SIMD operations, allowing multiple floating-point numbers to be processed simultaneously within a
single CPU cycle. Specifically, using single-precision (32-bit) floating-point numbers, each cycle can handle 4
floating-point operations.

The peak floating-point performance for a single performance core is calculated as follows:

$$ Peak\ Performance = 3.2\ GHz \times 4\ FLOPS = 12.8\ GFLOPS $$

For a single efficiency core, the calculation is:

$$ Peak\ Performance = 2.2\ GHz \times 4\ FLOPS = 8.8\ GFLOPS $$

Thus, the overall peak floating-point performance of the entire CPU can be determined by combining the contributions
from both types of cores:

$$ Peak\ Performance = 8\ Cores \times 12.8\ GFLOPS + 2\ Cores \times 8.8\ GFLOPS = 120\ GFLOPS $$

> This results in an average performance of 12 GFLOPS per core. It is evident that the average performance achieved by
> utilizing both performance and efficiency cores is lower than that obtained by exclusively using performance cores.

##### Run LLaMA2-7B-Chat with Apple Silicon M-series

Taking [TheBloke/Llama-2-7B-Chat-GGUF](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF) as an
example and estimate the maximum tokens per second for Apple Silicon M-series using the GGUF Parser.

```shell
$ # Estimate full offloaded Q8_0 model
$ gguf-parser --hf-repo TheBloke/LLaMA-7b-GGUF --hf-file llama-7b.Q8_0.gguf --estimate --in-short \
  -c 512 \
  --device-metric "<CPU FLOPS>;<RAM BW>,<iGPU FLOPS>;<VRAM BW>"

$ # Estimate full offloaded Q4_0 model
$ gguf-parser --hf-repo TheBloke/LLaMA-7b-GGUF --hf-file llama-7b.Q4_0.gguf --estimate --in-short \
  -c 512 \
  --device-metric "<CPU FLOPS>;<RAM BW>,<iGPU FLOPS>;<VRAM BW>"
```

| Variant  | CPU FLOPS (Performance Core) | iGPU FLOPS             | (V)RAM Bandwidth | Q8_0 Max TPS | Q4_0 Max TPS |
|----------|------------------------------|------------------------|------------------|--------------|--------------|
| M1       | 51.2 GFLOPS  (4 cores)       | 2.6 TFLOPS (8 cores)   | 68.3 GBps        | 8.68         | 14.56        |
| M1 Pro   | 102.4 GFLOPS  (8 cores)      | 5.2 TFLOPS (16 cores)  | 204.8 GBps       | 26.04        | 43.66        |
| M1 Max   | 102.4 GFLOPS  (8 cores)      | 10.4 TFLOPS (32 cores) | 409.6 GBps       | 52.08        | 87.31        |
| M1 Ultra | 204.8 GFLOPS (16 cores)      | 21 TFLOPS (64 cores)   | 819.2 GBps       | 104.16       | 174.62       |
| M2       | 56 GFLOPS (4 cores)          | 3.6 TFLOPS (10 cores)  | 102.4 GBps       | 13.02        | 21.83        |
| M2 Pro   | 112 GFLOPS (8 cores)         | 6.8 TFLOPS (19 cores)  | 204.8 GBps       | 26.04        | 43.66        |
| M2 Max   | 112 GFLOPS (8 cores)         | 13.6 TFLOPS (38 cores) | 409.6 GBps       | 52.08        | 87.31        |
| M2 Ultra | 224 GFLOPS (16 cores)        | 27.2 TFLOPS (76 cores) | 819.2 GBps       | 104.16       | 174.62       |
| M3       | 64.96 GFLOPS (4 cores)       | 4.1 TFLOPS (10 cores)  | 102.4 GBps       | 13.02        | 21.83        |
| M3 Pro   | 97.44 GFLOPS (6 cores)       | 7.4 TFLOPS (18 cores)  | 153.6 GBps       | 19.53        | 32.74        |
| M3 Max   | 194.88 GFLOPS (12 cores)     | 16.4 TFLOPS (40 cores) | 409.6 GBps       | 52.08        | 87.31        |
| M4       | 70.56 GFLOPS (4 cores)       | 4.1 TFLOPS             | 120 GBps         | 15.26        | 25.58        |

> References:
> - https://www.cpu-monkey.com/en/cpu_family-apple_m_series
> - https://nanoreview.net/
> - https://en.wikipedia.org/wiki/Apple_M1#Variants
> - https://en.wikipedia.org/wiki/Apple_M2#Variants
> - https://en.wikipedia.org/wiki/Apple_M3#Variants
> - https://en.wikipedia.org/wiki/Apple_M4#Variants

You can further verify the above results in [Performance of llama.cpp on Apple Silicon M-series
](https://github.com/ggerganov/llama.cpp/discussions/4167#user-content-fn-1-e9a4caf2848534167e450e18fc4ede7f).

##### Run LLaMA3.1-405B-Instruct with Apple Mac Studio devices combined with Thunderbolt

Example
by [leafspark/Meta-Llama-3.1-405B-Instruct-GGUF](https://huggingface.co/leafspark/Meta-Llama-3.1-405B-Instruct-GGUF)
and estimate the maximum tokens per second for three Apple Mac Studio devices combined with Thunderbolt.

| Device                        | CPU FLOPS (Performance Core) | iGPU FLOPS             | (V)RAM Bandwidth | Thunderbolt Bandwidth | Role       |
|-------------------------------|------------------------------|------------------------|------------------|-----------------------|------------|
| Apple Mac Studio (M2 Ultra) 0 | 224 GFLOPS (16 cores)        | 27.2 TFLOPS (76 cores) | 819.2 GBps       | 40 Gbps               | Main       |
| Apple Mac Studio (M2 Ultra) 1 | 224 GFLOPS (16 cores)        | 27.2 TFLOPS (76 cores) | 819.2 GBps       | 40 Gbps               | RPC Server |
| Apple Mac Studio (M2 Ultra) 2 | 224 GFLOPS (16 cores)        | 27.2 TFLOPS (76 cores) | 819.2 GBps       | 40 Gbps               | RPC Server |

Get the maximum tokens per second with the following command:

```shell
$ # Explain the command:
$ # --device-metric "224GFLOPS;819.2GBps"         <-- Apple Mac Studio 0 CPU FLOPS and RAM Bandwidth
$ # --device-metric "27.2TFLOPS;819.2GBps;40Gbps" <-- Apple Mac Studio 1 (RPC 0) iGPU FLOPS, VRAM Bandwidth, and Thunderbolt Bandwidth
$ # --device-metric "27.2TFLOPS;819.2GBps;40Gbps" <-- Apple Mac Studio 2 (RPC 1) iGPU FLOPS, VRAM Bandwidth, and Thunderbolt Bandwidth
$ # --device-metric "27.2TFLOPS;819.2GBps"        <-- Apple Mac Studio 0 iGPU FLOPS and VRAM Bandwidth
$ gguf-parser --hf-repo leafspark/Meta-Llama-3.1-405B-Instruct-GGUF --hf-file Llama-3.1-405B-Instruct.Q4_0.gguf/Llama-3.1-405B-Instruct.Q4_0-00001-of-00012.gguf --estimate --in-short \
  --no-mmap \
  -c 512 \
  --rpc host1:port,host2:port \
  --tensor-split "<Proportions>" \
  --device-metric "224GFLOPS;819.2GBps" \
  --device-metric "27.2TFLOPS;819.2GBps;40Gbps" \
  --device-metric "27.2TFLOPS;819.2GBps;40Gbps" \
  --device-metric "27.2TFLOPS;819.2GBps"
```

| Tensor Split | Apple Mac Studio 0 RAM | Apple Mac Studio 1 VRAM (RPC 0) | Apple Mac Studio 2 VRAM  (RPC 1) | Apple Mac Studio 0 VRAM | Q4_0 Max TPS |
|--------------|------------------------|---------------------------------|----------------------------------|-------------------------|--------------|
| 1,1,1        | 1.99 GiB               | 72.74 GiB                       | 71.04 GiB                        | 70.96 GiB               | 10.71        |
| 2,1,1        | 1.99 GiB               | 108.26 GiB                      | 54.13 GiB                        | 52.35 GiB               | 11.96        |
| 3,1,1        | 1.99 GiB               | 130.25 GiB                      | 42.29 GiB                        | 42.20 GiB               | 9.10         |
| 4,1,1        | 1.99 GiB               | 143.78 GiB                      | 35.52 GiB                        | 35.44 GiB               | 7.60         |

##### Run Qwen2.5-72B-Instruct with NVIDIA RTX 4080 and remote RPC by Apple Mac Studio (M2)

Example by [Qwen/Qwen2.5-72B-Instruct-GGUF](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GGUF) and estimate the
maximum tokens per second for NVIDIA RTX 4080.

| Hardware                                    | FLOPS        | Bandwidth  |
|---------------------------------------------|--------------|------------|
| Intel i5-14600k                             | 510.4 GFLOPS |            |
| 2 x Corsair Vengeance RGB DDR5-6000 (32GiB) |              | 96 GBps    |
| 2 x NVIDIA GeForce RTX 4080                 | 48.74 TFLOPS | 736.3 GBps |
| Apple Mac Studio (M2)                       | 27.2 TFLOPS  | 819.2 GBps |

```shell
$ # Explain the command:
$ # --tensor-split 20369,12935,13325               <-- Available Memory in MiB for each device
$ # --device-metric "510.4GFLOPS;96GBps"           <-- Intel i5-14600k CPU FLOPS and RAM Bandwidth
$ # --device-metric "27.2TFLOPS;819.2GBps;40Gbps"  <-- Apple Mac Studio (M2) (RPC 0) iGPU FLOPS, VRAM Bandwidth, and Thunderbolt Bandwidth
$ # --device-metric "48.74TFLOPS;736.3GBps;64GBps" <-- NVIDIA GeForce RTX 0 4080 GPU FLOPS, VRAM Bandwidth, and PCIe 5.0 x16 Bandwidth
$ # --device-metric "48.74TFLOPS;736.3GBps;8GBps"  <-- NVIDIA GeForce RTX 1 4080 GPU FLOPS, VRAM Bandwidth, and PCIe 4.0 x4 Bandwidth
$ gguf-parser --hf-repo Qwen/Qwen2.5-72B-Instruct-GGUF --hf-file qwen2.5-72b-instruct-q4_k_m-00001-of-00012.gguf --estimate --in-short \
  --no-mmap \
  -c 8192 \
  --rpc host:port \
  --tensor-split 20369,12935,13325 \
  --device-metric "510.4GFLOPS;96GBps" \
  --device-metric "27.2TFLOPS;819.2GBps;40Gbps" \
  --device-metric "48.74TFLOPS;736.3GBps;64GBps" \
  --device-metric "48.74TFLOPS;736.3GBps;8GBps"
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE                                                                                                                                                                              |
+-----------+------------------------------------------+----------------------------------------------+----------------------------------------+----------------------------------------+
|  MAX TPS  |                    RAM                   |                 RPC 0 (V)RAM                 |                 VRAM 0                 |                 VRAM 1                 |
|           +--------------------+----------+----------+----------------+--------------+--------------+----------------+-----------+-----------+----------------+-----------+-----------+
|           | LAYERS (I + T + O) |    UMA   |  NONUMA  | LAYERS (T + O) |      UMA     |    NONUMA    | LAYERS (T + O) |    UMA    |   NONUMA  | LAYERS (T + O) |    UMA    |   NONUMA  |
+-----------+--------------------+----------+----------+----------------+--------------+--------------+----------------+-----------+-----------+----------------+-----------+-----------+
| 51.82 tps |      1 + 0 + 0     | 1.19 GiB | 1.34 GiB |     36 + 0     |   18.85 GiB  |   20.17 GiB  |     22 + 0     | 11.34 GiB | 12.66 GiB |     22 + 1     | 12.65 GiB | 13.97 GiB |
+-----------+--------------------+----------+----------+----------------+--------------+--------------+----------------+-----------+-----------+----------------+-----------+-----------+
```

#### Full Layers Offload (default)

```shell
$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --estimate --in-short
+-------------------------------------------------------------------------------------+
| ESTIMATE                                                                            |
+------------------------------------------+------------------------------------------+
|                    RAM                   |                  VRAM 0                  |
+--------------------+----------+----------+----------------+------------+------------+
| LAYERS (I + T + O) |    UMA   |  NONUMA  | LAYERS (T + O) |     UMA    |   NONUMA   |
+--------------------+----------+----------+----------------+------------+------------+
|      1 + 0 + 0     | 1.63 GiB | 1.78 GiB |     126 + 1    | 126.28 GiB | 246.86 GiB |
+--------------------+----------+----------+----------------+------------+------------+
```

#### Zero Layers Offload

```shell
$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --gpu-layers=0 --estimate --in-short
+------------------------------------------------------------------------------------+
| ESTIMATE                                                                           |
+----------------------------------------------+-------------------------------------+
|                      RAM                     |                VRAM 0               |
+--------------------+------------+------------+----------------+--------+-----------+
| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA  |   NONUMA  |
+--------------------+------------+------------+----------------+--------+-----------+
|     1 + 126 + 1    | 127.64 GiB | 127.79 GiB |      0 + 0     |   0 B  | 33.62 GiB |
+--------------------+------------+------------+----------------+--------+-----------+
```

#### Specific Layers Offload

```shell
$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --gpu-layers=10 --estimate --in-short
+----------------------------------------------------------------------------------+
| ESTIMATE                                                                         |
+----------------------------------------------+-----------------------------------+
|                      RAM                     |               VRAM 0              |
+--------------------+------------+------------+----------------+--------+---------+
| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |   UMA  |  NONUMA |
+--------------------+------------+------------+----------------+--------+---------+
|     1 + 126 + 1    | 127.64 GiB | 127.79 GiB |      0 + 0     |   0 B  | 250 MiB |
+--------------------+------------+------------+----------------+--------+---------+
```

#### Specific Context Size

By default, the context size retrieved from the model's metadata.

Use `--ctx-size` to specify the context size.

```shell
$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --ctx-size=4096 --estimate --in-short
+--------------------------------------------------------------------------------------+
| ESTIMATE                                                                             |
+----------------------------------------------+---------------------------------------+
|                      RAM                     |                 VRAM 0                |
+--------------------+------------+------------+----------------+----------+-----------+
| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |   NONUMA  |
+--------------------+------------+------------+----------------+----------+-----------+
|      1 + 0 + 0     | 404.53 MiB | 554.53 MiB |     126 + 1    | 3.94 GiB | 93.28 GiB |
+--------------------+------------+------------+----------------+----------+-----------+
```

#### Enable Flash Attention

By default, LLaMA.cpp disables the Flash Attention.

Enable Flash Attention will reduce the VRAM usage, but it also increases the GPU/CPU usage.

Use `--flash-attention` to enable the Flash Attention.

Please note that not all models support Flash Attention, if the model does not support, the "FLASH ATTENTION" shows "
Disabled" even if you enable it.

```shell
$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --flash-attention --estimate --in-short
+-------------------------------------------------------------------------------------+
| ESTIMATE                                                                            |
+------------------------------------------+------------------------------------------+
|                    RAM                   |                  VRAM 0                  |
+--------------------+----------+----------+----------------+------------+------------+
| LAYERS (I + T + O) |    UMA   |  NONUMA  | LAYERS (T + O) |     UMA    |   NONUMA   |
+--------------------+----------+----------+----------------+------------+------------+
|      1 + 0 + 0     | 1.63 GiB | 1.78 GiB |     126 + 1    | 126.28 GiB | 215.98 GiB |
+--------------------+----------+----------+----------------+------------+------------+
```

#### Disable MMap

By default, LLaMA.cpp loads the model via Memory-Mapped.

For Apple MacOS, Memory-Mapped is an efficient way to load the model, and results in a lower VRAM usage.
For other platforms, Memory-Mapped affects the first-time model loading speed only.

Use `--no-mmap` to disable loading the model via Memory-Mapped.

Please note that some models require loading the whole weight into memory, if the model does not support MMap, the "MMAP
LOAD" shows "Not Supported".

```shell
$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --no-mmap --estimate --in-short
+-------------------------------------------------------------------------------------+
| ESTIMATE                                                                            |
+------------------------------------------+------------------------------------------+
|                    RAM                   |                  VRAM 0                  |
+--------------------+----------+----------+----------------+------------+------------+
| LAYERS (I + T + O) |    UMA   |  NONUMA  | LAYERS (T + O) |     UMA    |   NONUMA   |
+--------------------+----------+----------+----------------+------------+------------+
|      1 + 0 + 0     | 2.97 GiB | 3.12 GiB |     126 + 1    | 214.24 GiB | 246.86 GiB |
+--------------------+----------+----------+----------------+------------+------------+
```

#### With Adapter

Use `--lora`/`--control-vector` to estimate the usage when loading a model with adapters.

```shell
$ gguf-parser --hf-repo="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" --hf-file="Meta-Llama-3-8B-Instruct.Q5_K_M.gguf" --estimate --in-short
+-------------------------------------------------------------------------------------+
| ESTIMATE                                                                            |
+----------------------------------------------+--------------------------------------+
|                      RAM                     |                VRAM 0                |
+--------------------+------------+------------+----------------+----------+----------+
| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |  NONUMA  |
+--------------------+------------+------------+----------------+----------+----------+
|      1 + 0 + 0     | 210.80 MiB | 360.80 MiB |     32 + 1     | 1.25 GiB | 7.04 GiB |
+--------------------+------------+------------+----------------+----------+----------+

$ # With a LoRA adapter.
$ gguf-parser --hf-repo="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" --hf-file="Meta-Llama-3-8B-Instruct.Q5_K_M.gguf" --lora-url="https://huggingface.co/ngxson/test_gguf_lora_adapter/resolve/main/lora-Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf" --estimate --in-short
+-------------------------------------------------------------------------------------+
| ESTIMATE                                                                            |
+----------------------------------------------+--------------------------------------+
|                      RAM                     |                VRAM 0                |
+--------------------+------------+------------+----------------+----------+----------+
| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |    UMA   |  NONUMA  |
+--------------------+------------+------------+----------------+----------+----------+
|      1 + 0 + 0     | 223.91 MiB | 373.91 MiB |     32 + 1     | 1.42 GiB | 7.20 GiB |
+--------------------+------------+------------+----------------+----------+----------+
```

#### Get Proper Offload Layers

Use `--gpu-layers-step` to get the proper offload layers number when the model is too large to fit into the GPUs memory.

```shell
$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --gpu-layers-step=6 --estimate --in-short
+-----------------------------------------------------------------------------------------+
| ESTIMATE                                                                                |
+----------------------------------------------+------------------------------------------+
|                      RAM                     |                  VRAM 0                  |
+--------------------+------------+------------+----------------+------------+------------+
| LAYERS (I + T + O) |     UMA    |   NONUMA   | LAYERS (T + O) |     UMA    |   NONUMA   |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 126 + 1    | 127.64 GiB | 127.79 GiB |      0 + 0     |     0 B    |   250 MiB  |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 120 + 1    | 121.90 GiB | 122.05 GiB |      6 + 0     |    6 GiB   |  44.68 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 114 + 1    | 115.90 GiB | 116.05 GiB |     12 + 0     |   12 GiB   |  54.74 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 108 + 1    | 109.90 GiB | 110.05 GiB |     18 + 0     |   18 GiB   |  64.80 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 102 + 1    | 103.90 GiB | 104.05 GiB |     24 + 0     |   24 GiB   |  74.86 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 96 + 1     |  97.90 GiB |  98.05 GiB |     30 + 0     |   30 GiB   |  84.93 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 90 + 1     |  91.90 GiB |  92.05 GiB |     36 + 0     |   36 GiB   |  94.99 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 84 + 1     |  85.90 GiB |  86.05 GiB |     42 + 0     |   42 GiB   | 105.05 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 78 + 1     |  79.90 GiB |  80.05 GiB |     48 + 0     |   48 GiB   | 115.11 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 72 + 1     |  73.90 GiB |  74.05 GiB |     54 + 0     |   54 GiB   | 125.17 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 66 + 1     |  67.90 GiB |  68.05 GiB |     60 + 0     |   60 GiB   | 135.23 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 60 + 1     |  61.90 GiB |  62.05 GiB |     66 + 0     |   66 GiB   | 145.29 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 54 + 1     |  55.90 GiB |  56.05 GiB |     72 + 0     |   72 GiB   | 155.35 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 48 + 1     |  49.90 GiB |  50.05 GiB |     78 + 0     |   78 GiB   | 165.42 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 42 + 1     |  43.90 GiB |  44.05 GiB |     84 + 0     |   84 GiB   | 175.48 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 36 + 1     |  37.90 GiB |  38.05 GiB |     90 + 0     |   90 GiB   | 185.54 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 30 + 1     |  31.90 GiB |  32.05 GiB |     96 + 0     |   96 GiB   | 195.60 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 24 + 1     |  25.90 GiB |  26.05 GiB |     102 + 0    |   102 GiB  | 205.66 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 18 + 1     |  19.90 GiB |  20.05 GiB |     108 + 0    |   108 GiB  | 215.72 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|     1 + 12 + 1     |  13.90 GiB |  14.05 GiB |     114 + 0    |   114 GiB  | 226.05 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|      1 + 6 + 1     |  7.90 GiB  |  8.05 GiB  |     120 + 0    |   120 GiB  | 236.64 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|      1 + 0 + 1     |  1.90 GiB  |  2.05 GiB  |     126 + 0    |   126 GiB  | 246.24 GiB |
+--------------------+------------+------------+----------------+------------+------------+
|      1 + 0 + 0     |  1.63 GiB  |  1.78 GiB  |     126 + 1    | 126.28 GiB | 246.86 GiB |
+--------------------+------------+------------+----------------+------------+------------+
```

## License

MIT


================================================
FILE: cache.go
================================================
package gguf_parser

import (
	"errors"
	"fmt"
	"os"
	"path/filepath"
	"time"

	"github.com/gpustack/gguf-parser-go/util/json"
	"github.com/gpustack/gguf-parser-go/util/osx"
	"github.com/gpustack/gguf-parser-go/util/stringx"
)

var (
	ErrGGUFFileCacheDisabled  = errors.New("GGUF file cache disabled")
	ErrGGUFFileCacheMissed    = errors.New("GGUF file cache missed")
	ErrGGUFFileCacheCorrupted = errors.New("GGUF file cache corrupted")
)

type GGUFFileCache string

func (c GGUFFileCache) getKeyPath(key string) string {
	k := stringx.SumByFNV64a(key)
	p := filepath.Join(string(c), k[:1], k)
	return p
}

func (c GGUFFileCache) Get(key string, exp time.Duration) (*GGUFFile, error) {
	if c == "" {
		return nil, ErrGGUFFileCacheDisabled
	}

	if key == "" {
		return nil, ErrGGUFFileCacheMissed
	}

	p := c.getKeyPath(key)
	if !osx.Exists(p, func(stat os.FileInfo) bool {
		if !stat.Mode().IsRegular() {
			return false
		}
		return exp == 0 || time.Since(stat.ModTime()) < exp
	}) {
		return nil, ErrGGUFFileCacheMissed
	}

	var gf GGUFFile
	{
		bs, err := os.ReadFile(p)
		if err != nil {
			return nil, fmt.Errorf("GGUF file cache get: %w", err)
		}
		if err = json.Unmarshal(bs, &gf); err != nil {
			return nil, fmt.Errorf("GGUF file cache get: %w", err)
		}
	}

	if len(gf.TensorInfos) == 0 {
		_ = os.Remove(p)
		return nil, ErrGGUFFileCacheCorrupted
	}

	return &gf, nil
}

func (c GGUFFileCache) Put(key string, gf *GGUFFile) error {
	if c == "" {
		return ErrGGUFFileCacheDisabled
	}

	if key == "" || gf == nil {
		return nil
	}

	bs, err := json.Marshal(gf)
	if err != nil {
		return fmt.Errorf("GGUF file cache put: %w", err)
	}

	p := c.getKeyPath(key)
	if err = osx.WriteFile(p, bs, 0o600); err != nil {
		return fmt.Errorf("GGUF file cache put: %w", err)
	}
	return nil
}

func (c GGUFFileCache) Delete(key string) error {
	if c == "" {
		return ErrGGUFFileCacheDisabled
	}

	if key == "" {
		return ErrGGUFFileCacheMissed
	}

	p := c.getKeyPath(key)
	if !osx.ExistsFile(p) {
		return ErrGGUFFileCacheMissed
	}

	if err := os.Remove(p); err != nil {
		return fmt.Errorf("GGUF file cache delete: %w", err)
	}
	return nil
}


================================================
FILE: cmd/gguf-parser/README.md
================================================
# GGUF Parser

Review/Check [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) files,
estimate the memory usage
for [llama.cpp](https://github.com/ggerganov/llama.cpp), [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)
and [llama-box](https://github.com/gpustack/llama-box).

See [GGUF Parser detail introduction](https://github.com/gpustack/gguf-parser-go) for more information.

## Usage

```shell
$ gguf-parser --help
NAME:
   gguf-parser - Review/Check GGUF files and estimate the memory usage.

USAGE:
   gguf-parser [GLOBAL OPTIONS]

VERSION:
   ...

GLOBAL OPTIONS:
   --debug        Enable debugging, verbosity. (default: false)
   --help, -h     Print the usage.
   --version, -v  Print the version.

   Estimate

   --device-metric value [ --device-metric value ]                              Specify the device metrics, which is used to estimate the throughput, in form of "FLOPS;Up Bandwidth[;Down Bandwidth]". The FLOPS unit, select from [PFLOPS, TFLOPS, GFLOPS, MFLOPS, KFLOPS]. The Up/Down Bandwidth unit, select from [PiBps, TiBps, GiBps, MiBps, KiBps, PBps, TBps, GBps, MBps, KBps, Pbps, Tbps, Gbps, Mbps, Kbps]. Up Bandwidth usually indicates the bandwidth to transmit the data to calculate, and Down Bandwidth indicates the bandwidth to transmit the calculated result to next layer. For example, "--device-metric 10TFLOPS;400GBps" means the device has 10 TFLOPS and 400 GBps Up/Down bandwidth, "--device-metric 10TFLOPS;400GBps;5000MBps" means the device has 5000MBps Down bandwidth. If the quantity specified by "--device-metric" is less than the number of estimation devices(determined by "--tensor-split" and "--rpc" to infer the device count), then replicate the last "--device-metric" to meet the required number of evaluation devices.
   --flash-attention, --flash-attn, --fa, --diffusion-fa                        Specify enabling Flash Attention, which is used to estimate the usage. Flash Attention can reduce the usage of RAM/VRAM. (default: false)
   --gpu-layers value, --ngl value, --n-gpu-layers value                        Specify how many layers of the main model to offload, which is used to estimate the usage, default is full offloaded. (default: -1)
   --main-gpu value, --mg value                                                 Specify the GPU to use for the model (with "--split-mode=none") or for intermediate results and KV (with "--split-mode=row"), which is used to estimate the usage. Since gguf-parser cannot recognize the host GPU devices or RPC servers, "--main-gpu" only works when "--tensor-split" is set. (default: 0)
   --no-flash-attention, --no-flash-attn                                        Specify disabling Flash Attention. (default: false)
   --override-tensor value, --ot value [ --override-tensor value, --ot value ]  Override tensor buffer type, for example, use --override-tensor "[2-9][0-9]\.ffn_.*_exps\.=CPU" to keep experts of layers 20-99 in the CPU
   --parallel-size value, --parallel value, --np value, --threads-http value    Specify the number of parallel sequences to decode, which is used to estimate the usage. (default: 1)
   --platform-footprint value                                                   Specify the platform footprint(RAM,VRAM) of running host in MiB, which is used to estimate the NonUMA usage, default is "150,250". Different platform always gets different RAM and VRAM footprints, for example, within CUDA, "cudaMemGetInfo" or "cudaSetDevice" would occupy some RAM and VRAM, see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo. (default: "150,250")
   --rpc value                                                                  Specify the RPC servers, which is used to estimate the usage, it is a comma-separated list of host:port. Woks with "--tensor-split".
   --tensor-split value, --ts value                                             Specify the fraction of the model to offload to each device, which is used to estimate the usage, it is a comma-separated list of integer. Since gguf-parser cannot recognize the host GPU devices or RPC servers, must explicitly set "--tensor-split" to indicate how many devices are used. To declare the devices belong to RPC servers, set "--rpc" please.

   Estimate/LLaMACpp

   --batch-size value, -b value                                        Specify the logical batch size, which is used to estimate the usage. (default: 2048)
   --cache-type-k value, --ctk value                                   Specify the type of Key cache, which is used to estimate the usage, select from [f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1]. (default: "f16")
   --cache-type-v value, --ctv value                                   Specify the type of Value cache, which is used to estimate the usage, select from [f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1]. (default: "f16")
   --ctx-size value, -c value                                          Specify the size of prompt context, which is used to estimate the usage, default is equal to the model's maximum context size. (default: 0)
   --gpu-layers-draft value, --ngld value, --n-gpu-layers-draft value  Specify how many layers of the draft model to offload, which is used to estimate the usage, default is full offloaded. (default: -1)
   --gpu-layers-step value                                             Specify the step of layers to offload, works with "--gpu-layers". (default: 0)
   --in-max-ctx-size                                                   Limit the context size to the maximum context size of the model, if the context size is larger than the maximum context size. (default: false)
   --max-projected-cache value, --visual-max-image-cache value         Specify how many projected embedding to be cached. (default: 0)
   --mmap                                                              Specify enabling Memory-Mapped using, which is used to estimate the usage. Memory-Mapped can avoid loading the entire model weights into RAM. (default: false)
   --no-kv-offload, --nkvo                                             Specify disabling Key-Value offloading, which is used to estimate the usage. Disable Key-Value offloading can reduce the usage of VRAM. (default: false)
   --no-mmap                                                           Specify disabling Memory-Mapped using, which is used to estimate the usage. Memory-Mapped can avoid loading the entire model weights into RAM. (default: false)
   --rope-freq-base value                                              RoPE base frequency, used by NTK-aware scaling. (default: 0)
   --rope-freq-scale value                                             RoPE frequency scaling factor, expands context by a factor of 1/N. (default: 0)
   --rope-scale value                                                  RoPE context scaling factor, expands context by a factor of N. (default: 0)
   --rope-scaling value                                                RoPE frequency scaling method, defaults to linear unless specified by the model, select from [none, linear, yarn].
   --split-mode value, --sm value                                      Specify how to split the model across multiple devices, which is used to estimate the usage, select from [layer, row, none]. Since gguf-parser always estimates the usage of VRAM, "none" is meaningless here, keep for compatibility. (default: "layer")
   --swa-full                                                          Specify using full-size SWA cache. (default: false)
   --ubatch-size value, --ub value                                     Specify the physical maximum batch size, which is used to estimate the usage. (default: 512)
   --visual-max-image-size value                                       Specify maximum image size when completion with vision model. (default: 0)
   --yarn-orig-ctx value                                               YaRN original context size of model, defaults to model training context size. (default: 0)

   Estimate/StableDiffusionCpp

   --image-autoencoder-tiling, --vae-tiling, --image-vae-tiling                             Specify to enable tiling for the vae model. (default: false)
   --image-batch-count value, --batch-count value, --image-max-batch value                  Specify the batch(generation) count of the image. (default: 1)
   --image-free-compute-memory-immediately                                                  Specify to free the compute memory immediately after the generation, which burst using VRAM. (default: false)
   --image-height value, --height value, --image-max-height value                           Specify the (maximum) height of the image. (default: 1024)
   --image-no-autoencoder-offload, --vae-on-cpu, --image-no-vae-model-offload               Specify to offload the vae model to CPU. (default: false)
   --image-no-autoencoder-tiling, --image-no-vae-tiling                                     Specify to disable tiling for the vae model, it takes precedence over --image-autoencoder-tiling. (default: false)
   --image-no-conditioner-offload, --clip-on-cpu, --image-no-text-encoder-model-offload     Specify to offload the text encoder model to CPU. (default: false)
   --image-no-control-net-offload, --control-net-cpu, --image-no-control-net-model-offload  Specify to offload the control net model to CPU. (default: false)
   --image-width value, --width value, --image-max-width value                              Specify the (maximum) width of the image. (default: 1024)

   Load

   --cache-expiration value      Specify the expiration of cache, works with "--url/--hf-*/--ms-*/--ol-*". (default: 24h0m0s)
   --cache-path value            Cache the read result to the path, works with "--url/--hf-*/--ms-*/--ol-*". (default: "/Users/thxcode/.cache/gguf-parser")
   --skip-cache                  Skip cache, works with "--url/--hf-*/--ms-*/--ol-*", default is caching the read result. (default: false) [$SKIP_CACHE]
   --skip-dns-cache              Skip DNS cache, works with "--url/--hf-*/--ms-*/--ol-*", default is caching the DNS lookup result. (default: false) [$SKIP_DNS_CACHE]
   --skip-proxy                  Skip proxy settings, works with "--url/--hf-*/--ms-*/--ol-*", default is respecting the environment variables "HTTP_PROXY/HTTPS_PROXY/NO_PROXY". (default: false) [$SKIP_PROXY]
   --skip-range-download-detect  Skip range download detect, works with "--url/--hf-*/--ms-*/--ol-*", default is detecting the range download support. (default: false) [$SKIP_RANGE_DOWNLOAD_DETECT]
   --skip-tls-verify             Skip TLS verification, works with "--url/--hf-*/--ms-*/--ol-*", default is verifying the TLS certificate on HTTPs request. (default: false) [$SKIP_TLS_VERIFY]

   Model/Local

   --control-net-path value, --control-net value, --image-control-net-model value                               Path where the GGUF file to load for the Control Net model, optional.
   --control-vector-path value, --control-vector value [ --control-vector-path value, --control-vector value ]  Path where the GGUF file to load for the Control Vector adapter, optional.
   --draft-path value, --model-draft value, --md value                                                          Path where the GGUF file to load for the draft model, optional, e.g. "~/.cache/lm-studio/models/QuantFactory/Qwen2-1.5B-Instruct-GGUF/Qwen2-1.5B-Instruct.Q5_K_M.gguf".
   --lora-path value, --lora value [ --lora-path value, --lora value ]                                          Path where the GGUF file to load for the LoRA adapter, optional.
   --mmproj-path value, --mmproj value                                                                          Path where the GGUF file to load for the multimodal projector, optional.
   --path value, --model value, -m value                                                                        Path where the GGUF file to load for the main model, e.g. "~/.cache/lm-studio/models/QuantFactory/Qwen2-7B-Instruct-GGUF/Qwen2-7B-Instruct.Q5_K_M.gguf".
   --upscale-path value, --upscale-model value, --image-upscale-model value                                     Path where the GGUF file to load for the Upscale model, optional.

   Model/Remote

   --control-net-url value                                    Url where the GGUF file to load for the Control Net model, optional.
   --control-vector-url value [ --control-vector-url value ]  Url where the GGUF file to load for the Control Vector adapter, optional.
   --draft-url value                                          Url where the GGUF file to load for the draft model, optional, e.g. "https://huggingface.co/QuantFactory/Qwen2-1.5B-Instruct-GGUF/resolve/main/Qwen2-1.5B-Instruct.Q5_K_M.gguf". Note that gguf-parser does not need to download the entire GGUF file.
   --header value [ --header value ]                          Custom HTTP header in "Key: Value" format, works with "--url/--draft-url".
   --lora-url value [ --lora-url value ]                      Url where the GGUF file to load for the LoRA adapter, optional.
   --mmproj-url value                                         Url where the GGUF file to load for the multimodal projector, optional.
   --token value                                              Bearer auth token to load GGUF file, optional, works with "--url/--draft-url". [$TOKEN]
   --upscale-url value                                        Url where the GGUF file to load for the Upscale model, optional.
   --url value, --model-url value, --mu value                 Url where the GGUF file to load for the main model, e.g. "https://huggingface.co/QuantFactory/Qwen2-7B-Instruct-GGUF/resolve/main/Qwen2-7B-Instruct.Q5_K_M.gguf". Note that gguf-parser does not need to download the entire GGUF file.

   Model/Remote/HuggingFace

   --hf-control-net-file value                                        Model file below the "--hf-control-net-repo", optional.
   --hf-control-net-repo value                                        Repository of HuggingFace which the GGUF file store for the Control Net model, optional, works with "--hf-control-net-file".
   --hf-control-vector-file value [ --hf-control-vector-file value ]  Control Vector adapter file below the "--hf-repo".
   --hf-draft-file value                                              Model file below the "--hf-draft-repo", optional, e.g. "Qwen2-1.5B-Instruct.Q5_K_M.gguf".
   --hf-draft-repo value                                              Repository of HuggingFace which the GGUF file store for the draft model, optional, e.g. "QuantFactory/Qwen2-1.5B-Instruct-GGUF", works with "--hf-draft-file".
   --hf-file value, --hff value                                       Model file below the "--hf-repo", e.g. "Qwen2-7B-Instruct.Q5_K_M.gguf".
   --hf-lora-file value [ --hf-lora-file value ]                      LoRA adapter file below the "--hf-repo".
   --hf-mmproj-file value                                             Multimodal projector file below the "--hf-repo".
   --hf-repo value, --hfr value                                       Repository of HuggingFace which the GGUF file store for the main model, e.g. "QuantFactory/Qwen2-7B-Instruct-GGUF", works with "--hf-file".
   --hf-token value, --hft value                                      User access token of HuggingFace, optional, works with "--hf-repo/--hf-file pair" or "--hf-draft-repo/--hf-draft-file" pair. See https://huggingface.co/settings/tokens. [$HF_TOKEN]
   --hf-upscale-file value                                            Model file below the "--hf-upscale-repo", optional.
   --hf-upscale-repo value                                            Repository of HuggingFace which the GGUF file store for the Upscale model, optional, works with "--hf-upscale-file".

   Model/Remote/ModelScope

   --ms-control-net-file value                                        Model file below the "--ms-control-net-repo", optional.
   --ms-control-net-repo value                                        Repository of ModelScope which the GGUF file store for the Control Net model, optional, works with "--ms-control-net-file".
   --ms-control-vector-file value [ --ms-control-vector-file value ]  Control Vector adapter file below the "--ms-repo".
   --ms-draft-file value                                              Model file below the "--ms-draft-repo", optional, e.g. "qwen1_5-1_8b-chat-q5_k_m.gguf".
   --ms-draft-repo value                                              Repository of ModelScope which the GGUF file store for the draft model, optional, e.g. "qwen/Qwen1.5-1.8B-Chat-GGUF", works with "--ms-draft-file".
   --ms-file value                                                    Model file below the "--ms-repo", e.g. "qwen1_5-7b-chat-q5_k_m.gguf".
   --ms-lora-file value [ --ms-lora-file value ]                      LoRA adapter file below the "--ms-repo".
   --ms-mmproj-file value                                             Multimodal projector file below the "--ms-repo".
   --ms-repo value                                                    Repository of ModelScope which the GGUF file store for the main model, e.g. "qwen/Qwen1.5-7B-Chat-GGUF", works with "--ms-file".
   --ms-token value                                                   Git access token of ModelScope, optional, works with "--ms-repo/--ms-file" pair or "--ms-draft-repo/--ms-draft-file" pair. See https://modelscope.cn/my/myaccesstoken. [$HF_TOKEN, $MS_TOKEN]
   --ms-upscale-file value                                            Model file below the "--ms-upscale-repo", optional.
   --ms-upscale-repo value                                            Repository of ModelScope which the GGUF file store for the Upscale model, optional, works with "--ms-upscale-file".

   Model/Remote/Ollama

   --ol-base-url value  Model base URL of Ollama, e.g. https://registry.ollama.ai. (default: "https://registry.ollama.ai")
   --ol-model value     Model name of Ollama, e.g. "gemma2".
   --ol-usage           Specify respecting the extending layers introduced by Ollama, works with "--ol-model", which affects the usage estimation. (default: false)

   Output

   --estimate           Skip all the information except the estimate result. (default: false)
   --in-mib             Display the estimated result in table with MiB. (default: false)
   --in-short           Display the estimated result in table in short form. (default: false)
   --json               Output as JSON. (default: false)
   --json-pretty        Works with "--json", to output pretty format JSON. (default: true)
   --raw                Output the GGUF file information as JSON only, skip anything. (default: false)
   --raw-output value   Works with "--raw", to save the result to the file
   --skip-architecture  Skip to display architecture. (default: false)
   --skip-estimate      Skip to estimate. By default, gguf-parser always estimates the file which types with "model". (default: false)
   --skip-metadata      Skip to display metadata. (default: false)
   --skip-tokenizer     Skip to display tokenizer. By default, gguf-parser always displays the tokenizer of the file which types with "model". (default: false)

```

### Environment Variables Support

- `TOKEN`: The bearer auth token to load GGUF file, works with `--url/--draft-url`.
- `HF_ENDPOINT`: The HuggingFace endpoint, default is `https://huggingface.co`.
- `HF_TOKEN`: The HuggingFace token, see [HuggingFace token](https://huggingface.co/settings/tokens).
- `MS_ENDPOINT`: The ModelScope endpoint, default is `https://modelscope.cn`.
- `MS_TOKEN`: The ModelScope token, see [ModelScope token](https://modelscope.cn/my/myaccesstoken).

## License

MIT


================================================
FILE: cmd/gguf-parser/go.mod
================================================
module github.com/gpustack/gguf-parser-go/cmd/gguf-parser

go 1.22.0

toolchain go1.22.9

replace github.com/gpustack/gguf-parser-go => ../../

require (
	github.com/gpustack/gguf-parser-go v0.6.0
	github.com/jedib0t/go-pretty/v6 v6.6.1
	github.com/urfave/cli/v2 v2.27.5
)

require (
	github.com/cpuguy83/go-md2man/v2 v2.0.5 // indirect
	github.com/henvic/httpretty v0.1.4 // indirect
	github.com/json-iterator/go v1.1.12 // indirect
	github.com/mattn/go-runewidth v0.0.16 // indirect
	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
	github.com/modern-go/reflect2 v1.0.2 // indirect
	github.com/rivo/uniseg v0.4.7 // indirect
	github.com/russross/blackfriday/v2 v2.1.0 // indirect
	github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d // indirect
	github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
	golang.org/x/crypto v0.29.0 // indirect
	golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f // indirect
	golang.org/x/mod v0.22.0 // indirect
	golang.org/x/sync v0.9.0 // indirect
	golang.org/x/sys v0.27.0 // indirect
	golang.org/x/tools v0.27.0 // indirect
	gonum.org/v1/gonum v0.15.1 // indirect
)


================================================
FILE: cmd/gguf-parser/go.sum
================================================
github.com/cpuguy83/go-md2man/v2 v2.0.5 h1:ZtcqGrnekaHpVLArFSe4HK5DoKx1T0rq2DwVB0alcyc=
github.com/cpuguy83/go-md2man/v2 v2.0.5/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/henvic/httpretty v0.1.4 h1:Jo7uwIRWVFxkqOnErcoYfH90o3ddQyVrSANeS4cxYmU=
github.com/henvic/httpretty v0.1.4/go.mod h1:Dn60sQTZfbt2dYsdUSNsCljyF4AfdqnuJFDLJA1I4AM=
github.com/jedib0t/go-pretty/v6 v6.6.1 h1:iJ65Xjb680rHcikRj6DSIbzCex2huitmc7bDtxYVWyc=
github.com/jedib0t/go-pretty/v6 v6.6.1/go.mod h1:zbn98qrYlh95FIhwwsbIip0LYpwSG8SUOScs+v9/t0E=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d h1:3VwvTjiRPA7cqtgOWddEL+JrcijMlXUmj99c/6YyZoY=
github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d/go.mod h1:tAG61zBM1DYRaGIPloumExGvScf08oHuo0kFoOqdbT0=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/urfave/cli/v2 v2.27.5 h1:WoHEJLdsXr6dDWoJgMq/CboDmyY/8HMMH1fTECbih+w=
github.com/urfave/cli/v2 v2.27.5/go.mod h1:3Sevf16NykTbInEnD0yKkjDAeZDS0A6bzhBH5hrMvTQ=
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4=
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM=
golang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ=
golang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg=
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f h1:XdNn9LlyWAhLVp6P/i8QYBW+hlyhrhei9uErw2B5GJo=
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f/go.mod h1:D5SMRVC3C2/4+F/DB1wZsLRnSNimn2Sp/NPsCrsv8ak=
golang.org/x/mod v0.22.0 h1:D4nJWe9zXqHOmWqj4VMOJhvzj7bEZg4wEYa759z1pH4=
golang.org/x/mod v0.22.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY=
golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ=
golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s=
golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.26.0 h1:WEQa6V3Gja/BhNxg540hBip/kkaYtRg3cxg4oXSw4AU=
golang.org/x/term v0.26.0/go.mod h1:Si5m1o57C5nBNQo5z1iq+XDijt21BDBDp2bK0QI8e3E=
golang.org/x/tools v0.27.0 h1:qEKojBykQkQ4EynWy4S8Weg69NumxKdn40Fce3uc/8o=
golang.org/x/tools v0.27.0/go.mod h1:sUi0ZgbwW9ZPAq26Ekut+weQPR5eIM6GQLQ1Yjm1H0Q=
gonum.org/v1/gonum v0.15.1 h1:FNy7N6OUZVUaWG9pTiD+jlhdQ3lMP+/LcTpJ6+a8sQ0=
gonum.org/v1/gonum v0.15.1/go.mod h1:eZTZuRFrzu5pcyjN5wJhcIhnUdNijYxX1T2IcrOGY0o=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=


================================================
FILE: cmd/gguf-parser/main.go
================================================
package main

import (
	"errors"
	"fmt"
	"net"
	"os"
	"path/filepath"
	"regexp"
	"strconv"
	"strings"
	"sync"
	"time"

	"github.com/gpustack/gguf-parser-go/util/anyx"
	"github.com/gpustack/gguf-parser-go/util/json"
	"github.com/gpustack/gguf-parser-go/util/osx"
	"github.com/gpustack/gguf-parser-go/util/signalx"
	"github.com/jedib0t/go-pretty/v6/table"
	"github.com/jedib0t/go-pretty/v6/text"
	"github.com/urfave/cli/v2"

	. "github.com/gpustack/gguf-parser-go" // nolint: stylecheck
)

var Version = "v0.0.0"

func init() {
	cli.VersionFlag = &cli.BoolFlag{
		Name:               "version",
		Aliases:            []string{"v"},
		Usage:              "Print the version.",
		DisableDefaultText: true,
	}
	cli.HelpFlag = &cli.BoolFlag{
		Name:               "help",
		Aliases:            []string{"h"},
		Usage:              "Print the usage.",
		DisableDefaultText: true,
	}
}

func main() {
	name := filepath.Base(os.Args[0])
	app := &cli.App{
		Name:            name,
		Usage:           "Review/Check GGUF files and estimate the memory usage and provide optimization suggestions.",
		UsageText:       name + " [GLOBAL OPTIONS]",
		Version:         Version,
		Reader:          os.Stdin,
		Writer:          os.Stdout,
		ErrWriter:       os.Stderr,
		HideHelpCommand: true,
		OnUsageError: func(c *cli.Context, _ error, _ bool) error {
			return cli.ShowAppHelp(c)
		},
		Flags: []cli.Flag{
			&cli.BoolFlag{
				Destination: &debug,
				Value:       debug,
				Name:        "debug",
				Usage:       "Enable debugging, verbosity.",
			},
			&cli.StringFlag{
				Destination: &path,
				Value:       path,
				Category:    "Model/Local",
				Name:        "path",
				Aliases: []string{ // LLaMACpp compatibility
					"model",
					"m",
				},
				Usage: "Path where the GGUF file to load for the main model, e.g. \"~/.cache" +
					"/lm-studio/models/QuantFactory/Qwen2-7B-Instruct-GGUF" +
					"/Qwen2-7B-Instruct.Q5_K_M.gguf\".",
			},
			&cli.StringFlag{
				Destination: &draftPath,
				Value:       draftPath,
				Category:    "Model/Local",
				Name:        "draft-path",
				Aliases: []string{ // LLaMACpp compatibility
					"model-draft",
					"md",
				},
				Usage: "Path where the GGUF file to load for the draft model, optional, e.g. \"~/.cache" +
					"/lm-studio/models/QuantFactory/Qwen2-1.5B-Instruct-GGUF" +
					"/Qwen2-1.5B-Instruct.Q5_K_M.gguf\".",
			},
			&cli.StringFlag{
				Destination: &mmprojPath,
				Value:       mmprojPath,
				Category:    "Model/Local",
				Name:        "mmproj-path",
				Aliases: []string{ // LLaMACpp compatibility
					"mmproj",
				},
				Usage: "Path where the GGUF file to load for the multimodal projector, optional.",
			},
			&cli.StringSliceFlag{
				Destination: &loraPaths,
				Category:    "Model/Local",
				Name:        "lora-path",
				Aliases: []string{ // LLaMACpp compatibility
					"lora",
				},
				Usage: "Path where the GGUF file to load for the LoRA adapter, optional.",
			},
			&cli.StringSliceFlag{
				Destination: &controlVectorPaths,
				Category:    "Model/Local",
				Name:        "control-vector-path",
				Aliases: []string{ // LLaMACpp compatibility
					"control-vector",
				},
				Usage: "Path where the GGUF file to load for the Control Vector adapter, optional.",
			},
			&cli.StringFlag{
				Destination: &upscalePath,
				Value:       upscalePath,
				Category:    "Model/Local",
				Name:        "upscale-path",
				Aliases: []string{
					"upscale-model",       // StableDiffusionCpp compatibility
					"image-upscale-model", // LLaMABox compatibility
				},
				Usage: "Path where the GGUF file to load for the Upscale model, optional.",
			},
			&cli.StringFlag{
				Destination: &controlNetPath,
				Value:       controlNetPath,
				Category:    "Model/Local",
				Name:        "control-net-path",
				Aliases: []string{
					"control-net",             // StableDiffusionCpp compatibility
					"image-control-net-model", // LLaMABox compatibility
				},
				Usage: "Path where the GGUF file to load for the Control Net model, optional.",
			},
			&cli.StringFlag{
				Destination: &url,
				Value:       url,
				Category:    "Model/Remote",
				Name:        "url",
				Aliases: []string{
					"model-url",
					"mu",
				},
				Usage: "Url where the GGUF file to load for the main model, e.g. " +
					"\"https://huggingface.co/QuantFactory/Qwen2-7B-Instruct-GGUF" +
					"/resolve/main/Qwen2-7B-Instruct.Q5_K_M.gguf\". " +
					"Note that gguf-parser does not need to download the entire GGUF file.",
			},
			&cli.StringFlag{
				Destination: &draftUrl,
				Value:       draftUrl,
				Category:    "Model/Remote",
				Name:        "draft-url",
				Usage: "Url where the GGUF file to load for the draft model, optional, e.g. " +
					"\"https://huggingface.co/QuantFactory/Qwen2-1.5B-Instruct-GGUF" +
					"/resolve/main/Qwen2-1.5B-Instruct.Q5_K_M.gguf\". " +
					"Note that gguf-parser does not need to download the entire GGUF file.",
			},
			&cli.StringFlag{
				Destination: &mmprojUrl,
				Value:       mmprojUrl,
				Category:    "Model/Remote",
				Name:        "mmproj-url",
				Usage:       "Url where the GGUF file to load for the multimodal projector, optional.",
			},
			&cli.StringSliceFlag{
				Destination: &loraUrls,
				Category:    "Model/Remote",
				Name:        "lora-url",
				Usage:       "Url where the GGUF file to load for the LoRA adapter, optional.",
			},
			&cli.StringSliceFlag{
				Destination: &controlVectorUrls,
				Category:    "Model/Remote",
				Name:        "control-vector-url",
				Usage:       "Url where the GGUF file to load for the Control Vector adapter, optional.",
			},
			&cli.StringFlag{
				Destination: &upscaleUrl,
				Value:       upscaleUrl,
				Category:    "Model/Remote",
				Name:        "upscale-url",
				Usage:       "Url where the GGUF file to load for the Upscale model, optional.",
			},
			&cli.StringFlag{
				Destination: &controlNetUrl,
				Value:       controlNetUrl,
				Category:    "Model/Remote",
				Name:        "control-net-url",
				Usage:       "Url where the GGUF file to load for the Control Net model, optional.",
			},
			&cli.StringFlag{
				Destination: &token,
				Value:       token,
				Category:    "Model/Remote",
				Name:        "token",
				EnvVars: []string{
					"TOKEN",
				},
				Usage: "Bearer auth token to load GGUF file, optional, " +
					"works with \"--url/--draft-url\".",
			},
			&cli.StringSliceFlag{
				Destination: &headers,
				Category:    "Model/Remote",
				Name:        "header",
				Usage: "Custom HTTP header in \"Key: Value\" format, " +
					"works with \"--url/--draft-url\".",
			},
			&cli.StringFlag{
				Destination: &hfRepo,
				Value:       hfRepo,
				Category:    "Model/Remote/HuggingFace",
				Name:        "hf-repo",
				Aliases: []string{ // LLaMACpp compatibility
					"hfr",
				},
				Usage: "Repository of HuggingFace which the GGUF file store for the main model, e.g. " +
					"\"QuantFactory/Qwen2-7B-Instruct-GGUF\", works with \"--hf-file\".",
			},
			&cli.StringFlag{
				Destination: &hfFile,
				Value:       hfFile,
				Category:    "Model/Remote/HuggingFace",
				Name:        "hf-file",
				Aliases: []string{ // LLaMACpp compatibility
					"hff",
				},
				Usage: "Model file below the \"--hf-repo\", e.g. " +
					"\"Qwen2-7B-Instruct.Q5_K_M.gguf\".",
			},
			&cli.StringFlag{
				Destination: &hfDraftRepo,
				Value:       hfDraftRepo,
				Category:    "Model/Remote/HuggingFace",
				Name:        "hf-draft-repo",
				Usage: "Repository of HuggingFace which the GGUF file store for the draft model, optional, e.g. " +
					"\"QuantFactory/Qwen2-1.5B-Instruct-GGUF\", works with \"--hf-draft-file\".",
			},
			&cli.StringFlag{
				Destination: &hfDraftFile,
				Value:       hfDraftFile,
				Category:    "Model/Remote/HuggingFace",
				Name:        "hf-draft-file",
				Usage: "Model file below the \"--hf-draft-repo\", optional, e.g. " +
					"\"Qwen2-1.5B-Instruct.Q5_K_M.gguf\".",
			},
			&cli.StringFlag{
				Destination: &hfMMProjFile,
				Value:       hfMMProjFile,
				Category:    "Model/Remote/HuggingFace",
				Name:        "hf-mmproj-file",
				Usage:       "Multimodal projector file below the \"--hf-repo\".",
			},
			&cli.StringSliceFlag{
				Destination: &hfLoRAFiles,
				Category:    "Model/Remote/HuggingFace",
				Name:        "hf-lora-file",
				Usage:       "LoRA adapter file below the \"--hf-repo\".",
			},
			&cli.StringSliceFlag{
				Destination: &hfControlVectorFiles,
				Category:    "Model/Remote/HuggingFace",
				Name:        "hf-control-vector-file",
				Usage:       "Control Vector adapter file below the \"--hf-repo\".",
			},
			&cli.StringFlag{
				Destination: &hfUpscaleRepo,
				Value:       hfUpscaleRepo,
				Category:    "Model/Remote/HuggingFace",
				Name:        "hf-upscale-repo",
				Usage: "Repository of HuggingFace which the GGUF file store for the Upscale model, optional, " +
					"works with \"--hf-upscale-file\".",
			},
			&cli.StringFlag{
				Destination: &hfUpscaleFile,
				Value:       hfUpscaleFile,
				Category:    "Model/Remote/HuggingFace",
				Name:        "hf-upscale-file",
				Usage:       "Model file below the \"--hf-upscale-repo\", optional.",
			},
			&cli.StringFlag{
				Destination: &hfControlNetRepo,
				Value:       hfControlNetRepo,
				Category:    "Model/Remote/HuggingFace",
				Name:        "hf-control-net-repo",
				Usage: "Repository of HuggingFace which the GGUF file store for the Control Net model, optional, " +
					"works with \"--hf-control-net-file\".",
			},
			&cli.StringFlag{
				Destination: &hfControlNetFile,
				Value:       hfControlNetFile,
				Category:    "Model/Remote/HuggingFace",
				Name:        "hf-control-net-file",
				Usage:       "Model file below the \"--hf-control-net-repo\", optional.",
			},
			&cli.StringFlag{
				Destination: &hfToken,
				Value:       hfToken,
				Category:    "Model/Remote/HuggingFace",
				Name:        "hf-token",
				Aliases: []string{ // LLaMACpp compatibility
					"hft",
				},
				EnvVars: []string{
					"HF_TOKEN",
				},
				Usage: "User access token of HuggingFace, optional, " +
					"works with \"--hf-repo/--hf-file pair\" or \"--hf-draft-repo/--hf-draft-file\" pair. " +
					"See https://huggingface.co/settings/tokens.",
			},
			&cli.StringFlag{
				Destination: &msRepo,
				Value:       msRepo,
				Category:    "Model/Remote/ModelScope",
				Name:        "ms-repo",
				Usage: "Repository of ModelScope which the GGUF file store for the main model, e.g. " +
					"\"qwen/Qwen1.5-7B-Chat-GGUF\", works with \"--ms-file\".",
			},
			&cli.StringFlag{
				Destination: &msFile,
				Value:       msFile,
				Category:    "Model/Remote/ModelScope",
				Name:        "ms-file",
				Usage: "Model file below the \"--ms-repo\", e.g. " +
					"\"qwen1_5-7b-chat-q5_k_m.gguf\".",
			},
			&cli.StringFlag{
				Destination: &msDraftRepo,
				Value:       msDraftRepo,
				Category:    "Model/Remote/ModelScope",
				Name:        "ms-draft-repo",
				Usage: "Repository of ModelScope which the GGUF file store for the draft model, optional, e.g. " +
					"\"qwen/Qwen1.5-1.8B-Chat-GGUF\", works with \"--ms-draft-file\".",
			},
			&cli.StringFlag{
				Destination: &msDraftFile,
				Value:       msDraftFile,
				Category:    "Model/Remote/ModelScope",
				Name:        "ms-draft-file",
				Usage: "Model file below the \"--ms-draft-repo\", optional, e.g. " +
					"\"qwen1_5-1_8b-chat-q5_k_m.gguf\".",
			},
			&cli.StringFlag{
				Destination: &msMMProjFile,
				Value:       msMMProjFile,
				Category:    "Model/Remote/ModelScope",
				Name:        "ms-mmproj-file",
				Usage:       "Multimodal projector file below the \"--ms-repo\".",
			},
			&cli.StringSliceFlag{
				Destination: &msLoRAFiles,
				Category:    "Model/Remote/ModelScope",
				Name:        "ms-lora-file",
				Usage:       "LoRA adapter file below the \"--ms-repo\".",
			},
			&cli.StringSliceFlag{
				Destination: &msControlVectorFiles,
				Category:    "Model/Remote/ModelScope",
				Name:        "ms-control-vector-file",
				Usage:       "Control Vector adapter file below the \"--ms-repo\".",
			},
			&cli.StringFlag{
				Destination: &msUpscaleRepo,
				Value:       msUpscaleRepo,
				Category:    "Model/Remote/ModelScope",
				Name:        "ms-upscale-repo",
				Usage: "Repository of ModelScope which the GGUF file store for the Upscale model, optional, " +
					"works with \"--ms-upscale-file\".",
			},
			&cli.StringFlag{
				Destination: &msUpscaleFile,
				Value:       msUpscaleFile,
				Category:    "Model/Remote/ModelScope",
				Name:        "ms-upscale-file",
				Usage:       "Model file below the \"--ms-upscale-repo\", optional.",
			},
			&cli.StringFlag{
				Destination: &msControlNetRepo,
				Value:       msControlNetRepo,
				Category:    "Model/Remote/ModelScope",
				Name:        "ms-control-net-repo",
				Usage: "Repository of ModelScope which the GGUF file store for the Control Net model, optional, " +
					"works with \"--ms-control-net-file\".",
			},
			&cli.StringFlag{
				Destination: &msControlNetFile,
				Value:       msControlNetFile,
				Category:    "Model/Remote/ModelScope",
				Name:        "ms-control-net-file",
				Usage:       "Model file below the \"--ms-control-net-repo\", optional.",
			},
			&cli.StringFlag{
				Destination: &msToken,
				Value:       msToken,
				Category:    "Model/Remote/ModelScope",
				Name:        "ms-token",
				EnvVars: []string{
					"HF_TOKEN", // Compatible with HuggingFace
					"MS_TOKEN",
				},
				Usage: "Git access token of ModelScope, optional, " +
					"works with \"--ms-repo/--ms-file\" pair or \"--ms-draft-repo/--ms-draft-file\" pair. " +
					"See https://modelscope.cn/my/myaccesstoken.",
			},
			&cli.StringFlag{
				Destination: &olBaseURL,
				Value:       olBaseURL,
				Category:    "Model/Remote/Ollama",
				Name:        "ol-base-url",
				Usage: "Model base URL of Ollama, e.g. " +
					"https://registry.ollama.ai.",
			},
			&cli.StringFlag{
				Destination: &olModel,
				Value:       olModel,
				Category:    "Model/Remote/Ollama",
				Name:        "ol-model",
				Usage: "Model name of Ollama, e.g. " +
					"\"gemma2\".",
			},
			&cli.BoolFlag{
				Destination: &olUsage,
				Value:       olUsage,
				Category:    "Model/Remote/Ollama",
				Name:        "ol-usage",
				Usage: "Specify respecting the extending layers introduced by Ollama, " +
					"works with \"--ol-model\", which affects the usage estimation.",
			},
			&cli.BoolFlag{
				Destination: &skipProxy,
				Value:       skipProxy,
				Category:    "Load",
				Name:        "skip-proxy",
				EnvVars: []string{
					"SKIP_PROXY",
				},
				Usage: "Skip proxy settings, " +
					"works with \"--url/--hf-*/--ms-*/--ol-*\", " +
					"default is respecting the environment variables \"HTTP_PROXY/HTTPS_PROXY/NO_PROXY\".",
			},
			&cli.BoolFlag{
				Destination: &skipTLSVerify,
				Value:       skipTLSVerify,
				Category:    "Load",
				Name:        "skip-tls-verify",
				EnvVars: []string{
					"SKIP_TLS_VERIFY",
				},
				Usage: "Skip TLS verification, " +
					"works with \"--url/--hf-*/--ms-*/--ol-*\", " +
					"default is verifying the TLS certificate on HTTPs request.",
			},
			&cli.BoolFlag{
				Destination: &skipDNSCache,
				Value:       skipDNSCache,
				Category:    "Load",
				Name:        "skip-dns-cache",
				EnvVars: []string{
					"SKIP_DNS_CACHE",
				},
				Usage: "Skip DNS cache, " +
					"works with \"--url/--hf-*/--ms-*/--ol-*\", " +
					"default is caching the DNS lookup result.",
			},
			&cli.BoolFlag{
				Destination: &skipRangDownloadDetect,
				Value:       skipRangDownloadDetect,
				Category:    "Load",
				Name:        "skip-range-download-detect",
				EnvVars: []string{
					"SKIP_RANGE_DOWNLOAD_DETECT",
				},
				Usage: "Skip range download detect, " +
					"works with \"--url/--hf-*/--ms-*/--ol-*\", " +
					"default is detecting the range download support.",
			},
			&cli.DurationFlag{
				Destination: &cacheExpiration,
				Value:       cacheExpiration,
				Category:    "Load",
				Name:        "cache-expiration",
				Usage: "Specify the expiration of cache, " +
					"works with \"--url/--hf-*/--ms-*/--ol-*\".",
			},
			&cli.StringFlag{
				Destination: &cachePath,
				Value:       cachePath,
				Category:    "Load",
				Name:        "cache-path",
				Usage: "Cache the read result to the path, " +
					"works with \"--url/--hf-*/--ms-*/--ol-*\".",
			},
			&cli.BoolFlag{
				Destination: &skipCache,
				Value:       skipCache,
				Category:    "Load",
				Name:        "skip-cache",
				EnvVars: []string{
					"SKIP_CACHE",
				},
				Usage: "Skip cache, " +
					"works with \"--url/--hf-*/--ms-*/--ol-*\", " +
					"default is caching the read result.",
			},
			&cli.IntFlag{
				Destination: &parallelSize,
				Value:       parallelSize,
				Category:    "Estimate",
				Name:        "parallel-size",
				Aliases: []string{ // LLaMACpp compatibility
					"parallel",
					"np",
					"threads-http", // LLaMABox v0.0.140+ compatibility
				},
				Usage: "Specify the number of parallel sequences to decode, " +
					"which is used to estimate the usage.",
			},
			&cli.BoolFlag{
				Destination: &flashAttention,
				Value:       flashAttention,
				Category:    "Estimate",
				Name:        "flash-attention",
				Aliases: []string{
					"flash-attn",
					"fa",
					"diffusion-fa", // StableDiffusionCpp compatibility
				},
				Usage: "Specify enabling Flash Attention, " +
					"which is used to estimate the usage. " +
					"Flash Attention can reduce the usage of RAM/VRAM.",
			},
			&cli.BoolFlag{ // LLaMABox compatibility
				Category: "Estimate",
				Name:     "no-flash-attention",
				Aliases: []string{
					"no-flash-attn",
				},
				Usage: "Specify disabling Flash Attention.",
				Action: func(context *cli.Context, b bool) error {
					flashAttention = !b
					return nil
				},
			},
			&cli.UintFlag{
				Destination: &mainGPU,
				Value:       mainGPU,
				Category:    "Estimate",
				Name:        "main-gpu",
				Aliases: []string{ // LLaMACpp compatibility
					"mg",
				},
				Usage: "Specify the GPU to use for the model (with \"--split-mode=none\") " +
					"or for intermediate results and KV (with \"--split-mode=row\"), " +
					"which is used to estimate the usage. " +
					"Since gguf-parser cannot recognize the host GPU devices or RPC servers, " +
					"\"--main-gpu\" only works when \"--tensor-split\" is set.",
			},
			&cli.StringFlag{
				Destination: &rpcServers,
				Value:       rpcServers,
				Category:    "Estimate",
				Name:        "rpc",
				Usage: "Specify the RPC servers, " +
					"which is used to estimate the usage, " +
					"it is a comma-separated list of host:port. " +
					"Woks with \"--tensor-split\".",
			},
			&cli.StringFlag{
				Destination: &tensorSplit,
				Value:       tensorSplit,
				Category:    "Estimate",
				Name:        "tensor-split",
				Aliases: []string{ // LLaMACpp compatibility
					"ts",
				},
				Usage: "Specify the fraction of the model to offload to each device, " +
					"which is used to estimate the usage, " +
					"it is a comma-separated list of integer. " +
					"Since gguf-parser cannot recognize the host GPU devices or RPC servers, " +
					"must explicitly set \"--tensor-split\" to indicate how many devices are used. " +
					"To declare the devices belong to RPC servers, set \"--rpc\" please.",
			},
			&cli.IntFlag{
				Destination: &offloadLayers,
				Value:       offloadLayers,
				Category:    "Estimate",
				Name:        "gpu-layers",
				Aliases: []string{ // LLaMACpp compatibility
					"ngl",
					"n-gpu-layers",
				},
				Usage: "Specify how many layers of the main model to offload, " +
					"which is used to estimate the usage, " +
					"default is full offloaded.",
			},
			&cli.StringSliceFlag{
				Destination: &overrideTensors,
				Category:    "Estimate",
				Name:        "override-tensor",
				Aliases: []string{ // LLaMACpp compatibility
					"ot",
				},
				Usage: "Override tensor buffer type, " +
					"for example, use --override-tensor \"[2-9][0-9]\\.ffn_.*_exps\\.=CPU\" to keep experts of layers 20-99 in the CPU",
			},
			&cli.StringSliceFlag{
				Destination: &deviceMetrics,
				Category:    "Estimate",
				Name:        "device-metric",
				Usage: "Specify the device metrics, " +
					"which is used to estimate the throughput, in form of \"FLOPS;Up Bandwidth[;Down Bandwidth]\". " +
					"The FLOPS unit, select from [PFLOPS, TFLOPS, GFLOPS, MFLOPS, KFLOPS]. " +
					"The Up/Down Bandwidth unit, select from [PiBps, TiBps, GiBps, MiBps, KiBps, PBps, TBps, GBps, MBps, KBps, Pbps, Tbps, Gbps, Mbps, Kbps]. " +
					"Up Bandwidth usually indicates the bandwidth to transmit the data to calculate, " +
					"and Down Bandwidth indicates the bandwidth to transmit the calculated result to next layer. " +
					"For example, \"--device-metric 10TFLOPS;400GBps\" means the device has 10 TFLOPS and 400 GBps Up/Down bandwidth, " +
					"\"--device-metric 10TFLOPS;400GBps;5000MBps\" means the device has 5000MBps Down bandwidth. " +
					"If the quantity specified by \"--device-metric\" is less than the number of estimation devices(" +
					"determined by \"--tensor-split\" and \"--rpc\" to infer the device count), " +
					"then replicate the last \"--device-metric\" to meet the required number of evaluation devices.",
			},
			&cli.StringFlag{
				Destination: &platformFootprint,
				Value:       platformFootprint,
				Category:    "Estimate",
				Name:        "platform-footprint",
				Usage: "Specify the platform footprint(RAM,VRAM) of running host in MiB, " +
					"which is used to estimate the NonUMA usage, " +
					"default is \"150,250\". " +
					"Different platform always gets different RAM and VRAM footprints, " +
					"for example, within CUDA, \"cudaMemGetInfo\" or \"cudaSetDevice\" would occupy some RAM and VRAM, " +
					"see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo.",
			},
			&cli.IntFlag{
				Destination: &lmcCtxSize,
				Value:       lmcCtxSize,
				Category:    "Estimate/LLaMACpp",
				Name:        "ctx-size",
				Aliases: []string{ // LLaMACpp compatibility
					"c",
				},
				Usage: "Specify the size of prompt context, " +
					"which is used to estimate the usage, " +
					"default is equal to the model's maximum context size.",
			},
			&cli.StringFlag{
				Destination: &lmcRoPEScalingType,
				Category:    "Estimate/LLaMACpp",
				Name:        "rope-scaling",
				Usage: "RoPE frequency scaling method, " +
					"defaults to linear unless specified by the model, select from [none, linear, yarn].",
			},
			&cli.Float64Flag{
				Category: "Estimate/LLaMACpp",
				Name:     "rope-scale",
				Usage: "RoPE context scaling factor, " +
					"expands context by a factor of N.",
				Action: func(context *cli.Context, f float64) error {
					if f != 0 {
						lmcRoPEFreqScale = 1 / f
					}
					return nil
				},
			},
			&cli.Float64Flag{
				Destination: &lmcRoPEFreqBase,
				Category:    "Estimate/LLaMACpp",
				Name:        "rope-freq-base",
				Usage: "RoPE base frequency, " +
					"used by NTK-aware scaling.",
			},
			&cli.Float64Flag{
				Destination: &lmcRoPEFreqScale,
				Category:    "Estimate/LLaMACpp",
				Name:        "rope-freq-scale",
				Usage: "RoPE frequency scaling factor, " +
					"expands context by a factor of 1/N.",
			},
			&cli.IntFlag{
				Destination: &lmcRoPEScalingOrigCtxSize,
				Category:    "Estimate/LLaMACpp",
				Name:        "yarn-orig-ctx",
				Usage: "YaRN original context size of model, " +
					"defaults to model training context size.",
			},
			&cli.BoolFlag{
				Destination: &lmcInMaxCtxSize,
				Value:       lmcInMaxCtxSize,
				Category:    "Estimate/LLaMACpp",
				Name:        "in-max-ctx-size",
				Usage: "Limit the context size to the maximum context size of the model, " +
					"if the context size is larger than the maximum context size.",
			},
			&cli.IntFlag{
				Destination: &lmcLogicalBatchSize,
				Value:       lmcLogicalBatchSize,
				Category:    "Estimate/LLaMACpp",
				Name:        "batch-size",
				Aliases: []string{ // LLaMACpp compatibility
					"b",
				},
				Usage: "Specify the logical batch size, " +
					"which is used to estimate the usage.",
			},
			&cli.IntFlag{
				Destination: &lmcPhysicalBatchSize,
				Value:       lmcPhysicalBatchSize,
				Category:    "Estimate/LLaMACpp",
				Name:        "ubatch-size",
				Aliases: []string{ // LLaMACpp compatibility
					"ub",
				},
				Usage: "Specify the physical maximum batch size, " +
					"which is used to estimate the usage.",
			},
			&cli.StringFlag{
				Destination: &lmcCacheKeyType,
				Value:       lmcCacheKeyType,
				Category:    "Estimate/LLaMACpp",
				Name:        "cache-type-k",
				Aliases: []string{ // LLaMACpp compatibility
					"ctk",
				},
				Usage: "Specify the type of Key cache, " +
					"which is used to estimate the usage, select from [f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1].",
			},
			&cli.StringFlag{
				Destination: &lmcCacheValueType,
				Value:       lmcCacheValueType,
				Category:    "Estimate/LLaMACpp",
				Name:        "cache-type-v",
				Aliases: []string{ // LLaMACpp compatibility
					"ctv",
				},
				Usage: "Specify the type of Value cache, " +
					"which is used to estimate the usage, select from [f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1].",
			},
			&cli.BoolFlag{
				Destination: &lmcNoKVOffload,
				Value:       lmcNoKVOffload,
				Category:    "Estimate/LLaMACpp",
				Name:        "no-kv-offload",
				Aliases: []string{ // LLaMACpp compatibility
					"nkvo",
				},
				Usage: "Specify disabling Key-Value offloading, " +
					"which is used to estimate the usage. " +
					"Disable Key-Value offloading can reduce the usage of VRAM.",
			},
			&cli.StringFlag{
				Destination: &lmcSplitMode,
				Value:       lmcSplitMode,
				Category:    "Estimate/LLaMACpp",
				Name:        "split-mode",
				Aliases: []string{ // LLaMACpp compatibility
					"sm",
				},
				Usage: "Specify how to split the model across multiple devices, " +
					"which is used to estimate the usage, select from [layer, row, none]. " +
					"Since gguf-parser always estimates the usage of VRAM, " +
					"\"none\" is meaningless here, keep for compatibility.",
			},
			&cli.BoolFlag{
				Destination: &lmcSWAFull,
				Value:       lmcSWAFull,
				Category:    "Estimate/LLaMACpp",
				Name:        "swa-full",
				Usage:       "Specify using full-size SWA cache.",
			},
			&cli.BoolFlag{
				Destination: &lmcNoMMap,
				Value:       lmcNoMMap,
				Category:    "Estimate/LLaMACpp",
				Name:        "no-mmap",
				Usage: "Specify disabling Memory-Mapped using, " +
					"which is used to estimate the usage. " +
					"Memory-Mapped can avoid loading the entire model weights into RAM.",
			},
			&cli.BoolFlag{ // LLaMABox compatibility
				Category: "Estimate/LLaMACpp",
				Name:     "mmap",
				Usage: "Specify enabling Memory-Mapped using, " +
					"which is used to estimate the usage. " +
					"Memory-Mapped can avoid loading the entire model weights into RAM.",
				Action: func(context *cli.Context, b bool) error {
					lmcNoMMap = !b
					return nil
				},
			},
			&cli.UintFlag{ // LLaMABox compatibility
				Destination: &lmcVisualMaxImageSize,
				Value:       lmcVisualMaxImageSize,
				Category:    "Estimate/LLaMACpp",
				Name:        "visual-max-image-size",
				Usage:       "Specify maximum image size when completion with vision model.",
			},
			&cli.UintFlag{ // LLaMABox compatibility
				Destination: &lmcMaxProjectedCache,
				Value:       lmcMaxProjectedCache,
				Category:    "Estimate/LLaMACpp",
				Name:        "max-projected-cache",
				Aliases: []string{
					"visual-max-image-cache", // Deprecated argument name
				},
				Usage: "Specify how many projected embedding to be cached.",
			},
			&cli.IntFlag{
				Destination: &lmcOffloadLayersDraft,
				Value:       lmcOffloadLayersDraft,
				Category:    "Estimate/LLaMACpp",
				Name:        "gpu-layers-draft",
				Aliases: []string{ // LLaMACpp compatibility
					"ngld",
					"n-gpu-layers-draft",
				},
				Usage: "Specify how many layers of the draft model to offload, " +
					"which is used to estimate the usage, " +
					"default is full offloaded.",
			},
			&cli.Uint64Flag{
				Destination: &lmcOffloadLayersStep,
				Value:       lmcOffloadLayersStep,
				Category:    "Estimate/LLaMACpp",
				Name:        "gpu-layers-step",
				Usage: "Specify the step of layers to offload, " +
					"works with \"--gpu-layers\".",
			},
			&cli.UintFlag{
				Destination: &sdcBatchCount,
				Value:       sdcBatchCount,
				Category:    "Estimate/StableDiffusionCpp",
				Name:        "image-batch-count",
				Aliases: []string{
					"batch-count",     // StableDiffusionCpp compatibility
					"image-max-batch", // LLaMABox compatibility
				},
				Usage: "Specify the batch(generation) count of the image.",
			},
			&cli.UintFlag{
				Destination: &sdcHeight,
				Value:       sdcHeight,
				Category:    "Estimate/StableDiffusionCpp",
				Name:        "image-height",
				Aliases: []string{
					"height",           // StableDiffusionCpp compatibility
					"image-max-height", // LLaMABox compatibility
				},
				Usage: "Specify the (maximum) height of the image.",
			},
			&cli.UintFlag{
				Destination: &sdcWidth,
				Value:       sdcWidth,
				Category:    "Estimate/StableDiffusionCpp",
				Name:        "image-width",
				Aliases: []string{
					"width",           // StableDiffusionCpp compatibility
					"image-max-width", // LLaMABox compatibility
				},
				Usage: "Specify the (maximum) width of the image.",
			},
			&cli.BoolFlag{
				Destination: &sdcNoConditionerOffload,
				Value:       sdcNoConditionerOffload,
				Category:    "Estimate/StableDiffusionCpp",
				Name:        "image-no-conditioner-offload",
				Aliases: []string{
					"clip-on-cpu",                         // StableDiffusionCpp compatibility
					"image-no-text-encoder-model-offload", // LLaMABox compatibility
				},
				Usage: "Specify to offload the text encoder model to CPU.",
			},
			&cli.BoolFlag{
				Destination: &sdcNoAutoencoderOffload,
				Value:       sdcNoAutoencoderOffload,
				Category:    "Estimate/StableDiffusionCpp",
				Name:        "image-no-autoencoder-offload",
				Aliases: []string{
					"vae-on-cpu",                 // StableDiffusionCpp compatibility
					"image-no-vae-model-offload", // LLaMABox compatibility
				},
				Usage: "Specify to offload the vae model to CPU.",
			},
			&cli.BoolFlag{
				Destination: &sdcNoControlNetOffload,
				Value:       sdcNoControlNetOffload,
				Category:    "Estimate/StableDiffusionCpp",
				Name:        "image-no-control-net-offload",
				Aliases: []string{
					"control-net-cpu",                    // StableDiffusionCpp compatibility
					"image-no-control-net-model-offload", // LLaMABox compatibility
				},
				Usage: "Specify to offload the control net model to CPU.",
			},
			&cli.BoolFlag{
				Destination: &sdcAutoencoderTiling,
				Value:       sdcAutoencoderTiling,
				Category:    "Estimate/StableDiffusionCpp",
				Name:        "image-autoencoder-tiling",
				Aliases: []string{
					"vae-tiling",       // StableDiffusionCpp compatibility
					"image-vae-tiling", // LLaMABox compatibility
				},
				Usage: "Specify to enable tiling for the vae model.",
			},
			&cli.BoolFlag{
				Destination: &sdcNoAutoencoderTiling,
				Value:       sdcNoAutoencoderTiling,
				Category:    "Estimate/StableDiffusionCpp",
				Name:        "image-no-autoencoder-tiling",
				Aliases: []string{
					"image-no-vae-tiling", // LLaMABox compatibility
				},
				Usage: "Specify to disable tiling for the vae model, it takes precedence over --image-autoencoder-tiling.",
			},
			&cli.BoolFlag{
				Destination: &sdcFreeComputeMemoryImmediately,
				Value:       sdcFreeComputeMemoryImmediately,
				Category:    "Estimate/StableDiffusionCpp",
				Name:        "image-free-compute-memory-immediately", // LLaMABox compatibility
				Usage:       "Specify to free the compute memory immediately after the generation, which burst using VRAM.",
			},
			&cli.BoolFlag{
				Destination: &raw,
				Value:       raw,
				Category:    "Output",
				Name:        "raw",
				Usage:       "Output the GGUF file information as JSON only, skip anything.",
			},
			&cli.StringFlag{
				Destination: &rawOutput,
				Value:       rawOutput,
				Category:    "Output",
				Name:        "raw-output",
				Usage:       "Works with \"--raw\", to save the result to the file",
			},
			&cli.BoolFlag{
				Destination: &skipMetadata,
				Value:       skipMetadata,
				Category:    "Output",
				Name:        "skip-metadata",
				Usage:       "Skip to display metadata.",
			},
			&cli.BoolFlag{
				Destination: &skipArchitecture,
				Value:       skipArchitecture,
				Category:    "Output",
				Name:        "skip-architecture",
				Usage:       "Skip to display architecture.",
			},
			&cli.BoolFlag{
				Destination: &skipTokenizer,
				Value:       skipTokenizer,
				Category:    "Output",
				Name:        "skip-tokenizer",
				Usage: "Skip to display tokenizer. " +
					"By default, gguf-parser always displays the tokenizer of the file which types with \"model\".",
			},
			&cli.BoolFlag{
				Destination: &skipEstimate,
				Value:       skipEstimate,
				Category:    "Output",
				Name:        "skip-estimate",
				Usage: "Skip to estimate. " +
					"By default, gguf-parser always estimates the file which types with \"model\".",
			},
			&cli.BoolFlag{
				Category: "Output",
				Name:     "estimate",
				Usage:    "Skip all the information except the estimate result.",
				Action: func(_ *cli.Context, estimateOnly bool) error {
					if estimateOnly {
						skipMetadata = true
						skipArchitecture = true
						skipTokenizer = true
					}
					return nil
				},
			},
			&cli.BoolFlag{
				Destination: &inShort,
				Value:       inShort,
				Category:    "Output",
				Name:        "in-short",
				Usage:       "Display the estimated result in table in short form.",
			},
			&cli.BoolFlag{
				Destination: &inMib,
				Value:       inMib,
				Category:    "Output",
				Name:        "in-mib",
				Usage:       "Display the estimated result in table with MiB.",
			},
			&cli.BoolFlag{
				Destination: &inJson,
				Value:       inJson,
				Category:    "Output",
				Name:        "json",
				Usage:       "Output as JSON.",
			},
			&cli.BoolFlag{
				Destination: &inPrettyJson,
				Value:       inPrettyJson,
				Category:    "Output",
				Name:        "json-pretty",
				Usage:       "Works with \"--json\", to output pretty format JSON.",
			},
		},
		Action: mainAction,
	}

	if err := app.RunContext(signalx.Handler(), os.Args); err != nil {
		_, _ = fmt.Fprintf(os.Stderr, "%v\n", err)
		os.Exit(1)
	}
}

var (
	// model options
	path                 string
	draftPath            string          // for estimate
	mmprojPath           string          // for estimate
	loraPaths            cli.StringSlice // for estimate
	controlVectorPaths   cli.StringSlice // for estimate
	upscalePath          string          // for estimate
	controlNetPath       string          // for estimate
	url                  string
	draftUrl             string          // for estimate
	mmprojUrl            string          // for estimate
	loraUrls             cli.StringSlice // for estimate
	controlVectorUrls    cli.StringSlice // for estimate
	upscaleUrl           string          // for estimate
	controlNetUrl        string          // for estimate
	token                string
	headers              cli.StringSlice
	hfRepo               string
	hfFile               string
	hfDraftRepo          string          // for estimate
	hfDraftFile          string          // for estimate
	hfMMProjFile         string          // for estimate
	hfLoRAFiles          cli.StringSlice // for estimate
	hfControlVectorFiles cli.StringSlice // for estimate
	hfUpscaleRepo        string          // for estimate
	hfUpscaleFile        string          // for estimate
	hfControlNetRepo     string          // for estimate
	hfControlNetFile     string          // for estimate
	hfToken              string
	msRepo               string
	msFile               string
	msDraftRepo          string          // for estimate
	msDraftFile          string          // for estimate
	msMMProjFile         string          // for estimate
	msLoRAFiles          cli.StringSlice // for estimate
	msControlVectorFiles cli.StringSlice // for estimate
	msUpscaleRepo        string          // for estimate
	msUpscaleFile        string          // for estimate
	msControlNetRepo     string          // for estimate
	msControlNetFile     string          // for estimate
	msToken              string
	olBaseURL            = "https://registry.ollama.ai"
	olModel              string
	olUsage              bool
	// load options
	debug                  bool
	skipProxy              bool
	skipTLSVerify          bool
	skipDNSCache           bool
	skipRangDownloadDetect bool
	cacheExpiration        = 24 * time.Hour
	cachePath              = DefaultCachePath()
	skipCache              bool
	// estimate options
	parallelSize      = 1
	flashAttention    bool
	mainGPU           uint
	rpcServers        string
	tensorSplit       string
	offloadLayers     = -1
	overrideTensors   cli.StringSlice
	deviceMetrics     cli.StringSlice
	platformFootprint = "150,250"
	// estimate options for llama.cpp
	lmcCtxSize                = 0
	lmcRoPEFreqBase           float64
	lmcRoPEFreqScale          float64
	lmcRoPEScalingType        string
	lmcRoPEScalingOrigCtxSize int
	lmcInMaxCtxSize           bool
	lmcLogicalBatchSize       = 2048
	lmcPhysicalBatchSize      = 512
	lmcCacheKeyType           = "f16"
	lmcCacheValueType         = "f16"
	lmcNoKVOffload            bool
	lmcSplitMode              = "layer"
	lmcSWAFull                = false
	lmcNoMMap                 bool
	lmcVisualMaxImageSize     uint
	lmcMaxProjectedCache      uint
	lmcOffloadLayersDraft     = -1
	lmcOffloadLayersStep      uint64
	// estimate options for stable-diffusion.cpp
	sdcBatchCount                   uint = 1
	sdcHeight                       uint = 1024
	sdcWidth                        uint = 1024
	sdcNoConditionerOffload         bool
	sdcNoAutoencoderOffload         bool
	sdcNoControlNetOffload          bool
	sdcAutoencoderTiling            bool
	sdcNoAutoencoderTiling          bool
	sdcFreeComputeMemoryImmediately bool
	// output options
	raw              bool
	rawOutput        string
	inShort          bool
	skipMetadata     bool
	skipArchitecture bool
	skipTokenizer    bool
	skipEstimate     bool
	inMib            bool
	inJson           bool
	inPrettyJson     = true
)

func mainAction(c *cli.Context) error {
	ctx := c.Context

	// Prepare options.

	ropts := []GGUFReadOption{
		SkipLargeMetadata(),
		UseMMap(),
		UseCache(),
	}
	if hs := headers.Value(); len(hs) > 0 {
		hm := make(map[string]string, len(hs))
		for _, h := range hs {
			parts := strings.SplitN(h, ":", 2)
			if len(parts) == 2 {
				hm[strings.TrimSpace(parts[0])] = strings.TrimSpace(parts[1])
			}
		}
		if len(hm) > 0 {
			ropts = append(ropts, UseHeaders(hm))
		}
	}
	if token != "" {
		ropts = append(ropts, UseBearerAuth(token))
	}
	if debug {
		ropts = append(ropts, UseDebug())
	}
	if skipProxy {
		ropts = append(ropts, SkipProxy())
	}
	if skipTLSVerify {
		ropts = append(ropts, SkipTLSVerification())
	}
	if skipDNSCache {
		ropts = append(ropts, SkipDNSCache())
	}
	if skipRangDownloadDetect {
		ropts = append(ropts, SkipRangeDownloadDetection())
	}
	if cacheExpiration >= 0 {
		ropts = append(ropts, UseCacheExpiration(cacheExpiration))
	}
	if cachePath != "" {
		ropts = append(ropts, UseCachePath(cachePath))
	}
	if skipCache {
		ropts = append(ropts, SkipCache())
	}

	eopts := []GGUFRunEstimateOption{
		WithLLaMACppCacheValueType(GGMLTypeF16),
		WithLLaMACppCacheKeyType(GGMLTypeF16),
	}
	if parallelSize > 0 {
		eopts = append(eopts, WithParallelSize(int32(parallelSize)))
	}
	if flashAttention {
		eopts = append(eopts, WithFlashAttention())
	}
	if tensorSplit != "" {
		tss := strings.Split(tensorSplit, ",")
		if len(tss) > 128 {
			return errors.New("--tensor-split exceeds the number of devices")
		}
		var vs float64
		vv := make([]float64, len(tss))
		vf := make([]float64, len(tss))
		for i, s := range tss {
			s = strings.TrimSpace(s)
			v, err := strconv.ParseFloat(s, 64)
			if err != nil {
				return errors.New("--tensor-split has invalid integer")
			}
			vs += v
			vv[i] = vs
		}
		for i, v := range vv {
			vf[i] = v / vs
		}
		eopts = append(eopts, WithTensorSplitFraction(vf))
		if mainGPU < uint(len(vv)) {
			eopts = append(eopts, WithMainGPUIndex(int(mainGPU)))
		} else {
			return errors.New("--main-gpu must be less than item size of --tensor-split")
		}
		if rpcServers != "" {
			rss := strings.Split(rpcServers, ",")
			if len(rss) > len(tss) {
				return errors.New("--rpc has more items than --tensor-split")
			}
			rpc := make([]string, len(rss))
			for i, s := range rss {
				s = strings.TrimSpace(s)
				if _, _, err := net.SplitHostPort(s); err != nil {
					return errors.New("--rpc has invalid host:port")
				}
				rpc[i] = s
			}
			eopts = append(eopts, WithRPCServers(rpc))
		}
	}
	if otss := overrideTensors.Value(); len(otss) > 0 {
		var ots []GGUFRunOverriddenTensor
		for i := range otss {
			pots := strings.Split(otss[i], ",")
			for j := range pots {
				ss := strings.SplitN(strings.TrimSpace(pots[j]), "=", 2)
				if len(ss) != 2 {
					return errors.New("--override-tensor has invalid format")
				}
				pr, err := regexp.Compile(strings.TrimSpace(ss[0]))
				if err != nil {
					return fmt.Errorf("--override-tensor has invalid pattern: %w", err)
				}
				bt := strings.TrimSpace(ss[1])
				if bt == "" {
					return errors.New("--override-tensor has empty buffer type")
				}
				ots = append(ots, GGUFRunOverriddenTensor{
					PatternRegex: pr,
					BufferType:   bt,
				})
			}
		}
		eopts = append(eopts, WithOverriddenTensors(ots))
	}
	if dmss := deviceMetrics.Value(); len(dmss) > 0 {
		dms := make([]GGUFRunDeviceMetric, len(dmss))
		for i := range dmss {
			ss := strings.Split(dmss[i], ";")
			if len(ss) < 2 {
				return errors.New("--device-metric has invalid format")
			}
			var err error
			dms[i].FLOPS, err = ParseFLOPSScalar(strings.TrimSpace(ss[0]))
			if err != nil {
				return fmt.Errorf("--device-metric has invalid FLOPS: %w", err)
			}
			dms[i].UpBandwidth, err = ParseBytesPerSecondScalar(strings.TrimSpace(ss[1]))
			if err != nil {
				return fmt.Errorf("--device-metric has invalid Up Bandwidth: %w", err)
			}
			if len(ss) > 2 {
				dms[i].DownBandwidth, err = ParseBytesPerSecondScalar(strings.TrimSpace(ss[2]))
				if err != nil {
					return fmt.Errorf("--device-metric has invalid Down Bandwidth: %w", err)
				}
			} else {
				dms[i].DownBandwidth = dms[i].UpBandwidth
			}
		}
		eopts = append(eopts, WithDeviceMetrics(dms))
	}
	if lmcCtxSize > 0 {
		eopts = append(eopts, WithLLaMACppContextSize(int32(lmcCtxSize)))
	}
	if lmcRoPEFreqBase > 0 || lmcRoPEFreqScale > 0 || lmcRoPEScalingType != "" || lmcRoPEScalingOrigCtxSize > 0 {
		eopts = append(eopts, WithLLaMACppRoPE(lmcRoPEFreqBase, lmcRoPEFreqScale, lmcRoPEScalingType, int32(lmcRoPEScalingOrigCtxSize)))
	}
	if lmcInMaxCtxSize {
		eopts = append(eopts, WithinLLaMACppMaxContextSize())
	}
	if lmcLogicalBatchSize > 0 {
		eopts = append(eopts, WithLLaMACppLogicalBatchSize(int32(max(32, lmcLogicalBatchSize))))
	}
	if lmcPhysicalBatchSize > 0 {
		if lmcPhysicalBatchSize > lmcLogicalBatchSize {
			return errors.New("--ubatch-size must be less than or equal to --batch-size")
		}
		eopts = append(eopts, WithLLaMACppPhysicalBatchSize(int32(lmcPhysicalBatchSize)))
	}
	if lmcCacheKeyType != "" {
		eopts = append(eopts, WithLLaMACppCacheKeyType(toGGMLType(lmcCacheKeyType)))
	}
	if lmcCacheValueType != "" {
		eopts = append(eopts, WithLLaMACppCacheValueType(toGGMLType(lmcCacheValueType)))
	}
	if lmcNoKVOffload {
		eopts = append(eopts, WithoutLLaMACppOffloadKVCache())
	}
	switch lmcSplitMode {
	case "row":
		eopts = append(eopts, WithLLaMACppSplitMode(LLaMACppSplitModeRow))
	case "none":
		eopts = append(eopts, WithLLaMACppSplitMode(LLaMACppSplitModeNone))
	default:
		eopts = append(eopts, WithLLaMACppSplitMode(LLaMACppSplitModeLayer))
	}
	if lmcSWAFull {
		eopts = append(eopts, WithLLaMACppFullSizeSWACache())
	}
	if lmcVisualMaxImageSize > 0 {
		eopts = append(eopts, WithLLaMACppVisualMaxImageSize(uint32(lmcVisualMaxImageSize)))
	}
	if lmcMaxProjectedCache > 0 {
		eopts = append(eopts, WithLLaMACppMaxProjectedCache(uint32(lmcMaxProjectedCache)))
	}
	if sdcBatchCount > 1 {
		eopts = append(eopts, WithStableDiffusionCppBatchCount(int32(sdcBatchCount)))
	}
	if sdcHeight > 0 {
		eopts = append(eopts, WithStableDiffusionCppHeight(uint32(sdcHeight)))
	}
	if sdcWidth > 0 {
		eopts = append(eopts, WithStableDiffusionCppWidth(uint32(sdcWidth)))
	}
	if sdcNoConditionerOffload {
		eopts = append(eopts, WithoutStableDiffusionCppOffloadConditioner())
	}
	if sdcNoAutoencoderOffload {
		eopts = append(eopts, WithoutStableDiffusionCppOffloadAutoencoder())
	}
	if sdcAutoencoderTiling && !sdcNoAutoencoderTiling {
		eopts = append(eopts, WithStableDiffusionCppAutoencoderTiling())
	}
	if sdcFreeComputeMemoryImmediately {
		eopts = append(eopts, WithStableDiffusionCppFreeComputeMemoryImmediately())
	}
	if offloadLayers >= 0 {
		eopts = append(eopts, WithLLaMACppOffloadLayers(uint64(offloadLayers)), WithStableDiffusionCppOffloadLayers(uint64(offloadLayers)))
	}

	// Parse GGUF file.

	var (
		// Common.
		gf         *GGUFFile
		adapterGfs []*GGUFFile
		// LLaMACpp specific.
		lmcProjectGf *GGUFFile
		lmcDrafterGf *GGUFFile
		// StableDiffusionCpp specific.
		sdcControlNetGf *GGUFFile
		sdcUpscaleGf    *GGUFFile
	)
	{
		var err error

		ropts := ropts[:len(ropts):len(ropts)]

		// Main model.
		switch {
		default:
			return errors.New("no model specified")
		case path != "":
			gf, err = ParseGGUFFile(path, ropts...)
		case url != "":
			gf, err = ParseGGUFFileRemote(ctx, url, ropts...)
		case hfRepo != "" && hfFile != "":
			if hfToken != "" {
				ropts = append(ropts, UseBearerAuth(hfToken))
			}
			gf, err = ParseGGUFFileFromHuggingFace(ctx, hfRepo, hfFile, ropts...)
		case msRepo != "" && msFile != "":
			if msToken != "" {
				ropts = append(ropts, UseBearerAuth(msToken))
			}
			gf, err = ParseGGUFFileFromModelScope(ctx, msRepo, msFile, ropts...)
		case olModel != "":
			om := ParseOllamaModel(olModel, SetOllamaModelBaseURL(olBaseURL))
			gf, err = ParseGGUFFileFromOllamaModel(ctx, om, ropts...)
			if err == nil && om != nil && olUsage {
				// Parameters override.
				{
					ps, _ := om.Params(ctx, nil)
					if v, ok := ps["num_ctx"]; ok {
						eopts = append(eopts, WithLLaMACppContextSize(anyx.Number[int32](v)))
					} else if lmcCtxSize <= 0 {
						eopts = append(eopts, WithLLaMACppContextSize(2048))
					}
					if v, ok := ps["use_mmap"]; ok && !anyx.Bool(v) {
						lmcNoMMap = true
					}
					if v, ok := ps["num_gpu"]; ok {
						offloadLayers = anyx.Number[int](v)
					}
				}
				// Multimodal projector overlap.
				{
					mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.projector$`))
					if len(mls) > 0 {
						lmcProjectGf, err = ParseGGUFFileRemote(ctx, mls[len(mls)-1].BlobURL().String(), ropts...)
						if err != nil {
							return fmt.Errorf("failed to parse GGUF file: %w", err)
						}
					}
				}
				// Adapter overlap.
				{
					als := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.adapter$`))
					if len(als) > 0 {
						var adpgf *GGUFFile
						for i := range als {
							adpgf, err = ParseGGUFFileRemote(ctx, als[i].BlobURL().String(), ropts...)
							if err != nil {
								return fmt.Errorf("failed to parse GGUF file: %w", err)
							}
							adapterGfs = append(adapterGfs, adpgf)
						}
					}
				}
			}
		}
		if err != nil {
			return fmt.Errorf("failed to parse GGUF file: %w", err)
		}

		// Adapter.
		{
			// LoRA.
			for _, loraPath := range loraPaths.Value() {
				adpgf, err := ParseGGUFFile(loraPath, ropts...)
				if err != nil {
					return fmt.Errorf("failed to parse LoRA adapter GGUF file: %w", err)
				}
				adapterGfs = append(adapterGfs, adpgf)
			}
			for _, loraUrl := range loraUrls.Value() {
				adpgf, err := ParseGGUFFileRemote(ctx, loraUrl, ropts...)
				if err != nil {
					return fmt.Errorf("failed to parse LoRA adapter GGUF file: %w", err)
				}
				adapterGfs = append(adapterGfs, adpgf)
			}
			if hfRepo != "" {
				for _, hfLoRAFile := range hfLoRAFiles.Value() {
					adpgf, err := ParseGGUFFileFromHuggingFace(ctx, hfRepo, hfLoRAFile, ropts...)
					if err != nil {
						return fmt.Errorf("failed to parse LoRA adapter GGUF file: %w", err)
					}
					adapterGfs = append(adapterGfs, adpgf)
				}
			}
			if msRepo != "" {
				for _, msLoRAFile := range msLoRAFiles.Value() {
					adpgf, err := ParseGGUFFileFromModelScope(ctx, msRepo, msLoRAFile, ropts...)
					if err != nil {
						return fmt.Errorf("failed to parse LoRA adapter GGUF file: %w", err)
					}
					adapterGfs = append(adapterGfs, adpgf)
				}
			}

			// Control Vector.
			for _, cvPath := range controlVectorPaths.Value() {
				adpgf, err := ParseGGUFFile(cvPath, ropts...)
				if err != nil {
					return fmt.Errorf("failed to parse Control Vector adapter GGUF file: %w", err)
				}
				adapterGfs = append(adapterGfs, adpgf)
			}
			for _, cvUrl := range controlVectorUrls.Value() {
				adpgf, err := ParseGGUFFileRemote(ctx, cvUrl, ropts...)
				if err != nil {
					return fmt.Errorf("failed to parse Control Vector adapter GGUF file: %w", err)
				}
				adapterGfs = append(adapterGfs, adpgf)
			}
			if hfRepo != "" {
				for _, hfCvFile := range hfControlVectorFiles.Value() {
					adpgf, err := ParseGGUFFileFromHuggingFace(ctx, hfRepo, hfCvFile, ropts...)
					if err != nil {
						return fmt.Errorf("failed to parse Control Vector adapter GGUF file: %w", err)
					}
					adapterGfs = append(adapterGfs, adpgf)
				}
			}
			if msRepo != "" {
				for _, msCvFile := range msControlVectorFiles.Value() {
					adpgf, err := ParseGGUFFileFromModelScope(ctx, msRepo, msCvFile, ropts...)
					if err != nil {
						return fmt.Errorf("failed to parse Control Vector adapter GGUF file: %w", err)
					}
					adapterGfs = append(adapterGfs, adpgf)
				}
			}
		}

		// Drafter for LLaMACpp.
		switch {
		case draftPath != "":
			lmcDrafterGf, err = ParseGGUFFile(draftPath, ropts...)
		case draftUrl != "":
			lmcDrafterGf, err = ParseGGUFFileRemote(ctx, draftUrl, ropts...)
		case hfDraftRepo != "" && hfDraftFile != "":
			lmcDrafterGf, err = ParseGGUFFileFromHuggingFace(ctx, hfDraftRepo, hfDraftFile, ropts...)
		case msDraftRepo != "" && msDraftFile != "":
			lmcDrafterGf, err = ParseGGUFFileFromModelScope(ctx, msDraftRepo, msDraftFile, ropts...)
		}
		if err != nil {
			return fmt.Errorf("failed to parse draft GGUF file: %w", err)
		}

		// Projector for LLaMACpp.
		switch {
		case mmprojPath != "":
			lmcProjectGf, err = ParseGGUFFile(mmprojPath, ropts...)
		case mmprojUrl != "":
			lmcProjectGf, err = ParseGGUFFileRemote(ctx, mmprojUrl, ropts...)
		case hfRepo != "" && hfMMProjFile != "":
			lmcProjectGf, err = ParseGGUFFileFromHuggingFace(ctx, hfRepo, hfMMProjFile, ropts...)
		case msRepo != "" && msMMProjFile != "":
			lmcProjectGf, err = ParseGGUFFileFromModelScope(ctx, msRepo, msMMProjFile, ropts...)
		}
		if err != nil {
			return fmt.Errorf("failed to parse multimodal projector GGUF file: %w", err)
		}

		// ControlNet for StableDiffusionCpp.
		switch {
		case controlNetPath != "":
			sdcControlNetGf, err = ParseGGUFFile(controlNetPath, ropts...)
		case controlNetUrl != "":
			sdcControlNetGf, err = ParseGGUFFileRemote(ctx, controlNetUrl, ropts...)
		case hfControlNetRepo != "" && hfControlNetFile != "":
			sdcControlNetGf, err = ParseGGUFFileFromHuggingFace(ctx, hfControlNetRepo, hfControlNetFile, ropts...)
		case msControlNetRepo != "" && msControlNetFile != "":
			sdcControlNetGf, err = ParseGGUFFileFromModelScope(ctx, msControlNetRepo, msControlNetFile, ropts...)
		}
		if err != nil {
			return fmt.Errorf("failed to parse control net GGUF file: %w", err)
		}

		// Upscaler for StableDiffusionCpp.
		switch {
		case upscalePath != "":
			sdcUpscaleGf, err = ParseGGUFFile(upscalePath, ropts...)
		case upscaleUrl != "":
			sdcUpscaleGf, err = ParseGGUFFileRemote(ctx, upscaleUrl, ropts...)
		case hfUpscaleRepo != "" && hfUpscaleFile != "":
			sdcUpscaleGf, err = ParseGGUFFileFromHuggingFace(ctx, hfUpscaleRepo, hfUpscaleFile, ropts...)
		case msUpscaleRepo != "" && msUpscaleFile != "":
			sdcUpscaleGf, err = ParseGGUFFileFromModelScope(ctx, msUpscaleRepo, msUpscaleFile, ropts...)
		}
		if err != nil {
			return fmt.Errorf("failed to parse upscaler GGUF file: %w", err)
		}
	}

	// Output raw.

	if raw {
		w := os.Stdout
		if rawOutput != "" {
			f, err := osx.CreateFile(rawOutput, 0o666)
			if err != nil {
				return fmt.Errorf("failed to create file: %w", err)
			}
			defer osx.Close(f)
			w = f
		}
		if err := json.NewEncoder(w).Encode(gf); err != nil {
			return fmt.Errorf("failed to encode JSON: %w", err)
		}
		return nil
	}

	// Otherwise, display the metadata and estimate the usage.

	var (
		m   = gf.Metadata()
		a   = gf.Architecture()
		t   = gf.Tokenizer()
		lme LLaMACppRunEstimate
		sde StableDiffusionCppRunEstimate
	)

	skipArchitecture = skipArchitecture || m.Type == "imatrix"
	skipTokenizer = skipTokenizer || t.Model == ""
	skipEstimate = skipEstimate || m.Type != "model"

	if !skipEstimate && m.Architecture != "diffusion" {
		if lmcDrafterGf != nil {
			dlmceopts := eopts[:len(eopts):len(eopts)]
			if lmcOffloadLayersDraft >= 0 {
				dlmceopts = append(dlmceopts, WithLLaMACppOffloadLayers(uint64(lmcOffloadLayersDraft)))
			}
			dlmceopts = append(dlmceopts, WithLLaMACppCacheKeyType(GGMLTypeF16), WithLLaMACppCacheValueType(GGMLTypeF16))
			de := lmcDrafterGf.EstimateLLaMACppRun(dlmceopts...)
			eopts = append(eopts, WithLLaMACppDrafter(&de))
		}

		if lmcProjectGf != nil {
			plmceopts := eopts[:len(eopts):len(eopts)]
			me := lmcProjectGf.EstimateLLaMACppRun(plmceopts...)
			eopts = append(eopts, WithLLaMACppProjector(&me))
		}

		if len(adapterGfs) > 0 {
			adps := make([]LLaMACppRunEstimate, len(adapterGfs))
			almceopts := eopts[:len(eopts):len(eopts)]
			for i, adpgf := range adapterGfs {
				ae := adpgf.EstimateLLaMACppRun(almceopts...)
				adps[i] = ae
			}
			eopts = append(eopts, WithLLaMACppAdapters(adps))
		}

		lme = gf.EstimateLLaMACppRun(eopts...)
	}

	if !skipEstimate && m.Architecture == "diffusion" {
		if sdcUpscaleGf != nil {
			sdceopts := eopts[:len(eopts):len(eopts)]
			ue := sdcUpscaleGf.EstimateStableDiffusionCppRun(sdceopts...)
			eopts = append(eopts, WithStableDiffusionCppUpscaler(&ue))
		}

		if sdcControlNetGf != nil {
			sdceopts := eopts[:len(eopts):len(eopts)]
			if sdcNoControlNetOffload {
				sdceopts = append(sdceopts, WithStableDiffusionCppOffloadLayers(0))
			}
			ce := sdcControlNetGf.EstimateStableDiffusionCppRun(sdceopts...)
			eopts = append(eopts, WithStableDiffusionCppControlNet(&ce))
		}

		sde = gf.EstimateStableDiffusionCppRun(eopts...)
	}

	// Then, output as JSON or table.

	var (
		mmap                      = !lmcNoMMap
		platformRAM, platformVRAM uint64
	)
	{
		if platformFootprint != "" {
			parts := strings.Split(platformFootprint, ",")
			if len(parts) == 2 {
				if v, err := strconv.ParseUint(parts[0], 10, 64); err == nil {
					platformRAM = v * 1024 * 1024
				}
				if v, err := strconv.ParseUint(parts[1], 10, 64); err == nil {
					platformVRAM = v * 1024 * 1024
				}
			}
		}
	}

	if inJson {
		o := map[string]any{}

		if !skipMetadata {
			o["metadata"] = m
		}

		if !skipArchitecture {
			o["architecture"] = a
		}

		if !skipTokenizer {
			o["tokenizer"] = t
		}

		if !skipEstimate && m.Architecture != "diffusion" {
			lmes := lme.Summarize(mmap, platformRAM, platformVRAM)
			switch {
			case lmcOffloadLayersStep > lme.OffloadLayers:
				lmcOffloadLayersStep = lme.OffloadLayers
			case lmcOffloadLayersStep <= 0:
				lmcOffloadLayersStep = lme.OffloadLayers
			}
			if lmcOffloadLayersStep < lme.OffloadLayers {
				cnt := lme.OffloadLayers/lmcOffloadLayersStep + 1
				if lme.OffloadLayers%lmcOffloadLayersStep != 0 || lme.FullOffloaded {
					cnt++
				}
				esis := make([]LLaMACppRunEstimateSummaryItem, cnt)
				var wg sync.WaitGroup
				for i := 0; i < cap(esis); i++ {
					wg.Add(1)
					go func(i int) {
						defer wg.Done()
						lmeopts := eopts[:len(eopts):len(eopts)]
						lmeopts = append(lmeopts, WithLLaMACppOffloadLayers(uint64(i)*lmcOffloadLayersStep))
						esis[i] = gf.EstimateLLaMACppRun(lmeopts...).SummarizeItem(mmap, platformRAM, platformVRAM)
					}(i)
				}
				wg.Wait()
				esis[cap(esis)-1] = lmes.Items[0]
				lmes.Items = esis
			}
			o["estimate"] = lmes
		}

		if !skipEstimate && m.Architecture == "diffusion" {
			sdes := sde.Summarize(mmap, platformRAM, platformVRAM)
			o["estimate"] = sdes
		}

		enc := json.NewEncoder(os.Stdout)
		if inPrettyJson {
			enc.SetIndent("", "  ")
		}
		if err := enc.Encode(o); err != nil {
			return fmt.Errorf("failed to encode JSON: %w", err)
		}

		return nil
	}

	GGUFBytesScalarStringInMiBytes = inMib

	if !skipMetadata {
		tprint(
			"Metadata",
			[][]any{
				{
					"Type",
					"Name",
					"Arch",
					"Quantization",
					"Little Endian",
					"Size",
					"Parameters",
					"BPW",
				},
			},
			[][]any{
				{
					m.Type,
					sprintf(tenary(len(m.Name) == 0, "N/A", tenary(len([]rune(m.Name)) <= 20, m.Name, string([]rune(m.Name)[:20])+"..."))),
					m.Architecture,
					m.FileTypeDescriptor,
					sprintf(m.LittleEndian),
					sprintf(m.Size),
					sprintf(m.Parameters),
					sprintf(m.BitsPerWeight),
				},
			})
	}

	if !skipArchitecture {
		var (
			hds [][]any
			bds [][]any
		)
		switch a.Type {
		case "projector":
			hds = [][]any{
				{
					"Projector Type",
					"Embedding Len",
					"Layers",
					"Feed Forward Len",
					"Encoder",
				},
			}
			switch {
			case a.ClipHasVisionEncoder && a.ClipHasAudioEncoder:
				hds = [][]any{
					{
						"Projector Type",
						"Embedding Len", "Embedding Len",
						"Layers", "Layers",
						"Feed Forward Len", "Feed Forward Len",
						"Encoder",
					},
					{
						"Projector Type",
						"Vision", "Audio",
						"Vision", "Audio",
						"Vision", "Audio",
						"Encoder",
					},
				}
				bds = [][]any{
					{
						sprintf(a.ClipProjectorType),
						sprintf(a.ClipVisionEmbeddingLength),
						sprintf(a.ClipAudioEmbeddingLength),
						sprintf(a.ClipVisionBlockCount),
						sprintf(a.ClipAudioBlockCount),
						sprintf(tenary(
							a.ClipVisionFeedForwardLength[0] == a.ClipVisionFeedForwardLength[1],
							a.ClipVisionFeedForwardLength[0],
							sprintf("[%d, %d, ...]", a.ClipVisionFeedForwardLength[0], a.ClipVisionFeedForwardLength[1]))),
						sprintf(tenary(
							a.ClipAudioFeedForwardLength[0] == a.ClipAudioFeedForwardLength[1],
							a.ClipAudioFeedForwardLength[0],
							sprintf("[%d, %d, ...]", a.ClipAudioFeedForwardLength[0], a.ClipAudioFeedForwardLength[1]))),
						"Vision & Audio",
					},
				}
			case a.ClipHasVisionEncoder:
				bds = [][]any{
					{
						sprintf(a.ClipProjectorType),
						sprintf(a.ClipVisionEmbeddingLength),
						sprintf(a.ClipVisionBlockCount),
						sprintf(tenary(
							a.ClipVisionFeedForwardLength[0] == a.ClipVisionFeedForwardLength[1],
							a.ClipVisionFeedForwardLength[0],
							sprintf("[%d, %d, ...]", a.ClipVisionFeedForwardLength[0], a.ClipVisionFeedForwardLength[1]))),
						"Vision",
					},
				}
			default:
				bds = [][]any{
					{
						sprintf(a.ClipProjectorType),
						sprintf(a.ClipAudioEmbeddingLength),
						sprintf(a.ClipAudioBlockCount),
						sprintf(tenary(
							a.ClipAudioFeedForwardLength[0] == a.ClipAudioFeedForwardLength[1],
							a.ClipAudioFeedForwardLength[0],
							sprintf("[%d, %d, ...]", a.ClipAudioFeedForwardLength[0], a.ClipAudioFeedForwardLength[1]))),
						"Audio",
					},
				}
			}
		case "adapter":
			hds = [][]any{
				{
					"Adapter Type",
				},
			}
			bds = [][]any{
				{
					sprintf(a.AdapterType),
				},
			}
			if a.AdapterType == "lora" {
				hds[0] = append(hds[0], "LoRA Alpha")
				bds[0] = append(bds[0], sprintf(a.AdapterLoRAAlpha))
			} else {
				hds[0] = append(hds[0], "ControlVector Layers")
				bds[0] = append(bds[0], sprintf(a.AdapterControlVectorLayerCount))
			}
		default:
			if a.Architecture == "diffusion" {
				hds = [][]any{
					{
						"Diffusion Arch",
						"Conditioners",
						"Autoencoder",
					},
				}
				bds = [][]any{
					{
						sprintf(tenary(a.DiffusionArchitecture != "", a.DiffusionArchitecture, "N/A")),
						sprintf(tenary(a.DiffusionHasConditioners(), a.DiffusionConditioners, "N/A")),
						sprintf(tenary(a.DiffusionHasAutoencoder(), a.DiffusionAutoencoder, "N/A")),
					},
				}
			} else {
				hds = [][]any{
					{
						"Max Context Len",
						"Embedding Len",
						"Attention Causal",
						"Attention Head Cnt",
						"Layers",
						tenary(a.ExpertFeedForwardLength != 0, "Expert Feed Forward Len", "Feed Forward Len"),
						"Expert Cnt",
						"Vocabulary Len",
					},
				}
				bds = [][]any{
					{
						sprintf(a.MaximumContextLength),
						sprintf(a.EmbeddingLength),
						sprintf(a.AttentionCausal),
						sprintf(tenary(
							a.AttentionHeadCountKV == 0 || a.AttentionHeadCountKV == a.AttentionHeadCount,
							"N/A",
							a.AttentionHeadCount)),
						sprintf(a.BlockCount),
						sprintf(tenaryFunc(
							a.ExpertFeedForwardLength != 0,
							func() any {
								return a.ExpertFeedForwardLength
							},
							func() any {
								switch {
								case len(a.FeedForwardLength) == 0:
									return "N/A"
								case len(a.FeedForwardLength) == 1:
									return a.FeedForwardLength[0]
								case a.FeedForwardLength[0] == a.FeedForwardLength[1]:
									return a.FeedForwardLength[0]
								default:
									return sprintf("[%d, %d, ...]", a.FeedForwardLength[0], a.FeedForwardLength[1])
								}
							},
						)),
						sprintf(a.ExpertCount),
						sprintf(a.VocabularyLength),
					},
				}
			}
		}
		tprint(
			"ARCHITECTURE",
			hds,
			bds)
	}

	if !skipTokenizer {
		tprint(
			"TOKENIZER",
			[][]any{
				{
					"Model",
					"Tokens Size",
					"Tokens Len",
					"Added Tokens Len",
					"BOS Token",
					"EOS Token",
					"EOT Token",
					"EOM Token",
					"Unknown Token",
					"Separator Token",
					"Padding Token",
				},
			},
			[][]any{
				{
					t.Model,
					sprintf(tenary(t.TokensSize <= 0, "N/A", GGUFBytesScalar(t.TokensSize))),
					sprintf(tenary(t.TokensLength <= 0, "N/A", t.TokensLength)),
					sprintf(tenary(t.AddedTokensLength <= 0, "N/A", t.AddedTokensLength)),
					sprintf(tenary(t.BOSTokenID < 0, "N/A", t.BOSTokenID)),
					sprintf(tenary(t.EOSTokenID < 0, "N/A", t.EOSTokenID)),
					sprintf(tenary(t.EOTTokenID < 0, "N/A", t.EOTTokenID)),
					sprintf(tenary(t.EOMTokenID < 0, "N/A", t.EOMTokenID)),
					sprintf(tenary(t.UnknownTokenID < 0, "N/A", t.UnknownTokenID)),
					sprintf(tenary(t.SeparatorTokenID < 0, "N/A", t.SeparatorTokenID)),
					sprintf(tenary(t.PaddingTokenID < 0, "N/A", t.PaddingTokenID)),
				},
			})
	}

	if !skipEstimate && m.Architecture != "diffusion" {
		hds := make([][]any, 2)
		lmes := lme.Summarize(mmap, platformRAM, platformVRAM)
		if !inShort {
			hds[0] = []any{
				"Arch",
				"Context Size",
				"Batch Size (L / P)",
				"Flash Attention",
				"MMap Load",
				"Embedding Only",
				"Reranking",
				"Distributable",
				"Offload Layers",
				"Full Offloaded",
			}
			hds[1] = []any{
				"Arch",
				"Context Size",
				"Batch Size (L / P)",
				"Flash Attention",
				"MMap Load",
				"Embedding Only",
				"Reranking",
				"Distributable",
				"Offload Layers",
				"Full Offloaded",
			}
		}
		if lmes.Items[0].MaximumTokensPerSecond != nil {
			hds[0] = append(hds[0], "Max TPS")
			hds[1] = append(hds[1], "Max TPS")
		}
		hds[0] = append(hds[0], "RAM", "RAM", "RAM")
		hds[1] = append(hds[1], "Layers (I + T + O)", "UMA", "NonUMA")
		for _, v := range lmes.Items[0].VRAMs {
			var hd string
			if v.Remote {
				hd = fmt.Sprintf("RPC %d (V)RAM", v.Position)
			} else {
				hd = fmt.Sprintf("VRAM %d", v.Position)
			}
			hds[0] = append(hds[0], hd, hd, hd)
			hds[1] = append(hds[1], "Layers (T + O)", "UMA", "NonUMA")
		}

		switch {
		case lmcOffloadLayersStep > lme.OffloadLayers:
			lmcOffloadLayersStep = lme.OffloadLayers
		case lmcOffloadLayersStep <= 0:
			lmcOffloadLayersStep = lme.OffloadLayers
		}
		if lmcOffloadLayersStep < lme.OffloadLayers {
			cnt := lme.OffloadLayers/lmcOffloadLayersStep + 1
			if lme.OffloadLayers%lmcOffloadLayersStep != 0 || lme.FullOffloaded {
				cnt++
			}
			esis := make([]LLaMACppRunEstimateSummaryItem, cnt)
			var wg sync.WaitGroup
			for i := 0; i < cap(esis); i++ {
				wg.Add(1)
				go func(i int) {
					defer wg.Done()
					lmeopts := eopts[:len(eopts):len(eopts)]
					lmeopts = append(lmeopts, WithLLaMACppOffloadLayers(uint64(i)*lmcOffloadLayersStep))
					esis[i] = gf.EstimateLLaMACppRun(lmeopts...).SummarizeItem(mmap, platformRAM, platformVRAM)
				}(i)
			}
			wg.Wait()
			esis[cap(esis)-1] = lmes.Items[0]
			lmes.Items = esis
		}

		bds := make([][]any, len(lmes.Items))
		for i := range lmes.Items {
			if !inShort {
				bds[i] = []any{
					sprintf(tenary(lmes.Architecture != "", lmes.Architecture, "N/A")),
					sprintf(lmes.ContextSize),
					sprintf("%d / %d", lmes.LogicalBatchSize, lmes.PhysicalBatchSize),
					sprintf(tenary(flashAttention, tenary(lmes.FlashAttention, "Enabled", "Unsupported"), "Disabled")),
					sprintf(tenary(mmap, tenary(!lmes.NoMMap, "Enabled", "Unsupported"), "Disabled")),
					sprintf(tenary(lmes.EmbeddingOnly, "Yes", "No")),
					sprintf(tenary(lmes.Reranking, "Supported", "Unsupported")),
					sprintf(tenary(lmes.Architecture != "" && lmes.Distributable, "Supported", "Unsupported")),
					sprintf(tenary(lmes.Items[i].FullOffloaded, sprintf("%d (%d + 1)",
						lmes.Items[i].OffloadLayers, lmes.Items[i].OffloadLayers-1), lmes.Items[i].OffloadLayers)),
					sprintf(tenary(lmes.Items[i].FullOffloaded, "Yes", "No")),
				}
			}
			if lmes.Items[i].MaximumTokensPerSecond != nil {
				bds[i] = append(bds[i],
					sprintf(*lmes.Items[i].MaximumTokensPerSecond))
			}
			bds[i] = append(bds[i],
				sprintf("1 + %d + %d", lmes.Items[i].RAM.HandleLayers, tenary(lmes.Items[i].RAM.HandleOutputLayer, 1, 0)),
				sprintf(lmes.Items[i].RAM.UMA),
				sprintf(lmes.Items[i].RAM.NonUMA))
			for _, v := range lmes.Items[i].VRAMs {
				bds[i] = append(bds[i],
					sprintf("%d + %d", v.HandleLayers, tenary(v.HandleOutputLayer, 1, 0)),
					sprintf(v.UMA),
					sprintf(v.NonUMA))
			}
		}

		tprint(
			"ESTIMATE",
			hds,
			bds)
	}

	if !skipEstimate && m.Architecture == "diffusion" {
		hds := make([][]any, 2)
		sdes := sde.Summarize(mmap, platformRAM, platformVRAM)
		if !inShort {
			hds[0] = []any{
				"Arch",
				"Flash Attention",
				"MMap Load",
				"Distributable",
				"Full Offloaded",
			}
			hds[1] = []any{
				"Arch",
				"Flash Attention",
				"MMap Load",
				"Distributable",
				"Full Offloaded",
			}
		}
		hds[0] = append(hds[0], "RAM", "RAM")
		hds[1] = append(hds[1], "UMA", "NonUMA")
		for _, v := range sdes.Items[0].VRAMs {
			var hd string
			if v.Remote {
				hd = fmt.Sprintf("RPC %d (V)RAM", v.Position)
			} else {
				hd = fmt.Sprintf("VRAM %d", v.Position)
			}
			hds[0] = append(hds[0], hd, hd)
			hds[1] = append(hds[1], "UMA", "NonUMA")
		}

		bds := make([][]any, len(sdes.Items))
		for i := range sdes.Items {
			if !inShort {
				bds[i] = []any{
					sprintf(tenary(sdes.Architecture != "", sdes.Architecture, "N/A")),
					sprintf(tenary(flashAttention, tenary(sdes.FlashAttention, "Enabled", "Unsupported"), "Disabled")),
					sprintf(tenary(mmap, tenary(!sdes.NoMMap, "Enabled", "Unsupported"), "Disabled")),
					sprintf(tenary(sdes.Architecture != "" && sdes.Distributable, "Supported", "Unsupported")),
					sprintf(tenary(sdes.Items[i].FullOffloaded, "Yes", "No")),
				}
			}
			bds[i] = append(bds[i],
				sprintf(sdes.Items[i].RAM.UMA),
				sprintf(sdes.Items[i].RAM.NonUMA))
			for _, v := range sdes.Items[i].VRAMs {
				bds[i] = append(bds[i],
					sprintf(v.UMA),
					sprintf(v.NonUMA))
			}
		}

		tprint(
			"ESTIMATE",
			hds,
			bds)
	}

	return nil
}

func sprintf(f any, a ...any) string {
	if v, ok := f.(string); ok {
		if len(a) != 0 {
			return fmt.Sprintf(v, a...)
		}
		return v
	}
	return anyx.String(f)
}

func tprint(title string, headers, bodies [][]any) {
	tw := table.NewWriter()
	tw.SetOutputMirror(os.Stdout)
	tw.SetTitle(strings.ToUpper(title))
	for i := range headers {
		tw.AppendHeader(headers[i], table.RowConfig{AutoMerge: true, AutoMergeAlign: text.AlignCenter})
	}
	for i := range bodies {
		tw.AppendRow(bodies[i])
	}
	tw.SetColumnConfigs(func() (r []table.ColumnConfig) {
		r = make([]table.ColumnConfig, len(headers[0]))
		for i := range r {
			r[i].Number = i + 1
			r[i].AutoMerge = true
			if len(headers) > 1 && (strings.HasPrefix(headers[1][i].(string), "Layers") || headers[1][i] == "UMA" || headers[1][i] == "NonUMA") {
				r[i].AutoMerge = false
			}
			r[i].Align = text.AlignCenter
			r[i].AlignHeader = text.AlignCenter
		}
		return r
	}())
	tw.Style().Options.SeparateRows = true
	tw.Render()
	fmt.Println()
}

func tenary(c bool, t, f any) any {
	if c {
		return t
	}
	return f
}

func tenaryFunc(c bool, t, f func() any) any {
	if c {
		return t()
	}
	return f()
}

func toGGMLType(s string) GGMLType {
	t := GGMLTypeF16
	switch s {
	case "f32":
		t = GGMLTypeF32
	case "f16":
		t = GGMLTypeF16
	case "q8_0":
		t = GGMLTypeQ8_0
	case "q4_0":
		t = GGMLTypeQ4_0
	case "q4_1":
		t = GGMLTypeQ4_1
	case "iq4_nl":
		t = GGMLTypeIQ4_NL
	case "q5_0":
		t = GGMLTypeQ5_0
	case "q5_1":
		t = GGMLTypeQ5_1
	}
	return t
}


================================================
FILE: file.go
================================================
package gguf_parser

import (
	"bytes"
	"encoding/binary"
	"errors"
	"fmt"
	"io"
	"regexp"
	"strings"

	"golang.org/x/exp/constraints"

	"github.com/gpustack/gguf-parser-go/util/anyx"
	"github.com/gpustack/gguf-parser-go/util/bytex"
	"github.com/gpustack/gguf-parser-go/util/funcx"
	"github.com/gpustack/gguf-parser-go/util/osx"
	"github.com/gpustack/gguf-parser-go/util/stringx"
)

// GGUFFile represents a GGUF file,
// see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#file-structure.
//
// Compared with the complete GGUF file,
// this structure lacks the tensor data part.
type GGUFFile struct {
	/* Basic */

	// Header is the header of the GGUF file.
	Header GGUFHeader `json:"header"`
	// TensorInfos are the tensor infos of the GGUF file,
	// the size of TensorInfos is equal to `Header.TensorCount`.
	TensorInfos GGUFTensorInfos `json:"tensorInfos"`
	// Padding is the padding size of the GGUF file,
	// which is used to split Header and TensorInfos from tensor data.
	Padding int64 `json:"padding"`
	// SplitPaddings holds the padding size slice of the GGUF file splits,
	// each item represents splitting Header and TensorInfos from tensor data.
	//
	// The length of SplitPaddings is the number of split files.
	SplitPaddings []int64 `json:"splitPaddings,omitempty"`
	// TensorDataStartOffset is the offset in bytes of the tensor data in this file.
	//
	// The offset is the start of the file.
	TensorDataStartOffset int64 `json:"tensorDataStartOffset"`
	// SplitTensorDataStartOffsets holds the offset slice in bytes of the tensor data of the GGUF file splits,
	// each item represents the offset of the tensor data in the split file.
	//
	// The length of SplitTensorDataStartOffsets is the number of split files.
	SplitTensorDataStartOffsets []int64 `json:"splitTensorDataStartOffsets,omitempty"`

	/* Appendix */

	// Size is the size of the GGUF file,
	// if the file is split, the size is the sum of all split files.
	Size GGUFBytesScalar `json:"size"`
	// SplitSizes holds the size slice of the GGUF file splits,
	// each item represents the size of the split file.
	//
	// The length of SplitSizes is the number of split files.
	SplitSizes []GGUFBytesScalar `json:"splitSizes,omitempty"`
	// ModelSize is the size of the model when loading.
	ModelSize GGUFBytesScalar `json:"modelSize"`
	// SplitModelSizes holds the size slice of the model,
	// each item represents a size when loading of the split file.
	//
	// The length of SplitModelSizes is the number of split files.
	SplitModelSizes []GGUFBytesScalar `json:"splitModelSizes,omitempty"`
	// ModelParameters is the number of the model parameters.
	ModelParameters GGUFParametersScalar `json:"modelParameters"`
	// ModelBitsPerWeight is the bits per weight of the model,
	// which describes how many bits are used to store a weight,
	// higher is better.
	ModelBitsPerWeight GGUFBitsPerWeightScalar `json:"modelBitsPerWeight"`
}

// GGUFMagic is a magic number of GGUF file,
// see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#historical-state-of-affairs.
type GGUFMagic uint32

// GGUFMagic constants.
const (
	GGUFMagicGGML   GGUFMagic = 0x67676d6c
	GGUFMagicGGMF   GGUFMagic = 0x67676d66
	GGUFMagicGGJT   GGUFMagic = 0x67676a74
	GGUFMagicGGUFLe GGUFMagic = 0x46554747 // GGUF
	GGUFMagicGGUFBe GGUFMagic = 0x47475546 // GGUF
)

// GGUFVersion is a version of GGUF file format,
// see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#version-history.
type GGUFVersion uint32

// GGUFVersion constants.
const (
	GGUFVersionV1 GGUFVersion = iota + 1
	GGUFVersionV2
	GGUFVersionV3
)

// GGUFHeader represents the header of a GGUF file.
type GGUFHeader struct {
	// Magic is a magic number that announces that this is a GGUF file.
	Magic GGUFMagic `json:"magic"`
	// Version is a version of the GGUF file format.
	Version GGUFVersion `json:"version"`
	// TensorCount is the number of tensors in the file.
	TensorCount uint64 `json:"tensorCount"`
	// MetadataKVCount is the number of key-value pairs in the metadata.
	MetadataKVCount uint64 `json:"metadataKVCount"`
	// MetadataKV are the key-value pairs in the metadata,
	MetadataKV GGUFMetadataKVs `json:"metadataKV"`
}

// GGUFMetadataValueType is a type of GGUF metadata value,
// see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#file-structure.
type GGUFMetadataValueType uint32

// GGUFMetadataValueType constants.
const (
	GGUFMetadataValueTypeUint8 GGUFMetadataValueType = iota
	GGUFMetadataValueTypeInt8
	GGUFMetadataValueTypeUint16
	GGUFMetadataValueTypeInt16
	GGUFMetadataValueTypeUint32
	GGUFMetadataValueTypeInt32
	GGUFMetadataValueTypeFloat32
	GGUFMetadataValueTypeBool
	GGUFMetadataValueTypeString
	GGUFMetadataValueTypeArray
	GGUFMetadataValueTypeUint64
	GGUFMetadataValueTypeInt64
	GGUFMetadataValueTypeFloat64
	_GGUFMetadataValueTypeCount // Unknown
)

// Types for GGUFMetadataKV.
type (
	// GGUFMetadataKV is a key-value pair in the metadata of a GGUF file.
	GGUFMetadataKV struct {
		// Key is the key of the metadata key-value pair,
		// which is no larger than 64 bytes long.
		Key string `json:"key"`
		// ValueType is the type of the metadata value.
		ValueType GGUFMetadataValueType `json:"valueType"`
		// Value is the value of the metadata key-value pair.
		Value any `json:"value"`
	}

	// GGUFMetadataKVArrayValue is a value of a GGUFMetadataKV with type GGUFMetadataValueTypeArray.
	GGUFMetadataKVArrayValue struct {
		/* Basic */

		// Type is the type of the array item.
		Type GGUFMetadataValueType `json:"type"`
		// Len is the length of the array.
		Len uint64 `json:"len"`
		// Array holds all array items.
		Array []any `json:"array,omitempty"`

		/* Appendix */

		// StartOffset is the offset in bytes of the GGUFMetadataKVArrayValue in the GGUFFile file.
		//
		// The offset is the start of the file.
		StartOffset int64 `json:"startOffset"`

		// Size is the size of the array in bytes.
		Size int64 `json:"size"`
	}

	// GGUFMetadataKVs is a list of GGUFMetadataKV.
	GGUFMetadataKVs []GGUFMetadataKV
)

// Types for GGUFTensorInfo.
type (
	// GGUFTensorInfo represents a tensor info in a GGUF file.
	GGUFTensorInfo struct {
		/* Basic */

		// Name is the name of the tensor,
		// which is no larger than 64 bytes long.
		Name string `json:"name"`
		// NDimensions is the number of dimensions of the tensor.
		NDimensions uint32 `json:"nDimensions"`
		// Dimensions is the dimensions of the tensor,
		// the length is NDimensions.
		Dimensions []uint64 `json:"dimensions"`
		// Type is the type of the tensor.
		Type GGMLType `json:"type"`
		// Offset is the offset in bytes of the tensor's data in this file.
		//
		// The offset is relative to tensor data, not to the start of the file.
		Offset uint64 `json:"offset"`

		/* Appendix */

		// StartOffset is the offset in bytes of the GGUFTensorInfo in the GGUFFile file.
		//
		// The offset is the start of the file.
		StartOffset int64 `json:"startOffset"`
	}

	// GGUFTensorInfos is a list of GGUFTensorInfo.
	GGUFTensorInfos []GGUFTensorInfo
)

var ErrGGUFFileInvalidFormat = errors.New("invalid GGUF format")

// ParseGGUFFile parses a GGUF file from the local given path,
// and returns the GGUFFile, or an error if any.
func ParseGGUFFile(path string, opts ...GGUFReadOption) (*GGUFFile, error) {
	var o _GGUFReadOptions
	for _, opt := range opts {
		opt(&o)
	}

	var paths []string
	{
		rs := CompleteShardGGUFFilename(path)
		if rs != nil {
			paths = rs
		} else {
			paths = []string{path}
		}
	}

	fs := make([]_GGUFFileReadSeeker, 0, len(paths))
	defer func() {
		for i := range fs {
			osx.Close(fs[i])
		}
	}()

	for i := range paths {
		if o.MMap {
			mf, err := osx.OpenMmapFile(paths[i])
			if err != nil {
				return nil, fmt.Errorf("open mmap file: %w", err)
			}

			fs = append(fs, _GGUFFileReadSeeker{
				Closer:     mf,
				ReadSeeker: io.NewSectionReader(mf, 0, mf.Len()),
				Size:       mf.Len(),
			})

			continue
		}

		ff, err := osx.Open(paths[i])
		if err != nil {
			return nil, fmt.Errorf("open file: %w", err)
		}

		fs = append(fs, _GGUFFileReadSeeker{
			Closer:     ff,
			ReadSeeker: ff,
			Size:       funcx.MustNoError(ff.Stat()).Size(),
		})
	}

	return parseGGUFFile(fs, o)
}

type _GGUFFileReadSeeker struct {
	io.Closer
	io.ReadSeeker
	Size int64
}

func _validateCountWithRemaining(f _GGUFFileReadSeeker, count uint64, version GGUFVersion, what string) error {
	if count == 0 {
		return nil
	}
	var minItemSize int64

	switch strings.ToLower(what) {
	case "metadatakvcount":
		if version <= GGUFVersionV1 {
			minItemSize = 12 // key length (uint32) + value type (uint32) + min value (string length uint32)
		} else {
			minItemSize = 20 // key length (uint64) + value type (uint32) + min value (string length uint64)
		}
	case "tensor":
		if version <= GGUFVersionV1 {
			minItemSize = 20 // name length (uint32) + n_dims (uint32) + type (uint32) + offset (uint64)
		} else {
			minItemSize = 24 // name length (uint64) + n_dims (uint32) + type (uint32) + offset (uint64)
		}
	}

	if minItemSize <= 0 {
		return fmt.Errorf("invalid min item size for %s: %d", what, minItemSize)
	}
	pos, err := f.Seek(0, io.SeekCurrent)
	if err != nil {
		return fmt.Errorf("seek %s count position: %w", what, err)
	}
	remaining := f.Size - pos
	if remaining < 0 {
		return fmt.Errorf("invalid file size: %d", f.Size)
	}
	maxCount := uint64(remaining / minItemSize)
	if maxCount < count {
		return fmt.Errorf("%s count too large for remaining bytes: %d", what, count)
	}

	return nil
}

func parseGGUFFile(fs []_GGUFFileReadSeeker, o _GGUFReadOptions) (_ *GGUFFile, err error) {
	var gf GGUFFile

	for _, f := range fs {
		var bo binary.ByteOrder = binary.LittleEndian

		// magic
		var magic GGUFMagic
		if err = binary.Read(f, bo, &magic); err != nil {
			return nil, fmt.Errorf("read magic: %w", err)
		}
		switch magic {
		default:
			return nil, ErrGGUFFileInvalidFormat
		case GGUFMagicGGML, GGUFMagicGGMF, GGUFMagicGGJT:
			return nil, fmt.Errorf("unsupported format: %s", magic)
		case GGUFMagicGGUFLe:
		case GGUFMagicGGUFBe:
			bo = binary.BigEndian
		}
		gf.Header.Magic = magic

		// version
		var version GGUFVersion
		if err = binary.Read(f, bo, &version); err != nil {
			return nil, fmt.Errorf("read version: %w", err)
		}
		if version > GGUFVersionV3 {
			return nil, fmt.Errorf("unsupported GGUF version: %d (supported: %d-%d)",
				version, GGUFVersionV1, GGUFVersionV3)
		}
		gf.Header.Version = version

		rd := _GGUFReader{v: version, o: o, f: f, bo: bo}

		// tensor count
		var tensorCount uint64
		if version <= GGUFVersionV1 {
			tensorCount, err = rd.ReadUint64FromUint32()
		} else {
			tensorCount, err = rd.ReadUint64()
		}
		if err != nil {
			return nil, fmt.Errorf("read tensor count: %w", err)
		}
		if err := _validateCountWithRemaining(f, tensorCount, version, "tensor"); err != nil {
			return nil, err
		}
		gf.Header.TensorCount += tensorCount

		// metadata kv count
		var metadataKVCount uint64
		if version <= GGUFVersionV1 {
			metadataKVCount, err = rd.ReadUint64FromUint32()
		} else {
			metadataKVCount, err = rd.ReadUint64()
		}
		if err != nil {
			return nil, fmt.Errorf("read metadata kv count: %w", err)
		}
		if err := _validateCountWithRemaining(f, metadataKVCount, version, "metadatakvcount"); err != nil {
			return nil, err
		}
		gf.Header.MetadataKVCount += metadataKVCount

		// metadata kv
		{
			rd := _GGUFMetadataReader{_GGUFReader: rd}
			kvs := make(GGUFMetadataKVs, metadataKVCount)
			for i := uint64(0); i < metadataKVCount; i++ {
				kvs[i], err = rd.Read()
				if err != nil {
					return nil, fmt.Errorf("read metadata kv %d: %w", i, err)
				}
			}
			for i := range kvs {
				if kvs[i].Key == "split.no" {
					gf.Header.MetadataKVCount--
					continue
				}
				gf.Header.MetadataKV = append(gf.Header.MetadataKV, kvs[i])
			}
		}

		// tensor infos
		if gf.TensorInfos == nil {
			tc, ok := gf.Header.MetadataKV.Get("split.tensors.count")
			if ok {
				gf.TensorInfos = make(GGUFTensorInfos, 0, anyx.Number[int](tc.Value))
			} else {
				// avoid preallocating with tensorCount (could be huge); start empty and append
				gf.TensorInfos = make(GGUFTensorInfos, 0)
			}
		}
		{
			rd := _GGUFTensorInfoReader{_GGUFReader: rd}
			tis := make(GGUFTensorInfos, 0)
			for i := uint64(0); i < tensorCount; i++ {
				ti, err := rd.Read()
				if err != nil {
					return nil, fmt.Errorf("read tensor info %d: %w", i, err)
				}
				tis = append(tis, ti)
			}
			gf.TensorInfos = append(gf.TensorInfos, tis...)
		}

		pds, err := f.Seek(0, io.SeekCurrent)
		if err != nil {
			return nil, fmt.Errorf("seek padding start: %w", err)
		}

		// padding
		var padding int64
		{
			// The global alignment to use, as described above.
			// This can vary to allow for different alignment schemes, but it must be a multiple of 8.
			// Some writers may not write the alignment.
			// If the alignment is not specified, assume it is 32.
			var ag uint32 = 32
			if v, ok := gf.Header.MetadataKV.Get("general.alignment"); ok {
				ag = v.ValueUint32()
			}
			padding = int64(ag) - (pds % int64(ag))
		}
		if len(fs) == 1 {
			gf.Padding = padding
		}
		gf.SplitPaddings = append(gf.SplitPaddings, padding)

		// tensor data offset
		tensorDataStartOffset := pds + padding
		if len(fs) == 1 {
			gf.TensorDataStartOffset = tensorDataStartOffset
		}
		gf.SplitTensorDataStartOffsets = append(gf.SplitTensorDataStartOffsets, tensorDataStartOffset)

		// size
		size := GGUFBytesScalar(f.Size)
		gf.Size += size
		gf.SplitSizes = append(gf.SplitSizes, size)

		// model size
		modelSize := GGUFBytesScalar(f.Size - tensorDataStartOffset)
		gf.ModelSize += modelSize
		gf.SplitModelSizes = append(gf.SplitModelSizes, modelSize)
	}

	// model parameters
	gf.ModelParameters = GGUFParametersScalar(gf.TensorInfos.Elements())

	// bpw
	if gf.ModelParameters != 0 {
		gf.ModelBitsPerWeight = GGUFBitsPerWeightScalar(float64(gf.ModelSize) * 8 / float64(gf.ModelParameters))
	}

	return &gf, nil
}

// Types for GGUF hierarchical tensors.
type (
	// GGUFTensorInfoFilter is a filter to filter out if the given tensor name matches.
	// Return true if the name matches, and false otherwise.
	GGUFTensorInfoFilter func(name string) bool

	// IGGUFTensorInfos is an interface for GGUF tensor infos,
	// which includes basic operations.
	IGGUFTensorInfos interface {
		// Get returns the GGUFTensorInfo with the given name,
		// and true if found, and false otherwise.
		Get(name string) (info GGUFTensorInfo, found bool)
		// GetFileType returns the GGUFFileType.
		GetFileType() GGUFFileType
		// Match returns true if the name matches the given regex, and false otherwise.
		Match(nameRegex *regexp.Regexp) bool
		// Search returns a list of GGUFTensorInfo with the names that match the given regex.
		Search(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo)
		// Index returns a map value to the GGUFTensorInfo with the given names,
		// and the number of names found.
		Index(names []string) (infos map[string]GGUFTensorInfo, found int)
		// Elements returns the number of elements(parameters).
		Elements(filter ...GGUFTensorInfoFilter) uint64
		// Bytes returns the number of bytes.
		Bytes(filter ...GGUFTensorInfoFilter) uint64
		// Count returns the number of tensors.
		Count() uint64
	}

	// GGUFLayerTensorInfos represents hierarchical tensor infos of a GGUF file,
	// it can save GGUFNamedTensorInfos, GGUFTensorInfos, and GGUFTensorInfo.
	GGUFLayerTensorInfos []IGGUFTensorInfos

	// GGUFNamedTensorInfos is the namespace for relevant tensors,
	// which must has a name.
	GGUFNamedTensorInfos struct {
		// Name is the name of the namespace.
		Name string `json:"name"`
		// GGUFLayerTensorInfos can save GGUFNamedTensorInfos, GGUFTensorInfos, or GGUFTensorInfo.
		//
		// If the item is type of GGUFTensorInfo, it must be the leaf node.
		//
		// Any branch nodes are type of GGUFNamedTensorInfos or GGUFTensorInfos,
		// which can be nested.
		//
		// Branch nodes store in type pointer.
		GGUFLayerTensorInfos `json:"items,omitempty"`
	}
)

// Layers converts the GGUFTensorInfos to GGUFLayerTensorInfos.
func (gf *GGUFFile) Layers(ignores ...string) GGUFLayerTensorInfos {
	return gf.TensorInfos.Layers(ignores...)
}

func (kv GGUFMetadataKV) ValueUint8() uint8 {
	if kv.ValueType != GGUFMetadataValueTypeUint8 {
		panic(fmt.Errorf("key %q try to get type Uint8 but type %v", kv.Key, kv.ValueType))
	}
	return anyx.Number[uint8](kv.Value)
}

func (kv GGUFMetadataKV) ValueInt8() int8 {
	if kv.ValueType != GGUFMetadataValueTypeInt8 {
		panic(fmt.Errorf("key %q try to get type Int8 but type %v", kv.Key, kv.ValueType))
	}
	return anyx.Number[int8](kv.Value)
}

func (kv GGUFMetadataKV) ValueUint16() uint16 {
	if kv.ValueType != GGUFMetadataValueTypeUint16 {
		panic(fmt.Errorf("key %q try to get type Uint16 but type %v", kv.Key, kv.ValueType))
	}
	return anyx.Number[uint16](kv.Value)
}

func (kv GGUFMetadataKV) ValueInt16() int16 {
	if kv.ValueType != GGUFMetadataValueTypeInt16 {
		panic(fmt.Errorf("key %q try to get type Int16 but type %v", kv.Key, kv.ValueType))
	}
	return anyx.Number[int16](kv.Value)
}

func (kv GGUFMetadataKV) ValueUint32() uint32 {
	if kv.ValueType != GGUFMetadataValueTypeUint32 {
		panic(fmt.Errorf("key %q try to get type Uint32 but type %v", kv.Key, kv.ValueType))
	}
	return anyx.Number[uint32](kv.Value)
}

func (kv GGUFMetadataKV) ValueInt32() int32 {
	if kv.ValueType != GGUFMetadataValueTypeInt32 {
		panic(fmt.Errorf("key %q try to get type Int32 but type %v", kv.Key, kv.ValueType))
	}
	return anyx.Number[int32](kv.Value)
}

func (kv GGUFMetadataKV) ValueFloat32() float32 {
	if kv.ValueType != GGUFMetadataValueTypeFloat32 {
		panic(fmt.Errorf("key %q try to get type Float32 but type %v", kv.Key, kv.ValueType))
	}
	return anyx.Number[float32](kv.Value)
}

func (kv GGUFMetadataKV) ValueBool() bool {
	if kv.ValueType != GGUFMetadataValueTypeBool {
		panic(fmt.Errorf("key %q try to get type Bool but type %v", kv.Key, kv.ValueType))
	}
	return anyx.Bool(kv.Value)
}

func (kv GGUFMetadataKV) ValueString() string {
	if kv.ValueType != GGUFMetadataValueTypeString {
		panic(fmt.Errorf("key %q try to get type String but type %v", kv.Key, kv.ValueType))
	}
	return anyx.String(kv.Value)
}

func (kv GGUFMetadataKV) ValueArray() GGUFMetadataKVArrayValue {
	if kv.ValueType != GGUFMetadataValueTypeArray {
		panic(fmt.Errorf("key %q try to get type Array but type %v", kv.Key, kv.ValueType))
	}
	switch t := kv.Value.(type) {
	case GGUFMetadataKVArrayValue:
		return t
	case map[string]any:
		return GGUFMetadataKVArrayValue{
			Type: anyx.Number[GGUFMetadataValueType](t["type"]),
			Len:  anyx.Number[uint64](t["len"]),
			Array: func() []any {
				if vv, ok := t["array"].([]any); ok {
					return vv
				}
				return nil
			}(),
			StartOffset: anyx.Number[int64](t["startOffset"]),
			Size:        anyx.Number[int64](t["size"]),
		}
	default:
		panic(fmt.Errorf("key %q try to get type Array but type %T", kv.Key, kv.Value))
	}
}

func (kv GGUFMetadataKV) ValueUint64() uint64 {
	if kv.ValueType != GGUFMetadataValueTypeUint64 {
		panic(fmt.Errorf("key %q try to get type Uint64 but type %v", kv.Key, kv.ValueType))
	}
	return anyx.Number[uint64](kv.Value)
}

func (kv GGUFMetadataKV) ValueInt64() int64 {
	if kv.ValueType != GGUFMetadataValueTypeInt64 {
		panic(fmt.Errorf("key %q try to get type Int64 but type %v", kv.Key, kv.ValueType))
	}
	return anyx.Number[int64](kv.Value)
}

func (kv GGUFMetadataKV) ValueFloat64() float64 {
	if kv.ValueType != GGUFMetadataValueTypeFloat64 {
		panic(fmt.Errorf("key %q try to get type Float64 but type %v", kv.Key, kv.ValueType))
	}
	return anyx.Number[float64](kv.Value)
}

// ValueNumeric returns the numeric values of the GGUFMetadataKV,
// and panics if the value type is not numeric.
//
// ValueNumeric is a generic function, and the type T must be constraints.Integer or constraints.Float.
//
// Compare to the GGUFMetadataKV's Value* functions,
// ValueNumeric will cast the original value to the target type.
func ValueNumeric[T constraints.Integer | constraints.Float](kv GGUFMetadataKV) T {
	switch kv.ValueType {
	case GGUFMetadataValueTypeUint8:
	case GGUFMetadataValueTypeInt8:
	case GGUFMetadataValueTypeUint16:
	case GGUFMetadataValueTypeInt16:
	case GGUFMetadataValueTypeUint32:
	case GGUFMetadataValueTypeInt32:
	case GGUFMetadataValueTypeFloat32:
	case GGUFMetadataValueTypeUint64:
	case GGUFMetadataValueTypeInt64:
	case GGUFMetadataValueTypeFloat64:
	default:
		panic(fmt.Errorf("key %q try to get type Numeric but got type %v", kv.Key, kv.ValueType))
	}
	return anyx.Number[T](kv.Value)
}

func (av GGUFMetadataKVArrayValue) ValuesUint8() []uint8 {
	if av.Type != GGUFMetadataValueTypeUint8 {
		panic(fmt.Errorf("try to get type Uint8 but got type %v", av.Type))
	}
	v := make([]uint8, av.Len)
	for i := uint64(0); i < av.Len; i++ {
		v[i] = anyx.Number[uint8](av.Array[i])
	}
	return v
}

func (av GGUFMetadataKVArrayValue) ValuesInt8() []int8 {
	if av.Type != GGUFMetadataValueTypeInt8 {
		panic(fmt.Errorf("try to get type Int8 but got type %v", av.Type))
	}
	v := make([]int8, av.Len)
	for i := uint64(0); i < av.Len; i++ {
		v[i] = anyx.Number[int8](av.Array[i])
	}
	return v
}

func (av GGUFMetadataKVArrayValue) ValuesUint16() []uint16 {
	if av.Type != GGUFMetadataValueTypeUint16 {
		panic(fmt.Errorf("try to get type Uint16 but got type %v", av.Type))
	}
	v := make([]uint16, av.Len)
	for i := uint64(0); i < av.Len; i++ {
		v[i] = anyx.Number[uint16](av.Array[i])
	}
	return v
}

func (av GGUFMetadataKVArrayValue) ValuesInt16() []int16 {
	if av.Type != GGUFMetadataValueTypeInt16 {
		panic(fmt.Errorf("try to get type Int16 but got type %v", av.Type))
	}
	v := make([]int16, av.Len)
	for i := uint64(0); i < av.Len; i++ {
		v[i] = anyx.Number[int16](av.Array[i])
	}
	return v
}

func (av GGUFMetadataKVArrayValue) ValuesUint32() []uint32 {
	if av.Type != GGUFMetadataValueTypeUint32 {
		panic(fmt.Errorf("try to get type Uint8 but got type %v", av.Type))
	}
	v := make([]uint32, av.Len)
	for i := uint64(0); i < av.Len; i++ {
		v[i] = anyx.Number[uint32](av.Array[i])
	}
	return v
}

func (av GGUFMetadataKVArrayValue) ValuesInt32() []int32 {
	if av.Type != GGUFMetadataValueTypeInt32 {
		panic(fmt.Errorf("try to get type Int32 but got type %v", av.Type))
	}
	v := make([]int32, av.Len)
	for i := uint64(0); i < av.Len; i++ {
		v[i] = anyx.Number[int32](av.Array[i])
	}
	return v
}

func (av GGUFMetadataKVArrayValue) ValuesFloat32() []float32 {
	if av.Type != GGUFMetadataValueTypeFloat32 {
		panic(fmt.Errorf("try to get type Float32 but got type %v", av.Type))
	}
	v := make([]float32, av.Len)
	for i := uint64(0); i < av.Len; i++ {
		v[i] = anyx.Number[float32](av.Array[i])
	}
	return v
}

func (av GGUFMetadataKVArrayValue) ValuesBool() []bool {
	if av.Type != GGUFMetadataValueTypeBool {
		panic(fmt.Errorf("try to get type Bool but got type %v", av.Type))
	}
	v := make([]bool, av.Len)
	for i := uint64(0); i < av.Len; i++ {
		v[i] = anyx.Bool(av.Array[i])
	}
	return v
}

func (av GGUFMetadataKVArrayValue) ValuesString() []string {
	if av.Type != GGUFMetadataValueTypeString {
		panic(fmt.Errorf("try to get type String but got type %v", av.Type))
	}
	v := make([]string, av.Len)
	for i := uint64(0); i < av.Len; i++ {
		v[i] = anyx.String(av.Array[i])
	}
	return v
}

func (av GGUFMetadataKVArrayValue) ValuesArray() []GGUFMetadataKVArrayValue {
	if av.Type != GGUFMetadataValueTypeArray {
		panic(fmt.Errorf("try to get type Array but got type %v", av.Type))
	}
	v := make([]GGUFMetadataKVArrayValue, av.Len)
	for i := uint64(0); i < av.Len; i++ {
		switch t := av.Array[i].(type) {
		case GGUFMetadataKVArrayValue:
			v[i] = t
		case map[string]any:
			v[i] = GGUFMetadataKVArrayValue{
				Type: anyx.Number[GGUFMetadataValueType](t["type"]),
				Len:  anyx.Number[uint64](t["len"]),
				Array: func() []any {
					if vv, ok := t["array"].([]any); ok {
						return vv
					}
					return nil
				}(),
				StartOffset: anyx.Number[int64](t["startOffset"]),
				Size:        anyx.Number[int64](t["size"]),
			}
		default:
			panic(fmt.Errorf("try to get type Array but got type %T", av.Array[i]))
		}
	}
	return v
}

func (av GGUFMetadataKVArrayValue) ValuesUint64() []uint64 {
	if av.Type != GGUFMetadataValueTypeUint64 {
		panic(fmt.Errorf("try to get type Uint16 but got type %v", av.Type))
	}
	v := make([]uint64, av.Len)
	for i := uint64(0); i < av.Len; i++ {
		v[i] = anyx.Number[uint64](av.Array[i])
	}
	return v
}

func (av GGUFMetadataKVArrayValue) ValuesInt64() []int64 {
	if av.Type != GGUFMetadataValueTypeInt64 {
		panic(fmt.Errorf("try to get type Int64 but got type %v", av.Type))
	}
	v := make([]int64, av.Len)
	for i := uint64(0); i < av.Len; i++ {
		v[i] = anyx.Number[int64](av.Array[i])
	}
	return v
}

func (av GGUFMetadataKVArrayValue) ValuesFloat64() []float64 {
	if av.Type != GGUFMetadataValueTypeFloat64 {
		panic(fmt.Errorf("try to get type Float64 but got type %v", av.Type))
	}
	v := make([]float64, av.Len)
	for i := uint64(0); i < av.Len; i++ {
		v[i] = anyx.Number[float64](av.Array[i])
	}
	return v
}

// ValuesNumeric returns the numeric values of the GGUFMetadataKVArrayValue,
// and panics if the value type is not numeric.
//
// ValuesNumeric is a generic function, and the type T must be constraints.Integer or constraints.Float.
//
// Compare to the GGUFMetadataKVArrayValue's Value* functions,
// ValuesNumeric will cast the original value to the target type.
func ValuesNumeric[T constraints.Integer | constraints.Float](av GGUFMetadataKVArrayValue) []T {
	v := make([]T, av.Len)
	for i := uint64(0); i < av.Len; i++ {
		switch av.Type {
		case GGUFMetadataValueTypeUint8:
		case GGUFMetadataValueTypeInt8:
		case GGUFMetadataValueTypeUint16:
		case GGUFMetadataValueTypeInt16:
		case GGUFMetadataValueTypeUint32:
		case GGUFMetadataValueTypeInt32:
		case GGUFMetadataValueTypeFloat32:
		case GGUFMetadataValueTypeUint64:
		case GGUFMetadataValueTypeInt64:
		case GGUFMetadataValueTypeFloat64:
		default:
			panic(fmt.Errorf("try to get type Numeric but got type %v", av.Type))
		}
		if av.Array != nil {
			v[i] = anyx.Number[T](av.Array[i])
		}
	}
	return v
}

// Get returns the GGUFMetadataKV with the given key,
// and true if found, and false otherwise.
func (kvs GGUFMetadataKVs) Get(key string) (value GGUFMetadataKV, found bool) {
	for i := range kvs {
		if kvs[i].Key == key {
			return kvs[i], true
		}
	}
	return GGUFMetadataKV{}, false
}

// Search returns a list of GGUFMetadataKV with the keys that match the given regex.
func (kvs GGUFMetadataKVs) Search(keyRegex *regexp.Regexp) (values []GGUFMetadataKV) {
	for i := range kvs {
		if keyRegex.MatchString(kvs[i].Key) {
			values = append(values, kvs[i])
		}
	}
	return values
}

// Index returns a map value to the GGUFMetadataKVs with the given keys,
// and the number of keys found.
func (kvs GGUFMetadataKVs) Index(keys []string) (values map[string]GGUFMetadataKV, found int) {
	ks := make(map[string]struct{}, len(keys))
	for i := range keys {
		ks[keys[i]] = struct{}{}
	}
	values = make(map[string]GGUFMetadataKV)
	for i := range kvs {
		if _, ok := ks[kvs[i].Key]; ok {
			values[kvs[i].Key] = kvs[i]
			found++
		}
		if found == len(ks) {
			break
		}
	}
	return values, found
}

// Get returns the GGUFTensorInfo with the given name,
// and true if found, and false otherwise.
func (ti GGUFTensorInfo) Get(name string) (info GGUFTensorInfo, found bool) {
	if ti.Name == name {
		return ti, true
	}
	return GGUFTensorInfo{}, false
}

// GetFileType returns the GGUFFileType.
func (ti GGUFTensorInfo) GetFileType() GGUFFileType {
	return GetFileType(map[GGMLType]int{ti.Type: 1})
}

// Match returns true if the name of the GGUFTensorInfo matches the given regex.
func (ti GGUFTensorInfo) Match(nameRegex *regexp.Regexp) bool {
	return nameRegex.MatchString(ti.Name)
}

// Search returns a list of GGUFTensorInfo with the names that match the given regex.
func (ti GGUFTensorInfo) Search(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo) {
	if nameRegex.MatchString(ti.Name) {
		return []GGUFTensorInfo{ti}
	}
	return nil
}

// Index returns a map value to the GGUFTensorInfo with the given names,
// and the number of names found.
func (ti GGUFTensorInfo) Index(names []string) (infos map[string]GGUFTensorInfo, found int) {
	if len(names) == 0 {
		return nil, 0
	}
	if names[0] == ti.Name {
		return map[string]GGUFTensorInfo{ti.Name: ti}, 1
	}
	return nil, 0
}

// Elements returns the number of elements of the GGUFTensorInfo,
// which is inspired by
// https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2597-L2601.
func (ti GGUFTensorInfo) Elements(filter ...GGUFTensorInfoFilter) uint64 {
	if ti.NDimensions == 0 {
		return 0
	}

	for i := range filter {
		if filter[i] != nil && !filter[i](ti.Name) {
			return 0
		}
	}

	ret := uint64(1)
	for i := uint32(0); i < ti.NDimensions; i++ {
		ret *= ti.Dimensions[i]
	}
	return ret
}

// Bytes returns the number of bytes of the GGUFTensorInfo,
// which is inspired by
// https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2609-L2626.
func (ti GGUFTensorInfo) Bytes(filter ...GGUFTensorInfoFilter) uint64 {
	if ti.NDimensions == 0 {
		return 0
	}

	tt, ok := ti.Type.Trait()
	if !ok {
		panic(fmt.Errorf("invalid type: %v", ti.Type))
	}

	for i := range filter {
		if filter[i] != nil && !filter[i](ti.Name) {
			return 0
		}
	}

	// https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L3210-L3214
	nb := make([]uint64, 0, ti.NDimensions)
	{
		nb = append(nb, tt.TypeSize)
		nb = append(nb, nb[0]*(ti.Dimensions[0]/tt.BlockSize))
		for i := uint32(2); i < ti.NDimensions; i++ {
			nb = append(nb, nb[i-1]*ti.Dimensions[i-1])
		}
	}

	var ret uint64
	if tt.BlockSize == 1 {
		ret = tt.TypeSize
		for i := uint32(0); i < ti.NDimensions; i++ {
			ret += (ti.Dimensions[i] - 1) * nb[i]
		}
		return ret
	}

	ret = ti.Dimensions[0] * nb[0] / tt.BlockSize
	for i := uint32(1); i < ti.NDimensions; i++ {
		ret += (ti.Dimensions[i] - 1) * nb[i]
	}
	return ret
}

// Count returns the number of GGUF tensors of the GGUFTensorInfo,
// which is always 1.
func (ti GGUFTensorInfo) Count() uint64 {
	return 1
}

// Get returns the GGUFTensorInfo with the given name,
// and true if found, and false otherwise.
func (tis GGUFTensorInfos) Get(name string) (info GGUFTensorInfo, found bool) {
	for i := range tis {
		if tis[i].Name == name {
			return tis[i], true
		}
	}
	return GGUFTensorInfo{}, false
}

// GetFileType returns the GGUFFileType represented the mostly GGMLType of the GGUFTensorInfos.
func (tis GGUFTensorInfos) GetFileType() GGUFFileType {
	if len(tis) == 0 {
		return _GGUFFileTypeCount
	}

	cm := make(map[GGMLType]int)
	for i := range tis {
		cm[tis[i].Type]++
	}

	return GetFileType(cm)
}

// Match returns true if a tensor of GGUFTensorInfos matches the given regex.
func (tis GGUFTensorInfos) Match(nameRegex *regexp.Regexp) bool {
	for i := range tis {
		if nameRegex.MatchString(tis[i].Name) {
			return true
		}
	}
	return false
}

// Search returns a list of GGUFTensorInfo with the names that match the given regex.
func (tis GGUFTensorInfos) Search(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo) {
	for i := range tis {
		if nameRegex.MatchString(tis[i].Name) {
			infos = append(infos, tis[i])
		}
	}
	return infos
}

// Index returns a map value to the GGUFTensorInfos with the given names,
// and the number of names found.
func (tis GGUFTensorInfos) Index(names []string) (infos map[string]GGUFTensorInfo, found int) {
	ns := make(map[string]struct{}, len(names))
	for i := range names {
		ns[names[i]] = struct{}{}
	}
	infos = make(map[string]GGUFTensorInfo)
	for i := range tis {
		if _, ok := ns[tis[i].Name]; ok {
			infos[tis[i].Name] = tis[i]
			found++
		}
		if found == len(ns) {
			break
		}
	}
	return infos, found
}

// Elements returns the number of elements of the GGUFTensorInfos.
func (tis GGUFTensorInfos) Elements() uint64 {
	var ret uint64
	for i := range tis {
		ret += tis[i].Elements()
	}
	return ret
}

// Bytes returns the number of bytes of the GGUFTensorInfos.
func (tis GGUFTensorInfos) Bytes() uint64 {
	var ret uint64
	for i := range tis {
		ret += tis[i].Bytes()
	}
	return ret
}

// Count returns the number of GGUF tensors of the GGUFTensorInfos.
func (tis GGUFTensorInfos) Count() uint64 {
	return uint64(len(tis))
}

// Layers converts the GGUFTensorInfos to GGUFLayerTensorInfos.
func (tis GGUFTensorInfos) Layers(ignores ...string) GGUFLayerTensorInfos {
	if len(tis) == 0 {
		return nil
	}

	ls := tis.layers()
	if len(ignores) != 0 {
		_, ls, _ = ls.Cut(ignores)
		return ls
	}
	return ls
}

var numberRegex = regexp.MustCompile(`^\d+$`)

func (tis GGUFTensorInfos) layers() GGUFLayerTensorInfos {
	var ret GGUFLayerTensorInfos

	pm := make(map[string]any)
	for i := range tis {
		ps := strings.Split(tis[i].Name, ".")
		if len(ps) < 2 {
			ret = append(ret, tis[i])
			continue
		}
		switch {
		default:
			ret = append(ret, tis[i])
		case ps[0] == "blk" || ps[0] == "block":
			// LLaMACpp.
			p := strings.Join([]string{ps[0], ps[1]}, ".")
			if _, ok := pm[p]; !ok {
				l := &GGUFNamedTensorInfos{Name: p}
				pm[p] = l
				ret = append(ret, l)
			}
			l := pm[p].(*GGUFNamedTensorInfos)
			l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i])
		case (ps[0] == "v" || ps[0] == "t" || ps[0] == "a") && ps[1] == "blk":
			// LLaMACpp CLIP.
			p := ps[0]
			if _, ok := pm[p]; !ok {
				l := &GGUFNamedTensorInfos{Name: p}
				pm[p] = l
				ret = append(ret, l)
			}
			l := pm[p].(*GGUFNamedTensorInfos)
			if len(ps) < 3 {
				l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i])
				continue
			}
			p = strings.Join([]string{ps[0], ps[1], ps[2]}, ".")
			if _, ok := pm[p]; !ok {
				xl := &GGUFNamedTensorInfos{Name: p}
				pm[p] = xl
				l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, xl)
			}
			xl := pm[p].(*GGUFNamedTensorInfos)
			xl.GGUFLayerTensorInfos = append(xl.GGUFLayerTensorInfos, tis[i])
		case ((ps[0] == "dec" || ps[0] == "enc") && ps[1] == "blk") ||
			((ps[0] == "decoder" || ps[0] == "encoder") && ps[1] == "block"):
			// BERT.
			p := ps[0]
			if _, ok := pm[p]; !ok {
				l := &GGUFNamedTensorInfos{Name: p}
				pm[p] = l
				ret = append(ret, l)
			}
			l := pm[p].(*GGUFNamedTensorInfos)
			if len(ps) < 3 {
				l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i])
				continue
			}
			p = strings.Join([]string{ps[0], ps[1], ps[2]}, ".")
			if _, ok := pm[p]; !ok {
				xl := &GGUFNamedTensorInfos{Name: p}
				pm[p] = xl
				l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, xl)
			}
			xl := pm[p].(*GGUFNamedTensorInfos)
			xl.GGUFLayerTensorInfos = append(xl.GGUFLayerTensorInfos, tis[i])
		case ps[0] == "first_stage_model":
			// StableDiffusionCpp Autoencoder.
			p := strings.Join([]string{ps[0], ps[1]}, ".")
			if _, ok := pm[p]; !ok {
				l := &GGUFNamedTensorInfos{Name: p}
				pm[p] = l
				ret = append(ret, l)
			}
			l := pm[p].(*GGUFNamedTensorInfos)
			if len(ps) < 3 {
				l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i])
				continue
			}
			p = strings.Join([]string{ps[0], ps[1], ps[2]}, ".")
			if _, ok := pm[p]; !ok {
				xl := &GGUFNamedTensorInfos{Name: p}
				pm[p] = xl
				l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, xl)
			}
			xl := pm[p].(*GGUFNamedTensorInfos)
			xl.GGUFLayerTensorInfos = append(xl.GGUFLayerTensorInfos, tis[i])
		case ps[0] == "cond_stage_model":
			// StableDiffusionCpp Conditioner.
			if len(ps) < 3 {
				ret = append(ret, tis[i])
				continue
			}
			p := strings.Join([]string{ps[0], ps[1], ps[2]}, ".")
			if !numberRegex.MatchString(ps[1]) {
				p = strings.Join([]string{ps[0], ps[1]}, ".")
			}
			if _, ok := pm[p]; !ok {
				l := &GGUFNamedTensorInfos{Name: p}
				pm[p] = l
				ret = append(ret, l)
			}
			l := pm[p].(*GGUFNamedTensorInfos)
			if len(ps) < 4 {
				l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i])
				continue
			}
			p = strings.Join([]string{ps[0], ps[1], ps[2], ps[3]}, ".")
			if !numberRegex.MatchString(ps[1]) {
				p = strings.Join([]string{ps[0], ps[1], ps[2]}, ".")
			}
			if _, ok := pm[p]; !ok {
				xl := &GGUFNamedTensorInfos{Name: p}
				pm[p] = xl
				l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, xl)
			}
			xl := pm[p].(*GGUFNamedTensorInfos)
			xl.GGUFLayerTensorInfos = append(xl.GGUFLayerTensorInfos, tis[i])
		case ps[0] == "model" && ps[1] == "diffusion_model": // nolint: goconst
			// StableDiffusionCpp.
			p := "model.diffusion_model"
			if _, ok := pm[p]; !ok {
				l := &GGUFNamedTensorInfos{Name: p}
				pm[p] = l
				ret = append(ret, l)
			}
			l := pm[p].(*GGUFNamedTensorInfos)
			if len(ps) < 3 {
				l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i])
				continue
			}
			p = strings.Join([]string{"model.diffusion_model", ps[2]}, ".")
			if _, ok := pm[p]; !ok {
				xl := &GGUFNamedTensorInfos{Name: p}
				pm[p] = xl
				l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, xl)
			}
			xl := pm[p].(*GGUFNamedTensorInfos)
			xl.GGUFLayerTensorInfos = append(xl.GGUFLayerTensorInfos, tis[i])
		}
	}
	return ret
}

// Get returns the IGGUFTensorInfos with the given name,
// and true if found, and false otherwise.
func (ltis GGUFLayerTensorInfos) Get(name string) (info GGUFTensorInfo, found bool) {
	for i := range ltis {
		switch v := ltis[i].(type) {
		case GGUFTensorInfo:
			if v.Name == name {
				return v, true
			}
		case *GGUFNamedTensorInfos:
			info, found = v.GGUFLayerTensorInfos.Get(name)
			if found {
				return info, true
			}
		}
	}
	return GGUFTensorInfo{}, false
}

// GetFileType returns the GGUFFileType represented the mostly GGMLType of the GGUFLayerTensorInfos.
func (ltis GGUFLayerTensorInfos) GetFileType() GGUFFileType {
	if len(ltis) == 0 {
		return _GGUFFileTypeCount
	}

	cm := make(map[GGMLType]int)
	for i := range ltis {
		switch v := ltis[i].(type) {
		case GGUFTensorInfo:
			cm[v.Type]++
		case *GGUFNamedTensorInfos:
			cm[v.GetFileType().GGMLType()]++
		}
	}

	return GetFileType(cm)
}

// Match returns true if a tensor of GGUFLayerTensorInfos matches the given regex.
func (ltis GGUFLayerTensorInfos) Match(nameRegex *regexp.Regexp) bool {
	for i := range ltis {
		switch v := ltis[i].(type) {
		case GGUFTensorInfo:
			if nameRegex.MatchString(v.Name) {
				return true
			}
		case *GGUFNamedTensorInfos:
			if v.Match(nameRegex) {
				return true
			}
		}
	}
	return false
}

// Search returns a list of GGUFTensorInfo with the names that match the given regex.
func (ltis GGUFLayerTensorInfos) Search(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo) {
	for i := range ltis {
		switch v := ltis[i].(type) {
		case GGUFTensorInfo:
			if nameRegex.MatchString(v.Name) {
				infos = append(infos, v)
			}
		case *GGUFNamedTensorInfos:
			infos = append(infos, v.Search(nameRegex)...)
		}
	}
	return infos
}

// Index returns a map value to the GGUFTensorInfos with the given names,
// and the number of names found.
func (ltis GGUFLayerTensorInfos) Index(names []string) (infos map[string]GGUFTensorInfo, found int) {
	ns := make(map[string]struct{}, len(names))
	for i := range names {
		ns[names[i]] = struct{}{}
	}
	infos = make(map[string]GGUFTensorInfo)
	for i := range ltis {
		switch v := ltis[i].(type) {
		case GGUFTensorInfo:
			if _, ok := ns[v.Name]; ok {
				infos[v.Name] = v
				found++
			}
		case *GGUFNamedTensorInfos:
			inf, _ := v.Index(names)
			for k := range inf {
				infos[k] = inf[k]
				found++
			}
		}
		if found == len(ns) {
			break
		}
	}
	return infos, found
}

// Elements returns the number of elements of the GGUFLayerTensorInfos.
func (ltis GGUFLayerTensorInfos) Elements(filter ...GGUFTensorInfoFilter) uint64 {
	var ret uint64
	for i := range ltis {
		ret += ltis[i].Elements(filter...)
	}
	return ret
}

// Bytes returns the number of bytes of the GGUFLayerTensorInfos.
func (ltis GGUFLayerTensorInfos) Bytes(filter ...GGUFTensorInfoFilter) uint64 {
	var ret uint64
	for i := range ltis {
		ret += ltis[i].Bytes(filter...)
	}
	return ret
}

// Count returns the number of GGUF tensors of the GGUFLayerTensorInfos.
func (ltis GGUFLayerTensorInfos) Count() uint64 {
	var ret uint64
	for i := range ltis {
		ret += ltis[i].Count()
	}
	return ret
}

// Cut splits the GGUFLayerTensorInfos into two parts,
// and returns the GGUFLayerTensorInfos with the names that match the given names at first,
// and the GGUFLayerTensorInfos without the names at second,
// and true if the GGUFLayerTensorInfos with the names are found, and false otherwise.
//
// The given names support glob pattern, for example, "a*" matches "a", "ab", "abc", and so on.
func (ltis GGUFLayerTensorInfos) Cut(names []string) (before, after GGUFLayerTensorInfos, found bool) {
	prefixes := make(map[string]struct{})
	matches := make(map[string]struct{})
	for i := range names {
		if strings.HasSuffix(names[i], "*") {
			prefixes[strings.TrimSuffix(names[i], "*")] = struct{}{}
		} else {
			matches[names[i]] = struct{}{}
		}
	}
	before = make(GGUFLayerTensorInfos, 0, len(names))
	after = make(GGUFLayerTensorInfos, 0, len(ltis))

	for i := range ltis {
		switch v := ltis[i].(type) {
		case GGUFTensorInfo:
			if len(matches) != 0 {
				if _, ok := matches[v.Name]; ok {
					before = append(before, v)
					continue
				}
			}
			if len(prefixes) != 0 {
				var check bool
				for prefix := range prefixes {
					if strings.HasPrefix(v.Name, prefix) {
						before = append(before, v)
						check = true
						break
					}
				}
				if check {
					continue
				}
			}
			after = append(after, v)
		case *GGUFNamedTensorInfos:
			if len(matches) != 0 {
				if _, ok := matches[v.Name]; ok {
					before = append(before, v)
					continue
				}
			}
			if len(prefixes) != 0 {
				var check bool
				for prefix := range prefixes {
					if strings.HasPrefix(v.Name, prefix) {
						before = append(before, v)
						check = true
						break
					}
				}
				if check {
					continue
				}
			}
			after = append(after, v)
		}
	}
	return before, after, len(before) > 0
}

type _GGUFReader struct {
	v  GGUFVersion
	o  _GGUFReadOptions
	f  io.ReadSeeker
	bo binary.ByteOrder
}

func (rd _GGUFReader) ReadUint8() (v uint8, err error) {
	err = binary.Read(rd.f, rd.bo, &v)
	if err != nil {
		return 0, fmt.Errorf("read uint8: %w", err)
	}
	return v, nil
}

func (rd _GGUFReader) ReadInt8() (v int8, err error) {
	err = binary.Read(rd.f, rd.bo, &v)
	if err != nil {
		return 0, fmt.Errorf("read int8: %w", err)
	}
	return v, nil
}

func (rd _GGUFReader) ReadUint16() (v uint16, err error) {
	err = binary.Read(rd.f, rd.bo, &v)
	if err != nil {
		return 0, fmt.Errorf("read uint16: %w", err)
	}
	return v, nil
}

func (rd _GGUFReader) ReadInt16() (v int16, err error) {
	err = binary.Read(rd.f, rd.bo, &v)
	if err != nil {
		return 0, fmt.Errorf("read int16: %w", err)
	}
	return v, nil
}

func (rd _GGUFReader) ReadUint32() (v uint32, err error) {
	err = binary.Read(rd.f, rd.bo, &v)
	if err != nil {
		return 0, fmt.Errorf("read uint32: %w", err)
	}
	return v, nil
}

func (rd _GGUFReader) ReadUint64FromUint32() (uint64, error) {
	v, err := rd.ReadUint32()
	return uint64(v), err
}

func (rd _GGUFReader) ReadInt32() (v int32, err error) {
	err = binary.Read(rd.f, rd.bo, &v)
	if err != nil {
		return 0, fmt.Errorf("read int32: %w", err)
	}
	return v, nil
}

func (rd _GGUFReader) ReadFloat32() (v float32, err error) {
	err = binary.Read(rd.f, rd.bo, &v)
	if err != nil {
		return 0, fmt.Errorf("read float32: %w", err)
	}
	return v, nil
}

func (rd _GGUFReader) ReadBool() (v bool, err error) {
	b, err := rd.ReadUint8()
	if err != nil {
		return false, fmt.Errorf("read bool: %w", err)
	}
	return b != 0, nil
}

func (rd _GGUFReader) ReadString() (v string, err error) {
	var l uint64
	if rd.v <= GGUFVersionV1 {
		l, err = rd.ReadUint64FromUint32()
	} else {
		l, err = rd.ReadUint64()
	}
	if err != nil {
		return "", fmt.Errorf("read string length: %w", err)
	}

	b := bytex.GetBytes(l)
	defer bytex.Put(b)
	if _, err = rd.f.Read(b); err != nil {
		return "", fmt.Errorf("read string: %w", err)
	}

	return string(bytes.TrimSpace(b)), nil
}

func (rd _GGUFReader) SkipReadingString() (err error) {
	var l uint64
	if rd.v <= GGUFVersionV1 {
		l, err = rd.ReadUint64FromUint32()
	} else {
		l, err = rd.ReadUint64()
	}
	if err != nil {
		return fmt.Errorf("read string length: %w", err)
	}
	_, err = rd.f.Seek(int64(l), io.SeekCurrent)
	if err != nil {
		return fmt.Errorf("seek string: %w", err)
	}
	return nil
}

func (rd _GGUFReader) ReadArray(key string) (v GGUFMetadataKVArrayValue, err error) {
	v.StartOffset, err = rd.f.Seek(0, io.SeekCurrent)
	if err != nil {
		return v, fmt.Errorf("read array start: %w", err)
	}

	if err = binary.Read(rd.f, rd.bo, &v.Type); err != nil {
		return v, fmt.Errorf("read array item type: %w", err)
	}

	if rd.v <= GGUFVersionV1 {
		v.Len, err = rd.ReadUint64FromUint32()
	} else {
		v.Len, err = rd.ReadUint64()
	}
	if err != nil {
		return v, fmt.Errorf("read array length: %w", err)
	}

	itemStart, err := rd.f.Seek(0, io.SeekCurrent)
	if err != nil {
		return v, fmt.Errorf("seek array item start: %w", err)
	}

	if !rd.o.SkipLargeMetadata || stringx.HasSuffixes(key, ".feed_forward_length", ".attention.head_count") {
		v.Array = make([]any, v.Len)
		for i := uint64(0); i < v.Len; i++ {
			v.Array[i], err = rd.ReadValue(key, v.Type)
			if err != nil {
				return v, fmt.Errorf("read array item %d: %w", i, err)
			}
		}

		itemEnd, err := rd.f.Seek(0, io.SeekCurrent)
		if err != nil {
			return v, fmt.Errorf("seek array item end: %w", err)
		}
		v.Size = itemEnd - itemStart

		return v, nil
	}

	switch v.Type {
	case GGUFMetadataValueTypeUint8, GGUFMetadataValueTypeInt8, GGUFMetadataValueTypeBool:
		_, err = rd.f.Seek(int64(v.Len), io.SeekCurrent)
	case GGUFMetadataValueTypeUint16, GGUFMetadataValueTypeInt16:
		_, err = rd.f.Seek(int64(v.Len)*2, io.SeekCurrent)
	case GGUFMetadataValueTypeUint32, GGUFMetadataValueTypeInt32, GGUFMetadataValueTypeFloat32:
		_, err = rd.f.Seek(int64(v.Len)*4, io.SeekCurrent)
	case GGUFMetadataValueTypeUint64, GGUFMetadataValueTypeInt64, GGUFMetadataValueTypeFloat64:
		_, err = rd.f.Seek(int64(v.Len)*8, io.SeekCurrent)
	case GGUFMetadataValueTypeString:
		for i := uint64(0); i < v.Len; i++ {
			if err = rd.SkipReadingString(); err != nil {
				return v, fmt.Errorf("seek array[string] %d: %w", i, err)
			}
		}
	default:
		// Should not happen.
		panic(fmt.Errorf("invalid type: %v", v.Type))
	}
	if err != nil {
		return v, fmt.Errorf("seek array end: %w", err)
	}

	itemEnd, err := rd.f.Seek(0, io.SeekCurrent)
	if err != nil {
		return v, fmt.Errorf("seek array item end: %w", err)
	}
	v.Size = itemEnd - itemStart

	return v, nil
}

func (rd _GGUFReader) ReadUint64() (v uint64, err error) {
	err = binary.Read(rd.f, rd.bo, &v)
	if err != nil {
		return 0, fmt.Errorf("read uint64: %w", err)
	}
	return v, nil
}

func (rd _GGUFReader) ReadInt64() (v int64, err error) {
	err = binary.Read(rd.f, rd.bo, &v)
	if err != nil {
		return 0, fmt.Errorf("read int64: %w", err)
	}
	return v, nil
}

func (rd _GGUFReader) ReadFloat64() (v float64, err error) {
	err = binary.Read(rd.f, rd.bo, &v)
	if err != nil {
		return 0, fmt.Errorf("read float64: %w", err)
	}
	return v, nil
}

func (rd _GGUFReader) ReadValue(vk string, vt GGUFMetadataValueType) (v any, err error) {
	if vt >= _GGUFMetadataValueTypeCount {
		return nil, fmt.Errorf("invalid type: %v", vt)
	}

	switch vt {
	case GGUFMetadataValueTypeUint8:
		v, err = rd.ReadUint8()
	case GGUFMetadataValueTypeInt8:
		v, err = rd.ReadInt8()
	case GGUFMetadataValueTypeUint16:
		v, err = rd.ReadUint16()
	case GGUFMetadataValueTypeInt16:
		v, err = rd.ReadInt16()
	case GGUFMetadataValueTypeUint32:
		v, err = rd.ReadUint32()
	case GGUFMetadataValueTypeInt32:
		v, err = rd.ReadInt32()
	case GGUFMetadataValueTypeFloat32:
		v, err = rd.ReadFloat32()
	case GGUFMetadataValueTypeBool:
		v, err = rd.ReadBool()
	case GGUFMetadataValueTypeString:
		v, err = rd.ReadString()
	case GGUFMetadataValueTypeArray:
		v, err = rd.ReadArray(vk)
	case GGUFMetadataValueTypeUint64:
		v, err = rd.ReadUint64()
	case GGUFMetadataValueTypeInt64:
		v, err = rd.ReadInt64()
	case GGUFMetadataValueTypeFloat64:
		v, err = rd.ReadFloat64()
	default:
		// Should not happen.
		panic(fmt.Errorf("invalid type: %v", vt))
	}
	if err != nil {
		return nil, err
	}
	return v, nil
}

type _GGUFMetadataReader struct {
	_GGUFReader
}

func (rd _GGUFMetadataReader) Read() (kv GGUFMetadataKV, err error) {
	kv.Key, err = rd.ReadString()
	if err != nil {
		return kv, fmt.Errorf("read key: %w", err)
	}

	{
		vt, err := rd.ReadUint32()
		if err != nil {
			return kv, fmt.Errorf("read value type: %w", err)
		}
		kv.ValueType = GGUFMetadataValueType(vt)
		if kv.ValueType >= _GGUFMetadataValueTypeCount {
			return kv, fmt.Errorf("invalid value type: %v", kv.ValueType)
		}
	}

	kv.Value, err = rd.ReadValue(kv.Key, kv.ValueType)
	if err != nil {
		return kv, fmt.Errorf("read %s value: %w", kv.Key, err)
	}

	return kv, nil
}

type _GGUFTensorInfoReader struct {
	_GGUFReader
}

func (rd _GGUFTensorInfoReader) Read() (ti GGUFTensorInfo, err error) {
	ti.StartOffset, err = rd.f.Seek(0, io.SeekCurrent)
	if err != nil {
		return ti, fmt.Errorf("seek tensor info start: %w", err)
	}

	ti.Name, err = rd.ReadString()
	if err != nil {
		return ti, fmt.Errorf("read name: %w", err)
	}

	ti.NDimensions, err = rd.ReadUint32()
	if err != nil {
		return ti, fmt.Errorf("read n dimensions: %w", err)
	}

	ti.Dimensions = make([]uint64, ti.NDimensions)
	for i := uint32(0); i < ti.NDimensions; i++ {
		if rd.v <= GGUFVersionV1 {
			ti.Dimensions[i], err = rd.ReadUint64FromUint32()
		} else {
			ti.Dimensions[i], err = rd.ReadUint64()
		}
		if err != nil {
			return ti, fmt.Errorf("read dimension %d: %w", i, err)
		}
	}

	{
		v, err := rd.ReadUint32()
		if err != nil {
			return ti, fmt.Errorf("read type: %w", err)
		}
		ti.Type = GGMLType(v)
		if ti.Type >= _GGMLTypeCount {
			return ti, fmt.Errorf("%v: This quantized type is currently unsupported", ti.Type)
		}
	}

	ti.Offset, err = rd.ReadUint64()
	if err != nil {
		return ti, fmt.Errorf("read offset: %w", err)
	}

	return ti, nil
}


================================================
FILE: file_architecture.go
================================================
package gguf_parser

import (
	"regexp"
	"slices"
	"strings"
)

// Types for the architecture metadata of a GGUF file.
type (
	// GGUFArchitecture represents the architecture metadata of a GGUF file.
	GGUFArchitecture struct {
		/* Basic */

		// Type describes the type of the file,
		// default is "model".
		Type string `json:"type"`
		// Architecture describes what architecture this model implements.
		//
		// All lowercase ASCII.
		Architecture string `json:"architecture"`
		// MaximumContextLength(n_ctx_train) is the maximum context length of the model.
		//
		// For most architectures, this is the hard limit on the length of the input.
		// Architectures, like RWKV,
		// that are not reliant on transformer-style attention may be able to handle larger inputs,
		// but this is not guaranteed.
		MaximumContextLength uint64 `json:"maximumContextLength,omitempty"`
		// EmbeddingLength(n_embd) is the length of the embedding layer.
		EmbeddingLength uint64 `json:"embeddingLength,omitempty"`
		// BlockCount(n_layer) is the number of blocks of attention and feed-forward layers,
		// i.e. the bulk of the LLM.
		// This does not include the input or embedding layers.
		BlockCount uint64 `json:"blockCount,omitempty"`
		// FeedForwardLength(n_ff) stores the length of each feed-forward layer.
		FeedForwardLength []uint64 `json:"feedForwardLength,omitempty"`
		// ExpertFeedForwardLength(expert_feed_forward_length) is the length of the feed-forward layer in the expert model.
		ExpertFeedForwardLength uint64 `json:"expertFeedForwardLength,omitempty"`
		// ExpertSharedFeedForwardLength(expert_shared_feed_forward_length) is the length of the shared feed-forward layer in the expert model.
		ExpertSharedFeedForwardLength uint64 `json:"expertSharedFeedForwardLength,omitempty"`
		// ExpertCount(n_expert) is the number of experts in MoE models.
		ExpertCount uint32 `json:"expertCount,omitempty"`
		// ExpertUsedCount(n_expert_used) is the number of experts used during each token evaluation in MoE models.
		ExpertUsedCount uint32 `json:"expertUsedCount,omitempty"`
		// ExpertSharedCount(n_expert_shared) is the number of shared experts in MoE models.
		ExpertSharedCount uint32 `json:"expertSharedCount,omitempty"`
		// AttentionHeadCount(n_head) is the number of attention heads.
		AttentionHeadCount uint64 `json:"attentionHeadCount,omitempty"`
		// AttentionHeadCountKV(n_head_kv) is the number of attention heads per group used in Grouped-Query-Attention.
		//
		// If not provided or equal to AttentionHeadCount,
		// the model does not use Grouped-Query-Attention.
		AttentionHeadCountKV uint64 `json:"attentionHeadCountKV,omitempty"`
		// AttentionSlidingWindowPattern is the pattern used in the sliding window attention.
		//
		// 0 means all layers are Sliding Window Attention.
		// 1 means all layers are none Sliding Window Attention.
		// N means every Nth layer is none Sliding Window Attention.
		AttentionSlidingWindowPattern uint32 `json:"attentionSlidingWindowPattern,omitempty"`
		// AttentionSlidingWindow is the size of the sliding window used in the attention layer.
		AttentionSlidingWindow uint64 `json:"attentionSlidingWindow,omitempty"`
		// AttentionMaxALiBIBias is the maximum bias to use for ALiBI.
		AttentionMaxALiBIBias float32 `json:"attentionMaxALiBIBias,omitempty"`
		// AttentionClampKQV describes a value `C`,
		// which is used to clamp the values of the `Q`, `K` and `V` tensors between `[-C, C]`.
		AttentionClampKQV float32 `json:"attentionClampKQV,omitempty"`
		// AttentionLayerNormEpsilon is the epsilon value used in the LayerNorm(Layer Normalization).
		AttentionLayerNormEpsilon float32 `json:"attentionLayerNormEpsilon,omitempty"`
		// AttentionLayerNormRMSEpsilon is the epsilon value used in the RMSNorm(root Mean Square Layer Normalization),
		// which is a simplification of the original LayerNorm.
		AttentionLayerNormRMSEpsilon float32 `json:"attentionLayerNormRMSEpsilon,omitempty"`
		// AttentionQueryLORARank is the LORA rank of the query matrix.
		//
		// Zero means no LORA.
		AttentionQueryLORARank uint32 `json:"attentionQueryLORARank,omitempty"`
		// AttentionKeyValueLORARank is the LORA rank of the key/value matrix.
		//
		// Zero means no LORA.
		AttentionKeyValueLORARank uint32 `json:"attentionKeyValueLORARank,omitempty"`
		// AttentionKeyLength(n_embd_head_k) is the size of a key head.
		//
		// Defaults to `EmbeddingLength / AttentionHeadCount`.
		AttentionKeyLength uint32 `json:"attentionKeyLength,omitempty"`
		// AttentionKeyLengthMLA(n_embd_head_k_mla) is the size of a key head in MLA(Multi-Layer Attention).
		//
		// Zero means no MLA.
		AttentionKeyLengthMLA uint32 `json:"attentionKeyLengthMLA,omitempty"`
		// AttentionValueLength(n_embd_head_v) is the size of a value head.
		//
		// Defaults to `EmbeddingLength / AttentionHeadCount`.
		AttentionValueLength uint32 `json:"attentionValueLength,omitempty"`
		// AttentionValueLengthMLA(n_embd_head_v_mla) is the size of a value head in MLA(Multi-Layer Attention).
		//
		// Zero means no MLA.
		AttentionValueLengthMLA uint32 `json:"attentionValueLengthMLA,omitempty"`
		// AttentionCausal is true if the attention is causal.
		AttentionCausal bool `json:"attentionCausal,omitempty"`
		// AttentionRecurrent is true if the attention is recurrent.
		//
		// Used in Mamba, RWKV, and similar architectures.
		AttentionRecurrent bool `json:"attentionRecurrent,omitempty"`
		// AttentionHybrid is true if the attention is hybrid (causal (self-attention) + recurrent).
		//
		// Used in Jamba, Falcon-H1, and similar architectures.
		AttentionHybrid bool `json:"attentionHybrid,omitempty"`
		// RoPEDimensionCount is the number of dimensions in the RoPE(Rotary Positional Encoding).
		RoPEDimensionCount uint64 `json:"ropeDimensionCount,omitempty"`
		// RoPEFrequencyBase is the base frequency of the RoPE.
		RoPEFrequencyBase float32 `json:"ropeFrequencyBase,omitempty"`
		// RoPEFrequencyScale is the scale frequency of the RoPE.
		RoPEFrequencyScale float32 `json:"ropeFrequencyScale,omitempty"`
		// RoPEFrequencyScale is the frequency scale of the RoPE.
		RoPEScalingType string `json:"ropeScalingType,omitempty"`
		// RoPEScalingFactor is the scaling factor of the RoPE.
		RoPEScalingFactor float32 `json:"ropeScalingFactor,omitempty"`
		// RoPEScalingOriginalContextLength is the original context length of the RoPE scaling.
		RoPEScalingOriginalContextLength uint64 `json:"ropeScalingOriginalContextLength,omitempty"`
		// RoPEScalingFinetuned is true if the RoPE scaling is fine-tuned.
		RoPEScalingFinetuned bool `json:"ropeScalingFinetuned,omitempty"`
		// PoolingType is the type of pooling used in the model.
		PoolingType uint32 `json:"poolingType,omitempty"`
		// SSMConvolutionKernel is the size of the convolution kernel used in the Selective State Space Model (SSM) and similar architectures.
		SSMConvolutionKernel uint32 `json:"ssmConvolutionKernel,omitempty"`
		// SSMInnerSize is the embedding size of the state in SSM and similar architectures.
		SSMInnerSize uint32 `json:"ssmInnerSize,omitempty"`
		// SSMStateSize is the size of the recurrent state in SSM and similar architectures.
		SSMStateSize uint32 `json:"ssmStateSize,omitempty"`
		// SSMTimeStepRank is the rank of the time steps in SSM and similar architectures.
		SSMTimeStepRank uint32 `json:"ssmTimeStepRank,omitempty"`
		// SSMGroupCount is the number of groups in the SSM and similar architectures.
		SSMGroupCount uint32 `json:"ssmGroupCount,omitempty"`
		// WKVHeadSize is the size of the head in RWKV and similar architectures.
		RWKVHeadSize uint32 `json:"rwkvHeadSize,omitempty"`
		// RWKVRescaleEveryNLayers is the number of layers after which the rescaling is applied in RWKV and similar architectures.
		RWKVRescaleEveryNLayers uint32 `json:"rwkvRescaleEveryNLayers,omitempty"`
		// RWKVTimeMixExtraDimension indicates whether the RWKV architecture has an extra dimension for time mixing.
		RWKVTimeMixExtraDimension uint32 `json:"rwkvTimeMixExtraDimension,omitempty"`
		// RWKVTimeDecayExtraDimension indicates whether the RWKV architecture has an extra dimension for time decay.
		RWKVTimeDecayExtraDimension uint32 `json:"rwkvTimeDecayExtraDimension,omitempty"`
		// TokenShiftCount is the number of token shifts used in RWKV and similar architectures.
		RWKVTokenShiftCount uint32 `json:"rwkvTokenShiftCount,omitempty"`
		// VocabularyLength is the size of the vocabulary.
		//
		// VocabularyLength is the same as the tokenizer's token size.
		VocabularyLength uint64 `json:"vocabularyLength,omitempty"`

		/* Appendix */

		// ClipProjectorType is the type of the projector used in the clip model.
		//
		// Only used when Architecture is "clip".
		ClipProjectorType string `json:"clipProjectorType,omitempty"`
		// ClipHasLLaVAProjector indicates whether the clip model has LLaVA projector or not.
		//
		// Only used when Architecture is "clip".
		//
		// Deprecated: use ClipProjectorType instead.
		ClipHasLLaVAProjector bool `json:"clipHasLLaVAProjector,omitempty"`
		// ClipHasMiniCPMVProjector indicates whether the clip model has MiniCPMV projector or not.
		//
		// Only used when Architecture is "clip".
		//
		// Deprecated: use ClipProjectorType instead.
		ClipHasMiniCPMVProjector bool `json:"clipHasMiniCPMVProject,omitempty"`
		// ClipMiniCPMVVersion is the version of the MiniCPMV projector.
		//
		// Only used when Architecture is "clip".
		ClipMiniCPMVVersion int32 `json:"clipMiniCPMVVersion,omitempty"`
		// ClipMiniCPMVQueryNum is the number of queries used in the MiniCPMV projector.
		//
		// Only used when Architecture is "clip".
		ClipMiniCPMVQueryNum int32 `json:"clipMiniCPMVQueryNum,omitempty"`
		// ClipHasGLMProjector indicates whether the clip model has GLM projector or not.
		//
		// Only used when Architecture is "clip".
		//
		// Deprecated: use ClipProjectorType instead.
		ClipHasGLMProjector bool `json:"clipHasGLMProjector,omitempty"`
		// ClipHasQwen2VLMerger indicates whether the clip model has Qwen2VL merger or not.
		//
		// Only used when Architecture is "clip".
		//
		// Deprecated: use ClipProjectorType instead.
		ClipHasQwen2VLMerger bool `json:"clipHasQwen2VLMerger,omitempty"`
		// ClipHasVisionEncoder indicates whether the clip model has vision encoder or not.
		//
		// Only used when Architecture is "clip".
		ClipHasVisionEncoder bool `json:"clipHasVisionEncoder,omitempty"`
		// ClipVisionEmbeddingLength indicates the embedding length of vision encoder.
		//
		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
		ClipVisionEmbeddingLength uint64 `json:"clipVisionEmbeddingLength,omitempty"`
		// ClipVisionBlockCount indicates the number of blocks in the vision encoder.
		//
		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
		ClipVisionBlockCount uint64 `json:"clipVisionBlockCount,omitempty"`
		// ClipVisionFeedForwardLength indicates the feed-forward length of the vision encoder.
		//
		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
		ClipVisionFeedForwardLength []uint64 `json:"clipVisionFeedForwardLength,omitempty"`
		// ClipVisionAttentionHeadCount indicates the number of attention heads in the vision encoder.
		//
		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
		ClipVisionAttentionHeadCount uint64 `json:"clipVisionAttentionHeadCount,omitempty"`
		// ClipVisionAttentionLayerNormRMSEpsilon indicates the epsilon value used in the RMSNorm of the vision encoder.
		//
		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
		ClipVisionAttentionLayerNormRMSEpsilon float32 `json:"clipVisionAttentionLayerNormRMSEpsilon,omitempty"`
		// ClipVisionProjectionDim indicates the projection dimension of vision encoder.
		//
		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
		ClipVisionProjectionDim uint32 `json:"clipVisionProjectionDim,omitempty"`
		// ClipVisionProjectorScaleFactor is the scale factor of the projector.
		//
		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
		ClipVisionProjectorScaleFactor uint32 `json:"clipVisionProjectorScaleFactor,omitempty"`
		// ClipVisionImageSize indicates the image size of vision encoder.
		//
		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
		ClipVisionImageSize uint32 `json:"clipVisionImageSize,omitempty"`
		// ClipVisionPatchSize indicates the patch size of vision encoder.
		//
		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
		ClipVisionPatchSize uint32 `json:"clipVisionPatchSize,omitempty"`
		// ClipVisionMMPatchMergeType indicates the merge type of the vision encoder.
		//
		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
		ClipVisionMMPatchMergeType string `json:"clipVisionMMPatchMergeType,omitempty"`
		// ClipVisionSpatialMergeSize is the spatial merge size of the vision encoder.
		//
		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
		ClipVisionSpatialMergeSize uint32 `json:"clipVisionSpatialMergeSize,omitempty"`
		// ClipVisionWindowAttentionPattern is the Window Attention pattern used in the vision encoder.
		//
		// Only used when Architecture is "clip" and ClipHasVisionEncoder is true.
		ClipVisionWindowAttentionPattern uint32 `json:"clipVisionWindowAttentionPattern,omitempty"`
		// ClipHasAudioEncoder indicates whether the clip model has audio encoder or not.
		//
		// Only used when Architecture is "clip".
		ClipHasAudioEncoder bool `json:"clipHasAudioEncoder,omitempty"`
		// ClipAudioEmbeddingLength indicates the embedding length of audio encoder.
		//
		// Only used when Architecture is "clip" and ClipHasAudioEncoder is true.
		ClipAudioEmbeddingLength uint64 `json:"clipAudioEmbeddingLength,omitempty"`
		// ClipAudioBlockCount indicates the number of blocks in the audio encoder.
		//
		// Only used when Architecture is "clip" and ClipHasAudioEncoder is true.
		ClipAudioBlockCount uint64 `json:"clipAudioBlockCount,omitempty"`
		// ClipAudioFeedForwardLength indicates the feed-forward length of the audio encoder.
		//
		// Only used when Architecture is "clip" and ClipHasAudioEncoder is true.
		ClipAudioFeedForwardLength []uint64 `json:"clipAudioFeedForwardLength,omitempty"`
		// ClipAudioAttentionHeadCount indicates the number of attention heads in the audio encoder.
		//
		// Only used when Architecture is "clip" and ClipHasAudioEncoder is true.
		ClipAudioAttentionHeadCount uint64 `json:"clipAudioAttentionHeadCount,omitempty"`
		// ClipAudioAttentionLayerNormRMSEpsilon indicates the epsilon value used in the RMSNorm of the audio encoder.
		//
		// Only used when Architecture is "clip" and ClipHasAudioEncoder is true.
		ClipAudioAttentionLayerNormRMSEpsilon float32 `json:"clipAudioAttentionLayerNormRMSEpsilon,omitempty"`
		// ClipAudioProjectionDim indicates the projection dimension of audio encoder.
		//
		// Only used when Architecture is "clip" and ClipHasAudioEncoder is true.
		ClipAudioProjectionDim uint32 `json:"clipAudioProjectionDim,omitempty"`
		// ClipAudioProjectorStackFactor is the scale factor of the projector.
		//
		// Only used when Architecture is "clip" and ClipHasAudioEncoder is true.
		ClipAudioProjectorStackFactor uint32 `json:"clipAudioProjectorStackFactor,omitempty"`
		// ClipAudioNumMelBins is the number of mel bins used in the audio encoder.
		//
		// Only used when Architecture is "clip" and ClipHasAudioEncoder is true.
		ClipAudioNumMelBins uint32 `json:"clipAudioNumMelBins,omitempty"`

		// AdapterType is the type of the adapter.
		//
		// Only used when Architecture is "adapter".
		AdapterType string `json:"adapterType,omitempty"`
		// AdapterLoRAAlpha is the alpha value of the LoRA adapter.
		//
		// Only used when AdapterType is "lora".
		AdapterLoRAAlpha float32 `json:"adapterLoRAAlpha,omitempty"`
		// AdapterControlVectorLayerCount is the number of layers in the control vector.
		//
		// Only used when Architecture is "control_vector".
		AdapterControlVectorLayerCount uint32 `json:"adapterControlVectorLayerCount,omitempty"`

		// DiffusionArchitecture is the actual architecture of the diffusion model.
		//
		// Only used when Architecture is "diffusion".
		DiffusionArchitecture string `json:"diffusionArchitecture,omitempty"`
		// DiffusionTransformer indicates whether the diffusion model is a diffusion transformer or not.
		//
		DiffusionTransformer bool `json:"diffusionTransformer,omitempty"`
		// DiffusionConditioners is the list of diffusion conditioners.
		//
		// Only used when Architecture is "diffusion".
		DiffusionConditioners GGUFArchitectureDiffusionConditioners `json:"diffusionConditioners,omitempty"`
		// DiffusionAutoencoder represents the autoencoder of the diffusion model.
		//
		// Only used when Architecture is "diffusion".
		DiffusionAutoencoder *GGUFArchitectureDiffusionAutoencoder `json:"diffusionAutoencoder,omitempty"`
	}

	// GGUFArchitectureDiffusionConditioners is the list of GGUFArchitectureDiffusionConditioner.
	GGUFArchitectureDiffusionConditioners []GGUFArchitectureDiffusionConditioner

	// GGUFArchitectureDiffusionConditioner represents the conditioner metadata of the diffusion architecture.
	GGUFArchitectureDiffusionConditioner struct {
		// Architecture is the architecture of the diffusion conditioner.
		Architecture string `json:"architecture"`

		// FileType describes the type of the majority of the tensors in the GGUF file.
		FileType GGUFFileType `json:"fileType"`
	}

	// GGUFArchitectureDiffusionAutoencoder represents the autoencoder metadata of the diffusion architecture.
	GGUFArchitectureDiffusionAutoencoder struct {
		// Architecture is the architecture of the diffusion autoencoder.
		//
		// Currently, only "VAE" is supported.
		Architecture string `json:"architecture"`

		// FileType describes the type of the majority of the tensors in the GGUF file.
		FileType GGUFFileType `json:"fileType"`
	}
)

// DiffusionHasConditioners returns true if the diffusion model has conditioners.
func (ga GGUFArchitecture) DiffusionHasConditioners() bool {
	return len(ga.DiffusionConditioners) > 0
}

// DiffusionHasAutoencoder returns true if the diffusion model has an autoencoder.
func (ga GGUFArchitecture) DiffusionHasAutoencoder() bool {
	return ga.DiffusionAutoencoder != nil && ga.DiffusionAutoencoder.Architecture != ""
}

func (gacs GGUFArchitectureDiffusionConditioners) String() string {
	var sb strings.Builder
	for i, gac := range gacs {
		if i > 0 {
			sb.WriteString(", ")
		}
		sb.WriteString(gac.String())
	}
	return sb.String()
}

func (gac GGUFArchitectureDiffusionConditioner) String() string {
	return gac.Architecture + " (" + gac.FileType.String() + ")"
}

func (gaa GGUFArchitectureDiffusionAutoencoder) String() string {
	return gaa.Architecture + " (" + gaa.FileType.String() + ")"
}

// Architecture returns the architecture metadata of the GGUF file.
func (gf *GGUFFile) Architecture() (ga GGUFArchitecture) {
	for _, re := range _GGUFPotentialDiffusionArchitectureTensorsRegexes {
		if gf.TensorInfos.Match(re) {
			return gf.diffuserArchitecture()
		}
	}
	var (
		generalTypeKey         = "general.type"
		generalArchitectureKey = "general.architecture"

		controlVectorModelHintKey = "controlvector.model_hint"
	)
	m, _ := gf.Header.MetadataKV.Index([]string{
		generalTypeKey,
		generalArchitectureKey,
		controlVectorModelHintKey,
	})

	typ, arch := "model", "llama" // nolint: goconst
	{
		if v, ok := m[generalTypeKey]; ok {
			typ = v.ValueString()
		}
		if v, ok := m[generalArchitectureKey]; ok {
			arch = v.ValueString()
		}
	}

	switch {
	case arch == "clip":
		return gf.clipArchitecture()
	case arch == "controlvector":
		arch = "llama"
		if v, ok := m[controlVectorModelHintKey]; ok {
			arch = v.ValueString()
		}
		return gf.adapterArchitecture(arch)
	case typ == "adapter":
		return gf.adapterArchitecture(arch)
	case typ == "imatrix":
		return gf.imatrixArchitecture(arch)
	}
	return gf.transformerArchitecture(arch)
}

func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) {
	const (
		// Diffusion

		sdKey                = "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_v.weight" // SD 1.x/2.x
		sdKey2               = "output_blocks.11.1.transformer_blocks.0.attn2.to_v.weight"
		sdXlKey              = "model.diffusion_model.output_blocks.5.1.transformer_blocks.1.attn1.to_v.weight" // SD XL
		sdXlKey2             = "output_blocks.5.1.transformer_blocks.1.attn1.to_v.weight"
		sdXlRefinerKey       = "model.diffusion_model.output_blocks.8.1.transformer_blocks.1.attn1.to_v.weight" // SD XL Refiner
		sdXlRefinerKey2      = "output_blocks.8.1.transformer_blocks.1.attn1.to_v.weight"
		sd3Key               = "model.diffusion_model.joint_blocks.23.x_block.attn.proj.weight" // SD 3.x
		sd3Key2              = "joint_blocks.23.x_block.attn.proj.weight"
		sdInPaintFeatureKey  = "model.diffusion_model.input_blocks.0.0.weight" // SD in-paint feature
		sdInPaintFeatureKey2 = "input_blocks.0.0.weight"

		fluxKey             = "model.diffusion_model.double_blocks.0.txt_attn.proj.weight" // FLUX.1
		fluxKey2            = "double_blocks.0.txt_attn.proj.weight"
		fluxFillFeatureKey  = "model.diffusion_model.img_in.weight" // FLUX.1 Fill feature
		fluxFillFeatureKey2 = "img_in.weight"

		// Conditioner

		openAiClipVitL14Key  = "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight" // OpenAI CLIP ViT-L/14
		openAiClipVitL14Key2 = "text_model.encoder.layers.11.self_attn.k_proj.weight"
		openClipVitH14Key    = "cond_stage_model.transformer.text_model.encoder.layers.22.self_attn.k_proj.weight" // OpenCLIP ViT-H/14
		openClipVitH14Key2   = "text_model.encoder.layers.22.self_attn.k_proj.weight"
		openClipVitG14Key    = "cond_stage_model.1.transformer.text_model.encoder.layers.31.self_attn.k_proj.weight" // OpenCLIP ViT-G/14
		openClipVitG14Key2   = "text_model.encoder.layers.31.self_attn.k_proj.weight"
		t5xxlKey             = "cond_stage_model.1.transformer.encoder.block.23.layer.0.SelfAttention.k.weight" // Google T5-xxl
		t5xxlKey2            = "cond_stage_model.2.transformer.encoder.block.23.layer.0.SelfAttention.k.weight"
		t5xxlKey3            = "encoder.block.23.layer.0.SelfAttention.k.weight"
	)

	tis, _ := gf.TensorInfos.Index([]string{
		sdKey,
		sdKey2,
		sdXlKey,
		sdXlKey2,
		sdXlRefinerKey,
		sdXlRefinerKey2,
		sd3Key,
		sd3Key2,
		sdInPaintFeatureKey,
		sdInPaintFeatureKey2,

		fluxKey,
		fluxKey2,
		fluxFillFeatureKey,
		fluxFillFeatureKey2,

		openAiClipVitL14Key,
		openAiClipVitL14Key2,
		openClipVitH14Key,
		openClipVitH14Key2,
		openClipVitG14Key,
		openClipVitG14Key2,
		t5xxlKey,
		t5xxlKey2,
		t5xxlKey3,
	})

	ga.Type = "model"
	ga.Architecture = "diffusion"

	if ti, ok := tis[sdKey]; ok {
		ga.DiffusionArchitecture = "Stable Diffusion 1.x"
		if ti.Dimensions[0] == 1024 {
			ga.DiffusionArchitecture = "Stable Diffusion 2.x"
		}
		if ti, ok := tis[sdInPaintFeatureKey]; ok && ti.Dimensions[2] == 9 {
			ga.DiffusionArchitecture += " InPaint"
		}
	} else if _, ok := tis[sdKey2]; ok {
		ga.DiffusionArchitecture = "Stable Diffusion 1.x"
		if ti.Dimensions[0] == 1024 {
			ga.DiffusionArchitecture = "Stable Diffusion 2.x"
		}
		if ti, ok := tis[sdInPaintFeatureKey2]; ok && ti.Dimensions[2] == 9 {
			ga.DiffusionArchitecture += " InPaint"
		}
	} else if _, ok := tis[sdXlKey]; ok {
		ga.DiffusionArchitecture = "Stable Diffusion XL"
		if _, ok = tis[sdXlRefinerKey]; ok {
			ga.DiffusionArchitecture = "Stable Diffusion XL Refiner"
		}
		if ti, ok := tis[sdInPaintFeatureKey]; ok && ti.Dimensions[2] == 9 {
			ga.DiffusionArchitecture += " InPaint"
		}
	} else if _, ok := tis[sdXlKey2]; ok {
		ga.DiffusionArchitecture = "Stable Diffusion XL"
		if _, ok = tis[sdXlRefinerKey2]; ok {
			ga.DiffusionArchitecture = "Stable Diffusion XL Refiner"
		}
		if ti, ok := tis[sdInPaintFeatureKey2]; ok && ti.Dimensions[2] == 9 {
			ga.DiffusionArchitecture += " InPaint"
		}
	} else if _, ok := tis[sd3Key]; ok {
		ga.DiffusionArchitecture = "Stable Diffusion 3.x"
		ga.DiffusionTransformer = true
	} else if _, ok := tis[sd3Key2]; ok {
		ga.DiffusionArchitecture = "Stable Diffusion 3.x"
		ga.DiffusionTransformer = true
	}
	if _, ok := tis[fluxKey]; ok {
		ga.DiffusionArchitecture = "FLUX.1"
		ga.DiffusionTransformer = true
		if ti, ok := tis[fluxFillFeatureKey]; ok && ti.Dimensions[0] == 384 {
			ga.DiffusionArchitecture += " Fill"
		}
	} else if _, ok := tis[fluxKey2]; ok {
		ga.DiffusionArchitecture = "FLUX.1"
		ga.DiffusionTransformer = true
		if ti, ok := tis[fluxFillFeatureKey2]; ok && ti.Dimensions[0] == 384 {
			ga.DiffusionArchitecture += " Fill"
		}
	}

	if ti, ok := tis[openAiClipVitL14Key]; ok {
		cond := GGUFArchitectureDiffusionConditioner{
			Architecture: "OpenAI CLIP ViT-L/14",
			FileType:     ti.GetFileType(),
		}
		if ti, ok = tis[openClipVitH14Key]; ok {
			cond = GGUFArchitectureDiffusionConditioner{
				Architecture: "OpenCLIP ViT-H/14",
				FileType:     ti.GetFileType(),
			}
		}
		ga.DiffusionConditioners = append(ga.DiffusionConditioners, cond)
	} else if ti, ok := tis[openAiClipVitL14Key2]; ok {
		cond := GGUFArchitectureDiffusionConditioner{
			Architecture: "OpenAI CLIP ViT-L/14",
			FileType:     ti.GetFileType(),
		}
		if ti, ok = tis[openClipVitH14Key2]; ok {
			cond = GGUFArchitectureDiffusionConditioner{
				Architecture: "OpenCLIP ViT-H/14",
				FileType:     ti.GetFileType(),
			}
		}
		ga.DiffusionConditioners = append(ga.DiffusionConditioners, cond)
	}
	if ti, ok := tis[openClipVitG14Key]; ok {
		ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{
			Architecture: "OpenCLIP ViT-G/14",
			FileType:     ti.GetFileType(),
		})
	} else if ti, ok = tis[openClipVitG14Key2]; ok {
		ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{
			Architecture: "OpenCLIP ViT-G/14",
			FileType:     ti.GetFileType(),
		})
	}
	if ti, ok := tis[t5xxlKey]; ok {
		ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{
			Architecture: "Google T5-xxl",
			FileType:     ti.GetFileType(),
		})
	} else if ti, ok = tis[t5xxlKey2]; ok {
		ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{
			Architecture: "Google T5-xxl",
			FileType:     ti.GetFileType(),
		})
	} else if ti, ok = tis[t5xxlKey3]; ok {
		ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{
			Architecture: "Google T5-xxl",
			FileType:     ti.GetFileType(),
		})
	}

	for _, re := range []*regexp.Regexp{
		regexp.MustCompile(`^first_stage_model\..*`),
		regexp.MustCompile(`^decoder\.conv_in\..*`),
	} {
		if tis := gf.TensorInfos.Search(re); len(tis) != 0 {
			ga.DiffusionAutoencoder = &GGUFArchitectureDiffusionAutoencoder{
				Architecture: ga.DiffusionArchitecture + " VAE",
				FileType:     GGUFTensorInfos(tis).GetFileType(),
			}
			break
		}
	}

	return ga
}

func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitecture) {
	const (
		projectorTypeKey     = "clip.projector_type"
		hasLLaVAProjectorKey = "clip.has_llava_projector"
		hasMiniCPMVProjector = "clip.has_minicpmv_projector"
		miniCPMVVersionKey   = "clip.minicpmv_version"
		miniCPMVQueryNumKey  = "clip.minicpmv_query_num"
		hasGLMProjectorKey   = "clip.has_glm_projector"
		hasQwen2VLMergerKey  = "clip.has_qwen2vl_merger"

		hasVisionEncoderKey                   = "clip.has_vision_encoder"
		visionEmbeddingLengthKey              = "clip.vision.embedding_length"
		visionBlockCountKey                   = "clip.vision.block_count"
		visionFeedForwardLengthKey            = "clip.vision.feed_forward_length"
		visionAttentionHeadCountKey           = "clip.vision.attention.head_count"
		visionAttentionLayerNormRMSEpsilonKey = "clip.vision.attention.layer_norm_epsilon"
		visionProjectionDimKey                = "clip.vision.projection_dim"
		visionProjectorScaleFactorKey         = "clip.vision.projector.scale_factor"
		visionImageSizeKey                    = "clip.vision.image_size"
		visionPatchSizeKey                    = "clip.vision.patch_size"
		visionMMPatchMergeTypeKey             = "clip.vision.mm_patch_merge_type"
		visioSpatialMergeSizeKey              = "clip.vision.spatial_merge_size"
		visionWindowAttentionPatternKey       = "clip.vision.n_wa_pattern"

		hasAudioEncoderKey                   = "clip.has_audio_encoder"
		audioEmbeddingLengthKey              = "clip.audio.embedding_length"
		audioBlockCountKey                   = "clip.audio.block_count"
		audioFeedForwardLengthKey            = "clip.audio.feed_forward_length"
		audioAttentionHeadCountKey           = "clip.audio.attention.head_count"
		audioAttentionLayerNormRMSEpsilonKey = "clip.audio.attention.layer_norm_epsilon"
		audioProjectionDimKey                = "clip.audio.projection_dim"
		audioProjectorStackFactorKey         = "clip.audio.projector.stack_factor"
		audioNumMelBinsKey                   = "clip.audio.num_mel_bins"
	)

	ga.Type = "projector"
	ga.Architecture = "clip"

	m, _ := gf.Header.MetadataKV.Index([]string{
		projectorTypeKey,
		hasLLaVAProjectorKey,
		hasMiniCPMVProjector,
		miniCPMVVersionKey,
		miniCPMVQueryNumKey,
		hasGLMProjectorKey,
		hasQwen2VLMergerKey,
		// Vision
		hasVisionEncoderKey,
		visionEmbeddingLengthKey,
		visionBlockCountKey,
		visionFeedForwardLengthKey,
		visionAttentionHeadCountKey,
		visionAttentionLayerNormRMSEpsilonKey,
		visionProjectionDimKey,
		visionProjectorScaleFactorKey,
		visionImageSizeKey,
		visionPatchSizeKey,
		visionMMPatchMergeTypeKey,
		visioSpatialMergeSizeKey,
		visionWindowAttentionPatternKey,
		// Audio
		hasAudioEncoderKey,
		audioEmbeddingLengthKey,
		audioBlockCountKey,
		audioFeedForwardLengthKey,
		audioAttentionHeadCountKey,
		audioAttentionLayerNormRMSEpsilonKey,
		audioProjectionDimKey,
		audioProjectorStackFactorKey,
		audioNumMelBinsKey,
	})

	if v, ok := m[projectorTypeKey]; ok {
		ga.ClipProjectorType = v.ValueString()
	} else {
		ga.ClipProjectorType = "mlp"
	}
	if v, ok := m[hasLLaVAProjectorKey]; ok {
		ga.ClipHasLLaVAProjector = v.ValueBool()
	}
	if v, ok := m[hasMiniCPMVProjector]; ok {
		ga.ClipHasMiniCPMVProjector = v.ValueBool()
	}
	if v, ok := m[miniCPMVVersionKey]; ok {
		ga.ClipMiniCPMVVersion = ValueNumeric[int32](v)
	}
	if v, ok := m[miniCPMVQueryNumKey]; ok {
		ga.ClipMiniCPMVQueryNum = ValueNumeric[int32](v)
	}
	if v, ok := m[hasGLMProjectorKey]; ok {
		ga.ClipHasGLMProjector = v.ValueBool()
	}
	if v, ok := m[hasQwen2VLMergerKey]; ok {
		ga.ClipHasQwen2VLMerger = v.ValueBool()
	}
	// Vision
	if v, ok := m[hasVisionEncoderKey]; ok {
		ga.ClipHasVisionEncoder = v.ValueBool()
	}
	if v, ok := m[visionEmbeddingLengthKey]; ok {
		ga.ClipVisionEmbeddingLength = ValueNumeric[uint64](v)
	}
	if v, ok := m[visionBlockCountKey]; ok {
		ga.ClipVisionBlockCount = ValueNumeric[uint64](v)
	}
	if v, ok := m[visionFeedForwardLengthKey]; ok {
		if v.ValueType == GGUFMetadataValueTypeArray {
			ga.ClipVisionFeedForwardLength = ValuesNumeric[uint64](v.ValueArray())
		} else {
			vx := ValueNumeric[uint64](v)
			ga.ClipVisionFeedForwardLength = make([]uint64, ga.ClipVisionBlockCount)
			for i := range ga.ClipVisionFeedForwardLength {
				ga.ClipVisionFeedForwardLength[i] = vx
			}
		}
	}
	if v, ok := m[visionAttentionHeadCountKey]; ok {
		ga.ClipVisionAttentionHeadCount = ValueNumeric[uint64](v)
	}
	if v, ok := m[visionAttentionLayerNormRMSEpsilonKey]; ok {
		ga.ClipVisionAttentionLayerNormRMSEpsilon = ValueNumeric[float32](v)
	}
	if v, ok := m[visionImageSizeKey]; ok {
		ga.ClipVisionImageSize = ValueNumeric[uint32](v)
	}
	if v, ok := m[visionProjectionDimKey]; ok {
		ga.ClipVisionProjectionDim = ValueNumeric[uint32](v)
	}
	ga.ClipVisionProjectorScaleFactor = 1
	if ga.ClipProjectorType == "gemma3" {
		ga.ClipVisionProjectorScaleFactor = 4
	}
	if v, ok := m[visionProjectorScaleFactorKey]; ok {
		ga.ClipVisionProjectorScaleFactor = ValueNumeric[uint32](v)
	}
	ga.ClipVisionPatchSize = 1
	if v, ok := m[visionPatchSizeKey]; ok {
		ga.ClipVisionPatchSize = ValueNumeric[uint32](v)
	}
	ga.ClipVisionMMPatchMergeType = "flat"
	if v, ok := m[visionMMPatchMergeTypeKey]; ok {
		ga.ClipVisionMMPatchMergeType = v.ValueString()
	}
	if v, ok := m[visioSpatialMergeSizeKey]; ok {
		ga.ClipVisionSpatialMergeSize = ValueNumeric[uint32](v)
	}
	if v, ok := m[visionWindowAttentionPatternKey]; ok {
		ga.ClipVisionWindowAttentionPattern = ValueNumeric[uint32](v)
	}
	// Audio
	if v, ok := m[hasAudioEncoderKey]; ok {
		ga.ClipHasAudioEncoder = v.ValueBool()
	}
	if v, ok := m[audioEmbeddingLengthKey]; ok {
		ga.ClipAudioEmbeddingLength = ValueNumeric[uint64](v)
	}
	if v, ok := m[audioBlockCountKey]; ok {
		ga.ClipAudioBlockCount = ValueNumeric[uint64](v)
	}
	if v, ok := m[audioFeedForwardLengthKey]; ok {
		if v.ValueType == GGUFMetadataValueTypeArray {
			ga.ClipAudioFeedForwardLength = ValuesNumeric[uint64](v.ValueArray())
		} else {
			vx := ValueNumeric[uint64](v)
			ga.ClipAudioFeedForwardLength = make([]uint64, ga.ClipAudioBlockCount)
			for i := range ga.ClipAudioFeedForwardLength {
				ga.ClipAudioFeedForwardLength[i] = vx
			}
		}
	}
	if v, ok := m[audioAttentionHeadCountKey]; ok {
		ga.ClipAudioAttentionHeadCount = ValueNumeric[uint64](v)
	}
	if v, ok := m[audioAttentionLayerNormRMSEpsilonKey]; ok {
		ga.ClipAudioAttentionLayerNormRMSEpsilon = ValueNumeric[float32](v)
	}
	if v, ok := m[audioProjectionDimKey]; ok {
		ga.ClipAudioProjectionDim = ValueNumeric[uint32](v)
	}
	ga.ClipAudioProjectorStackFactor = 1
	if v, ok := m[audioProjectorStackFactorKey]; ok {
		ga.ClipAudioProjectorStackFactor = ValueNumeric[uint32](v)
	}
	if v, ok := m[audioNumMelBinsKey]; ok {
		ga.ClipAudioNumMelBins = ValueNumeric[uint32](v)
	}

	ga.AttentionHeadCountKV = ga.AttentionHeadCount

	return ga
}

func (gf *GGUFFile) adapterArchitecture(arch string) (ga GGUFArchitecture) {
	var (
		typeKey = "adapter.type"

		loraAlphaKey = "adapter.lora.alpha"

		controlVectorLayerCountKey  = "adapter.control_vector.layer_count"
		controlVectorLayerCountKey2 = "control_vector.layer_count"
	)

	ga.Type = "adapter"
	ga.Architecture = arch

	m, _ := gf.Header.MetadataKV.Index([]string{
		typeKey,
		loraAlphaKey,
		controlVectorLayerCountKey,
		controlVectorLayerCountKey2,
	})

	if v, ok := m[typeKey]; ok {
		ga.AdapterType = v.ValueString()
	}
	if v, ok := m[loraAlphaKey]; ok {
		ga.AdapterLoRAAlpha = ValueNumeric[float32](v)
	}
	if v, ok := m[controlVectorLayerCountKey]; ok {
		ga.AdapterControlVectorLayerCount = ValueNumeric[uint32](v)
	} else if v, ok := m[controlVectorLayerCountKey2]; ok {
		ga.AdapterControlVectorLayerCount = ValueNumeric[uint32](v)
	}

	return ga
}

func (gf *GGUFFile) imatrixArchitecture(_ string) (ga GGUFArchitecture) {
	ga.Type = "imatrix"
	ga.Architecture = "imatrix"

	return ga
}

func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) {
	var (
		contextLengthKey     = arch + ".context_length"
		embeddingLengthKey   = arch + ".embedding_length"
		blockCountKey        = arch + ".block_count"
		feedForwardLengthKey = arch + ".feed_forward_length"

		expertFeedForwardLengthKey       = arch + ".expert_feed_forward_length"
		expertSharedFeedForwardLengthKey = arch + ".expert_shared_feed_forward_length"
		expertCountKey                   = arch + ".expert_count"
		expertUsedCountKey               = arch + ".expert_used_count"
		expertSharedCountKey             = arch + ".expert_shared_count"

		attentionHeadCountKey           = arch + ".attention.head_count"
		attentionHeadCountKVKey         = arch + ".attention.head_count_kv"
		attentionSlidingWindowKey       = arch + ".attention.sliding_window"
		attentionMaxALiBIBiasKey        = arch + ".attention.max_alibi_bias"
		attentionMaxALiBIBiasKey2       = arch + ".attention.alibi_bias_max"
		attentionClampKQVKey            = arch + ".attention.clamp_kqv"
		attentionClampKQVKey2           = arch + ".attention.clip_kqv"
		attentionLayerNormEpsilonKey    = arch + ".attention.layer_norm_epsilon"
		attentionLayerNormRMSEpsilonKey = arch + ".attention.layer_norm_rms_epsilon"
		attentionQueryLORARankKey       = arch + ".attention.q_lora_rank"
		attentionKeyValueLORARankKey    = arch + ".attention.kv_lora_rank"
		attentionKeyLengthKey           = arch + ".attention.key_length"
		attentionKeyLengthMLAKey        = arch + ".attention.key_length_mla"
		attentionValueLengthKey         = arch + ".attention.value_length"
		attentionValueLengthMLAKey      = arch + ".attention.value_length_mla"
		attentionCausalKey              = arch + ".attention.causal"

		ropeDimensionCountKey         = arch + ".rope.dimension_count"
		ropeFrequencyBaseKey          = arch + ".rope.freq_base"
		ropeFrequencyScaleKey         = arch + ".rope.freq_scale"
		ropeScaleLinearKey            = arch + ".rope.scale_linear"
		ropeScalingTypeKey            = arch + ".rope.scaling.type"
		ropeScalingFactorKey          = arch + ".rope.scaling.factor"
		ropeScalingOriginalContextKey = arch + ".rope.scaling.original_context_length" // uint32 maybe
		ropeScalingFinetunedKey       = arch + ".rope.scaling.finetuned"

		poolingTypeKey = arch + ".pooling_type"

		ssmConvolutionKernelKey = arch + ".ssm.conv_kernel"
		ssmInnerSizeKey         = arch + ".ssm.inner_size"
		ssmStateSizeKey         = arch + ".ssm.state_size"
		ssmTimeStepRankKey      = arch + ".ssm.time_step_rank"
		ssmGroupCountKey        = arch + ".ssm.group_count"

		rwkvHeadSizeKey                = arch + ".wkv.head_size"
		rwkvRescaleEveryNLayersKey     = arch + ".rescale_every_n_layers"
		rwkvTimeMixExtraDimensionKey   = arch + ".time_mix_extra_dim"
		rwkvTimeDecayExtraDimensionKey = arch + ".time_decay_extra_dim"
		rwkvTokenShiftCountKey         = arch + ".token_shift_count"

		vocabularyLengthKey    = arch + ".vocab_size"
		tokenizerGGMLTokensKey = "tokenizer.ggml.tokens"
	)

	ga.Type = "model"
	ga.Architecture = arch

	m, _ := gf.Header.MetadataKV.Index([]string{
		contextLengthKey,
		embeddingLengthKey,
		blockCountKey,
		feedForwardLengthKey,
		expertFeedForwardLengthKey,
		expertSharedFeedForwardLengthKey,
		expertCountKey,
		expertUsedCountKey,
		expertSharedCountKey,
		attentionHeadCountKey,
		attentionHeadCountKVKey,
		attentionSlidingWindowKey,
		attentionMaxALiBIBiasKey,
		attentionMaxALiBIBiasKey2,
		attentionClampKQVKey,
		attentionClampKQVKey2,
		attentionLayerNormEpsilonKey,
		attentionLayerNormRMSEpsilonKey,
		attentionQueryLORARankKey,
		attentionKeyValueLORARankKey,
		attentionKeyLengthKey,
		attentionKeyLengthMLAKey,
		attentionValueLengthKey,
		attentionValueLengthMLAKey,
		attentionCausalKey,
		ropeDimensionCountKey,
		ropeFrequencyBaseKey,
		ropeFrequencyScaleKey,
		ropeScaleLinearKey,
		ropeScalingTypeKey,
		ropeScalingFactorKey,
		ropeScalingOriginalContextKey,
		ropeScalingFinetunedKey,
		poolingTypeKey,
		ssmConvolutionKernelKey,
		ssmInnerSizeKey,
		ssmStateSizeKey,
		ssmTimeStepRankKey,
		ssmGroupCountKey,
		rwkvHeadSizeKey,
		rwkvRescaleEveryNLayersKey,
		rwkvTimeMixExtraDimensionKey,
		rwkvTimeDecayExtraDimensionKey,
		rwkvTokenShiftCountKey,
		vocabularyLengthKey,
		tokenizerGGMLTokensKey,
	})

	if v, ok := m[contextLengthKey]; ok {
		ga.MaximumContextLength = ValueNumeric[uint64](v)
	}
	if v, ok := m[embeddingLengthKey]; ok {
		ga.EmbeddingLength = ValueNumeric[uint64](v)
	}
	if v, ok := m[blockCountKey]; ok {
		ga.BlockCount = ValueNumeric[uint64](v)
	}
	if v, ok := m[feedForwardLengthKey]; ok {
		if v.ValueType == GGUFMetadataValueTypeArray {
			ga.FeedForwardLength = ValuesNumeric[uint64](v.ValueArray())
		} else {
			vx := ValueNumeric[uint64](v)
			ga.FeedForwardLength = make([]uint64, ga.BlockCount)
			for i := range ga.FeedForwardLength {
				ga.FeedForwardLength[i] = vx
			}
		}
	}

	if v, ok := m[expertCountKey]; ok {
		ga.ExpertCount = ValueNumeric[uint32](v)
	}
	if v, ok := m[expertUsedCountKey]; ok {
		ga.ExpertUsedCount = ValueNumeric[uint32](v)
	}
	if v, ok := m[expertSharedCountKey]; ok {
		ga.ExpertSharedCount = ValueNumeric[uint32](v)
	}
	if v, ok := m[expertFeedForwardLengthKey]; ok {
		ga.ExpertFeedForwardLength = ValueNumeric[uint64](v)
	}
	if v, ok := m[expertSharedFeedForwardLengthKey]; ok {
		ga.ExpertSharedFeedForwardLength = ValueNumeric[uint64](v)
	}

	if v, ok := m[attentionHeadCountKey]; ok {
		if v.ValueType == GGUFMetadataValueTypeArray {
			ga.AttentionHeadCount = ValuesNumeric[uint64](v.ValueArray())[0]
		} else {
			ga.AttentionHeadCount = ValueNumeric[uint64](v)
		}
	}
	if v, ok := m[attentionHeadCountKVKey]; ok {
		if v.ValueType == GGUFMetadataValueTypeArray {
			ga.AttentionHeadCountKV = ValuesNumeric[uint64](v.ValueArray())[0]
		} else {
			ga.AttentionHeadCountKV = ValueNumeric[uint64](v)
		}
	} else {
		ga.AttentionHeadCountKV = ga.AttentionHeadCount
	}
	ga.AttentionSlidingWindowPattern = 1
	if v, ok := m[attentionSlidingWindowKey]; ok {
		if v.ValueType == GGUFMetadataValueTypeArray {
			ga.AttentionSlidingWindow = ValuesNumeric[uint64](v.ValueArray())[0]
		} else {
			ga.AttentionSlidingWindow = ValueNumeric[uint64](v)
		}
	}
	switch arch {
	case "llama4":
		if ga.AttentionSlidingWindow == 0 {
			ga.AttentionSlidingWindow = 8192
		}
		ga.AttentionSlidingWindowPattern = 4
	case "phi3":
		// See https://github.com/ggml-org/llama.cpp/pull/13676
		ga.AttentionSlidingWindow = 0
	case "gemma2":
		if ga.AttentionSlidingWindow == 0 {
			ga.AttentionSlidingWindow = 4096
		}
		ga.AttentionSlidingWindowPattern = 2
	case "gemma3":
		ga.AttentionSlidingWindowPattern = 6
	case "cohere2":
		ga.AttentionSlidingWindowPattern = 4
	}
	if v, ok := m[attentionMaxALiBIBiasKey]; ok {
		ga.AttentionMaxALiBIBias = ValueNumeric[float32](v)
	} else if v, ok := m[attentionMaxALiBIBiasKey2]; ok {
		ga.AttentionMaxALiBIBias = ValueNumeric[float32](v)
	}
	if v, ok := m[attentionClampKQVKey]; ok {
		ga.AttentionClampKQV = ValueNumeric[float32](v)
	} else if v, ok := m[attentionClampKQVKey2]; ok {
		ga.AttentionClampKQV = ValueNumeric[float32](v)
	}
	if v, ok := m[attentionLayerNormEpsilonKey]; ok {
		ga.AttentionLayerNormEpsilon = ValueNumeric[float32](v)
	}
	if v, ok := m[attentionLayerNormRMSEpsilonKey]; ok {
		ga.AttentionLayerNormRMSEpsilon = ValueNumeric[float32](v)
	}
	if v, ok := m[attentionQueryLORARankKey]; ok {
		ga.AttentionQueryLORARank = ValueNumeric[uint32](v)
	}
	if v, ok := m[attentionKeyValueLORARankKey]; ok {
		ga.AttentionKeyValueLORARank = ValueNumeric[uint32](v)
	}
	if v, ok := m[attentionKeyLengthKey]; ok {
		ga.AttentionKeyLength = ValueNumeric[uint32](v)
	} else if ga.AttentionHeadCount != 0 {
		ga.AttentionKeyLength = uint32(ga.EmbeddingLength / ga.AttentionHeadCount)
	}
	if v, ok := m[attentionKeyLengthMLAKey]; ok {
		ga.AttentionKeyLengthMLA = ValueNumeric[uint32](v)
	}
	if v, ok := m[attentionValueLengthKey]; ok {
		ga.AttentionValueLength = ValueNumeric[uint32](v)
	} else if ga.AttentionHeadCount != 0 {
		ga.AttentionValueLength = uint32(ga.EmbeddingLength / ga.AttentionHeadCount)
	}
	if v, ok := m[attentionValueLengthMLAKey]; ok {
		ga.AttentionValueLengthMLA = ValueNumeric[uint32](v)
	}
	if v, ok := m[attentionCausalKey]; ok {
		ga.AttentionCausal = v.ValueBool()
	} else {
		ga.AttentionCausal = true
	}
	// See https://github.com/ggml-org/llama.cpp/blob/6491d6e4f1caf0ad2221865b4249ae6938a6308c/src/llama-arch.cpp#L1913-L1924.
	ga.AttentionRecurrent = slices.Contains([]string{ // TODO(thxCode): calculate this from the metadata.
		"mamba",
		"mamba2",
		"rwkv6",
		"rwkv6qwen2",
		"rwkv7",
		"arwkv7",
	}, ga.Architecture)
	// See https://github.com/ggml-org/llama.cpp/blob/a57d1bcb3c0165ac87b1f0dbb429839b0da69689/src/llama-arch.cpp#L2029-L2038.
	ga.AttentionHybrid = slices.Contains([]string{ // TODO(thxCode): calculate this from the metadata.
		"jamba",
		"falcon-h1",
		"granitehybrid",
	}, ga.Architecture)
	ga.AttentionRecurrent = ga.AttentionHybrid || ga.AttentionRecurrent

	if v, ok := m[ropeDimensionCountKey]; ok {
		ga.RoPEDimensionCount = ValueNumeric[uint64](v)
	}
	ga.RoPEFrequencyBase = 10000.0
	if v, ok := m[ropeFrequencyBaseKey]; ok {
		ga.RoPEFrequencyBase = ValueNumeric[float32](v)
	}
	ga.RoPEFrequencyScale = 1.0
	if v, ok := m[ropeFrequencyScaleKey]; ok {
		ga.RoPEFrequencyScale = ValueNumeric[float32](v)
	}
	if v, ok := m[ropeScalingTypeKey]; ok {
		ga.RoPEScalingType = v.ValueString()
	}
	if v, ok := m[ropeScaleLinearKey]; ok {
		ga.RoPEScalingType = "linear"
		ga.RoPEScalingFactor = ValueNumeric[float32](v)
		if ga.RoPEScalingFactor != 0 {
			ga.RoPEFrequencyScale = 1.0 / ga.RoPEScalingFactor
		}
	}
	if v, ok := m[ropeScalingFactorKey]; ok {
		ga.RoPEScalingFactor = ValueNumeric[float32](v)
		if ga.RoPEScalingFactor != 0 {
			ga.RoPEFrequencyScale = 1.0 / ga.RoPEScalingFactor
		}
	}
	if v, ok := m[ropeScalingOriginalContextKey]; ok {
		ga.RoPEScalingOriginalContextLength = ValueNumeric[uint64](v)
	}
	if v, ok := m[ropeScalingFinetunedKey]; ok {
		ga.RoPEScalingFinetuned = v.ValueBool()
	}

	if v, ok := m[poolingTypeKey]; ok {
		ga.PoolingType = v.ValueUint32()
		if ga.AttentionCausal && ga.PoolingType > 2 {
			ga.AttentionCausal = false
		}
	}

	if v, ok := m[ssmConvolutionKernelKey]; ok {
		ga.SSMConvolutionKernel = ValueNumeric[uint32](v)
	}
	if v, ok := m[ssmInnerSizeKey]; ok {
		ga.SSMInnerSize = ValueNumeric[uint32](v)
	}
	if v, ok := m[ssmStateSizeKey]; ok {
		ga.SSMStateSize = ValueNumeric[uint32](v)
	}
	if v, ok := m[ssmTimeStepRankKey]; ok {
		ga.SSMTimeStepRank = ValueNumeric[uint32](v)
	}
	if v, ok := m[ssmGroupCountKey]; ok {
		ga.SSMGroupCount = ValueNumeric[uint32](v)
	}

	if v, ok := m[rwkvHeadSizeKey]; ok {
		ga.RWKVHeadSize = ValueNumeric[uint32](v)
	}
	if v, ok := m[rwkvRescaleEveryNLayersKey]; ok {
		ga.RWKVRescaleEveryNLayers = ValueNumeric[uint32](v)
	}
	if v, ok := m[rwkvTimeMixExtraDimensionKey]; ok {
		ga.RWKVTimeMixExtraDimension = ValueNumeric[uint32](v)
	}
	if v, ok := m[rwkvTimeDecayExtraDimensionKey]; ok {
		ga.RWKVTimeDecayExtraDimension = ValueNumeric[uint32](v)
	}
	if v, ok := m[rwkvTokenShiftCountKey]; ok {
		ga.RWKVTokenShiftCount = ValueNumeric[uint32](v)
	} else if ga.AttentionRecurrent {
		ga.RWKVTokenShiftCount = 2
	}

	if v, ok := m[vocabularyLengthKey]; ok {
		ga.VocabularyLength = ValueNumeric[uint64](v)
	} else if v, ok := m[tokenizerGGMLTokensKey]; ok {
		ga.VocabularyLength = v.ValueArray().Len
	}

	return ga
}


================================================
FILE: file_architecture_test.go
================================================
package gguf_parser

import (
	"context"
	"os"
	"testing"

	"github.com/davecgh/go-spew/spew"
)

func TestGGUFFile_Architecture(t *testing.T) {
	ctx := context.Background()

	f, err := ParseGGUFFileFromHuggingFace(
		ctx,
		"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
		"Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf",
		SkipLargeMetadata())
	if err != nil {
		t.Fatal(err)
		return
	}

	t.Log("\n", spew.Sdump(f.Architecture()), "\n")
}

func BenchmarkGGUFFile_Architecture(b *testing.B) {
	mp, ok := os.LookupEnv("TEST_MODEL_PATH")
	if !ok {
		b.Skip("TEST_MODEL_PATH is not set")
		return
	}

	f, err := ParseGGUFFile(mp, SkipLargeMetadata(), UseMMap())
	if err != nil {
		b.Fatal(err)
		return
	}

	b.ReportAllocs()

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		_ = f.Architecture()
	}
}


================================================
FILE: file_estimate__llamacpp.go
================================================
package gguf_parser

import (
	"math"
	"regexp"
	"slices"
	"strings"

	"github.com/gpustack/gguf-parser-go/util/anyx"
	"github.com/gpustack/gguf-parser-go/util/ptr"
	"github.com/gpustack/gguf-parser-go/util/slicex"
)

// Types for LLaMACpp estimation.
type (
	// LLaMACppRunEstimate represents the estimated result of loading the GGUF file in llama.cpp.
	LLaMACppRunEstimate struct {
		// Type describes what type this GGUF file is.
		Type string `json:"type"`
		// Architecture describes what architecture this GGUF file implements.
		//
		// All lowercase ASCII.
		Architecture string `json:"architecture"`
		// ClipProjectorType is the type of the projector used in the clip model.
		//
		// Only used when Architecture is "clip".
		ClipProjectorType string `json:"clipProjectorType,omitempty"`
		// AdapterType is the type of the adapter.
		//
		// Only used when Architecture is "adapter".
		AdapterType string `json:"adapterType,omitempty"`
		// FlashAttention is the flag to indicate whether enable the flash attention,
		// true for enable.
		FlashAttention bool `json:"flashAttention"`
		// ContextSize is the size of the context.
		ContextSize uint64 `json:"contextSize"`
		// OffloadLayers is the number of offloaded layers.
		OffloadLayers uint64 `json:"offloadLayers"`
		// FullOffloaded is the flag to indicate whether the layers are fully offloaded,
		// false for partial offloaded or zero offloaded.
		FullOffloaded bool `json:"fullOffloaded"`
		// NoMMap is the flag to indicate whether support the mmap,
		// true for support.
		NoMMap bool `json:"noMMap"`
		// EmbeddingOnly is the flag to indicate whether the model is used for embedding only,
		// true for embedding only.
		EmbeddingOnly bool `json:"embeddingOnly"`
		// Reranking is the flag to indicate whether the model is used for reranking,
		// true for reranking.
		//
		// Only available when EmbeddingOnly is true.
		Reranking bool `json:"reranking"`
		// Distributable is the flag to indicate whether the model is distributable,
		// true for distributable.
		Distributable bool `json:"distributable"`
		// LogicalBatchSize is the logical batch size.
		LogicalBatchSize int32 `json:"logicalBatchSize"`
		// PhysicalBatchSize is the physical batch size.
		PhysicalBatchSize int32 `json:"physicalBatchSize"`
		// Devices represents the usage for running the GGUF file,
		// the first device is the CPU, and the rest are GPUs.
		Devices []LLaMACppRunDeviceUsage `json:"devices"`
		// Drafter is the estimated result of drafter.
		Drafter *LLaMACppRunEstimate `json:"drafter,omitempty"`
		// Projector is the estimated result of multimodal projector.
		Projector *LLaMACppRunEstimate `json:"projector,omitempty"`
		// Adapters is the estimated result of adapters.
		Adapters []LLaMACppRunEstimate `json:"adapters,omitempty"`
		// MaximumTokensPerSecond represents the maximum tokens per second for running the GGUF file.
		MaximumTokensPerSecond *GGUFTokensPerSecondScalar `json:"maximumTokensPerSecond,omitempty"`
	}

	// LLaMACppRunDeviceUsage represents the usage for running the GGUF file in llama.cpp.
	LLaMACppRunDeviceUsage struct {
		// HandleLayers is the number of layers that the device can handle.
		HandleLayers uint64 `json:"handleLayers"`
		// HandleSWALayers is the number of layers that the device can handle in sliding window attention (SWA),
		// the non SWA layers is `HandleLayers - HandleSWALayers`.
		HandleSWALayers uint64 `json:"handleSWALayers"`
		// HandleLastLayer is the index of the last layer the device can handle,
		// -1 means the device does not handle the last layer.
		HandleLastLayer int `json:"handleLastLayer"`
		// HandleOutputLayer is the flag to indicate whether the device can handle the output layer,
		// true for handle.
		HandleOutputLayer bool `json:"handleOutputLayer"`
		// Remote is the flag to indicate whether the device is remote,
		// true for remote.
		Remote bool `json:"remote"`
		// Position is the relative position of the device,
		// starts from 0.
		//
		// If Remote is true, Position is the position of the remote devices,
		// Otherwise, Position is the position of the device in the local devices.
		Position int `json:"position"`
		// Endpoint is the endpoint of the remote device, empty for local devices.
		Endpoint string `json:"endpoint,omitempty"`
		// Footprint is the memory footprint for bootstrapping.
		Footprint GGUFBytesScalar `json:"footprint"`
		// Parameter is the running parameters that the device processes.
		Parameter LLaMACppParameterUsage `json:"parameter"`
		// Weight is the memory usage of weights that the device loads.
		Weight LLaMACppWeightMemoryUsage `json:"weight"`
		// KVCache is the memory usage of kv that the device caches.
		KVCache LLaMACppKVCacheMemoryUsage `json:"kvCache"`
		// Computation is the memory usage of computation that the device processes.
		Computation LLaMACppComputationMemoryUsage `json:"computation"`
	}

	// LLaMACppParameterUsage represents the parameter usage for running the GGUF file in llama.cpp.
	LLaMACppParameterUsage struct {
		// KVCache is the parameter usage for caching previous KV.
		KVCache GGUFParametersScalar `json:"kvCache"`
		// Input is the parameter usage for input tensors.
		Input GGUFParametersScalar `json:"input"`
		// Compute is the parameter usage for compute tensors.
		Compute GGUFParametersScalar `json:"compute"`
		// ComputeOverridden is the parameter usage for overridden compute tensors.
		ComputeOverridden GGUFParametersScalar `json:"computeOverridden"`
		// Output is the parameter usage for output tensors.
		Output GGUFParametersScalar `json:"output"`
	}

	// LLaMACppWeightMemoryUsage represents the memory usage of loading weights in llama.cpp.
	LLaMACppWeightMemoryUsage struct {
		// Input is the memory usage for loading input tensors.
		Input GGUFBytesScalar `json:"input"`
		// Compute is the memory usage for loading compute tensors.
		Compute GGUFBytesScalar `json:"compute"`
		// ComputeOverridden is the memory usage for loading overridden compute tensors.
		ComputeOverridden GGUFBytesScalar `json:"computeOverridden"`
		// Output is the memory usage for loading output tensors.
		Output GGUFBytesScalar `json:"output"`
	}

	// LLaMACppKVCacheMemoryUsage represents the memory usage of caching previous KV in llama.cpp.
	LLaMACppKVCacheMemoryUsage struct {
		// Key is the memory usage for caching previous keys.
		Key GGUFBytesScalar `json:"key"`
		// Value is the memory usage for caching previous values.
		Value GGUFBytesScalar `json:"value"`
	}

	// LLaMACppComputationMemoryUsage represents the memory usage of computation in llama.cpp.
	LLaMACppComputationMemoryUsage struct {
		// Footprint is the memory footprint for computation.
		Footprint GGUFBytesScalar `json:"footprint"`
		// Input is the memory usage for input.
		Input GGUFBytesScalar `json:"input"`
		// Compute is the memory usage for computation.
		Compute GGUFBytesScalar `json:"graph"`
		// Output is the memory usage for output.
		Output GGUFBytesScalar `json:"output"`
	}
)

// EstimateLLaMACppRun estimates the usages of the GGUF file in llama.cpp.
func (gf *GGUFFile) EstimateLLaMACppRun(opts ...GGUFRunEstimateOption) (e LLaMACppRunEstimate) {
	// Options
	var o _GGUFRunEstimateOptions
	for _, opt := range opts {
		opt(&o)
	}
	switch {
	case o.TensorSplitFraction == nil:
		o.TensorSplitFraction = []float64{1}
		o.MainGPUIndex = 0
	case o.MainGPUIndex < 0 || o.MainGPUIndex >= len(o.TensorSplitFraction):
		panic("main device index must be range of 0 to the length of tensor split fraction")
	}
	if len(o.DeviceMetrics) > 0 {
		for i, j := 0, len(o.DeviceMetrics)-1; i < len(o.TensorSplitFraction)-j; i++ {
			o.DeviceMetrics = append(o.DeviceMetrics, o.DeviceMetrics[j])
		}
		o.DeviceMetrics = o.DeviceMetrics[:len(o.TensorSplitFraction)+1]
	}
	if o.LMCCacheKeyType == nil {
		o.LMCCacheKeyType = ptr.To(GGMLTypeF16)
	}
	if o.LMCCacheValueType == nil {
		o.LMCCacheValueType = ptr.To(GGMLTypeF16)
	}
	if o.LMCOffloadKVCache == nil {
		o.LMCOffloadKVCache = ptr.To(true)
	}
	if o.LMCLogicalBatchSize == nil {
		o.LMCLogicalBatchSize = ptr.To(int32(2048))
	} else {
		// See https://github.com/ggerganov/llama.cpp/blob/0bf16de07b0692e7df26b9a633e232bbd66e0360/src/llama.cpp#L16519-L16525.
		o.LMCLogicalBatchSize = ptr.To(max(32, *o.LMCLogicalBatchSize))
	}
	if o.LMCPhysicalBatchSize == nil {
		o.LMCPhysicalBatchSize = ptr.To(int32(512))
	}
	if *o.LMCPhysicalBatchSize > *o.LMCLogicalBatchSize {
		panic("physical batch size must be less than or equal to logical batch size")
	}
	if o.LMCSplitMode >= _LLAMACppSplitModeMax {
		panic("split mode must be less than max")
	}

	// Devices.
	e.Devices = make([]LLaMACppRunDeviceUsage, len(o.TensorSplitFraction)+1)
	for i := range e.Devices {
		e.Devices[i].HandleLastLayer = -1
	}
	for j := range e.Devices[1:] {
		e.Devices[j+1].Remote = j < len(o.RPCServers)
		if e.Devices[j+1].Remote {
			e.Devices[j+1].Position = j
			e.Devices[j+1].Endpoint = o.RPCServers[j]
		} else {
			e.Devices[j+1].Position = j - len(o.RPCServers)
		}
	}

	// Metadata.
	a := gf.Architecture()
	e.Type = a.Type
	e.Architecture = a.Architecture
	e.ClipProjectorType = a.ClipProjectorType
	e.AdapterType = a.AdapterType

	switch a.Type {
	case "model":
		t := gf.Tokenizer()
		gf.estimateLLaMACppRunInModel(&o, &a, &t, &e)
	case "projector":
		// For projector model,
		// see https://github.com/ggerganov/llama.cpp/blob/148ec970b62c3c5ae0a8bfdaad2fc237aaae350d/examples/llava/clip.cpp#L994-L1008.
		if ptr.Deref(o.LMCOffloadLayers, math.MaxUint64) != 0 {
			// Full offload.
			o.LMCOffloadLayers = ptr.To[uint64](math.MaxUint64)
		} else {
			// Zero offload.
			o.LMCOffloadLayers = ptr.To[uint64](0)
		}
		gf.estimateLLaMACppRunInProjector(&o, &a, &e)
	case "adapter":
		gf.estimateLLaMACppRunInAdapter(&o, &a, &e)
	case "imatrix":
		gf.estimateLLaMACppRunInIMatrix(&o, &a, &e)
	}

	return e
}

// estimateLLaMACppRunInModel estimates the usages of the GGUF file for model,
// including the usages of footprint, weight, KV cache, and computation.
func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GGUFArchitecture, t *GGUFTokenizer, e *LLaMACppRunEstimate) {
	ls := gf.Layers()
	ioLs, tfLs, _ := ls.Cut([]string{
		"position_*",
		"token_*",
		"cls.*",
		"output.*",
		"output_*",
		"rope_factors_*",
	})
	ipLs, opLs, _ := ioLs.Cut([]string{
		"position_*",
		"token_*",
	})

	if a.BlockCount == 0 {
		a.BlockCount = uint64(len(tfLs))
	}

	// Using sliding window attention.
	usingSWA := a.AttentionSlidingWindowPattern != 1 && !o.LMCFullSizeSWACache

	// Full offload: nLoadLayers == 0 && isOffloadOutputLayer
	// Zero offload: nOffloadLayers == 0
	// Partial offload: !Full offload && !Zero offload
	var (
		nOffloadLayers       uint64
		nActualOffloadLayers uint64
		nLoadLayers          = a.BlockCount
		idxOutputDevice      int

		fullOffload, zeroOffload          bool
		nSWALoadLayers, nSWAOffloadLayers uint64
	)
	{
		var isOffloadOutputLayer bool

		switch v := o.LMCOffloadLayers; {
		case v == nil:
			o.LMCOffloadLayers = ptr.To(a.BlockCount)
			nOffloadLayers = a.BlockCount
			isOffloadOutputLayer = true
		case *v != 0:
			nOffloadLayers = *v
			if nOffloadLayers > a.BlockCount {
				isOffloadOutputLayer = true
				nOffloadLayers = a.BlockCount
			}
		}
		nActualOffloadLayers = nOffloadLayers
		if isOffloadOutputLayer {
			nActualOffloadLayers += 1
		}
		nLoadLayers -= nOffloadLayers

		fullOffload = nLoadLayers == 0 && isOffloadOutputLayer
		zeroOffload = nOffloadLayers == 0

		e.FullOffloaded = fullOffload
		e.OffloadLayers = nOffloadLayers

		for i, j, offloadStart := uint64(0), 0, a.BlockCount-nOffloadLayers; i < a.BlockCount; i++ {
			switch {
			case i < nLoadLayers:
				e.Devices[0].HandleLayers += 1
				e.Devices[0].HandleLastLayer = int(i)
				if usingSWA && (a.AttentionSlidingWindowPattern == 0 || i%uint64(a.AttentionSlidingWindowPattern) != 0) {
					e.Devices[0].HandleSWALayers += 1
					nSWALoadLayers += 1
				}
			case i >= offloadStart:
				x := float64(i-offloadStart) / float64(nActualOffloadLayers)
				j = slicex.UpperBound(o.TensorSplitFraction, x)
				e.Devices[j+1].HandleLayers += 1
				e.Devices[j+1].HandleLastLayer = int(i)
				if usingSWA && (a.AttentionSlidingWindowPattern == 0 || i%uint64(a.AttentionSlidingWindowPattern) != 0) {
					e.Devices[j+1].HandleSWALayers += 1
					nSWAOffloadLayers += 1
				}
				if fullOffload && i == a.BlockCount-1 {
					idxOutputDevice = j + 1
				}
			}
		}

		e.Devices[idxOutputDevice].HandleOutputLayer = true
	}

	// Flash attention.
	{
		// Grok is not compatible with flash attention,
		// see https://github.com/ggerganov/llama.cpp/blob/19d3c8293b1f61acbe2dab1d49a17950fd788a4a/src/llama.cpp#L9566-L9569.
		if a.Architecture == "grok" {
			o.FlashAttention = false
		}
		// Fallback to FP16 if the value type is quantized when disabling flash attention,
		// see https://github.com/ggerganov/llama.cpp/blob/19d3c8293b1f61acbe2dab1d49a17950fd788a4a/src/llama.cpp#L9576-L9579.
		if o.LMCCacheValueType.IsQuantized() && !o.FlashAttention {
			o.LMCCacheValueType = ptr.To(GGMLTypeF16)
		}

		e.FlashAttention = o.FlashAttention
	}

	// Embedding.
	if !a.AttentionCausal {
		ropeFrequencyBase := ptr.Deref(o.LMCRoPEFrequencyBase, a.RoPEFrequencyBase)
		ropeFrequencyScale := ptr.Deref(o.LMCRoPEFrequencyScale, a.RoPEFrequencyScale)
		ropeScalingType := ptr.Deref(o.LMCRoPEScalingType, a.RoPEScalingType)
		ropeScalingOriginalContextSize := ptr.Deref(o.LMCRoPEScalingOriginalContextSize, int32(a.RoPEScalingOriginalContextLength))
		isRoPECustomized := ropeFrequencyBase != a.RoPEFrequencyBase ||
			ropeFrequencyScale != a.RoPEFrequencyScale ||
			ropeScalingType != a.RoPEScalingType ||
			(ropeScalingType == "yarn" && ropeScalingOriginalContextSize != int32(a.RoPEScalingOriginalContextLength))

		e.EmbeddingOnly = true
		o.LMCContextSize = ptr.To(ptr.Deref(o.LMCContextSize, int32(a.MaximumContextLength)))
		// Set context size/physical batch size/logical batch size to the training context size.
		if !isRoPECustomized {
			o.LMCContextSize = ptr.To(min(int32(a.MaximumContextLength), *o.LMCContextSize))
		}
		o.LMCLogicalBatchSize = o.LMCContextSize
		o.LMCPhysicalBatchSize = o.LMCLogicalBatchSize
		// Reranking.
		if _, found := gf.TensorInfos.Index([]string{"cls.bias", "cls.weight"}); found > 0 {
			e.Reranking = true
		}
		if !e.Reranking && a.PoolingType == 4 { // 0: None, 1: Mean, 2: Cls, 3: Last, 4: Rank
			e.Reranking = true
		}
	}

	// Distributable,
	// fix by https://github.com/ggerganov/llama.cpp/pull/11047.
	e.Distributable = true

	// Batch size.
	e.LogicalBatchSize = *o.LMCLogicalBatchSize
	e.PhysicalBatchSize = *o.LMCPhysicalBatchSize

	// Padding alignment.
	paddingAlign := uint64(32)
	if o.FlashAttention {
		paddingAlign = 256
	}

	// Init hyperparameters,
	// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L6957-L7000.
	var (
		nContext uint64
		nTokens  uint64
		nBatch   uint64
		nOutputs uint64
		nSeq     uint64
		nKV      uint64
	)
	{
		nContext = a.MaximumContextLength
		if o.LMCContextSize != nil {
			nContext = uint64(*o.LMCContextSize)
		}
		if o.LMCInMaxContextSize {
			nContext = min(nContext, a.MaximumContextLength)
		}
		// Padding context size,
		// see https://github.com/ggerganov/llama.cpp/blob/278d0e18469aacf505be18ce790a63c7cc31be26/src/llama.cpp#L19001-L19002.
		nContext = GGMLPadding(nContext, paddingAlign)

		// Correct token size,
		// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L12221-L12224.
		nTokens = min(nContext, uint64(*o.LMCPhysicalBatchSize))
		nBatch = nTokens
		nOutputs = nTokens
		nSeq = uint64(ptr.Deref(o.ParallelSize, 1))
		nKV = nContext

		e.ContextSize = nContext
	}

	// Footprint.
	{
		// Bootstrap.
		e.Devices[0].Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */

		// Tokens,
		// https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L6380-L6384.
		fp := t.TokensLength * (4 /* token type */ + 4 /* token score*/)
		if t.Model == "gpt2" {
			fp += t.MergesLength * (48 /* key type */ + 56 /* value type */)
		}
		fp += t.TokensLength * (32 /* id to token vector */ + (24 + 32) /* token to id map*/)
		e.Devices[0].Footprint += GGUFBytesScalar(fp)

		// Output buffer,
		// see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L11940-L12003.
		ob := a.EmbeddingLength * nOutputs * 4 /* float32 size */
		if a.AttentionCausal {
			ob += a.VocabularyLength * nOutputs * 4 /* float32 size */
		}
		if fullOffload {
			e.Devices[idxOutputDevice].Footprint += GGUFBytesScalar(ob)
		} else {
			e.Devices[0].Footprint += GGUFBytesScalar(ob)
		}
	}

	// Weight & Parameter.
	{
		filter := func(idx int) GGUFTensorInfoFilter {
			if len(o.OverriddenTensors) == 0 {
				return nil
			}
			return func(name string) bool {
				for _, ot := range o.OverriddenTensors {
					bt, bi := ot.ParseBufferType()
					switch {
					case bt == GGUFRunOverriddenTensorBufferTypeUnknown:
						continue
					case bt == GGUFRunOverriddenTensorBufferTypeCPU && idx == 0:
						continue
					case bt == GGUFRunOverriddenTensorBufferTypeGPU &&
						(e.Devices[idx].Remote || anyx.Number[int](bi)+1 != idx):
						continue
					case bt == GGUFRunOverriddenTensorBufferTypeRPC &&
						(!e.Devices[idx].Remote || e.Devices[idx].Endpoint != bi):
						continue
					}
					if ot.PatternRegex.MatchString(name) {
						return false
					}
				}
				return true
			}
		}

		// If overridden tensors are provided,
		// we need to search the tensors of the overridden pattern,
		// and place them in the correct device.
		if len(o.OverriddenTensors) != 0 {
			for _, ot := range o.OverriddenTensors {
				bt, bi := ot.ParseBufferType()
				if bt == GGUFRunOverriddenTensorBufferTypeUnknown {
					continue
				}
				var sls GGUFTensorInfos = ls.Search(ot.PatternRegex)
				if len(sls) == 0 {
					continue
				}
				switch bt {
				case GGUFRunOverriddenTensorBufferTypeCPU:
					e.Devices[0].Weight.ComputeOverridden += GGUFBytesScalar(sls.Bytes())
					e.Devices[0].Parameter.ComputeOverridden += GGUFParametersScalar(sls.Elements())
				case GGUFRunOverriddenTensorBufferTypeGPU:
					idx := anyx.Number[int](bi) + 1
					e.Devices[idx].Weight.ComputeOverridden += GGUFBytesScalar(sls.Bytes())
					e.Devices[idx].Parameter.ComputeOverridden += GGUFParametersScalar(sls.Elements())
				default:
					for i, d := range e.Devices[1:] {
						if d.Endpoint == bi {
							e.Devices[i+1].Weight.ComputeOverridden += GGUFBytesScalar(sls.Bytes())
							e.Devices[i+1].Parameter.ComputeOverridden += GGUFParametersScalar(sls.Elements())
							break
						}
					}
				}
			}
		}

		// Compute.
		for i, j, offloadStart := 0, 0, len(tfLs)-int(nOffloadLayers); i < len(tfLs); i++ {
			idx := 0
			if i >= offloadStart {
				x := float64(i-offloadStart) / float64(nActualOffloadLayers)
				j = slicex.UpperBound(o.TensorSplitFraction, x)
				idx = j + 1
			}
			f := filter(idx)
			e.Devices[idx].Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes(f))
			e.Devices[idx].Parameter.Compute += GGUFParametersScalar(tfLs[i].Elements(f))
		}

		// IO,
		// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L4930-L5002.
		e.Devices[0].Weight.Input = GGUFBytesScalar(ipLs.Bytes())
		e.Devices[0].Parameter.Input = GGUFParametersScalar(ipLs.Elements())
		var (
			wg GGUFBytesScalar
			ps GGUFParametersScalar
		)
		if _, ok := opLs.Get("output.weight"); ok {
			wg = GGUFBytesScalar(opLs.Bytes())
			ps = GGUFParametersScalar(opLs.Elements())
		} else {
			wg = GGUFBytesScalar(opLs.Bytes()) + e.Devices[0].Weight.Input /* duplicate the input layer */
			ps = GGUFParametersScalar(opLs.Elements() + ipLs.Elements())
		}
		e.Devices[0].Weight.Output = wg
		if fullOffload {
			e.Devices[idxOutputDevice].Weight.Output = wg
			e.Devices[idxOutputDevice].Parameter.Output = ps
		} else {
			e.Devices[0].Parameter.Output = ps
		}
	}

	// KV cache.
	if a.AttentionCausal {
		switch {
		// Recurrent,
		// see https://github.com/ggml-org/llama.cpp/blob/704bb7a71c01dc07c1478b85f6322bf5dfde1eaf/src/llama-hparams.cpp#L68-L88.
		case a.AttentionRecurrent:
			var r, s uint64
			if a.RWKVHeadSize > 0 {
				r = uint64(a.RWKVTokenShiftCount) * a.EmbeddingLength
				s = uint64(a.RWKVHeadSize) * a.EmbeddingLength
			} else {
				r = uint64((a.SSMConvolutionKernel - 1) * (a.SSMInnerSize + 2*a.SSMGroupCount*a.SSMStateSize))
				s = uint64(a.SSMStateSize * a.SSMInnerSize)
			}

			rps, sps := r*nSeq, s*nSeq
			rrs, srs := GGMLTypeF32.RowSizeOf([]uint64{rps}), GGMLTypeF32.RowSizeOf([]uint64{sps})

			e.Devices[0].KVCache.Key += GGUFBytesScalar(rrs * nLoadLayers)
			e.Devices[0].KVCache.Value += GGUFBytesScalar(srs * nLoadLayers)
			e.Devices[0].Parameter.KVCache += GGUFParametersScalar((rrs + srs) * nLoadLayers)
			if !*o.LMCOffloadKVCache {
				e.Devices[0].KVCache.Key += GGUFBytesScalar(rrs * nOffloadLayers)
				e.Devices[0].KVCache.Value += GGUFBytesScalar(srs * nOffloadLayers)
				e.Devices[0].Parameter.KVCache += GGUFParametersScalar((rrs + srs) * nOffloadLayers)
			} else if !zeroOffload {
				for i, d := range e.Devices[1:] {
					e.Devices[i+1].KVCache.Key += GGUFBytesScalar(rrs * d.HandleLayers)
					e.Devices[i+1].KVCache.Value += GGUFBytesScalar(srs * d.HandleLayers)
					e.Devices[i+1].Parameter.KVCache += GGUFParametersScalar((rrs + srs) * d.HandleLayers)
				}
			}

			if !a.AttentionHybrid {
				break
			}

			fallthrough
		// Causal,
		// see https://github.com/ggml-org/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L2479-L2501.
		default:
			akl, avl := uint64(a.AttentionKeyLength), uint64(a.AttentionValueLength)
			if a.AttentionKeyLengthMLA > 0 && a.AttentionValueLengthMLA > 0 {
				akl, avl = uint64(a.AttentionKeyLengthMLA), uint64(a.AttentionValueLengthMLA)
			}
			kGQA := akl * a.AttentionHeadCountKV
			vGQA := avl * a.AttentionHeadCountKV
			kps, vps := kGQA*nKV, vGQA*nKV
			krs, vrs := o.LMCCacheKeyType.RowSizeOf([]uint64{kps}), o.LMCCacheValueType.RowSizeOf([]uint64{vps})

			if !usingSWA {
				e.Devices[0].KVCache.Key += GGUFBytesScalar(krs * nLoadLayers)
				e.Devices[0].KVCache.Value += GGUFBytesScalar(vrs * nLoadLayers)
				e.Devices[0].Parameter.KVCache += GGUFParametersScalar((kps + vps) * nLoadLayers)
				if !*o.LMCOffloadKVCache {
					e.Devices[0].KVCache.Key += GGUFBytesScalar(krs * nOffloadLayers)
					e.Devices[0].KVCache.Value += GGUFBytesScalar(vrs * nOffloadLayers)
					e.Devices[0].Parameter.KVCache += GGUFParametersScalar((kps + vps) * nOffloadLayers)
				} else if !zeroOffload {
					for i, d := range e.Devices[1:] {
						e.Devices[i+1].KVCache.Key += GGUFBytesScalar(krs * d.HandleLayers)
						e.Devices[i+1].KVCache.Value += GGUFBytesScalar(vrs * d.HandleLayers)
						e.Devices[i+1].Parameter.KVCache += GGUFParametersScalar((kps + vps) * d.HandleLayers)
					}
				}
			} else {
				// Sliding window attention size,
				// see https://github.com/ggml-org/llama.cpp/blob/3079e9ac8e04ef6eddeb0c164d72edb6b6fd2df5/src/llama-kv-cache.cpp#L1640-L1642.
				swas := min(nKV, GGMLPadding(a.AttentionSlidingWindow*nSeq+uint64(*o.LMCLogicalBatchSize), paddingAlign))
				swaKps, swaVps := kGQA*swas, vGQA*swas
				swaKrs, swaVrs := o.LMCCacheKeyType.RowSizeOf([]uint64{swaKps}), o.LMCCacheValueType.RowSizeOf([]uint64{swaVps})

				nNonSWALoadLayers, nNonSWAOffloadLayers := nLoadLayers-nSWALoadLayers, nOffloadLayers-nSWAOffloadLayers

				e.Devices[0].KVCache.Key += GGUFBytesScalar(swaKrs*nSWALoadLayers + krs*nNonSWALoadLayers)
				e.Devices[0].KVCache.Value += GGUFBytesScalar(swaVrs*nSWALoadLayers + vrs*nNonSWALoadLayers)
				e.Devices[0].Parameter.KVCache += GGUFParametersScalar((swaKps+swaVps)*nSWALoadLayers + (kps+vps)*nNonSWALoadLayers)
				if !*o.LMCOffloadKVCache {
					e.Devices[0].KVCache.Key += GGUFBytesScalar(swaKrs*nSWAOffloadLayers + krs*nNonSWAOffloadLayers)
					e.Devices[0].KVCache.Value += GGUFBytesScalar(swaVrs*nSWAOffloadLayers + vrs*nNonSWAOffloadLayers)
					e.Devices[0].Parameter.KVCache += GGUFParametersScalar((swaKps+swaVps)*nSWAOffloadLayers + (kps+vps)*nNonSWAOffloadLayers)
				} else if !zeroOffload {
					for i, d := range e.Devices[1:] {
						e.Devices[i+1].KVCache.Key += GGUFBytesScalar(swaKrs*d.HandleSWALayers + krs*(d.HandleLayers-d.HandleSWALayers))
						e.Devices[i+1].KVCache.Value += GGUFBytesScalar(swaVrs*d.HandleSWALayers + vrs*(d.HandleLayers-d.HandleSWALayers))
						e.Devices[i+1].Parameter.KVCache += GGUFParametersScalar((swaKps+swaVps)*d.HandleSWALayers + (kps+vps)*(d.HandleLayers-d.HandleSWALayers))
					}
				}
			}
		}
	}

	// Computation.
	{
		// See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/src/llama-context.cpp#L1241-L1243.
		maxNodes := max(1024, uint64(8*len(gf.TensorInfos)))

		// Bootstrap, compute metadata.
		cm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false)
		e.Devices[0].Computation.Footprint = GGUFBytesScalar(cm)

		// Scheduler overhead,
		// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149.
		e.Devices[0].Computation.Footprint += GGUFBytesScalar(4 * 1024 * 1024)

		// GGML context,
		// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036.
		gc := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.BlockCount*3)
		e.Devices[0].Computation.Footprint += GGUFBytesScalar(gc)

		// Tensor usage,
		// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149.
		//
		// First, get the usage of input layer,
		// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L2279-L2290.
		var (
			inpTokens = GGMLTypeI32.RowSizeOf([]uint64{nBatch})                    // I32 [n_batch]
			inpEmbd   = GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, nBatch}) // F32 [n_embd, n_batch]
			inpPos    = GGMLTypeI32.RowSizeOf([]uint64{nBatch})                    // I32 [n_batch]
			inpOutIds = GGMLTypeI32.RowSizeOf([]uint64{nOutputs})                  // I32 [n_outputs],
			inpKQMask = GGMLTypeF32.RowSizeOf([]uint64{nKV, nBatch})               // F32 [n_kv, n_batch]
			inpSMask  = GGMLTypeF32.RowSizeOf([]uint64{1, nSeq})                   // F32 [1, n_seq]
			inpSSeq   = GGMLTypeI32.RowSizeOf([]uint64{nSeq, nBatch})              // I32 [n_seq, n_batch]
		)
		if a.AttentionRecurrent {
			e.Devices[0].Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + 2*inpSMask + inpSSeq + inpOutIds)
		} else {
			e.Devices[0].Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + inpPos + inpKQMask + inpOutIds)
		}
		{
			var v GGUFBytesScalar
			if a.AttentionRecurrent {
				v = GGUFBytesScalar(inpEmbd + inpSMask + inpSSeq)
			} else {
				v = GGUFBytesScalar(inpEmbd + inpPos + inpKQMask)
			}
			if len(o.RPCServers) == 0 && len(o.TensorSplitFraction) > 1 {
				if a.ExpertCount > 0 {
					v *= 2
				} else {
					v *= 4
				}
			}
			for i := range e.Devices[1:] {
				e.Devices[i+1].Computation.Input += v
			}
		}
		// Since the steps between transformer layers are serial,
		// the allocated memory can be reused for the next layer.
		// So, we only consider the usage of the largest layer,
		// which is the last layer by default.
		if a.AttentionRecurrent && !a.AttentionHybrid {
			if a.RWKVHeadSize > 0 {
				attnInc := uint64(0)
				for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|attn_norm_2)\.weight`)) {
					rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch})
					attnInc += rs
				}
				ffnInc := uint64(0)
				for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.time_mix_(lerp_x|receptance|decay_w2|key|value|gate|w2|output)\.weight`)) { // nolint: lll
					switch {
					case strings.HasSuffix(l.Name, ".time_mix_w2.weight"):
						rs := GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, 1, nTokens, l.Dimensions[l.NDimensions-1]})
						ffnInc += rs
					case strings.HasSuffix(l.Name, ".time_mix_output.weight"):
						rs := GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, nBatch + uint64(a.RWKVHeadSize)*nSeq})
						ffnInc += rs
					default:
						rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch})
						ffnInc += rs
					}
				}
				cp := GGUFBytesScalar(attnInc + ffnInc)
				for i := range e.Devices[1:] {
					e.Devices[i+1].Computation.Compute = cp
				}
			} else {
				r := uint64((a.SSMConvolutionKernel - 1) * (a.SSMInnerSize + 2*a.SSMGroupCount*a.SSMStateSize))
				convInc := GGMLTypeF32.RowSizeOf([]uint64{r, nSeq}) // F32 [n_embd_key_gqa, nSeq] reshape
				for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|ssm_in|ssm_conv1d)\.weight`)) {
					if !strings.HasSuffix(l.Name, ".ssm_conv1d.weight") {
						rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
						convInc += rs
						continue
					}
					// https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L10379.
					rs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.SSMInnerSize)*nTokens + uint64(a.SSMConvolutionKernel)*uint64(a.SSMInnerSize)*nSeq})
					convInc += rs
				}
				ssmInc := uint64(0)
				for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.ssm_(dt\.weight|a)`)) {
					if !strings.HasSuffix(l.Name, ".ssm_a") {
						rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
						ssmInc += rs
						continue
					}
					// https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L10413.
					rs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.SSMInnerSize)*nTokens + uint64(a.SSMStateSize)*uint64(a.SSMInnerSize)*nSeq})
					ssmInc += rs
				}
				cp := GGUFBytesScalar(convInc + ssmInc)
				for i := range e.Devices[1:] {
					e.Devices[i+1].Computation.Compute = cp
				}
			}
		} else {
			loadAttnInc, offloadAttnInc := uint64(0), uint64(0)
			{
				rs := o.LMCCacheKeyType.RowSizeOf([]uint64{uint64(a.AttentionKeyLength), nKV, a.AttentionHeadCountKV})
				loadAttnInc = rs // k-?
				rs = o.LMCCacheValueType.RowSizeOf([]uint64{uint64(a.AttentionValueLength), nKV, a.AttentionHeadCountKV})
				loadAttnInc += rs // v-?
			}
			if o.FlashAttention {
				// https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L7387.
				offloadAttnInc = GGMLTypeF16.RowSizeOf([]uint64{nKV, nTokens})
				for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.attn_(norm|q|qkv|q_b)\.weight`)) {
					if strings.HasSuffix(l.Name, ".attn_norm.weight") {
						rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
						offloadAttnInc += rs
						continue
					}
					rs := l.Bytes()
					offloadAttnInc += rs
				}
				// https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L6986-L6992.
				rs := o.LMCCacheKeyType.RowSizeOf([]uint64{uint64(a.AttentionKeyLength), nKV, a.AttentionHeadCountKV})
				offloadAttnInc += rs
				// https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L7000-L7007.
				rs = o.LMCCacheValueType.RowSizeOf([]uint64{uint64(a.AttentionValueLength), nKV, a.AttentionHeadCountKV})
				offloadAttnInc += rs
			} else {
				offloadAttnInc = uint64(0)
				for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.attn_(norm|q|qkv|q_b)\.weight`)) {
					var rs uint64
					switch {
					default: // norm.
						rs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
						offloadAttnInc += rs
					case strings.HasSuffix(l.Name, ".attn_q.weight"):
						rs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[0], nTokens})
						offloadAttnInc += rs * 2 // Qcur.
						rs = GGMLTypeF32.RowSizeOf([]uint64{nKV, nTokens, a.AttentionHeadCount})
						offloadAttnInc += rs // kq.
						if !zeroOffload && !fullOffload {
							offloadAttnInc += loadAttnInc
						}
					case strings.HasSuffix(l.Name, ".attn_qkv.weight"):
						rs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[0], nTokens})
						offloadAttnInc += rs * 2 // Qcur.
						rs = GGMLTypeF32.RowSizeOf([]uint64{nKV, nTokens, a.AttentionHeadCount})
						offloadAttnInc += rs // kq.
						rs = GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, a.EmbeddingLength * 3})
						offloadAttnInc += rs // wqkv.
						if !zeroOffload && !fullOffload {
							offloadAttnInc += loadAttnInc
						}
					case strings.HasSuffix(l.Name, ".attn_q_b.weight"):
						rs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
						offloadAttnInc += rs * 2 // q-?
						rs = GGMLTypeF32.RowSizeOf([]uint64{nKV, nTokens, a.AttentionHeadCount})
						offloadAttnInc += rs // kq.
					}
				}
			}
			ffnInc := uint64(0)
			for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|ffn_norm|ffn_gate|ffn_up)\.weight`)) {
				rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
				ffnInc += rs
			}
			if a.ExpertCount > 0 || a.ExpertUsedCount > 0 {
				rs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.ExpertCount), a.EmbeddingLength})
				ffnInc += rs // ffn_gate_input
				rs = GGMLTypeF32.RowSizeOf([]uint64{uint64(a.ExpertCount), nTokens})
				ffnInc += rs // ffn_moe_logits
				rs = GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, uint64(a.ExpertUsedCount), nTokens})
				ffnInc += rs // ffn_moe_down
			}
			if !zeroOffload {
				e.Devices[0].Computation.Compute = GGUFBytesScalar(loadAttnInc + ffnInc)
			} else {
				e.Devices[0].Computation.Compute = GGUFBytesScalar(loadAttnInc)
			}
			{
				cp := GGUFBytesScalar(max(offloadAttnInc, ffnInc))
				for i := range e.Devices[1:] {
					e.Devices[i+1].Computation.Compute = cp
				}
				if nLoadLayers > 1 {
					for i := range e.Devices[1:] {
						if e.Devices[i+1].Remote {
							continue
						}
						e.Devices[i+1].Computation.Compute += GGUFBytesScalar(loadAttnInc)
						break
					}
				}
			}
		}
		// Finally, get the usage of output layer.
		if a.AttentionCausal {
			var outInc uint64
			if a.AttentionRecurrent {
				outInc += inpSMask + inpSSeq
			}
			if l, ok := opLs.Get("output_norm.weight"); ok {
				rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
				outInc += rs
			}
			if l, ok := opLs.Get("output.weight"); ok {
				rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
				outInc += rs
			} else if l, ok := ipLs.Get("token_embd.weight"); ok {
				rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens})
				outInc += rs
			}
			e.Devices[idxOutputDevice].Computation.Output += GGUFBytesScalar(outInc)
		}
	}

	// Drafter.
	e.Drafter = o.LMCDrafter

	// Projector.
	e.Projector = o.LMCProjector

	// Adapters.
	e.Adapters = o.LMCAdapters

	// Maximum tokens per second.
	if ds, dmss := e.Devices, o.DeviceMetrics; len(dmss) != 0 {
		ltss := make([]float64, len(dmss))
		bs := anyx.Number[float64](*o.LMCLogicalBatchSize) / float64(nBatch)
		for i, dm := range dmss {
			fl, upbw, dwbw := float64(max(dm.FLOPS, 1)), float64(max(dm.UpBandwidth, 1)), float64(max(dm.DownBandwidth, 1))
			cmpops := float64(ds[i].Parameter.Compute+ds[i].Parameter.ComputeOverridden)*2 /* FMA */ *bs + float64(ds[i].Parameter.Input) + float64(ds[i].Parameter.Output) // nolint: lll
			cmps := float64(ds[i].Weight.Sum())
			cmplat := max(cmpops/fl, cmps/upbw)
			kvcops := float64(ds[i].Parameter.KVCache) * 2 /* FMA */ * bs
			kvcs := float64(ds[i].KVCache.Sum()) * bs
			kvclat := max(kvcops/fl, kvcs/upbw)
			ffs := float64(GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, nBatch}))
			ffslat := ffs / dwbw
			lays := float64(ds[i].HandleLayers)
			if ds[i].HandleOutputLayer {
				lays += 1
			}
			ltss[i] = (cmplat + kvclat + ffslat) * lays / float64(a.BlockCount+2)
		}
		lt := float64(0)
		ltmax := slices.Max(ltss)
		for i := range ltss {
			lt += ltss[i] / ltmax * ltss[i]
		}
		e.MaximumTokensPerSecond = ptr.To(GGUFTokensPerSecondScalar(1 / lt))
	}
}

// estimateLLaMACppRunInProjector estimates the usages of the GGUF file for projector.
func (gf *GGUFFile) estimateLLaMACppRunInProjector(o *_GGUFRunEstimateOptions, a *GGUFArchitecture, e *LLaMACppRunEstimate) {
	ls := gf.Layers()
	ioLs, tfLs, _ := ls.Cut([]string{
		"mm.*",
		// Vision specific IO layers.
		"v.patch_embd.*",
		"v.class_embd",
		"v.position_embd.*",
		"v.pre_ln.*",
		"v.post_ln.*",
		"model.*",
		"resampler.*",
		// Audio specific IO layers.
		"a.position_embd.*",
		"a.conv1d.*",
		"a.post_ln.*",
	})
	ipLs, opLs, _ := ioLs.Cut([]string{
		// Vision specific Input layers.
		"v.patch_embd.*",
		"v.class_embd",
		"v.position_embd.*",
		"v.pre_ln.*",
		"model.*",
		// Audio specific Input layers.
		"a.position_embd.*",
		"a.conv1d.*",
	})

	// Block count.
	if a.ClipHasVisionEncoder && a.ClipVisionBlockCount == 0 {
		if len(tfLs) == 1 {
			if ntfLs, ok := tfLs[0].(*GGUFNamedTensorInfos); ok && slices.Contains([]string{"v"}, ntfLs.Name) {
				a.ClipVisionBlockCount = uint64(len(ntfLs.GGUFLayerTensorInfos))
			}
		}
		if a.ClipVisionBlockCount == 0 {
			a.ClipVisionBlockCount = uint64(len(tfLs))
		}
	}
	if a.ClipHasAudioEncoder && a.ClipAudioBlockCount == 0 {
		if len(tfLs) == 1 {
			if ntfLs, ok := tfLs[0].(*GGUFNamedTensorInfos); ok && slices.Contains([]string{"a"}, ntfLs.Name) {
				a.ClipAudioBlockCount = uint64(len(ntfLs.GGUFLayerTensorInfos))
			}
		}
		if a.ClipAudioBlockCount == 0 {
			a.ClipAudioBlockCount = uint64(len(tfLs))
		}
	}

	// Offload layers.
	if *o.LMCOffloadLayers == math.MaxUint64 {
		e.FullOffloaded = true
		e.OffloadLayers = a.ClipVisionBlockCount + a.ClipAudioBlockCount
		o.LMCOffloadLayers = ptr.To(e.OffloadLayers)
	} else {
		e.FullOffloaded = false
		e.OffloadLayers = 0
	}

	// Footprint.
	{
		// Bootstrap.
		e.Devices[0].Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */
	}

	idx := 0 // Default to the main host's RAM.
	if e.FullOffloaded {
		for i := 1; i < len(e.Devices); i++ {
			if !e.Devices[i].Remote {
				idx = i
				break
			}
		}
	}

	// Weight & Parameter.
	{
		// Compute.
		e.Devices[idx].HandleLayers = *o.LMCOffloadLayers
		e.Devices[idx].HandleLastLayer = int(e.Devices[idx].HandleLayers - 1)
		e.Devices[idx].Weight.Compute = GGUFBytesScalar(tfLs.Bytes())
		e.Devices[idx].Parameter.Compute = GGUFParametersScalar(tfLs.Elements())

		// IO.
		e.Devices[idx].Weight.Input = GGUFBytesScalar(ipLs.Bytes())
		e.Devices[idx].Parameter.Input = GGUFParametersScalar(ipLs.Elements())
		e.Devices[idx].Weight.Output = GGUFBytesScalar(opLs.Bytes())
		e.Devices[idx].Parameter.Output = GGUFParametersScalar(opLs.Elements())
	}

	if a.ClipHasVisionEncoder {
		// Init hyperparameters,
		// see https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/clip.cpp#L599-L636.
		var (
			heightMaxSize uint64 // y
			widthMaxSize  uint64 // x
			// See https://github.com/ggml-org/llama.cpp/blob/6385b843a8dc8e15b8362196039720c58dd79fa2/tools/mtmd/clip.cpp#L3462.
			nPatches       uint64
			patchesMaxSize uint64
			// See https://github.com/ggml-org/llama.cpp/blob/6385b843a8dc8e15b8362196039720c58dd79fa2/tools/mtmd/clip.cpp#L4016.
			projectionDim uint64 // NB(thxCode): do not sure if there is the correct name.
		)
		// See https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/llava.cpp#L397-L411,
		//     https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/clip.cpp#L2323-L2345,
		//     https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/clip.cpp#L2767-L2794.
		heightMaxSize = uint64(a.ClipVisionImageSize)
		widthMaxSize = heightMaxSize
		if a.ClipHasQwen2VLMerger ||
			a.ClipProjectorType == "qwen2vl_merger" ||
			a.ClipProjectorType == "qwen2.5vl_merger" ||
			a.ClipProjectorType == "qwen2.5o" ||
			a.ClipProjectorType == "pixtral" {
			// See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/tools/mtmd/clip.cpp#L2217.
			heightMaxSize = uint64(ptr.Deref(o.LMCVisualMaxImageSize, 1024))
			widthMaxSize = heightMaxSize
		}
		nPatchSize := uint64(a.ClipVisionPatchSize)
		nPatchesHeight := heightMaxSize / nPatchSize
		nPatchesWidth := widthMaxSize / nPatchSize
		nPatches = nPatchesHeight * nPatchesWidth
		patchesMaxSize = 1
		switch {
		case a.ClipHasLLaVAProjector ||
			a.ClipProjectorType == "mlp" ||
			a.ClipProjectorType == "mlp_norm" ||
			a.ClipProjectorType == "ldp" ||
			a.ClipProjectorType == "ldpv2":
			// LLaVA 1.6 uses up to 6 patches
			if a.ClipVisionMMPatchMergeType != "flat" {
				patchesMaxSize = 6
			}
		case a.ClipHasMiniCPMVProjector ||
			a.ClipProjectorType == "resampler":
			// MiniCPM-V uses up to 10 patches
			patchesMaxSize = 10
		case a.ClipProjectorType == "adapter":
			// Granite vision uses up to 10 patches + base patch
			patchesMaxSize = 11
		}

		if o.LMCMaxProjectedCache != nil {
			patchesMaxSize += uint64(*o.LMCMaxProjectedCache)
		}

		switch a.ClipProjectorType {
		case "ldp":
			nPatches /= 4
			if ti, ok := gf.TensorInfos.Get("mm.model.mb_block.1.block.2.1.bias"); ok {
				projectionDim = ti.Dimensions[0]
			}
		case "ldpv2":
			nPatches /= 4
			if ti, ok := gf.TensorInfos.Get("mm.model.peg.0.bias"); ok {
				projectionDim = ti.Dimensions[0]
			}
		case "mlp":
			if ti, ok := gf.TensorInfos.Get("mm.2.bias"); ok {
				projectionDim = ti.Dimensions[0]
			}
		case "mlp_norm":
			if ti, ok := gf.TensorInfos.Get("mm.3.bias"); ok {
				projectionDim = ti.Dimensions[0]
			}
		case "resampler":
			if ti, ok := gf.TensorInfos.Get("resampler.query"); ok {
				nPatches = ti.Dimensions[1]
				projectionDim = ti.Dimensions[0]
			}
		case "adapter":
			nPatches /= 4
			nPatches += 2
			if ti, ok := gf.TensorInfos.Get("adapter.linear.dense_4h_to_h.weight"); ok {
				projectionDim = ti.Dimensions[1]
			}
		case "qwen2vl_merger", "qwen2.5vl_merger", "qwen2.5o":
			nSizePatch := uint64(a.ClipVisionPatchSize * 2)
			heightPatchSize := heightMaxSize / nSizePatch
			if heightMaxSize%nSizePatch > 0 {
				heightPatchSize++
			}
			widthPatchSize := widthMaxSize / nSizePatch
			if widthMaxSize%nSizePatch > 0 {
				widthPatchSize++
			}
			nPatches = heightPatchSize * widthPatchSize
			if ti, ok := gf.TensorInfos.Get("mm.2.bias"); ok {
				projectionDim = ti.Dimensions[0]
			}
		case "gemma3":
			nPerSide := uint64(a.ClipVisionImageSize) / uint64(a.ClipVisionPatchSize)
			nPerSide2DPool := nPerSide / uint64(a.ClipVisionProjectorScaleFactor)
			nPatches = nPerSide2DPool * nPerSide2DPool
			if ti, ok := gf.TensorInfos.Get("mm.input_projection.weight"); ok {
				projectionDim = ti.Dimensions[0]
			}
		case "idefics3", "llama4":
			nPatches /= uint64(a.ClipVisionProjectorScaleFactor * a.ClipVisionProjectorScaleFactor)
			if ti, ok := gf.TensorInfos.Get("mm.model.fc.weight"); ok {
				projectionDim = ti.Dimensions[1]
			}
		case "pixtral":
			heightPatchSize := heightMaxSize / uint64(a.ClipVisionPatchSize)
			if a.ClipVisionSpatialMergeSize > 0 {
				heightPatchSize /= uint64(a.ClipVisionSpatialMergeSize)
			}
			widthPatchSize := widthMaxSize / uint64(a.ClipVisionPatchSize)
			if a.ClipVisionSpatialMergeSize > 0 {
				widthPatchSize /= uint64(a.ClipVisionSpatialMergeSize)
			}
			nPatches = heightPatchSize*widthPatchSize + heightPatchSize - 1 /* [IMG_BREAK] per row */
			if ti, ok := gf.TensorInfos.Get("mm.2.bias"); ok {
				projectionDim = ti.Dimensions[0]
			}
		case "internvl":
			nPatches /= uint64(a.ClipVisionProjectorScaleFactor * a.ClipVisionProjectorScaleFactor)
			if ti, ok := gf.TensorInfos.Get("mm.model.mlp.3.weight"); ok {
				projectionDim = ti.Dimensions[1]
			}
		}

		// Footprint
		{
			// Image Embed,
			// see https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/llava.cpp#L401-L407.
			e.Devices[0].Footprint += GGUFBytesScalar(patchesMaxSize * nPatches * projectionDim * 4 /* float32 size */)
		}

		// Computation.
		{
			// See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/tools/mtmd/clip.cpp#L374.
			var maxNodes uint64 = 8192

			// Bootstrap, compute metadata.
			cm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false)
			e.Devices[0].Computation.Footprint += GGUFBytesScalar(cm)

			// Scheduler overhead,
			// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149.
			e.Devices[0].Computation.Footprint += GGUFBytesScalar(4 * 1024 * 1024)

			// GGML context,
			// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036.
			gc := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.ClipVisionBlockCount*3)
			e.Devices[0].Computation.Footprint += GGUFBytesScalar(gc)

			// Tensor usage.
			var (
				hasClassEmbd bool
				nPositions   uint64
				nBatch       uint64
				nEmbd        uint64
				nHead        uint64
			)
			{
				_, hasClassEmbd = ipLs.Get("v.class_embd")
				nPositions = nPatches
				if hasClassEmbd {
					nPositions += 1
				}
				if a.ClipHasQwen2VLMerger ||
					a.ClipProjectorType == "qwen2vl_merger" ||
					a.ClipProjectorType == "qwen2.5vl_merger" ||
					a.ClipProjectorType == "qwen2.5o" {
					nPositions *= 4
				}
				nBatch = 1
				nEmbd = a.ClipVisionEmbeddingLength
				nHead = a.ClipVisionAttentionHeadCount
			}
			// First, get the usage of input layer.
			{
				var (
					inpRaw     = GGMLTypeF32.RowSizeOf([]uint64{widthMaxSize, heightMaxSize, 3, nBatch}) // F32 [img_width, img_height, 3, n_batch]
					inpRawCnt  = GGMLTypeF32.RowSizeOf([]uint64{nPatches, nEmbd, nBatch})                // I32 [n_patches, n_embd, n_batch]
					inpEmbd    = GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions, nBatch})              // F32 [n_embd, n_positions, n_batch]
					inpPosEmbd = GGMLTypeF32.RowSizeOf([]uint64{projectionDim, nPatches, nBatch})        // F32 [mmproj, n_patches, n_batch]
					inpPos     = GGMLTypeI32.RowSizeOf([]uint64{nPositions})                             // I32 [n_positions]
					inpPatches = GGMLTypeI32.RowSizeOf([]uint64{nPatches})                               // I32 [n_patches]
				)
				e.Devices[idx].Computation.Input += GGUFBytesScalar(inpRaw + inpRawCnt + inpPos + inpPatches)
				if a.ClipHasMiniCPMVProjector ||
					a.ClipProjectorType == "resampler" {
					e.Devices[idx].Computation.Input += GGUFBytesScalar(inpPosEmbd)
				}
				if hasClassEmbd {
					e.Devices[idx].Computation.Input += GGUFBytesScalar(inpEmbd)
				}
				if a.ClipVisionWindowAttentionPattern > 0 { // Qwen2.5 VL
					inpWindowIndex := GGMLTypeI32.RowSizeOf([]uint64{nPatches})              // I32 [n_patches]
					inpWindowMask := GGMLTypeI32.RowSizeOf([]uint64{nPositions, nPositions}) // I32 [n_positions, n_positions]
					e.Devices[idx].Computation.Input += GGUFBytesScalar(inpWindowIndex + inpWindowMask)
				}
			}
			// Since the steps between transformer layers are serial,
			// the allocated memory can be reused for the next layer.
			// So, we only consider the usage of a certain layer.
			{
				compNorm := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) * 2
				compVcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions})
				compKcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions})
				compKQcur := GGMLTypeF32.RowSizeOf([]uint64{nPositions, nPositions, nHead})
				e.Devices[idx].Computation.Compute += GGUFBytesScalar(compNorm + compVcur + compKcur + compKQcur)
			}
		}
	}

	if a.ClipHasAudioEncoder {
		// See https://github.com/ggml-org/llama.cpp/blob/6385b843a8dc8e15b8362196039720c58dd79fa2/tools/mtmd/mtmd-audio.cpp#L311.
		var projectionDim uint64 // NB(thxCode): do not sure if there is the correct name.
		{
			if ti, ok := gf.TensorInfos.Get("a.position_embd.weight"); ok {
				projectionDim = ti.Dimensions[1]
			}
		}

		// Computation.
		{
			// See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/tools/mtmd/clip.cpp#L374.
			var maxNodes uint64 = 8192

			// Bootstrap, compute metadata.
			cm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false)
			e.Devices[0].Computation.Footprint += GGUFBytesScalar(cm)

			// Scheduler overhead,
			// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149.
			e.Devices[0].Computation.Footprint += GGUFBytesScalar(4 * 1024 * 1024)

			// GGML context,
			// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036.
			gc := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.ClipAudioBlockCount*3)
			e.Devices[0].Computation.Footprint += GGUFBytesScalar(gc)

			// Tensor usage.
			var (
				nPositions uint64
				nBatch     uint64
				nEmbd      uint64
				nHead      uint64
			)
			{
				nPositions = projectionDim
				nBatch = 1
				nEmbd = a.ClipAudioEmbeddingLength
				nHead = a.ClipAudioAttentionHeadCount
			}
			// First, get the usage of input layer.
			{
				inpEmbd := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions, nBatch}) // F32 [n_embed, n_positions, n_batch]
				e.Devices[idx].Computation.Input += GGUFBytesScalar(inpEmbd)
			}
			// Since the steps between transformer layers are serial,
			// the allocated memory can be reused for the next layer.
			// So, we only consider the usage of a certain layer.
			{
				compNorm := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions})
				compVcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions})
				compKcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions})
				compKQcur := GGMLTypeF32.RowSizeOf([]uint64{nPositions, nPositions, nHead})
				e.Devices[idx].Computation.Compute += GGUFBytesScalar(compNorm + compVcur + compKcur + compKQcur)
			}
		}
	}
}

// estimateLLaMACppRunInAdapter estimates the usages of the GGUF file for adapter.
func (gf *GGUFFile) estimateLLaMACppRunInAdapter(o *_GGUFRunEstimateOptions, a *GGUFArchitecture, e *LLaMACppRunEstimate) {
	ls := gf.Layers()
	ioLs, tfLs, _ := ls.Cut([]string{
		"position_*",
		"token_*",
		"cls.*",
		"output.*",
		"output_*",
	})
	ipLs, opLs, _ := ioLs.Cut([]string{
		"position_*",
		"token_*",
	})

	if a.BlockCount == 0 {
		a.BlockCount = uint64(len(tfLs))
	}

	// Full offload: nLoadLayers == 0 && isOffloadOutputLayer
	// Zero offload: nOffloadLayers == 0
	// Partial offload: !Full offload && !Zero offload
	var (
		nOffloadLayers       uint64
		nActualOffloadLayers uint64
		nLoadLayers          = a.BlockCount
		idxOutputDevice      int

		fullOffload bool
	)
	{
		var isOffloadOutputLayer bool

		switch v := o.LMCOffloadLayers; {
		case v == nil:
			o.LMCOffloadLayers = ptr.To(a.BlockCount)
			nOffloadLayers = a.BlockCount
			isOffloadOutputLayer = true
		case *v != 0:
			nOffloadLayers = *v
			if nOffloadLayers > a.BlockCount {
				isOffloadOutputLayer = true
				nOffloadLayers = a.BlockCount
			}
		}
		nActualOffloadLayers = nOffloadLayers
		if isOffloadOutputLayer {
			nActualOffloadLayers += 1
		}
		nLoadLayers -= nOffloadLayers

		fullOffload = nLoadLayers == 0 && isOffloadOutputLayer

		e.FullOffloaded = fullOffload
		e.OffloadLayers = nOffloadLayers

		for i, j, offloadStart := 0, 0, len(tfLs)-int(nOffloadLayers); i < len(tfLs); i++ {
			switch {
			case i < int(nLoadLayers):
				e.Devices[0].HandleLayers += 1
				e.Devices[0].HandleLastLayer = i
			case i >= offloadStart:
				x := float64(i-offloadStart) / float64(nActualOffloadLayers)
				j = slicex.UpperBound(o.TensorSplitFraction, x)
				e.Devices[j+1].HandleLayers += 1
				e.Devices[j+1].HandleLastLayer = i
				if fullOffload && i == len(tfLs)-1 {
					idxOutputDevice = j + 1
				}
			}
		}

		e.Devices[idxOutputDevice].HandleOutputLayer = true
	}

	// Distributable.
	e.Distributable = false

	// Footprint.
	{
		// Bootstrap.
		e.Devices[0].Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */
	}

	// Weight & Parameter.
	{
		// Compute.
		for i, j, offloadStart := 0, 0, len(tfLs)-int(nOffloadLayers); i < len(tfLs); i++ {
			idx := 0
			if i >= offloadStart {
				x := float64(i-offloadStart) / float64(nActualOffloadLayers)
				j = slicex.UpperBound(o.TensorSplitFraction, x)
				idx = j + 1
			}
			e.Devices[idx].Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes())
			e.Devices[idx].Parameter.Compute += GGUFParametersScalar(tfLs[i].Elements())
		}

		// IO,
		// see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L4930-L5002.
		e.Devices[0].Weight.Input = GGUFBytesScalar(ipLs.Bytes())
		e.Devices[0].Parameter.Input = GGUFParametersScalar(ipLs.Elements())
		var (
			wg GGUFBytesScalar
			ps GGUFParametersScalar
		)
		if _, ok := opLs.Get("output.weight"); ok {
			wg = GGUFBytesScalar(opLs.Bytes())
			ps = GGUFParametersScalar(opLs.Elements())
		} else {
			wg = GGUFBytesScalar(opLs.Bytes()) + e.Devices[0].Weight.Input /* duplicate the input layer */
			ps = GGUFParametersScalar(opLs.Elements() + ipLs.Elements())
		}
		e.Devices[0].Weight.Output = wg
		if fullOffload {
			e.Devices[idxOutputDevice].Weight.Output = wg
			e.Devices[idxOutputDevice].Parameter.Output = ps
		} else {
			e.Devices[0].Parameter.Output = ps
		}
	}
}

// estimateLLaMACppRunInIMatrix estimates the usages of the GGUF file for imatrix.
func (gf *GGUFFile) estimateLLaMACppRunInIMatrix(_ *_GGUFRunEstimateOptions, a *GGUFArchitecture, e *LLaMACppRunEstimate) {
	ls := gf.Layers()

	if a.BlockCount == 0 {
		a.BlockCount = uint64(len(ls))
	}

	// Distributable.
	e.Distributable = false

	// Footprint.
	{
		// Bootstrap.
		e.Devices[0].Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */
	}

	// Weight & Parameter.
	{
		var (
			wg GGUFBytesScalar
			ps GGUFParametersScalar
		)
		wg = GGUFBytesScalar(ls.Bytes())
		ps = GGUFParametersScalar(ls.Elements())
		e.Devices[0].Weight.Compute = wg
		e.Devices[0].Parameter.Compute = ps
	}
}

// Types for LLaMACpp estimated summary.
type (
	// LLaMACppRunEstimateSummary represents the summary of the usage for loading the GGUF file in llama.cpp.
	LLaMACppRunEstimateSummary struct {
		/* Basic */

		// Items
		Items []LLaMACppRunEstimateSummaryItem `json:"items"`

		/* Appendix */

		// Type describes what type this GGUF file is.
		Type string `json:"type"`
		// Architecture describes what architecture this GGUF file implements.
		//
		// All lowercase ASCII.
		Architecture string `json:"architecture"`
		// ClipProjectorType is the type of the projector used in the clip model.
		//
		// Only used when Architecture is "clip".
		ClipProjectorType string `json:"clipProjectorType,omitempty"`
		// AdapterType is the type of the adapter.
		//
		// Only used when Architecture is "adapter".
		AdapterType string `json:"adapterType,omitempty"`
		// ContextSize is the size of the context.
		ContextSize uint64 `json:"contextSize"`
		// FlashAttention is the flag to indicate whether enable the flash attention,
		// true for enable.
		FlashAttention bool `json:"flashAttention"`
		// NoMMap is the flag to indicate whether the file must be loaded without mmap,
		// true for total loaded.
		NoMMap bool `json:"noMMap"`
		// EmbeddingOnly is the flag to indicate whether the model is used for embedding only,
		// true for embedding only.
		EmbeddingOnly bool `json:"embeddingOnly"`
		// Reranking is the flag to indicate whether the model is used for reranking,
		// true for reranking.
		//
		// Only available when EmbeddingOnly is true.
		Reranking bool `json:"reranking"`
		// Distributable is the flag to indicate whether the model is distributable,
		// true for distributable.
		Distributable bool `json:"distributable"`
		// LogicalBatchSize is the logical batch size.
		LogicalBatchSize int32 `json:"logicalBatchSize"`
		// PhysicalBatchSize is the physical batch size.
		PhysicalBatchSize int32 `json:"physicalBatchSize"`
	}

	// LLaMACppRunEstimateSummaryItem represents one summary item for loading the GGUF file in llama.cpp.
	LLaMACppRunEstimateSummaryItem struct {
		// OffloadLayers is the number of offloaded layers.
		OffloadLayers uint64 `json:"offloadLayers"`
		// FullOffloaded is the flag to indicate whether the layers are fully offloaded,
		// false for partial offloaded or zero offloaded.
		FullOffloaded bool `json:"fullOffloaded"`
		// MaximumTokensPerSecond is the maximum tokens per second for running the GGUF file.
		MaximumTokensPerSecond *GGUFTokensPerSecondScalar `json:"maximumTokensPerSecond,omitempty"`
		// RAM is the memory usage for loading the GGUF file in RAM.
		RAM LLaMACppRunEstimateMemory `json:"ram"`
		// VRAMs is the memory usage for loading the GGUF file in VRAM per device.
		VRAMs []LLaMACppRunEstimateMemory `json:"vrams"`
	}

	// LLaMACppRunEstimateMemory represents the memory usage for loading the GGUF file in llama.cpp.
	LLaMACppRunEstimateMemory struct {
		// HandleLayers is the number of layers that the device can handle.
		HandleLayers uint64 `json:"handleLayers"`
		// HandleLastLayer is the index of the last layer the device can handle.
		HandleLastLayer int `json:"handleLastLayer"`
		// HandleOutputLayer is the flag to indicate whether the device can handle the output layer,
		// true for handle.
		HandleOutputLayer bool `json:"handleOutputLayer"`
		// Remote is the flag to indicate whether the device is remote,
		// true for remote.
		Remote bool `json:"remote"`
		// Position is the relative position of the device,
		// starts from 0.
		//
		// If Remote is true, Position is the position of the remote devices,
		// Otherwise, Position is the position of the device in the local devices.
		Position int `json:"position"`
		// UMA represents the usage of Unified Memory Architecture.
		UMA GGUFBytesScalar `json:"uma"`
		// NonUMA represents the usage of Non-Unified Memory Architecture.
		NonUMA GGUFBytesScalar `json:"nonuma"`
	}
)

// SummarizeItem returns the corresponding LLaMACppRunEstimateSummaryItem with the given options.
func (e LLaMACppRunEstimate) SummarizeItem(mmap bool, nonUMARamFootprint, nonUMAVramFootprint uint64) (emi LLaMACppRunEstimateSummaryItem) {
	emi.OffloadLayers, emi.FullOffloaded = e.OffloadLayers, e.FullOffloaded
	if emi.FullOffloaded {
		emi.OffloadLayers++ // The output layer is offloaded.
	}
	emi.MaximumTokensPerSecond = e.MaximumTokensPerSecond

	// RAM.
	{
		fp := e.Devices[0].Footprint
		wg := e.Devices[0].Weight.Sum()
		kv := e.Devices[0].KVCache.Sum()
		cp := e.Devices[0].Computation.Sum()

		emi.RAM.HandleLayers = e.Devices[0].HandleLayers
		emi.RAM.HandleLastLayer = e.Devices[0].HandleLastLayer
		emi.RAM.HandleOutputLayer = e.Devices[0].HandleOutputLayer

		// UMA.
		emi.RAM.UMA = fp + wg + kv + cp
		if !e.NoMMap && (mmap || e.FullOffloaded) {
			emi.RAM.UMA -= wg
			if !mmap {
				emi.RAM.UMA += e.Devices[0].Weight.Output
				emi.RAM.UMA += e.Devices[0].Weight.ComputeOverridden
			}
		}

		// NonUMA.
		emi.RAM.NonUMA = GGUFBytesScalar(nonUMARamFootprint) + emi.RAM.UMA
	}

	// VRAMs.
	emi.VRAMs = make([]LLaMACppRunEstimateMemory, len(e.Devices)-1)
	{
		for i, d := range e.Devices[1:] {
			fp := d.Footprint
			wg := d.Weight.Sum()
			kv := d.KVCache.Sum()
			cp := d.Computation.Sum()

			emi.VRAMs[i].HandleLayers = d.HandleLayers
			emi.VRAMs[i].HandleLastLayer = d.HandleLastLayer
			emi.VRAMs[i].HandleOutputLayer = d.HandleOutputLayer
			emi.VRAMs[i].Remote = d.Remote
			emi.VRAMs[i].Position = d.Position

			// UMA.
			emi.VRAMs[i].UMA = fp + wg + kv + /* cp */ 0
			if !e.NoMMap && mmap {
				emi.VRAMs[i].UMA -= wg
				if d.Remote || d.Position > 0 && d.HandleLastLayer >= 0 || e.Type == "projector" {
					emi.VRAMs[i].UMA += wg
				}
			}

			// NonUMA.
			emi.VRAMs[i].NonUMA = GGUFBytesScalar(nonUMAVramFootprint) + fp + wg + kv + cp
			if !d.Remote && d.Position > 0 && d.HandleLastLayer < 0 {
				emi.VRAMs[i].NonUMA -= wg + cp
			}
		}
	}

	// Add drafter's usage.
	if e.Drafter != nil {
		demi := e.Drafter.SummarizeItem(mmap, 0, 0)
		emi.RAM.UMA += demi.RAM.UMA
		emi.RAM.NonUMA += demi.RAM.NonUMA
		for i, v := range demi.VRAMs {
			emi.VRAMs[i].UMA += v.UMA
			emi.VRAMs[i].NonUMA += v.NonUMA
		}
	}

	// Add projector's usage.
	if e.Projector != nil {
		pemi := e.Projector.SummarizeItem(mmap, 0, 0)
		emi.RAM.UMA += pemi.RAM.UMA
		emi.RAM.NonUMA += pemi.RAM.NonUMA
		for i, v := range pemi.VRAMs {
			emi.VRAMs[i].UMA += v.UMA
			emi.VRAMs[i].NonUMA += v.NonUMA
		}
	}

	// Add adapters' usage.
	for i := range e.Adapters {
		aemi := e.Adapters[i].SummarizeItem(false, 0, 0)
		emi.RAM.UMA += aemi.RAM.UMA
		emi.RAM.NonUMA += aemi.RAM.NonUMA
		for j, v := range aemi.VRAMs {
			emi.VRAMs[j].UMA += v.UMA
			emi.VRAMs[j].NonUMA += v.NonUMA
		}
	}

	return emi
}

// Summarize returns the corresponding LLaMACppRunEstimateSummary with the given options.
func (e LLaMACppRunEstimate) Summarize(mmap bool, nonUMARamFootprint, nonUMAVramFootprint uint64) (es LLaMACppRunEstimateSummary) {
	// Items.
	es.Items = []LLaMACppRunEstimateSummaryItem{
		e.SummarizeItem(mmap, nonUMARamFootprint, nonUMAVramFootprint),
	}

	// Just copy from the original estimate.
	es.Type = e.Type
	es.Architecture = e.Architecture
	es.ClipProjectorType = e.ClipProjectorType
	es.AdapterType = e.AdapterType
	es.ContextSize = e.ContextSize
	es.FlashAttention = e.FlashAttention
	es.NoMMap = e.NoMMap
	es.EmbeddingOnly = e.EmbeddingOnly
	es.Reranking = e.Reranking
	es.LogicalBatchSize = e.LogicalBatchSize
	es.PhysicalBatchSize = e.PhysicalBatchSize
	es.Distributable = e.Distributable

	return es
}

func (u LLaMACppWeightMemoryUsage) Sum() GGUFBytesScalar {
	return u.Input + u.Compute + u.ComputeOverridden + u.Output
}

func (u LLaMACppKVCacheMemoryUsage) Sum() GGUFBytesScalar {
	return u.Key + u.Value
}

func (u LLaMACppComputationMemoryUsage) Sum() GGUFBytesScalar {
	return u.Footprint + u.Input + max(u.Compute, u.Output)
}

// ClipAligning returns the aligned value of x to the nearest multiple of n,
// see https://github.com/ggml-org/llama.cpp/blob/cdf94a18023c92f41808ec874ba577d914674717/tools/mtmd/clip-impl.h#L114-L115.
func ClipAligning(x, n uint64) uint64 {
	return ((x + n - 1) / n) * n
}


================================================
FILE: file_estimate__llamacpp_test.go
================================================
package gguf_parser

import (
	"context"
	"testing"

	"github.com/davecgh/go-spew/spew"
)

func TestGGUFFile_EstimateLLaMACppRun(t *testing.T) {
	ctx := context.Background()

	cases := []struct {
		name  string
		given *GGUFFile
	}{
		{
			name: "mixtral 7B",
			given: func() *GGUFFile {
				f, err := ParseGGUFFileFromHuggingFace(
					ctx,
					"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
					"Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf",
					SkipLargeMetadata())
				if err != nil {
					t.Fatal(err)
				}
				return f
			}(),
		},
		{
			name: "mixtral 8x7B",
			given: func() *GGUFFile {
				f, err := ParseGGUFFileFromHuggingFace(
					ctx,
					"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF",
					"Nous-Hermes-2-Mixtral-8x7B-DPO.Q5_K_M.gguf",
					SkipLargeMetadata())
				if err != nil {
					t.Fatal(err)
				}
				return f
			}(),
		},
		{
			name: "wizardlm 8x22B",
			given: func() *GGUFFile {
				f, err := ParseGGUFFileFromHuggingFace(
					ctx,
					"MaziyarPanahi/WizardLM-2-8x22B-GGUF",
					"WizardLM-2-8x22B.IQ1_M.gguf",
					SkipLargeMetadata())
				if err != nil {
					t.Fatal(err)
				}
				return f
			}(),
		},
	}
	for _, tc := range cases {
		t.Run(tc.name, func(t *testing.T) {
			f := tc.given
			t.Log("\n", spew.Sdump(f.EstimateLLaMACppRun()), "\n")
		})
	}
}

func TestGGUFFile_EstimateLLaMACppRun_ContextSize(t *testing.T) {
	ctx := context.Background()

	f, err := ParseGGUFFileFromHuggingFace(
		ctx,
		"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
		"Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf",
		SkipLargeMetadata())
	if err != nil {
		t.Fatal(err)
		return
	}

	cases := []struct {
		name string
		opts []GGUFRunEstimateOption
	}{
		{"1024(fp16)", []GGUFRunEstimateOption{WithLLaMACppContextSize(1024)}},
		{"1024(fp32)", []GGUFRunEstimateOption{WithLLaMACppContextSize(1024), WithLLaMACppCacheKeyType(GGMLTypeF32), WithLLaMACppCacheValueType(GGMLTypeF32)}},
		{"4096(fp16)", []GGUFRunEstimateOption{WithLLaMACppContextSize(4096)}},
		{"4096(fp32)", []GGUFRunEstimateOption{WithLLaMACppContextSize(4096), WithLLaMACppCacheKeyType(GGMLTypeF32), WithLLaMACppCacheValueType(GGMLTypeF32)}},
	}
	for _, tc := range cases {
		t.Run(tc.name, func(t *testing.T) {
			t.Log("\n", spew.Sdump(f.EstimateLLaMACppRun(tc.opts...)), "\n")
		})
	}
}

func TestGGUFFile_EstimateLLaMACppRun_OffloadLayers(t *testing.T) {
	ctx := context.Background()

	f, err := ParseGGUFFileFromHuggingFace(
		ctx,
		"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
		"Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf",
		SkipLargeMetadata())
	if err != nil {
		t.Fatal(err)
		return
	}

	cases := []struct {
		name string
		opts []GGUFRunEstimateOption
	}{
		{"offload 0 layer", []GGUFRunEstimateOption{WithLLaMACppOffloadLayers(0)}},
		{"offload 1 layer", []GGUFRunEstimateOption{WithLLaMACppOffloadLayers(1)}},
		{"offload 10 layers", []GGUFRunEstimateOption{WithLLaMACppOffloadLayers(10)}},
		{"offload all layers", []GGUFRunEstimateOption{}},
		{"offload 33 layers", []GGUFRunEstimateOption{WithLLaMACppOffloadLayers(33)}}, // exceeds the number of layers
	}
	for _, tc := range cases {
		t.Run(tc.name, func(t *testing.T) {
			t.Log("\n", spew.Sdump(f.EstimateLLaMACppRun(tc.opts...)), "\n")
		})
	}
}


================================================
FILE: file_estimate__stablediffusioncpp.go
================================================
package gguf_parser

import (
	"math"
	"strings"

	"golang.org/x/exp/maps"

	"github.com/gpustack/gguf-parser-go/util/ptr"
	"github.com/gpustack/gguf-parser-go/util/stringx"
)

// Types for StableDiffusionCpp estimation.
type (
	// StableDiffusionCppRunEstimate represents the estimated result of loading the GGUF file in stable-diffusion.cpp.
	StableDiffusionCppRunEstimate struct {
		// Type describes what type this GGUF file is.
		Type string `json:"type"`
		// Architecture describes what architecture this GGUF file implements.
		//
		// All lowercase ASCII.
		Architecture string `json:"architecture"`
		// FlashAttention is the flag to indicate whether enable the flash attention,
		// true for enable.
		FlashAttention bool `json:"flashAttention"`
		// FullOffloaded is the flag to indicate whether the layers are fully offloaded,
		// false for partial offloaded or zero offloaded.
		FullOffloaded bool `json:"fullOffloaded"`
		// NoMMap is the flag to indicate whether support the mmap,
		// true for support.
		NoMMap bool `json:"noMMap"`
		// ImageOnly is the flag to indicate whether the model is used for generating image,
		// true for generating image only.
		ImageOnly bool `json:"imageOnly"`
		// Distributable is the flag to indicate whether the model is distributable,
		// true for distributable.
		Distributable bool `json:"distributable"`
		// Devices represents the usage for running the GGUF file,
		// the first device is the CPU, and the rest are GPUs.
		Devices []StableDiffusionCppRunDeviceUsage `json:"devices"`
		// Autoencoder is the estimated result of the autoencoder.
		Autoencoder *StableDiffusionCppRunEstimate `json:"autoencoder,omitempty"`
		// Conditioners is the estimated result of the conditioners.
		Conditioners []StableDiffusionCppRunEstimate `json:"conditioners,omitempty"`
		// Upscaler is the estimated result of the upscaler.
		Upscaler *StableDiffusionCppRunEstimate `json:"upscaler,omitempty"`
		// ControlNet is the estimated result of the control net.
		ControlNet *StableDiffusionCppRunEstimate `json:"controlNet,omitempty"`
	}

	// StableDiffusionCppRunDeviceUsage represents the usage for running the GGUF file in llama.cpp.
	StableDiffusionCppRunDeviceUsage struct {
		// Remote is the flag to indicate whether the device is remote,
		// true for remote.
		Remote bool `json:"remote"`
		// Position is the relative position of the device,
		// starts from 0.
		//
		// If Remote is true, Position is the position of the remote devices,
		// Otherwise, Position is the position of the device in the local devices.
		Position int `json:"position"`
		// Footprint is the memory footprint for bootstrapping.
		Footprint GGUFBytesScalar `json:"footprint"`
		// Parameter is the running parameters that the device processes.
		Parameter GGUFParametersScalar `json:"parameter"`
		// Weight is the memory usage of weights that the device loads.
		Weight GGUFBytesScalar `json:"weight"`
		// Computation is the memory usage of computation that the device processes.
		Computation GGUFBytesScalar `json:"computation"`
	}
)

// EstimateStableDiffusionCppRun estimates the usages of the GGUF file in stable-diffusion.cpp.
func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption) (e StableDiffusionCppRunEstimate) {
	// Options
	var o _GGUFRunEstimateOptions
	for _, opt := range opts {
		opt(&o)
	}
	switch {
	case o.TensorSplitFraction == nil:
		o.TensorSplitFraction = []float64{1}
		o.MainGPUIndex = 0
	case o.MainGPUIndex < 0 || o.MainGPUIndex >= len(o.TensorSplitFraction):
		panic("main device index must be range of 0 to the length of tensor split fraction")
	}
	if len(o.DeviceMetrics) > 0 {
		for i, j := 0, len(o.DeviceMetrics)-1; i < len(o.TensorSplitFraction)-j; i++ {
			o.DeviceMetrics = append(o.DeviceMetrics, o.DeviceMetrics[j])
		}
		o.DeviceMetrics = o.DeviceMetrics[:len(o.TensorSplitFraction)+1]
	}
	if o.SDCOffloadLayers == nil {
		o.SDCOffloadLayers = ptr.To[uint64](math.MaxUint64)
	}
	if o.SDCBatchCount == nil {
		o.SDCBatchCount = ptr.To[int32](1)
	}
	if o.SDCHeight == nil {
		o.SDCHeight = ptr.To[uint32](1024)
	}
	if o.SDCWidth == nil {
		o.SDCWidth = ptr.To[uint32](1024)
	}
	if o.SDCOffloadConditioner == nil {
		o.SDCOffloadConditioner = ptr.To(true)
	}
	if o.SDCOffloadAutoencoder == nil {
		o.SDCOffloadAutoencoder = ptr.To(true)
	}
	if o.SDCAutoencoderTiling == nil {
		o.SDCAutoencoderTiling = ptr.To(false)
	}
	if o.SDCFreeComputeMemoryImmediately == nil {
		o.SDCFreeComputeMemoryImmediately = ptr.To(false)
	}

	// Devices.
	initDevices := func(e *StableDiffusionCppRunEstimate) {
		for j := range e.Devices[1:] {
			e.Devices[j+1].Remote = j < len(o.RPCServers)
			if e.Devices[j+1].Remote {
				e.Devices[j+1].Position = j
			} else {
				e.Devices[j+1].Position = j - len(o.RPCServers)
			}
		}
	}
	e.Devices = make([]StableDiffusionCppRunDeviceUsage, len(o.TensorSplitFraction)+1)
	initDevices(&e)

	// Metadata.
	a := gf.Architecture()
	e.Type = a.Type
	e.Architecture = normalizeArchitecture(a.DiffusionArchitecture)

	// Flash attention.
	if o.FlashAttention && !strings.HasPrefix(a.DiffusionArchitecture, "Stable Diffusion 3") {
		// NB(thxCode): Stable Diffusion 3 doesn't support flash attention yet,
		// see https://github.com/leejet/stable-diffusion.cpp/pull/386.
		e.FlashAttention = true
	}

	// Distributable.
	e.Distributable = true

	// Offload.
	e.FullOffloaded = *o.SDCOffloadLayers > 0

	// NoMMap.
	e.NoMMap = true // TODO: Implement this.

	// ImageOnly.
	e.ImageOnly = true // TODO: Implement this.

	// Autoencoder.
	if a.DiffusionAutoencoder != nil {
		ae := &StableDiffusionCppRunEstimate{
			Type:           "model",
			Architecture:   e.Architecture + "_vae",
			FlashAttention: e.FlashAttention,
			Distributable:  e.Distributable,
			FullOffloaded:  e.FullOffloaded && *o.SDCOffloadAutoencoder,
			NoMMap:         e.NoMMap,
			Devices:        make([]StableDiffusionCppRunDeviceUsage, len(e.Devices)),
		}
		initDevices(ae)
		e.Autoencoder = ae
	}

	// Conditioners.
	if len(a.DiffusionConditioners) != 0 {
		e.Conditioners = make([]StableDiffusionCppRunEstimate, 0, len(a.DiffusionConditioners))
		for i := range a.DiffusionConditioners {
			cd := StableDiffusionCppRunEstimate{
				Type:           "model",
				Architecture:   normalizeArchitecture(a.DiffusionConditioners[i].Architecture),
				FlashAttention: e.FlashAttention,
				Distributable:  e.Distributable,
				FullOffloaded:  e.FullOffloaded && *o.SDCOffloadConditioner,
				NoMMap:         e.NoMMap,
				Devices:        make([]StableDiffusionCppRunDeviceUsage, len(e.Devices)),
			}
			initDevices(&cd)
			e.Conditioners = append(e.Conditioners, cd)
		}
	}

	// Footprint
	{
		// Bootstrap.
		e.Devices[0].Footprint = GGUFBytesScalar(10*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */
	}

	var cdLs, aeLs, dmLs GGUFLayerTensorInfos
	{
		ls := gf.Layers()
		cdLs, aeLs, _ = ls.Cut([]string{
			"cond_stage_model.*",
		})
		aeLs, dmLs, _ = aeLs.Cut([]string{
			"first_stage_model.*",
		})
	}

	var cdDevIdx, aeDevIdx, dmDevIdx int
	{
		if *o.SDCOffloadConditioner && *o.SDCOffloadLayers > 0 {
			cdDevIdx = 1
		}
		if *o.SDCOffloadAutoencoder && *o.SDCOffloadLayers > 0 {
			aeDevIdx = 1
			if len(e.Devices) > 3 {
				aeDevIdx = 2
			}
		}
		if *o.SDCOffloadLayers > 0 {
			dmDevIdx = 1
			switch {
			case len(e.Devices) > 3:
				dmDevIdx = 3
			case len(e.Devices) > 2:
				dmDevIdx = 2
			}
		}
	}

	// Weight & Parameter.
	{
		// Conditioners.
		for i := range cdLs {
			e.Conditioners[i].Devices[cdDevIdx].Weight = GGUFBytesScalar(cdLs[i].Bytes())
			e.Conditioners[i].Devices[cdDevIdx].Parameter = GGUFParametersScalar(cdLs[i].Elements())
		}

		// Autoencoder.
		if len(aeLs) != 0 {
			e.Autoencoder.Devices[aeDevIdx].Weight = GGUFBytesScalar(aeLs.Bytes())
			e.Autoencoder.Devices[aeDevIdx].Parameter = GGUFParametersScalar(aeLs.Elements())
		}

		// Model.
		e.Devices[dmDevIdx].Weight = GGUFBytesScalar(dmLs.Bytes())
		e.Devices[dmDevIdx].Parameter = GGUFParametersScalar(dmLs.Elements())
	}

	// Computation.
	{
		// See https://github.com/leejet/stable-diffusion.cpp/blob/10c6501bd05a697e014f1bee3a84e5664290c489/ggml_extend.hpp#L1058C9-L1058C23.
		var maxNodes uint64 = 32768

		// Bootstrap, compute metadata.
		cm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false)
		e.Devices[0].Computation = GGUFBytesScalar(cm)

		// Work context,
		// see https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/stable-diffusion.cpp#L1467-L1481,
		//     https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/stable-diffusion.cpp#L1572-L1586,
		//     https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/stable-diffusion.cpp#L1675-L1679.
		//
		{
			zChannels := uint64(4)
			if a.DiffusionTransformer {
				zChannels = 16
			}
			// See https://github.com/thxCode/stable-diffusion.cpp/blob/1ae97f8a8ca3615bdaf9c1fd32c13562e2471833/stable-diffusion.cpp#L2682-L2691.
			usage := uint64(128 * 1024 * 1024) /* 128MiB, LLaMA Box */
			usage += uint64(*o.SDCWidth) * uint64(*o.SDCHeight) * 3 /* output channels */ * 4 /* sizeof(float) */ * zChannels
			e.Devices[0].Computation += GGUFBytesScalar(usage * uint64(ptr.Deref(o.ParallelSize, 1)) /* max batch */)
		}

		// Encode usage,
		// see https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/conditioner.hpp#L388-L391,
		//     https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/conditioner.hpp#L758-L766,
		//     https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/conditioner.hpp#L1083-L1085.
		{
			var tes [][]uint64
			switch {
			case strings.HasPrefix(a.DiffusionArchitecture, "FLUX"): // FLUX.1
				tes = [][]uint64{
					{768, 77},
					{4096, 256},
				}
			case strings.HasPrefix(a.DiffusionArchitecture, "Stable Diffusion 3"): // SD 3.x
				tes = [][]uint64{
					{768, 77},
					{1280, 77},
					{4096, 77},
				}
			case strings.HasPrefix(a.DiffusionArchitecture, "Stable Diffusion XL"): // SD XL/XL Refiner
				if strings.HasSuffix(a.DiffusionArchitecture, "Refiner") {
					tes = [][]uint64{
						{1280, 77},
					}
				} else {
					tes = [][]uint64{
						{768, 77},
						{1280, 77},
					}
				}
			default: // SD 1.x/2.x
				tes = [][]uint64{
					{768, 77},
				}
			}
			for i := range cdLs {
				usage := GGMLTypeF32.RowSizeOf(tes[i]) * 2 /* include conditioner */
				e.Conditioners[i].Devices[cdDevIdx].Computation += GGUFBytesScalar(usage)
			}

			// TODO VAE Encode
		}

		// Diffusing usage.
		if !*o.SDCFreeComputeMemoryImmediately {
			var usage uint64
			switch {
			case strings.HasPrefix(a.DiffusionArchitecture, "FLUX"): // FLUX.1
				usage = GuessFLUXDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention)
			case strings.HasPrefix(a.DiffusionArchitecture, "Stable Diffusion 3"): // SD 3.x
				const (
					sd3MediumKey  = "model.diffusion_model.joint_blocks.23.x_block.attn.proj.weight" // SD 3 Medium
					sd35MediumKey = "model.diffusion_model.joint_blocks.23.x_block.attn.ln_k.weight" // SD 3.5 Medium
					sd35LargeKey  = "model.diffusion_model.joint_blocks.37.x_block.attn.ln_k.weight" // SD 3.5 Large
				)
				m, _ := dmLs.Index([]string{sd3MediumKey, sd35MediumKey, sd35LargeKey})
				switch {
				case m[sd35LargeKey].Name != "":
					usage = GuessSD35LargeDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention)
				case m[sd35MediumKey].Name != "":
					usage = GuessSD35MediumDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention)
				default:
					usage = GuessSD3MediumDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention)
				}
			case strings.HasPrefix(a.DiffusionArchitecture, "Stable Diffusion XL"): // SD XL/XL Refiner
				const (
					sdXlKey        = "model.diffusion_model.output_blocks.5.1.transformer_blocks.1.attn1.to_v.weight" // SD XL
					sdXlRefinerKey = "model.diffusion_model.output_blocks.8.1.transformer_blocks.1.attn1.to_v.weight" // SD XL Refiner
				)
				m, _ := dmLs.Index([]string{sdXlKey, sdXlRefinerKey})
				if m[sdXlRefinerKey].Name != "" {
					usage = GuessSDXLRefinerDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention)
				} else {
					usage = GuessSDXLDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention)
				}
			case strings.HasPrefix(a.DiffusionArchitecture, "Stable Diffusion 2"): // SD 2.x
				usage = GuessSD2DiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention)
			default: // SD 1.x
				usage = GuessSD1DiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention)
			}
			e.Devices[dmDevIdx].Computation += GGUFBytesScalar(usage)
		}

		// Decode usage.
		if len(aeLs) != 0 && !*o.SDCFreeComputeMemoryImmediately {
			// Bootstrap.
			e.Autoencoder.Devices[aeDevIdx].Footprint += GGUFBytesScalar(100 * 1024 * 1024) /*100 MiB.*/

			var convDim uint64
			{
				m, _ := aeLs.Index([]string{
					"first_stage_model.decoder.conv_in.weight",
					"decoder.conv_in.weight",
				})
				tis := maps.Values(m)
				if len(tis) != 0 && tis[0].NDimensions > 3 {
					convDim = max(tis[0].Dimensions[0], tis[0].Dimensions[3])
				}
			}

			var usage uint64
			if !*o.SDCAutoencoderTiling {
				usage = uint64(*o.SDCWidth) * uint64(*o.SDCHeight) * (3 /* output channels */ *4 /* sizeof(float) */ + 1) * convDim
			} else {
				usage = 512 * 512 * (3 /* output channels */ *4 /* sizeof(float) */ + 1) * convDim
			}
			e.Autoencoder.Devices[aeDevIdx].Computation += GGUFBytesScalar(usage)
		}
	}

	return e
}

// Types for StableDiffusionCpp estimated summary.
type (
	// StableDiffusionCppRunEstimateSummary represents the estimated summary of loading the GGUF file in stable-diffusion.cpp.
	StableDiffusionCppRunEstimateSummary struct {
		/* Basic */

		// Items
		Items []StableDiffusionCppRunEstimateSummaryItem `json:"items"`

		/* Appendix */

		// Type describes what type this GGUF file is.
		Type string `json:"type"`
		// Architecture describes what architecture this GGUF file implements.
		//
		// All lowercase ASCII.
		Architecture string `json:"architecture"`
		// FlashAttention is the flag to indicate whether enable the flash attention,
		// true for enable.
		FlashAttention bool `json:"flashAttention"`
		// NoMMap is the flag to indicate whether the file must be loaded without mmap,
		// true for total loaded.
		NoMMap bool `json:"noMMap"`
		// ImageOnly is the flag to indicate whether the model is used for generating image,
		// true for embedding only.
		ImageOnly bool `json:"imageOnly"`
		// Distributable is the flag to indicate whether the model is distributable,
		// true for distributable.
		Distributable bool `json:"distributable"`
	}

	// StableDiffusionCppRunEstimateSummaryItem represents the estimated summary item of loading the GGUF file in stable-diffusion.cpp.
	StableDiffusionCppRunEstimateSummaryItem struct {
		// FullOffloaded is the flag to indicate whether the layers are fully offloaded,
		// false for partial offloaded or zero offloaded.
		FullOffloaded bool `json:"fullOffloaded"`
		// RAM is the memory usage for loading the GGUF file in RAM.
		RAM StableDiffusionCppRunEstimateMemory `json:"ram"`
		// VRAMs is the memory usage for loading the GGUF file in VRAM per device.
		VRAMs []StableDiffusionCppRunEstimateMemory `json:"vrams"`
	}

	// StableDiffusionCppRunEstimateMemory represents the memory usage for loading the GGUF file in llama.cpp.
	StableDiffusionCppRunEstimateMemory struct {
		// Remote is the flag to indicate whether the device is remote,
		// true for remote.
		Remote bool `json:"remote"`
		// Position is the relative position of the device,
		// starts from 0.
		//
		// If Remote is true, Position is the position of the remote devices,
		// Otherwise, Position is the position of the device in the local devices.
		Position int `json:"position"`
		// UMA represents the usage of Unified Memory Architecture.
		UMA GGUFBytesScalar `json:"uma"`
		// NonUMA represents the usage of Non-Unified Memory Architecture.
		NonUMA GGUFBytesScalar `json:"nonuma"`
	}
)

// SummarizeItem returns the corresponding LLaMACppRunEstimateSummaryItem with the given options.
func (e StableDiffusionCppRunEstimate) SummarizeItem(
	mmap bool,
	nonUMARamFootprint, nonUMAVramFootprint uint64,
) (emi StableDiffusionCppRunEstimateSummaryItem) {
	emi.FullOffloaded = e.FullOffloaded

	// RAM.
	{
		fp := e.Devices[0].Footprint
		wg := e.Devices[0].Weight
		cp := e.Devices[0].Computation

		// UMA.
		emi.RAM.UMA = fp + wg + cp

		// NonUMA.
		emi.RAM.NonUMA = GGUFBytesScalar(nonUMARamFootprint) + emi.RAM.UMA
	}

	// VRAMs.
	emi.VRAMs = make([]StableDiffusionCppRunEstimateMemory, len(e.Devices)-1)
	{
		for i, d := range e.Devices[1:] {
			fp := d.Footprint
			wg := d.Weight
			cp := d.Computation

			emi.VRAMs[i].Remote = d.Remote
			emi.VRAMs[i].Position = d.Position

			// UMA.
			emi.VRAMs[i].UMA = fp + wg + /* cp */ 0
			if d.Remote {
				emi.VRAMs[i].UMA += cp
			}

			// NonUMA.
			emi.VRAMs[i].NonUMA = GGUFBytesScalar(nonUMAVramFootprint) + fp + wg + cp
		}
	}

	// Add antoencoder's usage.
	if e.Autoencoder != nil {
		aemi := e.Autoencoder.SummarizeItem(mmap, 0, 0)
		emi.RAM.UMA += aemi.RAM.UMA
		emi.RAM.NonUMA += aemi.RAM.NonUMA
		for i, v := range aemi.VRAMs {
			emi.VRAMs[i].UMA += v.UMA
			emi.VRAMs[i].NonUMA += v.NonUMA
		}
	}

	// Add conditioners' usage.
	for i := range e.Conditioners {
		cemi := e.Conditioners[i].SummarizeItem(mmap, 0, 0)
		emi.RAM.UMA += cemi.RAM.UMA
		emi.RAM.NonUMA += cemi.RAM.NonUMA
		for i, v := range cemi.VRAMs {
			emi.VRAMs[i].UMA += v.UMA
			emi.VRAMs[i].NonUMA += v.NonUMA
		}
	}

	// Add upscaler's usage.
	if e.Upscaler != nil {
		uemi := e.Upscaler.SummarizeItem(mmap, 0, 0)
		emi.RAM.UMA += uemi.RAM.UMA
		emi.RAM.NonUMA += uemi.RAM.NonUMA
		// NB(thxCode): all VRAMs should offload to the first device at present.
		var vramUMA, vramNonUMA GGUFBytesScalar
		for _, v := range uemi.VRAMs {
			vramUMA += v.UMA
			vramNonUMA += v.NonUMA
		}
		if e.Upscaler.FullOffloaded {
			emi.VRAMs[0].UMA += vramUMA
			emi.VRAMs[0].NonUMA += vramNonUMA
		} else {
			emi.RAM.UMA += vramUMA
			emi.RAM.NonUMA += vramNonUMA
		}
	}

	// Add control net's usage.
	if e.ControlNet != nil {
		cnemi := e.ControlNet.SummarizeItem(mmap, 0, 0)
		emi.RAM.UMA += cnemi.RAM.UMA
		emi.RAM.NonUMA += cnemi.RAM.NonUMA
		// NB(thxCode): all VRAMs should offload to the first device at present.
		var vramUMA, vramNonUMA GGUFBytesScalar
		for _, v := range cnemi.VRAMs {
			vramUMA += v.UMA
			vramNonUMA += v.NonUMA
		}
		if e.ControlNet.FullOffloaded {
			emi.VRAMs[0].UMA += vramUMA
			emi.VRAMs[0].NonUMA += vramNonUMA
		} else {
			emi.RAM.UMA += vramUMA
			emi.RAM.NonUMA += vramNonUMA
		}
	}

	return emi
}

// Summarize returns the corresponding StableDiffusionCppRunEstimate with the given options.
func (e StableDiffusionCppRunEstimate) Summarize(
	mmap bool,
	nonUMARamFootprint, nonUMAVramFootprint uint64,
) (es StableDiffusionCppRunEstimateSummary) {
	// Items.
	es.Items = []StableDiffusionCppRunEstimateSummaryItem{
		e.SummarizeItem(mmap, nonUMARamFootprint, nonUMAVramFootprint),
	}

	// Just copy from the original estimate.
	es.Type = e.Type
	es.Architecture = e.Architecture
	es.FlashAttention = e.FlashAttention
	es.NoMMap = e.NoMMap
	es.ImageOnly = e.ImageOnly
	es.Distributable = e.Distributable

	return es
}

func normalizeArchitecture(arch string) string {
	return stringx.ReplaceAllFunc(arch, func(r rune) rune {
		switch r {
		case ' ', '.', '-', '/', ':':
			return '_' // Replace with underscore.
		}
		if r >= 'A' && r <= 'Z' {
			r += 'a' - 'A' // Lowercase.
		}
		return r
	})
}


================================================
FILE: file_estimate__stablediffusioncpp_test.go
================================================
package gguf_parser

import (
	"context"
	"testing"

	"github.com/davecgh/go-spew/spew"
)

func TestGGUFFile_EstimateStableDiffusionRun(t *testing.T) {
	ctx := context.Background()

	cases := []struct {
		name  string
		given *GGUFFile
	}{
		{
			name: "sd 1.5",
			given: func() *GGUFFile {
				f, err := ParseGGUFFileFromHuggingFace(
					ctx,
					"gpustack/stable-diffusion-v1-5-GGUF",
					"stable-diffusion-v1-5-FP16.gguf",
					SkipLargeMetadata())
				if err != nil {
					t.Fatal(err)
				}
				return f
			}(),
		},
		{
			name: "sd 2.1",
			given: func() *GGUFFile {
				f, err := ParseGGUFFileFromHuggingFace(
					ctx,
					"gpustack/stable-diffusion-v2-1-GGUF",
					"stable-diffusion-v2-1-Q8_0.gguf",
					SkipLargeMetadata())
				if err != nil {
					t.Fatal(err)
				}
				return f
			}(),
		},
		{
			name: "sd xl",
			given: func() *GGUFFile {
				f, err := ParseGGUFFileFromHuggingFace(
					ctx,
					"gpustack/stable-diffusion-xl-base-1.0-GGUF",
					"stable-diffusion-xl-base-1.0-FP16.gguf",
					SkipLargeMetadata())
				if err != nil {
					t.Fatal(err)
				}
				return f
			}(),
		},
		{
			name: "sd 3.5 large",
			given: func() *GGUFFile {
				f, err := ParseGGUFFileFromHuggingFace(
					ctx,
					"gpustack/stable-diffusion-v3-5-large-GGUF",
					"stable-diffusion-v3-5-large-Q4_0.gguf",
					SkipLargeMetadata())
				if err != nil {
					t.Fatal(err)
				}
				return f
			}(),
		},
		{
			name: "flux .1 dev",
			given: func() *GGUFFile {
				f, err := ParseGGUFFileFromHuggingFace(
					ctx,
					"gpustack/FLUX.1-dev-GGUF",
					"FLUX.1-dev-Q4_0.gguf",
					SkipLargeMetadata())
				if err != nil {
					t.Fatal(err)
				}
				return f
			}(),
		},
	}
	for _, tc := range cases {
		t.Run(tc.name, func(t *testing.T) {
			f := tc.given
			t.Log("\n", spew.Sdump(f.EstimateStableDiffusionCppRun()), "\n")
		})
	}
}


================================================
FILE: file_estimate_option.go
================================================
package gguf_parser

import (
	"regexp"
	"slices"
	"strconv"

	"github.com/gpustack/gguf-parser-go/util/ptr"
)

type (
	_GGUFRunEstimateOptions struct {
		// Common
		ParallelSize        *int32
		FlashAttention      bool
		MainGPUIndex        int
		RPCServers          []string
		TensorSplitFraction []float64
		OverriddenTensors   []*GGUFRunOverriddenTensor
		DeviceMetrics       []GGUFRunDeviceMetric

		// LLaMACpp (LMC) specific
		LMCContextSize                    *int32
		LMCRoPEFrequencyBase              *float32
		LMCRoPEFrequencyScale             *float32
		LMCRoPEScalingType                *string
		LMCRoPEScalingOriginalContextSize *int32
		LMCInMaxContextSize               bool
		LMCLogicalBatchSize               *int32
		LMCPhysicalBatchSize              *int32
		LMCVisualMaxImageSize             *uint32
		LMCMaxProjectedCache              *uint32
		LMCCacheKeyType                   *GGMLType
		LMCCacheValueType                 *GGMLType
		LMCOffloadKVCache                 *bool
		LMCOffloadLayers                  *uint64
		LMCSplitMode                      LLaMACppSplitMode
		LMCFullSizeSWACache               bool
		LMCProjector                      *LLaMACppRunEstimate
		LMCDrafter                        *LLaMACppRunEstimate
		LMCAdapters                       []LLaMACppRunEstimate

		// StableDiffusionCpp (SDC) specific
		SDCOffloadLayers                *uint64
		SDCBatchCount                   *int32
		SDCHeight                       *uint32
		SDCWidth                        *uint32
		SDCOffloadConditioner           *bool
		SDCOffloadAutoencoder           *bool
		SDCAutoencoderTiling            *bool
		SDCFreeComputeMemoryImmediately *bool
		SDCUpscaler                     *StableDiffusionCppRunEstimate
		SDCControlNet                   *StableDiffusionCppRunEstimate
	}

	// GGUFRunOverriddenTensor holds the overridden tensor information for the estimate.
	//
	// When BufferType is CPU,
	// it indicates that the tensor should be loaded into the CPU memory,
	// even if it belongs to a GPU offload layer.
	GGUFRunOverriddenTensor struct {
		// PatternRegex is the regex pattern to match the tensor name.
		PatternRegex *regexp.Regexp
		// BufferType is the buffer type to override,
		// it can be "CPU", "CUDA0", "Metal" and others.
		BufferType string

		// _BufferType record parsed buffer type, used internally.
		_BufferType GGUFRunOverriddenTensorBufferType
		// _Index record parsed device index, used internally.
		_Index string
	}

	// GGUFRunDeviceMetric holds the device metric for the estimate.
	//
	// When the device represents a CPU,
	// FLOPS refers to the floating-point operations per second of that CPU,
	// while UpBandwidth indicates the bandwidth of the RAM (since SRAM is typically small and cannot hold all weights,
	// the RAM here refers to the bandwidth of DRAM,
	// unless the device's SRAM can accommodate the corresponding model weights).
	//
	// When the device represents a GPU,
	// FLOPS refers to the floating-point operations per second of that GPU,
	// while UpBandwidth indicates the bandwidth of the VRAM.
	//
	// When the device represents a specific node,
	// FLOPS depends on whether a CPU or GPU is being used,
	// while UpBandwidth refers to the network bandwidth between nodes.
	GGUFRunDeviceMetric struct {
		// FLOPS is the floating-point operations per second of the device.
		FLOPS FLOPSScalar
		// UpBandwidth is the bandwidth of the device to transmit data to calculate,
		// unit is Bps (bytes per second).
		UpBandwidth BytesPerSecondScalar
		// DownBandwidth is the bandwidth of the device to transmit calculated result to next layer,
		// unit is Bps (bytes per second).
		DownBandwidth BytesPerSecondScalar
	}

	// GGUFRunEstimateOption is the options for the estimate.
	GGUFRunEstimateOption func(*_GGUFRunEstimateOptions)
)

// GGUFRunOverriddenTensorBufferType is the type of the overridden tensor buffer.
type GGUFRunOverriddenTensorBufferType uint32

const (
	_ GGUFRunOverriddenTensorBufferType = iota
	GGUFRunOverriddenTensorBufferTypeCPU
	GGUFRunOverriddenTensorBufferTypeGPU
	GGUFRunOverriddenTensorBufferTypeRPC
	GGUFRunOverriddenTensorBufferTypeUnknown
)

var (
	_GGUFRunOverriddenTensorBufferTypeCPURegex       = regexp.MustCompile(`^(CPU|AMX)`)
	_GGUFRunOverriddenTensorBufferTypeUMAGPURegex    = regexp.MustCompile(`^(Metal|OpenCL)`)
	_GGUFRunOverriddenTensorBufferTypeNonUMAGPURegex = regexp.MustCompile(`^(CUDA|CANN|ROCm|MUSA|SYCL|Vulkan|Kompute)(\d+)?`)
	_GGUFRunOverriddenTensorBufferTypeRPCRegex       = regexp.MustCompile(`^RPC\[(.*)\]`)
)

// ParseBufferType returns the device index of the overridden tensor.
//
// The device index is used to determine which device the tensor belongs to,
// it is according to the buffer type description.
func (odt *GGUFRunOverriddenTensor) ParseBufferType() (GGUFRunOverriddenTensorBufferType, string) {
	if odt == nil {
		return GGUFRunOverriddenTensorBufferTypeUnknown, ""
	}

	if odt._BufferType == 0 {
		odt._BufferType = GGUFRunOverriddenTensorBufferTypeUnknown
		if ms := _GGUFRunOverriddenTensorBufferTypeCPURegex.FindStringSubmatch(odt.BufferType); len(ms) > 1 {
			odt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeCPU, "0"
		}
		if ms := _GGUFRunOverriddenTensorBufferTypeUMAGPURegex.FindStringSubmatch(odt.BufferType); len(ms) > 1 {
			odt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeGPU, "1"
		}
		if ms := _GGUFRunOverriddenTensorBufferTypeRPCRegex.FindStringSubmatch(odt.BufferType); len(ms) > 1 {
			odt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeRPC, ms[1]
		}
		if ms := _GGUFRunOverriddenTensorBufferTypeNonUMAGPURegex.FindStringSubmatch(odt.BufferType); len(ms) > 2 {
			if idx, err := strconv.ParseInt(ms[2], 10, 64); err == nil && idx >= 0 {
				odt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeGPU, ms[2]
			}
		}
	}
	return odt._BufferType, odt._Index
}

// WithParallelSize sets the (decoding sequences) parallel size for the estimate.
func WithParallelSize(size int32) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		if size <= 0 {
			return
		}
		o.ParallelSize = &size
	}
}

// WithFlashAttention sets the flash attention flag.
func WithFlashAttention() GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		o.FlashAttention = true
	}
}

// WithMainGPUIndex sets the main device for the estimate.
//
// When split mode is LLaMACppSplitModeNone, the main device is the only device.
// When split mode is LLaMACppSplitModeRow, the main device handles the intermediate results and KV.
//
// WithMainGPUIndex needs to combine with WithTensorSplitFraction.
func WithMainGPUIndex(di int) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		o.MainGPUIndex = di
	}
}

// WithRPCServers sets the RPC servers for the estimate.
func WithRPCServers(srvs []string) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		if len(srvs) == 0 {
			return
		}
		o.RPCServers = srvs
	}
}

// WithTensorSplitFraction sets the tensor split cumulative fractions for the estimate.
//
// WithTensorSplitFraction accepts a variadic number of fractions,
// all fraction values must be in the range of [0, 1],
// and the last fraction must be 1.
//
// For example, WithTensorSplitFraction(0.2, 0.4, 0.6, 0.8, 1) will split the tensor into five parts with 20% each.
func WithTensorSplitFraction(fractions []float64) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		if len(fractions) == 0 {
			return
		}
		for _, f := range fractions {
			if f < 0 || f > 1 {
				return
			}
		}
		if fractions[len(fractions)-1] != 1 {
			return
		}
		o.TensorSplitFraction = fractions
	}
}

// WithOverriddenTensors sets the overridden tensors for the estimate.
func WithOverriddenTensors(tensors []GGUFRunOverriddenTensor) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		if len(tensors) == 0 {
			return
		}
		for _, t := range tensors {
			if t.PatternRegex == nil || t.BufferType == "" {
				return
			}
		}
		o.OverriddenTensors = make([]*GGUFRunOverriddenTensor, len(tensors))
		for i := range tensors {
			o.OverriddenTensors[i] = &tensors[i]
		}
	}
}

// WithDeviceMetrics sets the device metrics for the estimate.
func WithDeviceMetrics(metrics []GGUFRunDeviceMetric) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		if len(metrics) == 0 {
			return
		}
		o.DeviceMetrics = metrics
	}
}

// WithLLaMACppContextSize sets the context size for the estimate.
func WithLLaMACppContextSize(size int32) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		if size <= 0 {
			return
		}
		o.LMCContextSize = &size
	}
}

// WithLLaMACppRoPE sets the RoPE parameters for the estimate.
func WithLLaMACppRoPE(
	frequencyBase float64,
	frequencyScale float64,
	scalingType string,
	scalingOriginalContextSize int32,
) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		if frequencyBase > 0 {
			o.LMCRoPEFrequencyBase = ptr.Float32(float32(frequencyBase))
		}
		if frequencyScale > 0 {
			o.LMCRoPEFrequencyScale = ptr.Float32(float32(frequencyScale))
		}
		if slices.Contains([]string{"none", "linear", "yarn"}, scalingType) {
			o.LMCRoPEScalingType = &scalingType
		}
		if scalingOriginalContextSize > 0 {
			o.LMCRoPEScalingOriginalContextSize = ptr.To(scalingOriginalContextSize)
		}
	}
}

// WithinLLaMACppMaxContextSize limits the context size to the maximum,
// if the context size is over the maximum.
func WithinLLaMACppMaxContextSize() GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		o.LMCInMaxContextSize = true
	}
}

// WithLLaMACppLogicalBatchSize sets the logical batch size for the estimate.
func WithLLaMACppLogicalBatchSize(size int32) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		if size <= 0 {
			return
		}
		o.LMCLogicalBatchSize = &size
	}
}

// WithLLaMACppPhysicalBatchSize sets the physical batch size for the estimate.
func WithLLaMACppPhysicalBatchSize(size int32) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		if size <= 0 {
			return
		}
		o.LMCPhysicalBatchSize = &size
	}
}

// _GGUFEstimateCacheTypeAllowList is the allow list of cache key and value types.
var _GGUFEstimateCacheTypeAllowList = []GGMLType{
	GGMLTypeF32,
	GGMLTypeF16,
	GGMLTypeBF16,
	GGMLTypeQ8_0,
	GGMLTypeQ4_0, GGMLTypeQ4_1,
	GGMLTypeIQ4_NL,
	GGMLTypeQ5_0, GGMLTypeQ5_1,
}

// WithLLaMACppCacheKeyType sets the cache key type for the estimate.
func WithLLaMACppCacheKeyType(t GGMLType) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		if slices.Contains(_GGUFEstimateCacheTypeAllowList, t) {
			o.LMCCacheKeyType = &t
		}
	}
}

// WithLLaMACppCacheValueType sets the cache value type for the estimate.
func WithLLaMACppCacheValueType(t GGMLType) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		if slices.Contains(_GGUFEstimateCacheTypeAllowList, t) {
			o.LMCCacheValueType = &t
		}
	}
}

// WithoutLLaMACppOffloadKVCache disables offloading the KV cache.
func WithoutLLaMACppOffloadKVCache() GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		o.LMCOffloadKVCache = ptr.To(false)
	}
}

// WithLLaMACppOffloadLayers sets the number of layers to offload.
func WithLLaMACppOffloadLayers(layers uint64) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		o.LMCOffloadLayers = &layers
	}
}

// LLaMACppSplitMode is the split mode for LLaMACpp.
type LLaMACppSplitMode uint

const (
	LLaMACppSplitModeLayer LLaMACppSplitMode = iota
	LLaMACppSplitModeRow
	LLaMACppSplitModeNone
	_LLAMACppSplitModeMax
)

// WithLLaMACppSplitMode sets the split mode for the estimate.
func WithLLaMACppSplitMode(mode LLaMACppSplitMode) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		if mode < _LLAMACppSplitModeMax {
			o.LMCSplitMode = mode
		}
	}
}

// WithLLaMACppFullSizeSWACache enables full size sliding window attention cache.
func WithLLaMACppFullSizeSWACache() GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		o.LMCFullSizeSWACache = true
	}
}

// WithLLaMACppVisualMaxImageSize sets the visual maximum image size input for the estimate.
func WithLLaMACppVisualMaxImageSize(size uint32) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		if size == 0 {
			return
		}
		o.LMCVisualMaxImageSize = &size
	}
}

// WithLLaMACppMaxProjectedCache sets the maximum projected embedding cache for the estimate.
func WithLLaMACppMaxProjectedCache(cacheSize uint32) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		if cacheSize == 0 {
			return
		}
		o.LMCMaxProjectedCache = ptr.To(cacheSize)
	}
}

// WithLLaMACppDrafter sets the drafter estimate usage.
func WithLLaMACppDrafter(dft *LLaMACppRunEstimate) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		o.LMCDrafter = dft
	}
}

// WithLLaMACppProjector sets the multimodal projector estimate usage.
func WithLLaMACppProjector(prj *LLaMACppRunEstimate) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		o.LMCProjector = prj
	}
}

// WithLLaMACppAdapters sets the adapters estimate usage.
func WithLLaMACppAdapters(adp []LLaMACppRunEstimate) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		if len(adp) == 0 {
			return
		}
		o.LMCAdapters = adp
	}
}

// WithStableDiffusionCppOffloadLayers sets the number of layers to offload.
func WithStableDiffusionCppOffloadLayers(layers uint64) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		o.SDCOffloadLayers = &layers
	}
}

// WithStableDiffusionCppBatchCount sets the batch count for the estimate.
func WithStableDiffusionCppBatchCount(count int32) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		if count == 0 {
			return
		}
		o.SDCBatchCount = ptr.To(count)
	}
}

// WithStableDiffusionCppHeight sets the image height for the estimate.
func WithStableDiffusionCppHeight(height uint32) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		if height == 0 {
			return
		}
		o.SDCHeight = ptr.To(height)
	}
}

// WithStableDiffusionCppWidth sets the image width for the estimate.
func WithStableDiffusionCppWidth(width uint32) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		if width == 0 {
			return
		}
		o.SDCWidth = ptr.To(width)
	}
}

// WithoutStableDiffusionCppOffloadConditioner disables offloading the conditioner(text encoder).
func WithoutStableDiffusionCppOffloadConditioner() GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		o.SDCOffloadConditioner = ptr.To(false)
	}
}

// WithoutStableDiffusionCppOffloadAutoencoder disables offloading the autoencoder.
func WithoutStableDiffusionCppOffloadAutoencoder() GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		o.SDCOffloadAutoencoder = ptr.To(false)
	}
}

// WithStableDiffusionCppAutoencoderTiling enables tiling for the autoencoder.
func WithStableDiffusionCppAutoencoderTiling() GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		o.SDCAutoencoderTiling = ptr.To(true)
	}
}

// WithStableDiffusionCppFreeComputeMemoryImmediately enables freeing compute memory immediately.
func WithStableDiffusionCppFreeComputeMemoryImmediately() GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		o.SDCFreeComputeMemoryImmediately = ptr.To(true)
	}
}

// WithStableDiffusionCppUpscaler sets the upscaler estimate usage.
func WithStableDiffusionCppUpscaler(ups *StableDiffusionCppRunEstimate) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		o.SDCUpscaler = ups
	}
}

// WithStableDiffusionCppControlNet sets the control net estimate usage.
func WithStableDiffusionCppControlNet(cn *StableDiffusionCppRunEstimate) GGUFRunEstimateOption {
	return func(o *_GGUFRunEstimateOptions) {
		o.SDCControlNet = cn
	}
}


================================================
FILE: file_from_distro.go
================================================
package gguf_parser

import (
	"context"
	"errors"
	"fmt"
	"net/http"
	"path/filepath"
	"time"

	"github.com/gpustack/gguf-parser-go/util/httpx"
)

var (
	ErrOllamaInvalidModel      = errors.New("ollama invalid model")
	ErrOllamaBaseLayerNotFound = errors.New("ollama base layer not found")
)

// ParseGGUFFileFromOllama parses a GGUF file from Ollama model's base layer,
// and returns a GGUFFile, or an error if any.
func ParseGGUFFileFromOllama(ctx context.Context, model string, opts ...GGUFReadOption) (*GGUFFile, error) {
	return ParseGGUFFileFromOllamaModel(ctx, ParseOllamaModel(model), opts...)
}

// ParseGGUFFileFromOllamaModel is similar to ParseGGUFFileFromOllama,
// but inputs an OllamaModel instead of a string.
//
// The given OllamaModel will be completed(fetching MediaType, Config and Layers) after calling this function.
func ParseGGUFFileFromOllamaModel(ctx context.Context, model *OllamaModel, opts ...GGUFReadOption) (gf *GGUFFile, err error) {
	if model == nil {
		return nil, ErrOllamaInvalidModel
	}

	opts = append(opts[:len(opts):len(opts)], SkipRangeDownloadDetection())

	var o _GGUFReadOptions
	for _, opt := range opts {
		opt(&o)
	}

	// Cache.
	{
		if o.CachePath != "" {
			o.CachePath = filepath.Join(o.CachePath, "distro", "ollama")
		}
		c := GGUFFileCache(o.CachePath)

		// Get from cache.
		if gf, err = c.Get(model.String(), o.CacheExpiration); err == nil {
			return gf, nil
		}

		// Put to cache.
		defer func() {
			if err == nil {
				_ = c.Put(model.String(), gf)
			}
		}()
	}

	var cli *http.Client
	cli = httpx.Client(
		httpx.ClientOptions().
			WithUserAgent(OllamaUserAgent()).
			If(o.Debug, func(x *httpx.ClientOption) *httpx.ClientOption {
				return x.WithDebug()
			}).
			WithTimeout(0).
			WithRetryBackoff(1*time.Second, 5*time.Second, 10).
			WithRetryIf(func(resp *http.Response, err error) bool {
				return httpx.DefaultRetry(resp, err) || OllamaRegistryAuthorizeRetry(resp, cli)
			}).
			WithTransport(
				httpx.TransportOptions().
					WithoutKeepalive().
					TimeoutForDial(10*time.Second).
					TimeoutForTLSHandshake(5*time.Second).
					If(o.SkipProxy, func(x *httpx.TransportOption) *httpx.TransportOption {
						return x.WithoutProxy()
					}).
					If(o.ProxyURL != nil, func(x *httpx.TransportOption) *httpx.TransportOption {
						return x.WithProxy(http.ProxyURL(o.ProxyURL))
					}).
					If(o.SkipTLSVerification, func(x *httpx.TransportOption) *httpx.TransportOption {
						return x.WithoutInsecureVerify()
					}).
					If(o.SkipDNSCache, func(x *httpx.TransportOption) *httpx.TransportOption {
						return x.WithoutDNSCache()
					})))

	var ml OllamaModelLayer
	{
		err := model.Complete(ctx, cli)
		if err != nil {
			return nil, fmt.Errorf("complete ollama model: %w", err)
		}

		var ok bool
		ml, ok = model.GetLayer("application/vnd.ollama.image.model")
		if !ok {
			return nil, ErrOllamaBaseLayerNotFound
		}
	}

	return parseGGUFFileFromRemote(ctx, cli, ml.BlobURL().String(), o)
}


================================================
FILE: file_from_remote.go
================================================
package gguf_parser

import (
	"context"
	"fmt"
	"io"
	"net/http"
	"path/filepath"
	"strings"
	"time"

	"github.com/gpustack/gguf-parser-go/util/httpx"
	"github.com/gpustack/gguf-parser-go/util/osx"
)

// ParseGGUFFileFromHuggingFace parses a GGUF file from Hugging Face(https://huggingface.co/),
// and returns a GGUFFile, or an error if any.
func ParseGGUFFileFromHuggingFace(ctx context.Context, repo, file string, opts ...GGUFReadOption) (*GGUFFile, error) {
	ep := osx.Getenv("HF_ENDPOINT", "https://huggingface.co")
	return ParseGGUFFileRemote(ctx, fmt.Sprintf("%s/%s/resolve/main/%s", ep, repo, file), opts...)
}

// ParseGGUFFileFromModelScope parses a GGUF file from Model Scope(https://modelscope.cn/),
// and returns a GGUFFile, or an error if any.
func ParseGGUFFileFromModelScope(ctx context.Context, repo, file string, opts ...GGUFReadOption) (*GGUFFile, error) {
	ep := osx.Getenv("MS_ENDPOINT", "https://modelscope.cn")
	opts = append(opts[:len(opts):len(opts)], SkipRangeDownloadDetection())
	return ParseGGUFFileRemote(ctx, fmt.Sprintf("%s/models/%s/resolve/master/%s", ep, repo, file), opts...)
}

// ParseGGUFFileRemote parses a GGUF file from a remote BlobURL,
// and returns a GGUFFile, or an error if any.
func ParseGGUFFileRemote(ctx context.Context, url string, opts ...GGUFReadOption) (gf *GGUFFile, err error) {
	var o _GGUFReadOptions
	for _, opt := range opts {
		opt(&o)
	}

	// Cache.
	{
		if o.CachePath != "" {
			o.CachePath = filepath.Join(o.CachePath, "remote")
			if o.SkipLargeMetadata {
				o.CachePath = filepath.Join(o.CachePath, "brief")
			}
		}
		c := GGUFFileCache(o.CachePath)

		// Get from cache.
		if gf, err = c.Get(url, o.CacheExpiration); err == nil {
			return gf, nil
		}

		// Put to cache.
		defer func() {
			if err == nil {
				_ = c.Put(url, gf)
			}
		}()
	}

	cli := httpx.Client(
		httpx.ClientOptions().
			WithUserAgent("gguf-parser-go").
			If(o.Debug,
				func(x *httpx.ClientOption) *httpx.ClientOption {
					return x.WithDebug()
				},
			).
			If(o.BearerAuthToken != "",
				func(x *httpx.ClientOption) *httpx.ClientOption {
					return x.WithBearerAuth(o.BearerAuthToken)
				},
			).
			If(len(o.Headers) > 0,
				func(x *httpx.ClientOption) *httpx.ClientOption {
					return x.WithHeaders(o.Headers)
				},
			).
			WithTimeout(0).
			WithTransport(
				httpx.TransportOptions().
					WithoutKeepalive().
					TimeoutForDial(5*time.Second).
					TimeoutForTLSHandshake(5*time.Second).
					TimeoutForResponseHeader(5*time.Second).
					If(o.SkipProxy,
						func(x *httpx.TransportOption) *httpx.TransportOption {
							return x.WithoutProxy()
						},
					).
					If(o.ProxyURL != nil,
						func(x *httpx.TransportOption) *httpx.TransportOption {
							return x.WithProxy(http.ProxyURL(o.ProxyURL))
						},
					).
					If(o.SkipTLSVerification || !strings.HasPrefix(url, "https://"),
						func(x *httpx.TransportOption) *httpx.TransportOption {
							return x.WithoutInsecureVerify()
						},
					).
					If(o.SkipDNSCache,
						func(x *httpx.TransportOption) *httpx.TransportOption {
							return x.WithoutDNSCache()
						},
					),
			),
	)

	return parseGGUFFileFromRemote(ctx, cli, url, o)
}

func parseGGUFFileFromRemote(ctx context.Context, cli *http.Client, url string, o _GGUFReadOptions) (*GGUFFile, error) {
	var urls []string
	{
		rs := CompleteShardGGUFFilename(url)
		if rs != nil {
			urls = rs
		} else {
			urls = []string{url}
		}
	}

	fs := make([]_GGUFFileReadSeeker, 0, len(urls))
	defer func() {
		for i := range fs {
			osx.Close(fs[i])
		}
	}()

	for i := range urls {
		req, err := httpx.NewGetRequestWithContext(ctx, urls[i])
		if err != nil {
			return nil, fmt.Errorf("new request: %w", err)
		}

		sf, err := httpx.OpenSeekerFile(cli, req,
			httpx.SeekerFileOptions().
				WithBufferSize(o.BufferSize).
				If(o.SkipRangeDownloadDetection,
					func(x *httpx.SeekerFileOption) *httpx.SeekerFileOption {
						return x.WithoutRangeDownloadDetect()
					},
				),
		)
		if err != nil {
			return nil, fmt.Errorf("open http file: %w", err)
		}

		fs = append(fs, _GGUFFileReadSeeker{
			Closer:     sf,
			ReadSeeker: io.NewSectionReader(sf, 0, sf.Len()),
			Size:       sf.Len(),
		})
	}

	return parseGGUFFile(fs, o)
}


================================================
FILE: file_metadata.go
================================================
package gguf_parser

import (
	"regexp"
	"slices"
	"sort"
	"strings"

	"golang.org/x/exp/maps"
)

// GGUFMetadata represents the model metadata of a GGUF file.
type GGUFMetadata struct {
	/* Basic */

	// Type describes what type this GGUF file is,
	// default is "model".
	Type string `json:"type"`
	// Architecture describes what architecture this GGUF file implements.
	//
	// All lowercase ASCII.
	Architecture string `json:"architecture"`
	// QuantizationVersion describes the version of the quantization format.
	//
	// Not required if the model is not quantized (i.e. no tensors are quantized).
	// If any tensors are quantized, this must be present.
	// This is separate to the quantization scheme of the tensors itself,
	// the quantization version may change without changing the scheme's name,
	// e.g. the quantization scheme is Q5_K, and the QuantizationVersion is 4.
	QuantizationVersion uint32 `json:"quantizationVersion,omitempty"`
	// Alignment describes the alignment of the GGUF file.
	//
	// This can vary to allow for different alignment schemes, but it must be a multiple of 8.
	// Some writers may not write the alignment.
	//
	// Default is 32.
	Alignment uint32 `json:"alignment"`
	// Name to the model.
	//
	// This should be a human-readable name that can be used to identify the GGUF file.
	// It should be unique within the community that the model is defined in.
	Name string `json:"name,omitempty"`
	// Author to the model.
	Author string `json:"author,omitempty"`
	// URL to the model's homepage.
	//
	// This can be a GitHub repo, a paper, etc.
	URL string `json:"url,omitempty"`
	// Description to the model.
	Description string `json:"description,omitempty"`
	// License to the model.
	//
	// This is expressed as a SPDX license expression, e.g. "MIT OR Apache-2.0".
	License string `json:"license,omitempty"`
	// FileType describes the type of the majority of the tensors in the GGUF file.
	FileType GGUFFileType `json:"fileType"`
	// FileTypeDescriptor describes the type of the GGUF file according to the FileType and trait layer.
	//
	// This supplies the FileType with more detail.
	FileTypeDescriptor string `json:"fileTypeDetail"`

	/* Appendix */

	// LittleEndian is true if the GGUF file is little-endian,
	// and false for big-endian.
	LittleEndian bool `json:"littleEndian"`
	// FileSize is the size of the GGUF file in bytes.
	FileSize GGUFBytesScalar `json:"fileSize"`
	// Size is the model size.
	Size GGUFBytesScalar `json:"size"`
	// Parameters is the parameters of the GGUF file.
	Parameters GGUFParametersScalar `json:"parameters"`
	// BitsPerWeight is the bits per weight of the GGUF file.
	BitsPerWeight GGUFBitsPerWeightScalar `json:"bitsPerWeight"`
}

// GGUFFileType is a type of GGUF file,
// see https://github.com/ggml-org/llama.cpp/blob/fd1234cb468935ea087d6929b2487926c3afff4b/ggml/include/ggml.h#L419-L445,
// and https://github.com/huggingface/huggingface.js/blob/d67a464473ca07fee9811a129e5fac8cc7487098/packages/tasks/src/gguf.ts#L4-L52.
type GGUFFileType uint32

// GGUFFileType constants.
//
// GGUFFileTypeMostlyQ4_2, GGUFFileTypeMostlyQ4_3 are deprecated.
// GGUFFileTypeMostlyQ4_0_4_4, GGUFFileTypeMostlyQ4_0_4_8, GGUFFileTypeMostlyQ4_0_8_8 are deprecated.
//
// GGUFFileTypeMostlyQ4_1_SOME_F16 is a special case where the majority of the tensors are Q4_1,
// but 'token_embd.weight' and 'output.weight' tensors are F16.
const (
	GGUFFileTypeMostlyF32           GGUFFileType = iota // MOSTLY_F32
	GGUFFileTypeMostlyF16                               // MOSTLY_F16
	GGUFFileTypeMostlyQ4_0                              // MOSTLY_Q4_0
	GGUFFileTypeMostlyQ4_1                              // MOSTLY_Q4_1
	GGUFFileTypeMostlyQ4_1_SOME_F16                     // MOSTLY_Q4_1_SOME_F16
	GGUFFileTypeMostlyQ4_2                              // MOSTLY_Q4_2
	GGUFFileTypeMostlyQ4_3                              // MOSTLY_Q4_3
	GGUFFileTypeMostlyQ8_0                              // MOSTLY_Q8_0
	GGUFFileTypeMostlyQ5_0                              // MOSTLY_Q5_0
	GGUFFileTypeMostlyQ5_1                              // MOSTLY_Q5_1
	GGUFFileTypeMostlyQ2_K                              // MOSTLY_Q2_K
	GGUFFileTypeMostlyQ3_K_S                            // MOSTLY_Q3_K_S
	GGUFFileTypeMostlyQ3_K_M                            // MOSTLY_Q3_K_M
	GGUFFileTypeMostlyQ3_K_L                            // MOSTLY_Q3_K_L
	GGUFFileTypeMostlyQ4_K_S                            // MOSTLY_Q4_K_S
	GGUFFileTypeMostlyQ4_K_M                            // MOSTLY_Q4_K_M
	GGUFFileTypeMostlyQ5_K_S                            // MOSTLY_Q5_K_S
	GGUFFileTypeMostlyQ5_K_M                            // MOSTLY_Q5_K_M
	GGUFFileTypeMostlyQ6_K                              // MOSTLY_Q6_K
	GGUFFileTypeMostlyIQ2_XXS                           // MOSTLY_IQ2_XXS
	GGUFFileTypeMostlyIQ2_XS                            // MOSTLY_IQ2_XS
	GGUFFileTypeMostlyQ2_K_S                            // MOSTLY_Q2_K_S
	GGUFFileTypeMostlyIQ3_XS                            // MOSTLY_IQ3_XS
	GGUFFileTypeMostlyIQ3_XXS                           // MOSTLY_IQ3_XXS
	GGUFFileTypeMostlyIQ1_S                             // MOSTLY_IQ1_S
	GGUFFileTypeMostlyIQ4_NL                            // MOSTLY_IQ4_NL
	GGUFFileTypeMostlyIQ3_S                             // MOSTLY_IQ3_S
	GGUFFileTypeMostlyIQ3_M                             // MOSTLY_IQ3_M
	GGUFFileTypeMostlyIQ2_S                             // MOSTLY_IQ2_S
	GGUFFileTypeMostlyIQ2_M                             // MOSTLY_IQ2_M
	GGUFFileTypeMostlyIQ4_XS                            // MOSTLY_IQ4_XS
	GGUFFileTypeMostlyIQ1_M                             // MOSTLY_IQ1_M
	GGUFFileTypeMostlyBF16                              // MOSTLY_BF16
	GGUFFileTypeMostlyQ4_0_4_4                          // MOSTLY_Q4_0_4_4
	GGUFFileTypeMostlyQ4_0_4_8                          // MOSTLY_Q4_0_4_8
	GGUFFileTypeMostlyQ4_0_8_8                          // MOSTLY_Q4_0_8_8
	GGUFFileTypeMostlyTQ1_0                             // MOSTLY_TQ1_0
	GGUFFileTypeMostlyTQ2_0                             // MOSTLY_TQ2_0
	GGUFFileTypeMostlyMXFP4                             // MOSTLY_MXFP4
	_GGUFFileTypeCount                                  // Unknown
)

// _GGUFPotentialDiffusionArchitectures holds a list representing the potential diffusion architectures.
//
// Since we will unify all diffusion architectures to "diffusion" during processing,
// we can use this list to match the value in explicit `general.architecture`.
var _GGUFPotentialDiffusionArchitectures = []string{
	"flux",
	"sd",
	"sd2.5",
	"sd3",
	"stable-diffusion",
}

// _GGUFPotentialDiffusionArchitectureTensorsRegexes holds a list of regexes to match the potential diffusion architecture tensors.
//
// This is used to detect if the GGUF file is a diffusion model,
// when the `general.architecture` is not set to a known diffusion architecture.
var _GGUFPotentialDiffusionArchitectureTensorsRegexes = []*regexp.Regexp{
	regexp.MustCompile(`^model\.diffusion_model\..*`),
	regexp.MustCompile(`^double_blocks\..*`),
	regexp.MustCompile(`^joint_blocks\..*`),
	regexp.MustCompile(`^decoder\..*`),
	regexp.MustCompile(`^encoder\..*`),
	regexp.MustCompile(`^text_model\..*`),
}

// Metadata returns the metadata of the GGUF file.
func (gf *GGUFFile) Metadata() (gm GGUFMetadata) {
	const (
		typeKey         = "general.type"
		architectureKey = "general.architecture"
		quantizationKey = "general.quantization_version"
		alignmentKey    = "general.alignment"
		nameKey         = "general.name"
		authorKey       = "general.author"
		urlKey          = "general.url"
		descriptionKey  = "general.description"
		licenseKey      = "general.license"

		controlVectorModelHintKey = "controlvector.model_hint"
	)

	m, _ := gf.Header.MetadataKV.Index([]string{
		typeKey,
		architectureKey,
		quantizationKey,
		alignmentKey,
		nameKey,
		authorKey,
		urlKey,
		descriptionKey,
		licenseKey,
		controlVectorModelHintKey,
	})

	if v, ok := m[typeKey]; ok {
		gm.Type = v.ValueString()
	} else if _, ok = m[controlVectorModelHintKey]; ok {
		gm.Type = "adapter"
	} else {
		gm.Type = "model"
	}
	if v, ok := m[controlVectorModelHintKey]; ok {
		gm.Architecture = v.ValueString()
	} else if v, ok = m[architectureKey]; ok && !slices.Contains(_GGUFPotentialDiffusionArchitectures, v.ValueString()) {
		gm.Architecture = v.ValueString()
		if gm.Architecture == "clip" {
			gm.Type = "projector"
		}
	} else if gm.Type == "imatrix" {
		gm.Architecture = "imatrix" // Default to imatrix.
	} else {
		gm.Architecture = "llama" // Default to llama.
		for _, re := range _GGUFPotentialDiffusionArchitectureTensorsRegexes {
			if gf.TensorInfos.Match(re) {
				gm.Architecture = "diffusion"
				break
			}
		}
	}
	if v, ok := m[quantizationKey]; ok {
		gm.QuantizationVersion = ValueNumeric[uint32](v)
	}
	if v, ok := m[alignmentKey]; ok {
		gm.Alignment = ValueNumeric[uint32](v)
	} else {
		gm.Alignment = 32
	}
	if v, ok := m[nameKey]; ok {
		gm.Name = v.ValueString()
	}
	if v, ok := m[authorKey]; ok {
		gm.Author = v.ValueString()
	}
	if v, ok := m[urlKey]; ok {
		gm.URL = v.ValueString()
	}
	if v, ok := m[descriptionKey]; ok {
		gm.Description = v.ValueString()
	}
	if v, ok := m[licenseKey]; ok {
		gm.License = v.ValueString()
	}
	gm.FileType, gm.FileTypeDescriptor = gf.extractFileType(gm.Architecture)

	gm.LittleEndian = gf.Header.Version < GGUFVersionV3 || gf.Header.Magic == GGUFMagicGGUFLe
	gm.FileSize = gf.Size
	gm.Size = gf.ModelSize
	gm.Parameters = gf.ModelParameters
	gm.BitsPerWeight = gf.ModelBitsPerWeight

	return gm
}

// GGMLType returns the GGMLType of the GGUFFileType,
// which is inspired by
// https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2730-L2763.
func (t GGUFFileType) GGMLType() GGMLType {
	switch t {
	case GGUFFileTypeMostlyF32:
		return GGMLTypeF32
	case GGUFFileTypeMostlyF16:
		return GGMLTypeF16
	case GGUFFileTypeMostlyQ4_0:
		return GGMLTypeQ4_0
	case GGUFFileTypeMostlyQ4_1:
		return GGMLTypeQ4_1
	case GGUFFileTypeMostlyQ4_1_SOME_F16:
		return GGMLTypeQ4_1
	case GGUFFileTypeMostlyQ4_2:
		return GGMLTypeQ4_2
	case GGUFFileTypeMostlyQ4_3:
		return GGMLTypeQ4_3
	case GGUFFileTypeMostlyQ8_0:
		return GGMLTypeQ8_0
	case GGUFFileTypeMostlyQ5_0:
		return GGMLTypeQ5_0
	case GGUFFileTypeMostlyQ5_1:
		return GGMLTypeQ5_1
	case GGUFFileTypeMostlyQ2_K:
		return GGMLTypeQ2_K
	case GGUFFileTypeMostlyQ3_K_S:
		return GGMLTypeQ3_K
	case GGUFFileTypeMostlyQ3_K_M:
		return GGMLTypeQ4_K
	case GGUFFileTypeMostlyQ3_K_L:
		return GGMLTypeQ5_K
	case GGUFFileTypeMostlyQ4_K_S:
		return GGMLTypeQ6_K
	case GGUFFileTypeMostlyQ4_K_M:
		return GGMLTypeQ4_K
	case GGUFFileTypeMostlyQ5_K_S:
		return GGMLTypeQ5_K
	case GGUFFileTypeMostlyQ5_K_M:
		return GGMLTypeQ5_K
	case GGUFFileTypeMostlyQ6_K:
		return GGMLTypeQ6_K
	case GGUFFileTypeMostlyIQ2_XXS:
		return GGMLTypeIQ2_XXS
	case GGUFFileTypeMostlyIQ2_XS:
		return GGMLTypeIQ2_XS
	case GGUFFileTypeMostlyQ2_K_S:
		return GGMLTypeQ2_K
	case GGUFFileTypeMostlyIQ3_XS:
		return GGMLTypeIQ3_S
	case GGUFFileTypeMostlyIQ3_XXS:
		return GGMLTypeIQ3_XXS
	case GGUFFileTypeMostlyIQ1_S:
		return GGMLTypeIQ1_S
	case GGUFFileTypeMostlyIQ4_NL:
		return GGMLTypeIQ4_NL
	case GGUFFileTypeMostlyIQ3_S:
		return GGMLTypeIQ3_S
	case GGUFFileTypeMostlyIQ3_M:
		return GGMLTypeIQ3_S
	case GGUFFileTypeMostlyIQ2_S:
		return GGMLTypeIQ2_XS
	case GGUFFileTypeMostlyIQ2_M:
		return GGMLTypeIQ2_S
	case GGUFFileTypeMostlyIQ4_XS:
		return GGMLTypeIQ4_XS
	case GGUFFileTypeMostlyIQ1_M:
		return GGMLTypeIQ1_M
	case GGUFFileTypeMostlyBF16:
		return GGMLTypeBF16
	case GGUFFileTypeMostlyQ4_0_4_4:
		return GGMLTypeQ4_0_4_4
	case GGUFFileTypeMostlyQ4_0_4_8:
		return GGMLTypeQ4_0_4_8
	case GGUFFileTypeMostlyQ4_0_8_8:
		return GGMLTypeQ4_0_8_8
	case GGUFFileTypeMostlyTQ1_0:
		return GGMLTypeTQ1_0
	case GGUFFileTypeMostlyTQ2_0:
		return GGMLTypeTQ2_0
	case GGUFFileTypeMostlyMXFP4:
		return GGMLTypeMXFP4
	default:
	}
	return _GGMLTypeCount
}

// extractFileType extracts the GGUF file type from the metadata,
// it tries to return the descriptor of the file type.
func (gf *GGUFFile) extractFileType(arch string) (fileType GGUFFileType, fileTypeDescriptor string) {
	fileType, fileTypeDescriptor = _GGUFFileTypeCount, "Unknown"

	const fileTypeKey = "general.file_type"
	m, _ := gf.Header.MetadataKV.Index([]string{
		fileTypeKey,
	})
	if v, ok := m[fileTypeKey]; ok {
		fileType = GGUFFileType(ValueNumeric[uint32](v))
	}

	if fileType == _GGUFFileTypeCount {
		// Guess.
		if len(gf.TensorInfos) != 0 {
			cm := make(map[GGMLType]int)
			for i := range gf.TensorInfos {
				switch {
				case arch != "diffusion" &&
					!strings.HasPrefix(gf.TensorInfos[i].Name, "token_embd") &&
					!strings.HasPrefix(gf.TensorInfos[i].Name, "blk.") &&
					!strings.Contains(gf.TensorInfos[i].Name, "_norm") &&
					!strings.HasSuffix(gf.TensorInfos[i].Name, ".weight"):
					continue
				case arch == "diffusion" &&
					!strings.HasSuffix(gf.TensorInfos[i].Name, ".weight"):
					continue
				}
				cm[gf.TensorInfos[i].Type]++
			}
			fileType = GetFileType(cm)
		}
	}
	if fileType == _GGUFFileTypeCount {
		return fileType, fileTypeDescriptor
	}

	fileTypeDescriptor = strings.TrimPrefix(fileType.String(), "MOSTLY_")

	const tokenEmbedWeightTensorName = "token_embd.weight"

	switch fileType {
	case GGUFFileTypeMostlyQ4_0:
		tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})
		if v, ok := tis[tokenEmbedWeightTensorName]; ok {
			if v.Type == GGMLTypeQ8_0 || v.Type == GGMLTypeQ5_0 || v.Type == GGMLTypeQ5_1 {
				fileTypeDescriptor = "Q4_0_L"
			}
		}
	case GGUFFileTypeMostlyQ4_1:
		tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})
		if v, ok := tis[tokenEmbedWeightTensorName]; ok {
			if v.Type == GGMLTypeQ8_0 || v.Type == GGMLTypeQ5_0 || v.Type == GGMLTypeQ5_1 {
				fileTypeDescriptor = "Q4_1_L"
			}
		}
	case GGUFFileTypeMostlyQ5_0:
		tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})
		if v, ok := tis[tokenEmbedWeightTensorName]; ok {
			if v.Type == GGMLTypeQ8_0 {
				fileTypeDescriptor = "Q5_0_L"
			}
		}
	case GGUFFileTypeMostlyQ5_1:
		tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})
		if v, ok := tis[tokenEmbedWeightTensorName]; ok {
			if v.Type == GGMLTypeQ8_0 {
				fileTypeDescriptor = "Q5_1_L"
			}
		}
	case GGUFFileTypeMostlyQ2_K:
		tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})
		if v, ok := tis[tokenEmbedWeightTensorName]; ok {
			if v.Type == GGMLTypeQ8_0 || v.Type == GGMLTypeQ4_K {
				fileTypeDescriptor = "Q2_K_L"
			}
		}
	case GGUFFileTypeMostlyQ3_K_M:
		tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})
		if v, ok := tis[tokenEmbedWeightTensorName]; ok {
			if v.Type == GGMLTypeQ8_0 {
				fileTypeDescriptor = "Q3_K_L"
			}
		}
	case GGUFFileTypeMostlyQ4_K_M:
		tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})
		if v, ok := tis[tokenEmbedWeightTensorName]; ok {
			if v.Type == GGMLTypeQ8_0 {
				fileTypeDescriptor = "Q4_K_L"
			}
		}
	case GGUFFileTypeMostlyQ5_K_M:
		tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})
		if v, ok := tis[tokenEmbedWeightTensorName]; ok {
			if v.Type == GGMLTypeQ8_0 {
				fileTypeDescriptor = "Q5_K_L"
			}
		}
	case GGUFFileTypeMostlyQ6_K:
		tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName})
		if v, ok := tis[tokenEmbedWeightTensorName]; ok {
			if v.Type == GGMLTypeQ8_0 {
				fileTypeDescriptor = "Q6_K_L"
			}
		}
	}

	return fileType, fileTypeDescriptor
}

// GetFileType returns the GGUFFileType represented the mostly GGMLType of the given tensors counter.
//
// The input `cm` is a map of GGMLType to the count of tensors of that type.
func GetFileType(cm map[GGMLType]int) GGUFFileType {
	if len(cm) == 0 {
		return _GGUFFileTypeCount
	}

	// Sort.
	ts := maps.Keys(cm)
	sort.Slice(ts, func(i, j int) bool {
		return cm[ts[i]] > cm[ts[j]]
	})

	// Guess.
	if ts[0] == GGMLTypeF32 {
		if len(ts) == 1 {
			return GGUFFileTypeMostlyF32
		}
		ts[0] = ts[1]
	}
	switch ts[0] {
	case GGMLTypeF16:
		return GGUFFileTypeMostlyF16
	case GGMLTypeQ4_0:
		return GGUFFileTypeMostlyQ4_0
	case GGMLTypeQ4_1:
		return GGUFFileTypeMostlyQ4_1
	case GGMLTypeQ4_2:
		return GGUFFileTypeMostlyQ4_2
	case GGMLTypeQ4_3:
		return GGUFFileTypeMostlyQ4_3
	case GGMLTypeQ5_0:
		return GGUFFileTypeMostlyQ5_0
	case GGMLTypeQ5_1:
		return GGUFFileTypeMostlyQ5_1
	case GGMLTypeQ8_0:
		return GGUFFileTypeMostlyQ8_0
	case GGMLTypeQ2_K:
		if ts[len(ts)-1] == GGMLTypeQ5_K {
			return GGUFFileTypeMostlyQ2_K_S
		}
		return GGUFFileTypeMostlyQ2_K
	case GGMLTypeQ3_K:
		if cm[GGMLTypeQ8_0] > 0 ||
			(cm[GGMLTypeQ5_K] > 1 && cm[GGMLTypeQ4_K] == 0) {
			return GGUFFileTypeMostlyQ3_K_L
		}
		if cm[GGMLTypeQ4_K] > 1 {
			return GGUFFileTypeMostlyQ3_K_M
		}
		return GGUFFileTypeMostlyQ3_K_S
	case GGMLTypeQ4_K:
		if cm[GGMLTypeQ6_K] > 1 {
			return GGUFFileTypeMostlyQ4_K_M
		}
		if cm[GGMLTypeQ3_K] > 1 {
			return GGUFFileTypeMostlyQ3_K_M
		}
		return GGUFFileTypeMostlyQ4_K_S
	case GGMLTypeQ5_K:
		if cm[GGMLTypeQ6_K] > 1 {
			return GGUFFileTypeMostlyQ5_K_M
		}
		return GGUFFileTypeMostlyQ5_K_S
	case GGMLTypeQ6_K:
		return GGUFFileTypeMostlyQ6_K
	case GGMLTypeIQ2_XXS:
		return GGUFFileTypeMostlyIQ2_XXS
	case GGMLTypeIQ2_XS:
		if cm[GGMLTypeIQ4_XS] > 1 {
			return GGUFFileTypeMostlyIQ2_S
		}
		return GGUFFileTypeMostlyIQ2_XS
	case GGMLTypeIQ2_S:
		return GGUFFileTypeMostlyIQ2_M
	case GGMLTypeIQ3_XXS:
		return GGUFFileTypeMostlyIQ3_XXS
	case GGMLTypeIQ3_S:
		if cm[GGMLTypeIQ3_XXS] > 1 {
			return GGUFFileTypeMostlyIQ3_XS
		}
		return GGUFFileTypeMostlyIQ3_S
	case GGMLTypeIQ1_S:
		return GGUFFileTypeMostlyIQ1_S
	case GGMLTypeIQ4_NL:
		return GGUFFileTypeMostlyIQ4_NL
	case GGMLTypeIQ4_XS:
		return GGUFFileTypeMostlyIQ4_XS
	case GGMLTypeIQ1_M:
		return GGUFFileTypeMostlyIQ1_M
	case GGMLTypeBF16:
		return GGUFFileTypeMostlyBF16
	case GGMLTypeQ4_0_4_4:
		return GGUFFileTypeMostlyQ4_0_4_4
	case GGMLTypeQ4_0_4_8:
		return GGUFFileTypeMostlyQ4_0_4_8
	case GGMLTypeQ4_0_8_8:
		return GGUFFileTypeMostlyQ4_0_8_8
	case GGMLTypeTQ1_0:
		return GGUFFileTypeMostlyTQ1_0
	case GGMLTypeTQ2_0:
		return GGUFFileTypeMostlyTQ2_0
	case GGMLTypeMXFP4:
		return GGUFFileTypeMostlyMXFP4
	default:
	}
	return _GGUFFileTypeCount
}


================================================
FILE: file_metadata_test.go
================================================
package gguf_parser

import (
	"context"
	"fmt"
	"os"
	"strings"
	"testing"

	"github.com/davecgh/go-spew/spew"
	"github.com/stretchr/testify/assert"
)

func TestGGUFFile_Metadata(t *testing.T) {
	ctx := context.Background()

	f, err := ParseGGUFFileFromHuggingFace(
		ctx,
		"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
		"Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf",
		SkipLargeMetadata())
	if err != nil {
		t.Fatal(err)
		return
	}

	t.Log("\n", spew.Sdump(f.Metadata()), "\n")
}

func BenchmarkGGUFFile_Metadata(b *testing.B) {
	mp, ok := os.LookupEnv("TEST_MODEL_PATH")
	if !ok {
		b.Skip("TEST_MODEL_PATH is not set")
		return
	}

	f, err := ParseGGUFFile(mp, UseMMap(), SkipLargeMetadata())
	if err != nil {
		b.Fatal(err)
		return
	}

	b.ReportAllocs()

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		_ = f.Metadata()
	}
}

func TestGGUFFile_extractFileType(t *testing.T) {
	ctx := context.Background()

	repo := "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF"
	cases := []string{
		"Q2_K",
		"Q3_K_L",
		"Q3_K_M",
		"Q3_K_S",
		"Q4_0",
		"Q4_K_M",
		"Q4_K_S",
		"Q5_0",
		"Q5_K_M",
		"Q5_K_S",
		"Q6_K",
		"Q8_0",
	}
	for _, tc := range cases {
		t.Run(repo+"/"+tc, func(t *testing.T) {
			gf, err := ParseGGUFFileFromHuggingFace(
				ctx,
				repo,
				fmt.Sprintf("Hermes-2-Pro-Mistral-7B.%s.gguf", tc))
			if err != nil {
				t.Fatal(err)
				return
			}
			md := gf.Metadata()
			ft, ftd := gf.extractFileType(md.Architecture)
			assert.Equal(t, md.FileType.String(), ft.String(), tc+" file type should be equal")
			assert.Equal(t, tc, ftd, tc+" file type descriptor should be equal")
		})
	}

	// Ignore unsupported cases for https://huggingface.co/Mungert/Qwen2.5-VL-3B-Instruct-GGUF/commit/42f8e463b233df7575f1e1e9a83cb5936db56d2a.
	repo = "Mungert/Qwen2.5-VL-3B-Instruct-GGUF"
	cases = []string{
		"IQ2_M",
		"IQ2_S",
		"IQ2_XS",
		"IQ2_XXS",
		"IQ3_M",
		"IQ3_S",
		"IQ3_XS",
		"IQ3_XXS",
		"IQ4_NL",
		"IQ4_XS",
		// "Q2_K_L",
		"Q2_K_S",
		// "Q3_K_L",
		"Q3_K_M",
		"Q3_K_S",
		"Q4_0",
		// "Q4_0_L",
		"Q4_1",
		// "Q4_1_L",
		// "Q4_K_L",
		"Q4_K_M",
		"Q4_K_S",
		"Q5_0",
		// "Q5_0_L",
		// "Q5_K_L",
		"Q5_K_M",
		"Q5_K_S",
		// "Q6_K_L",
		// "Q6_K_M", == "Q6_K"
		"Q8_0",
	}
	for _, tc := range cases {
		t.Run(repo+"/"+tc, func(t *testing.T) {
			gf, err := ParseGGUFFileFromHuggingFace(
				ctx,
				repo,
				fmt.Sprintf("Qwen2.5-VL-3B-Instruct-%s.gguf", strings.ToLower(tc)))
			if err != nil {
				t.Fatal(err)
				return
			}
			md := gf.Metadata()
			ft, ftd := gf.extractFileType(md.Architecture)
			assert.Equal(t, md.FileType.String(), ft.String(), tc+" file type should be equal")
			assert.Equal(t, tc, ftd, tc+" file type descriptor should be equal")
		})
	}

	repo = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF"
	cases = []string{
		"BF16",
		"Q2_K",
		"Q2_K_L",
		"Q3_K_M",
		"Q4_K_M",
		"Q5_K_M",
		"Q6_K",
		"Q8_0",
	}
	for _, tc := range cases {
		t.Run(repo+"/"+tc, func(t *testing.T) {
			gf, err := ParseGGUFFileFromHuggingFace(
				ctx,
				repo,
				fmt.Sprintf("DeepSeek-R1-Distill-Qwen-1.5B-%s.gguf", tc))
			if err != nil {
				t.Fatal(err)
				return
			}
			md := gf.Metadata()
			ft, ftd := gf.extractFileType(md.Architecture)
			assert.Equal(t, md.FileType.String(), ft.String(), tc+" file type should be equal")
			assert.Equal(t, tc, ftd, tc+" file type descriptor should be equal")
		})
	}

	repo = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF"
	cases = []string{
		"IQ1_M",
		"IQ1_S",
		"IQ2_M",
		"IQ2_XXS",
		"IQ3_XXS",
		"IQ4_XS",
		// "Q2_K_XL" == "Q2_K_L"
		// "Q3_K_XL" == "Q3_K_M"
		// "Q4_K_XL" == "Q4_K_M"
	}
	for _, tc := range cases {
		t.Run(repo+"/"+tc, func(t *testing.T) {
			gf, err := ParseGGUFFileFromHuggingFace(
				ctx,
				repo,
				fmt.Sprintf("DeepSeek-R1-Distill-Qwen-1.5B-UD-%s.gguf", tc))
			if err != nil {
				t.Fatal(err)
				return
			}
			md := gf.Metadata()
			ft, ftd := gf.extractFileType(md.Architecture)
			assert.Equal(t, md.FileType.String(), ft.String(), tc+" file type should be equal")
			assert.Equal(t, tc, ftd, tc+" file type descriptor should be equal")
		})
	}
}


================================================
FILE: file_option.go
================================================
package gguf_parser

import (
	"net/url"
	"path/filepath"
	"runtime"
	"strings"
	"time"

	"github.com/gpustack/gguf-parser-go/util/osx"
)

type (
	_GGUFReadOptions struct {
		Debug             bool
		SkipLargeMetadata bool

		// Local.
		MMap bool

		// Remote.
		BearerAuthToken            string
		Headers                    map[string]string
		ProxyURL                   *url.URL
		SkipProxy                  bool
		SkipTLSVerification        bool
		SkipDNSCache               bool
		BufferSize                 int
		SkipRangeDownloadDetection bool
		CachePath                  string
		CacheExpiration            time.Duration
	}

	// GGUFReadOption is the option for reading the file.
	GGUFReadOption func(o *_GGUFReadOptions)
)

// UseDebug uses debug mode to read the file.
func UseDebug() GGUFReadOption {
	return func(o *_GGUFReadOptions) {
		o.Debug = true
	}
}

// SkipLargeMetadata skips reading large GGUFMetadataKV items,
// which are not necessary for most cases.
func SkipLargeMetadata() GGUFReadOption {
	return func(o *_GGUFReadOptions) {
		o.SkipLargeMetadata = true
	}
}

// UseMMap uses mmap to read the local file.
func UseMMap() GGUFReadOption {
	return func(o *_GGUFReadOptions) {
		o.MMap = true
	}
}

// UseBearerAuth uses the given token as a bearer auth when reading from remote.
func UseBearerAuth(token string) GGUFReadOption {
	return func(o *_GGUFReadOptions) {
		o.BearerAuthToken = token
	}
}

// UseHeaders uses the given headers when reading from remote.
func UseHeaders(headers map[string]string) GGUFReadOption {
	return func(o *_GGUFReadOptions) {
		o.Headers = headers
	}
}

// UseProxy uses the given url as a proxy when reading from remote.
func UseProxy(url *url.URL) GGUFReadOption {
	return func(o *_GGUFReadOptions) {
		o.ProxyURL = url
	}
}

// SkipProxy skips the proxy when reading from remote.
func SkipProxy() GGUFReadOption {
	return func(o *_GGUFReadOptions) {
		o.SkipProxy = true
	}
}

// SkipTLSVerification skips the TLS verification when reading from remote.
func SkipTLSVerification() GGUFReadOption {
	return func(o *_GGUFReadOptions) {
		o.SkipTLSVerification = true
	}
}

// SkipDNSCache skips the DNS cache when reading from remote.
func SkipDNSCache() GGUFReadOption {
	return func(o *_GGUFReadOptions) {
		o.SkipDNSCache = true
	}
}

// UseBufferSize sets the buffer size when reading from remote.
func UseBufferSize(size int) GGUFReadOption {
	const minSize = 32 * 1024
	if size < minSize {
		size = minSize
	}
	return func(o *_GGUFReadOptions) {
		o.BufferSize = size
	}
}

// SkipRangeDownloadDetection skips the range download detection when reading from remote.
func SkipRangeDownloadDetection() GGUFReadOption {
	return func(o *_GGUFReadOptions) {
		o.SkipRangeDownloadDetection = true
	}
}

// UseCache caches the remote reading result.
func UseCache() GGUFReadOption {
	return func(o *_GGUFReadOptions) {
		o.CachePath = DefaultCachePath()
		o.CacheExpiration = 24 * time.Hour
	}
}

// SkipCache skips the cache when reading from remote.
func SkipCache() GGUFReadOption {
	return func(o *_GGUFReadOptions) {
		o.CachePath = ""
		o.CacheExpiration = 0
	}
}

// DefaultCachePath returns the default cache path.
func DefaultCachePath() string {
	cd := filepath.Join(osx.UserHomeDir(), ".cache")
	if runtime.GOOS == "windows" {
		cd = osx.Getenv("APPDATA", cd)
	}
	return filepath.Join(cd, "gguf-parser")
}

// UseCachePath uses the given path to cache the remote reading result.
func UseCachePath(path string) GGUFReadOption {
	path = strings.TrimSpace(filepath.Clean(osx.InlineTilde(path)))
	return func(o *_GGUFReadOptions) {
		if path == "" {
			return
		}
		o.CachePath = path
	}
}

// UseCacheExpiration uses the given expiration to cache the remote reading result.
//
// Disable cache expiration by setting it to 0.
func UseCacheExpiration(expiration time.Duration) GGUFReadOption {
	if expiration < 0 {
		expiration = 0
	}
	return func(o *_GGUFReadOptions) {
		o.CacheExpiration = expiration
	}
}


================================================
FILE: file_test.go
================================================
package gguf_parser

import (
	"bytes"
	"context"
	"encoding/binary"
	"os"
	"testing"
	"time"

	"github.com/davecgh/go-spew/spew"
)

func TestParseGGUFFile(t *testing.T) {
	mp, ok := os.LookupEnv("TEST_MODEL_PATH")
	if !ok {
		t.Skip("TEST_MODEL_PATH is not set")
		return
	}

	// Slow read.
	{
		f, err := ParseGGUFFile(mp)
		if err != nil {
			t.Fatal(err)
			return
		}
		s := spew.ConfigState{
			Indent:   "  ",
			MaxDepth: 5, // Avoid console overflow.
		}
		t.Log("\n", s.Sdump(f), "\n")
	}

	// Fast read.
	{
		f, err := ParseGGUFFile(mp, SkipLargeMetadata(), UseMMap())
		if err != nil {
			t.Fatal(err)
			return
		}
		t.Log("\n", spew.Sdump(f), "\n")
	}
}

func BenchmarkParseGGUFFileMMap(b *testing.B) {
	mp, ok := os.LookupEnv("TEST_MODEL_PATH")
	if !ok {
		b.Skip("TEST_MODEL_PATH is not set")
		return
	}

	b.ReportAllocs()

	b.ResetTimer()
	b.Run("Normal", func(b *testing.B) {
		for i := 0; i < b.N; i++ {
			_, err := ParseGGUFFile(mp)
			if err != nil {
				b.Fatal(err)
				return
			}
		}
	})

	b.ResetTimer()
	b.Run("UseMMap", func(b *testing.B) {
		for i := 0; i < b.N; i++ {
			_, err := ParseGGUFFile(mp, UseMMap())
			if err != nil {
				b.Fatal(err)
				return
			}
		}
	})
}

func BenchmarkParseGGUFFileSkipLargeMetadata(b *testing.B) {
	mp, ok := os.LookupEnv("TEST_MODEL_PATH")
	if !ok {
		b.Skip("TEST_MODEL_PATH is not set")
		return
	}

	b.ReportAllocs()

	b.ResetTimer()
	b.Run("Normal", func(b *testing.B) {
		for i := 0; i < b.N; i++ {
			_, err := ParseGGUFFile(mp, UseMMap())
			if err != nil {
				b.Fatal(err)
				return
			}
		}
	})

	b.ResetTimer()
	b.Run("SkipLargeMetadata", func(b *testing.B) {
		for i := 0; i < b.N; i++ {
			_, err := ParseGGUFFile(mp, SkipLargeMetadata(), UseMMap())
			if err != nil {
				b.Fatal(err)
				return
			}
		}
	})
}

func TestParseGGUFFileRemote(t *testing.T) {
	const u = "https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF" +
		"/resolve/main/Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf"

	ctx := context.Background()

	// Slow read.
	{
		f, err := ParseGGUFFileRemote(ctx, u, UseDebug())
		if err != nil {
			t.Fatal(err)
			return
		}
		s := spew.ConfigState{
			Indent:   "  ",
			MaxDepth: 5, // Avoid console overflow.
		}
		t.Log("\n", s.Sdump(f), "\n")
	}

	// Fast read.
	{
		f, err := ParseGGUFFileRemote(ctx, u, UseDebug(), SkipLargeMetadata())
		if err != nil {
			t.Fatal(err)
			return
		}
		t.Log("\n", spew.Sdump(f), "\n")
	}
}

func BenchmarkParseGGUFFileRemoteWithBufferSize(b *testing.B) {
	const u = "https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF" +
		"/resolve/main/Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf"

	ctx := context.Background()

	b.ReportAllocs()

	b.ResetTimer()
	b.Run("256KibBuffer", func(b *testing.B) {
		for i := 0; i < b.N; i++ {
			_, err := ParseGGUFFileRemote(ctx, u, SkipLargeMetadata(), UseBufferSize(256*1024))
			if err != nil {
				b.Fatal(err)
				return
			}
		}
	})

	b.ResetTimer()
	b.Run("1MibBuffer", func(b *testing.B) {
		for i := 0; i < b.N; i++ {
			_, err := ParseGGUFFileRemote(ctx, u, SkipLargeMetadata(), UseBufferSize(1024*1024))
			if err != nil {
				b.Fatal(err)
				return
			}
		}
	})

	b.ResetTimer()
	b.Run("4MibBuffer", func(b *testing.B) {
		for i := 0; i < b.N; i++ {
			_, err := ParseGGUFFileRemote(ctx, u, SkipLargeMetadata(), UseBufferSize(4*1024*1024))
			if err != nil {
				b.Fatal(err)
				return
			}
		}
	})
}

func TestParseGGUFFileFromHuggingFace(t *testing.T) {
	ctx := context.Background()

	cases := [][2]string{
		{
			"TheBloke/Llama-2-13B-chat-GGUF",
			"llama-2-13b-chat.Q8_0.gguf",
		},
		{
			"lmstudio-community/Yi-1.5-9B-Chat-GGUF",
			"Yi-1.5-9B-Chat-Q5_K_M.gguf",
		},
		{
			"bartowski/gemma-2-9b-it-GGUF",
			"gemma-2-9b-it-Q3_K_M.gguf",
		},
	}
	for _, tc := range cases {
		t.Run(tc[0]+"/"+tc[1], func(t *testing.T) {
			f, err := ParseGGUFFileFromHuggingFace(ctx, tc[0], tc[1], SkipLargeMetadata())
			if err != nil {
				t.Fatal(err)
				return
			}
			t.Log("\n", spew.Sdump(f), "\n")
		})
	}
}

func TestParseGGUFFileFromModelScope(t *testing.T) {
	ctx := context.Background()

	cases := [][2]string{
		{
			"qwen/Qwen1.5-0.5B-Chat-GGUF",
			"qwen1_5-0_5b-chat-q5_k_m.gguf",
		},
		{
			"HIT-SCIR/huozi3-gguf",
			"huozi3-q2_k.gguf",
		},
		{
			"shaowenchen/chinese-alpaca-2-13b-16k-gguf",
			"chinese-alpaca-2-13b-16k.Q5_K.gguf",
		},
	}
	for _, tc := range cases {
		t.Run(tc[0]+"/"+tc[1], func(t *testing.T) {
			f, err := ParseGGUFFileFromModelScope(ctx, tc[0], tc[1], SkipLargeMetadata())
			if err != nil {
				t.Fatal(err)
				return
			}
			t.Log("\n", spew.Sdump(f), "\n")
		})
	}
}

func TestParseGGUFFileFromOllama(t *testing.T) {
	ctx := context.Background()

	cases := []string{
		"gemma2",
		"llama3.1",
		"qwen2:72b-instruct-q3_K_M",
	}
	for _, tc := range cases {
		t.Run(tc, func(t *testing.T) {
			start := time.Now()
			f, err := ParseGGUFFileFromOllama(ctx, tc, SkipLargeMetadata())
			if err != nil {
				t.Fatal(err)
				return
			}
			t.Logf("cost: %v\n", time.Since(start))
			t.Log("\n", spew.Sdump(f), "\n")
		})
	}
}

// FuzzParseGGUFFile writes the fuzz input to a temp file and calls ParseGGUFFile.
// Any panic during parsing will be reported by the fuzzing harness.
func FuzzParseGGUFFile(f *testing.F) {
	buf := new(bytes.Buffer)
	bo := binary.LittleEndian

	for _, v := range []GGUFMagic{GGUFMagicGGML, GGUFMagicGGMF, GGUFMagicGGJT, GGUFMagicGGUFLe, GGUFMagicGGUFBe} {
		_ = binary.Write(buf, bo, uint32(v))
		f.Add(buf.Bytes())
		buf.Reset()
	}

	f.Fuzz(func(t *testing.T, data []byte) {
		tmp, err := os.CreateTemp("", "gguf_fuzz_*.gguf")
		if err != nil {
			t.Fatalf("create tmp: %v", err)
		}
		defer os.Remove(tmp.Name())

		if _, err := tmp.Write(data); err != nil {
			t.Fatalf("write tmp: %v", err)
		}
		if err := tmp.Close(); err != nil {
			t.Fatalf("close tmp: %v", err)
		}

		// Call the public ParseGGUFFile which exercises parseGGUFFile.
		_, _ = ParseGGUFFile(tmp.Name())
	})
}

func TestParseGGUFFileWithFuzzInput(t *testing.T) {
	// Use the fuzz-generated data
	// data := []byte("GGUF\x00\x00\x00\x030000000000000000")
	data := []byte("FUGG\x00\x00\x00\x00GG>?\x00\x00\x00\x000000")

	// Create temp file
	tmpFile, err := os.CreateTemp("", "fuzz_test_gguf_*.gguf")
	if err != nil {
		t.Fatal(err)
	}
	defer os.Remove(tmpFile.Name())

	_, err = tmpFile.Write(data)
	if err != nil {
		t.Fatal(err)
	}
	tmpFile.Close()

	// Parse should return error (since it's invalid or triggers the check)
	_, err = ParseGGUFFile(tmpFile.Name())
	if err == nil {
		t.Error("expected error for fuzz-generated data")
	} else {
		t.Logf("got expected error: %v", err)
	}
}


================================================
FILE: file_tokenizer.go
================================================
package gguf_parser

// GGUFTokenizer represents the tokenizer metadata of a GGUF file.
type GGUFTokenizer struct {
	/* Basic */

	// Model is the model of the tokenizer.
	Model string `json:"model"`
	// TokensLength is the size of tokens.
	TokensLength uint64 `json:"tokensLength"`
	// MergeLength is the size of merges.
	MergesLength uint64 `json:"mergesLength"`
	// AddedTokensLength is the size of added tokens after training.
	AddedTokensLength uint64 `json:"addedTokenLength"`
	// BOSTokenID is the ID of the beginning of sentence token.
	//
	// Use -1 if the token is not found.
	BOSTokenID int64 `json:"bosTokenID"`
	// EOSTokenID is the ID of the end of sentence token.
	//
	// Use -1 if the token is not found.
	EOSTokenID int64 `json:"eosTokenID"`
	// EOTTokenID is the ID of the end of text token.
	//
	// Use -1 if the token is not found.
	EOTTokenID int64 `json:"eotTokenID"`
	// EOMTokenID is the ID of the end of message token.
	//
	// Use -1 if the token is not found.
	EOMTokenID int64 `json:"eomTokenID"`
	// UnknownTokenID is the ID of the unknown token.
	//
	// Use -1 if the token is not found.
	UnknownTokenID int64 `json:"unknownTokenID"`
	// SeparatorTokenID is the ID of the separator token.
	//
	// Use -1 if the token is not found.
	SeparatorTokenID int64 `json:"separatorTokenID"`
	// PaddingTokenID is the ID of the padding token.
	//
	// Use -1 if the token is not found.
	PaddingTokenID int64 `json:"paddingTokenID"`

	/* Appendix */

	// TokenSize is the size of tokens in bytes.
	TokensSize int64 `json:"tokensSize"`
	// MergesSize is the size of merges in bytes.
	MergesSize int64 `json:"mergesSize"`
}

// Tokenizer returns the tokenizer metadata of a GGUF file.
func (gf *GGUFFile) Tokenizer() (gt GGUFTokenizer) {
	const (
		modelKey            = "tokenizer.ggml.model"
		tokensKey           = "tokenizer.ggml.tokens"
		mergesKey           = "tokenizer.ggml.merges"
		addedTokensKey      = "tokenizer.ggml.added_tokens"
		bosTokenIDKey       = "tokenizer.ggml.bos_token_id"
		eosTokenIDKey       = "tokenizer.ggml.eos_token_id"
		eotTokenIDKey       = "tokenizer.ggml.eot_token_id"
		eomTokenIDKey       = "tokenizer.ggml.eom_token_id"
		unknownTokenIDKey   = "tokenizer.ggml.unknown_token_id"
		separatorTokenIDKey = "tokenizer.ggml.separator_token_id"
		paddingTokenIDKey   = "tokenizer.ggml.padding_token_id"
	)

	m, _ := gf.Header.MetadataKV.Index([]string{
		modelKey,
		tokensKey,
		mergesKey,
		addedTokensKey,
		bosTokenIDKey,
		eosTokenIDKey,
		eotTokenIDKey,
		eomTokenIDKey,
		unknownTokenIDKey,
		separatorTokenIDKey,
		paddingTokenIDKey,
	})

	gt.BOSTokenID = -1
	gt.EOSTokenID = -1
	gt.EOTTokenID = -1
	gt.EOMTokenID = -1
	gt.UnknownTokenID = -1
	gt.SeparatorTokenID = -1
	gt.PaddingTokenID = -1

	if v, ok := m[modelKey]; ok {
		gt.Model = v.ValueString()
	}
	if v, ok := m[tokensKey]; ok {
		arr := v.ValueArray()
		gt.TokensLength = arr.Len
		gt.TokensSize = arr.Size
	}
	if v, ok := m[mergesKey]; ok {
		arr := v.ValueArray()
		gt.MergesLength = arr.Len
		gt.MergesSize = arr.Size
	}
	if v, ok := m[addedTokensKey]; ok {
		gt.AddedTokensLength = v.ValueArray().Len
	}
	if v, ok := m[bosTokenIDKey]; ok {
		gt.BOSTokenID = ValueNumeric[int64](v)
	}
	if v, ok := m[eosTokenIDKey]; ok {
		gt.EOSTokenID = ValueNumeric[int64](v)
	}
	if v, ok := m[eotTokenIDKey]; ok {
		gt.EOTTokenID = ValueNumeric[int64](v)
	}
	if v, ok := m[eomTokenIDKey]; ok {
		gt.EOMTokenID = ValueNumeric[int64](v)
	}
	if v, ok := m[unknownTokenIDKey]; ok {
		gt.UnknownTokenID = ValueNumeric[int64](v)
	}
	if v, ok := m[separatorTokenIDKey]; ok {
		gt.SeparatorTokenID = ValueNumeric[int64](v)
	}
	if v, ok := m[paddingTokenIDKey]; ok {
		gt.PaddingTokenID = ValueNumeric[int64](v)
	}

	return gt
}


================================================
FILE: file_tokenizer_test.go
================================================
package gguf_parser

import (
	"context"
	"os"
	"testing"

	"github.com/davecgh/go-spew/spew"
)

func TestGGUFFile_Tokenizer(t *testing.T) {
	ctx := context.Background()

	f, err := ParseGGUFFileFromHuggingFace(
		ctx,
		"NousResearch/Hermes-2-Pro-Mistral-7B-GGUF",
		"Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf",
		SkipLargeMetadata())
	if err != nil {
		t.Fatal(err)
		return
	}

	t.Log("\n", spew.Sdump(f.Tokenizer()), "\n")
}

func BenchmarkGGUFFile_Tokenizer(b *testing.B) {
	mp, ok := os.LookupEnv("TEST_MODEL_PATH")
	if !ok {
		b.Skip("TEST_MODEL_PATH is not set")
		return
	}

	f, err := ParseGGUFFile(mp, SkipLargeMetadata(), UseMMap())
	if err != nil {
		b.Fatal(err)
		return
	}

	b.ReportAllocs()

	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		_ = f.Tokenizer()
	}
}


================================================
FILE: filename.go
================================================
package gguf_parser

import (
	"fmt"
	"regexp"
	"strconv"
	"strings"

	"github.com/gpustack/gguf-parser-go/util/funcx"
	"github.com/gpustack/gguf-parser-go/util/ptr"
)

// GGUFFilename represents a GGUF filename,
// see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention.
type GGUFFilename struct {
	BaseName   string `json:"baseName"`
	SizeLabel  string `json:"sizeLabel"`
	FineTune   string `json:"fineTune"`
	Version    string `json:"version"`
	Encoding   string `json:"encoding"`
	Type       string `json:"type"`
	Shard      *int   `json:"shard,omitempty"`
	ShardTotal *int   `json:"shardTotal,omitempty"`
}

var GGUFFilenameRegex = regexp.MustCompile(`^(?P<BaseName>[A-Za-z\s][A-Za-z0-9._\s]*(?:(?:-(?:(?:[A-Za-z\s][A-Za-z0-9._\s]*)|(?:[0-9._\s]*)))*))-(?:(?P<SizeLabel>(?:\d+x)?(?:\d+\.)?\d+[A-Za-z](?:-[A-Za-z]+(\d+\.)?\d+[A-Za-z]+)?)(?:-(?P<FineTune>[A-Za-z][A-Za-z0-9\s_-]+[A-Za-z](?i:[^BFKIQ])))?)?(?:-(?P<Version>[vV]\d+(?:\.\d+)*))?(?i:-(?P<Encoding>(BF16|F32|F16|([KI]?Q[0-9][A-Z0-9_]*))))?(?:-(?P<Type>LoRA|vocab))?(?:-(?P<Shard>\d{5})-of-(?P<ShardTotal>\d{5}))?\.gguf$`) // nolint:lll

// ParseGGUFFilename parses the given GGUF filename string,
// and returns the GGUFFilename, or nil if the filename is invalid.
func ParseGGUFFilename(name string) *GGUFFilename {
	n := name
	if !strings.HasSuffix(n, ".gguf") {
		n += ".gguf"
	}

	m := make(map[string]string)
	{
		r := GGUFFilenameRegex.FindStringSubmatch(n)
		for i, ne := range GGUFFilenameRegex.SubexpNames() {
			if i != 0 && i <= len(r) {
				m[ne] = r[i]
			}
		}
	}
	if m["BaseName"] == "" {
		return nil
	}

	var gn GGUFFilename
	gn.BaseName = strings.ReplaceAll(m["BaseName"], "-", " ")
	gn.SizeLabel = m["SizeLabel"]
	gn.FineTune = m["FineTune"]
	gn.Version = m["Version"]
	gn.Encoding = m["Encoding"]
	gn.Type = m["Type"]
	if v := m["Shard"]; v != "" {
		gn.Shard = ptr.To(parseInt(v))
	}
	if v := m["ShardTotal"]; v != "" {
		gn.ShardTotal = ptr.To(parseInt(v))
	}
	return &gn
}

func (gn GGUFFilename) String() string {
	if gn.BaseName == "" {
		return ""
	}

	var sb strings.Builder
	sb.WriteString(strings.ReplaceAll(gn.BaseName, " ", "-"))
	if gn.SizeLabel != "" {
		sb.WriteString("-")
		sb.WriteString(gn.SizeLabel)
	}
	if gn.FineTune != "" {
		sb.WriteString("-")
		sb.WriteString(gn.FineTune)
	}
	if gn.Version != "" {
		sb.WriteString("-")
		sb.WriteString(gn.Version)
	}
	if gn.Encoding != "" {
		sb.WriteString("-")
		sb.WriteString(gn.Encoding)
	}
	if gn.Type != "" {
		sb.WriteString("-")
		sb.WriteString(gn.Type)
	}
	if m, n := ptr.Deref(gn.Shard, 0), ptr.Deref(gn.ShardTotal, 0); m > 0 && n > 0 {
		sb.WriteString("-")
		sb.WriteString(fmt.Sprintf("%05d", m))
		sb.WriteString("-of-")
		sb.WriteString(fmt.Sprintf("%05d", n))
	}
	sb.WriteString(".gguf")
	return sb.String()
}

// IsShard returns true if the GGUF filename is a shard.
func (gn GGUFFilename) IsShard() bool {
	return ptr.Deref(gn.Shard, 0) > 0 && ptr.Deref(gn.ShardTotal, 0) > 0
}

var ShardGGUFFilenameRegex = regexp.MustCompile(`^(?P<Prefix>.*)-(?:(?P<Shard>\d{5})-of-(?P<ShardTotal>\d{5}))\.gguf$`)

// IsShardGGUFFilename returns true if the given filename is a shard GGUF filename.
func IsShardGGUFFilename(name string) bool {
	n := name
	if !strings.HasSuffix(n, ".gguf") {
		n += ".gguf"
	}

	m := make(map[string]string)
	{
		r := ShardGGUFFilenameRegex.FindStringSubmatch(n)
		for i, ne := range ShardGGUFFilenameRegex.SubexpNames() {
			if i != 0 && i <= len(r) {
				m[ne] = r[i]
			}
		}
	}

	var shard, shardTotal int
	if v := m["Shard"]; v != "" {
		shard = parseInt(v)
	}
	if v := m["ShardTotal"]; v != "" {
		shardTotal = parseInt(v)
	}
	return shard > 0 && shardTotal > 0
}

// CompleteShardGGUFFilename returns the list of shard GGUF filenames that are related to the given shard GGUF filename.
//
// Only available if the given filename is a shard GGUF filename.
func CompleteShardGGUFFilename(name string) []string {
	n := name
	if !strings.HasSuffix(n, ".gguf") {
		n += ".gguf"
	}

	m := make(map[string]string)
	{
		r := ShardGGUFFilenameRegex.FindStringSubmatch(n)
		for i, ne := range ShardGGUFFilenameRegex.SubexpNames() {
			if i != 0 && i <= len(r) {
				m[ne] = r[i]
			}
		}
	}

	var shard, shardTotal int
	if v := m["Shard"]; v != "" {
		shard = parseInt(v)
	}
	if v := m["ShardTotal"]; v != "" {
		shardTotal = parseInt(v)
	}

	if shard <= 0 || shardTotal <= 0 {
		return nil
	}

	names := make([]string, 0, shardTotal)
	for i := 1; i <= shardTotal; i++ {
		names = append(names, fmt.Sprintf("%s-%05d-of-%05d.gguf", m["Prefix"], i, shardTotal))
	}
	return names
}

func parseInt(v string) int {
	return int(funcx.MustNoError(strconv.ParseInt(v, 10, 64)))
}


================================================
FILE: filename_test.go
================================================
package gguf_parser

import (
	"testing"

	"github.com/stretchr/testify/assert"

	"github.com/gpustack/gguf-parser-go/util/ptr"
)

func TestParseGGUFFilename(t *testing.T) {
	cases := []struct {
		given    string
		expected *GGUFFilename
	}{
		{
			given: "Mixtral-8x7B-V0.1-KQ2.gguf",
			expected: &GGUFFilename{
				BaseName:  "Mixtral",
				SizeLabel: "8x7B",
				Version:   "V0.1",
				Encoding:  "KQ2",
			},
		},
		{
			given: "Grok-100B-v1.0-Q4_0-00003-of-00009.gguf",
			expected: &GGUFFilename{
				BaseName:   "Grok",
				SizeLabel:  "100B",
				Version:    "v1.0",
				Encoding:   "Q4_0",
				Shard:      ptr.To(3),
				ShardTotal: ptr.To(9),
			},
		},
		{
			given: "Hermes-2-Pro-Llama-3-8B-F16.gguf",
			expected: &GGUFFilename{
				BaseName:  "Hermes 2 Pro Llama 3",
				SizeLabel: "8B",
				Encoding:  "F16",
			},
		},
		{
			given: "Phi-3-mini-3.8B-ContextLength4k-instruct-v1.0.gguf",
			expected: &GGUFFilename{
				BaseName:  "Phi 3 mini",
				SizeLabel: "3.8B-ContextLength4k",
				FineTune:  "instruct",
				Version:   "v1.0",
			},
		},
		{
			given: "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00001-of-00018.gguf",
			expected: &GGUFFilename{
				BaseName:   "Meta Llama 3.1",
				SizeLabel:  "405B",
				FineTune:   "Instruct-XelotX",
				Encoding:   "BF16",
				Shard:      ptr.To(1),
				ShardTotal: ptr.To(18),
			},
		},
		{
			given: "qwen2-72b-instruct-q6_k-00001-of-00002.gguf",
			expected: &GGUFFilename{
				BaseName:   "qwen2",
				SizeLabel:  "72b",
				FineTune:   "instruct",
				Encoding:   "q6_k",
				Shard:      ptr.To(1),
				ShardTotal: ptr.To(2),
			},
		},
		{
			given:    "Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00001-of-00009.gguf",
			expected: nil,
		},
		{
			given:    "not-a-known-arrangement.gguf",
			expected: nil,
		},
	}
	for _, tc := range cases {
		t.Run(tc.given, func(t *testing.T) {
			actual := ParseGGUFFilename(tc.given)
			assert.Equal(t, tc.expected, actual)
		})
	}
}

func TestGGUFFilenameString(t *testing.T) {
	cases := []struct {
		given    GGUFFilename
		expected string
	}{
		{
			given: GGUFFilename{
				BaseName:  "Mixtral",
				SizeLabel: "8x7B",
				Version:   "v0.1",
				Encoding:  "KQ2",
			},
			expected: "Mixtral-8x7B-v0.1-KQ2.gguf",
		},
		{
			given: GGUFFilename{
				BaseName:   "Grok",
				SizeLabel:  "100B",
				Version:    "v1.0",
				Encoding:   "Q4_0",
				Shard:      ptr.To(3),
				ShardTotal: ptr.To(9),
			},
			expected: "Grok-100B-v1.0-Q4_0-00003-of-00009.gguf",
		},
		{
			given: GGUFFilename{
				BaseName:  "Hermes 2 Pro Llama 3",
				SizeLabel: "8B",
				Encoding:  "F16",
			},
			expected: "Hermes-2-Pro-Llama-3-8B-F16.gguf",
		},
		{
			given: GGUFFilename{
				BaseName:  "Phi 3 mini",
				SizeLabel: "3.8B-ContextLength4k",
				FineTune:  "instruct",
				Version:   "v1.0",
			},
			expected: "Phi-3-mini-3.8B-ContextLength4k-instruct-v1.0.gguf",
		},
		{
			given:    GGUFFilename{},
			expected: "",
		},
	}
	for _, tc := range cases {
		t.Run(tc.expected, func(t *testing.T) {
			actual := tc.given.String()
			assert.Equal(t, tc.expected, actual)
		})
	}
}

func TestIsShardGGUFFilename(t *testing.T) {
	cases := []struct {
		given    string
		expected bool
	}{
		{
			given:    "qwen2-72b-instruct-q6_k-00001-of-00002.gguf",
			expected: true,
		},
		{
			given:    "Grok-100B-v1.0-Q4_0-00003-of-00009.gguf",
			expected: true,
		},
		{
			given:    "Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00001-of-00009.gguf",
			expected: true,
		},
		{
			given:    "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00001-of-00018.gguf",
			expected: true,
		},
		{
			given:    "not-a-known-arrangement.gguf",
			expected: false,
		},
	}
	for _, tc := range cases {
		t.Run(tc.given, func(t *testing.T) {
			actual := IsShardGGUFFilename(tc.given)
			assert.Equal(t, tc.expected, actual)
		})
	}
}

func TestCompleteShardGGUFFilename(t *testing.T) {
	cases := []struct {
		given    string
		expected []string
	}{
		{
			given: "qwen2-72b-instruct-q6_k-00001-of-00002.gguf",
			expected: []string{
				"qwen2-72b-instruct-q6_k-00001-of-00002.gguf",
				"qwen2-72b-instruct-q6_k-00002-of-00002.gguf",
			},
		},
		{
			given: "Grok-100B-v1.0-Q4_0-00003-of-00009.gguf",
			expected: []string{
				"Grok-100B-v1.0-Q4_0-00001-of-00009.gguf",
				"Grok-100B-v1.0-Q4_0-00002-of-00009.gguf",
				"Grok-100B-v1.0-Q4_0-00003-of-00009.gguf",
				"Grok-100B-v1.0-Q4_0-00004-of-00009.gguf",
				"Grok-100B-v1.0-Q4_0-00005-of-00009.gguf",
				"Grok-100B-v1.0-Q4_0-00006-of-00009.gguf",
				"Grok-100B-v1.0-Q4_0-00007-of-00009.gguf",
				"Grok-100B-v1.0-Q4_0-00008-of-00009.gguf",
				"Grok-100B-v1.0-Q4_0-00009-of-00009.gguf",
			},
		},
		{
			given: "Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00001-of-00009.gguf",
			expected: []string{
				"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00001-of-00009.gguf",
				"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00002-of-00009.gguf",
				"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00003-of-00009.gguf",
				"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00004-of-00009.gguf",
				"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00005-of-00009.gguf",
				"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00006-of-00009.gguf",
				"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00007-of-00009.gguf",
				"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00008-of-00009.gguf",
				"Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00009-of-00009.gguf",
			},
		},
		{
			given: "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00001-of-00018.gguf",
			expected: []string{
				"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00001-of-00018.gguf",
				"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00002-of-00018.gguf",
				"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00003-of-00018.gguf",
				"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00004-of-00018.gguf",
				"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00005-of-00018.gguf",
				"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00006-of-00018.gguf",
				"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00007-of-00018.gguf",
				"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00008-of-00018.gguf",
				"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00009-of-00018.gguf",
				"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00010-of-00018.gguf",
				"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00011-of-00018.gguf",
				"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00012-of-00018.gguf",
				"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00013-of-00018.gguf",
				"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00014-of-00018.gguf",
				"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00015-of-00018.gguf",
				"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00016-of-00018.gguf",
				"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00017-of-00018.gguf",
				"Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00018-of-00018.gguf",
			},
		},
		{
			given:    "not-a-known-arrangement.gguf",
			expected: nil,
		},
	}
	for _, tc := range cases {
		t.Run(tc.given, func(t *testing.T) {
			actual := CompleteShardGGUFFilename(tc.given)
			assert.Equal(t, tc.expected, actual)
		})
	}
}


================================================
FILE: gen.go
================================================
//go:generate go generate -tags stringer gen.stringer.go
//go:generate go generate -tags regression gen.regression.go
package gguf_parser


================================================
FILE: gen.regression.go
================================================
//go:build regression

//go:generate go run -tags regression gen.regression.go
package main

import (
	"fmt"
	"strconv"
	"math"
	"os"
	"text/template"
	"bytes"
	"go/format"

	"gonum.org/v1/gonum/mat"
	"golang.org/x/exp/maps"
	"sort"
)

type LinearRegression struct {
	Intercept float64
	Slope     float64
}

func (lr *LinearRegression) Fit(xs, ys []float64) {
	if len(xs) != len(ys) {
		panic("length of xs and ys must be the same")
	}

	var sX, sY, sXY, sXX float64
	for i := 0; i < len(xs); i++ {
		sX += xs[i]
		sY += ys[i]
		sXY += xs[i] * ys[i]
		sXX += xs[i] * xs[i]
	}

	n := float64(len(xs))
	d := n*sXX - sX*sX
	if d == 0 {
		d = 1
	}

	lr.Slope = (n*sXY - sX*sY) / d
	lr.Intercept = (sY*sXX - sX*sXY) / d
}

func (lr *LinearRegression) Predict(x float64) (y float64) {
	return lr.Intercept + lr.Slope*x
}

type PolynomialRegression struct {
	Degree       int
	Coefficients []float64
}

func (pr *PolynomialRegression) Fit(xs, ys []float64) {
	samples := len(xs)
	feats := pr.Degree + 1

	feat := mat.NewDense(samples, feats, nil)
	{
		for i := 0; i < samples; i++ {
			for j := 0; j < feats; j++ {
				feat.Set(i, j, math.Pow(xs[i], float64(j)))
			}
		}
		var qr mat.QR
		qr.Factorize(feat)
	}
	yVec := mat.NewVecDense(samples, ys)

	var coef mat.VecDense
	if err := coef.SolveVec(feat, yVec); err != nil {
		panic("failed to solve")
	}

	pr.Coefficients = coef.RawVector().Data
}

func (pr *PolynomialRegression) Predict(x float64) (y float64) {
	y = 0
	for i := 0; i < pr.Degree+1; i++ {
		y += pr.Coefficients[i] * math.Pow(x, float64(i))
	}
	return
}

func DiffusionModelMemoryUsageRegression(output string) {
	type Regression struct {
		Name                 string
		LinearRegression     *LinearRegression
		PolynomialRegression *PolynomialRegression
	}

	const tmplStr = `
package gguf_parser

import "math"

{{ range . -}}
// {{ .Name }} returns the memory usage in bytes for the given width and height,
// which is calculated by linear regression or polynomial regression.
func {{ .Name }}(width, height uint32, flashAttention bool) uint64 {
	coefficients := []float64{ {{ range $i, $c := .PolynomialRegression.Coefficients }}{{ if eq $i 0 }}{{ printf "%.4f" $c }}{{ else }}{{ printf "%.10f" $c }}{{ end }}, {{ end }} }
	degree := {{ .PolynomialRegression.Degree }}
	x := float64(width * height)
	
	{{ if .LinearRegression -}}
    if flashAttention {
		coefficients = []float64{ {{ printf "%.5f" .LinearRegression.Intercept }}, {{ printf "%.10f" .LinearRegression.Slope }} }
		degree = 1
    }
    {{- end }}

	y := float64(0)
	for i := 0; i <= degree; i++ {
		y += coefficients[i] * math.Pow(x, float64(i))
	}
	return uint64(y)
}

{{ end }}

`
	ts := []struct {
		n     string
		x2y   map[float64]float64
		c     map[float64]float64
		fax2y map[float64]float64
		fac   map[float64]float64
	}{
		{
			n: "GuessSD1DiffusionModelMemoryUsage",
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 49.57 MB(VRAM)     // 256*256
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 559.90 MB(VRAM)    // 512*512
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 8360.93 MB(VRAM)   // 1024*1024
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 18681.62 MB(VRAM)  // 1024*1536
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 25377.96 MB(VRAM)  // 1024*1792
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 41842.65 MB(VRAM)  // 1536*1536
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 77333.77 MB(VRAM)  // 1792*1792
			x2y: map[float64]float64{
				256 * 256:   49.57,
				512 * 512:   559.90,
				1024 * 1024: 8360.93,
				1024 * 1536: 18681.62,
				1024 * 1792: 25377.96,
				1536 * 1536: 41842.65,
				1792 * 1792: 77333.77,
			},
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 56879.17 MB(VRAM)  // 1536*1792
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 100924.37 MB(VRAM) // 1792*2048
			c: map[float64]float64{
				1536 * 1792: 56879.17,
				1792 * 2048: 100924.37,
			},
		},
		{
			n: "GuessSD2DiffusionModelMemoryUsage",
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 37.65 MB(VRAM)     // 256*256
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 367.98 MB(VRAM)    // 512*512
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 830.86 MB(VRAM)    // 1024*1024
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 11769.69 MB(VRAM)  // 1024*1536
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 15970.04 MB(VRAM)  // 1024*1792
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 26290.73 MB(VRAM)  // 1536*1536
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 48521.84 MB(VRAM)  // 1792*1792
			x2y: map[float64]float64{
				256 * 256:   37.65,
				512 * 512:   367.98,
				1024 * 1024: 830.86,
				1024 * 1536: 11769.69,
				1024 * 1792: 15970.04,
				1536 * 1536: 26290.73,
				1792 * 1792: 48521.84,
			},
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 35711.24 MB(VRAM) // 1536*1792
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 63292.44 MB(VRAM) // 1792*2048
			c: map[float64]float64{
				1536 * 1792: 35711.24,
				1792 * 2048: 63292.44,
			},
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 34.52 MB(VRAM)   // 256*256
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 130.48 MB(VRAM)  // 512*512
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 519.01 MB(VRAM)  // 1024*1024
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 774.69 MB(VRAM)  // 1024*1536
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 902.54 MB(VRAM)  // 1024*1792
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1158.23 MB(VRAM) // 1536*1536
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1573.72 MB(VRAM) // 1792*1792
			fax2y: map[float64]float64{
				256 * 256:   34.52,
				512 * 512:   130.48,
				1024 * 1024: 519.01,
				1024 * 1536: 774.69,
				1024 * 1792: 902.54,
				1536 * 1536: 1158.23,
				1792 * 1792: 1573.72,
			},
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1349.99 MB(VRAM) // 1536*1792
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1797.44 MB(VRAM) // 1792*2048
			fac: map[float64]float64{
				1536 * 1792: 1349.99,
				1792 * 2048: 1797.44,
			},
		},
		{
			n: "GuessSDXLDiffusionModelMemoryUsage",
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 60.76 MB(VRAM)     // 256*256
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 132.05 MB(VRAM)    // 512*512
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 830.86 MB(VRAM)    // 1024*1024
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1701.55 MB(VRAM)   // 1024*1536
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 2256.90 MB(VRAM)   // 1024*1792
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 3607.58 MB(VRAM)   // 1536*1536
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 6484.95 MB(VRAM)   // 1792*1792
			x2y: map[float64]float64{
				256 * 256:   60.76,
				512 * 512:   132.05,
				1024 * 1024: 830.86,
				1024 * 1536: 1701.55,
				1024 * 1792: 2256.90,
				1536 * 1536: 3607.58,
				1792 * 1792: 6484.95,
			},
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 4830.60 MB(VRAM) // 1536*1792
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 8384.30 MB(VRAM) // 1792*2048
			c: map[float64]float64{
				1536 * 1792: 4830.60,
				1792 * 2048: 8384.30,
			},
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 60.13 MB(VRAM)   // 256*256
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 132.05 MB(VRAM)  // 512*512
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 440.86 MB(VRAM)  // 1024*1024
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 726.55 MB(VRAM)  // 1024*1536
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 874.40 MB(VRAM)  // 1024*1792
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1110.08 MB(VRAM) // 1536*1536
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1554.33 MB(VRAM) // 1792*1792
			fax2y: map[float64]float64{
				256 * 256:   60.13,
				512 * 512:   132.05,
				1024 * 1024: 440.86,
				1024 * 1536: 726.55,
				1024 * 1792: 874.40,
				1536 * 1536: 1110.08,
				1792 * 1792: 1554.33,
			},
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1339.35 MB(VRAM) // 1536*1792
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1769.30 MB(VRAM) // 1792*2048
			fac: map[float64]float64{
				1536 * 1792: 1339.35,
				1792 * 2048: 1769.30,
			},
		},
		{
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 44.57 MB(VRAM)     // 256*256
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 154.40 MB(VRAM)    // 512*512
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 968.43 MB(VRAM)    // 1024*1024
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 2013.12 MB(VRAM)   // 1024*1536
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 2679.46 MB(VRAM)   // 1024*1792
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 4300.15 MB(VRAM)   // 1536*1536
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 7752.77 MB(VRAM)   // 1792*1792
			n: "GuessSDXLRefinerDiffusionModelMemoryUsage",
			x2y: map[float64]float64{
				256 * 256:   44.57,
				512 * 512:   154.40,
				1024 * 1024: 968.43,
				1024 * 1536: 2013.12,
				1024 * 1792: 2679.46,
				1536 * 1536: 4300.15,
				1792 * 1792: 7752.77,
			},
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 5767.67 MB(VRAM)   // 1536*1792
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 10031.87 MB(VRAM)  // 1792*2048
			c: map[float64]float64{
				1536 * 1792: 5767.67,
				1792 * 2048: 10031.87,
			},
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 44.57 MB(VRAM)   // 256*256
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 154.40 MB(VRAM)  // 512*512
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 596.43 MB(VRAM)  // 1024*1024
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 915.12 MB(VRAM)  // 1024*1536
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1062.46 MB(VRAM) // 1024*1792
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1357.15 MB(VRAM) // 1536*1536
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1836.02 MB(VRAM) // 1792*1792
			fax2y: map[float64]float64{
				256 * 256:   44.57,
				512 * 512:   154.40,
				1024 * 1024: 596.43,
				1024 * 1536: 915.12,
				1024 * 1792: 1062.46,
				1536 * 1536: 1357.15,
				1792 * 1792: 1836.02,
			},
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1578.17 MB(VRAM) // 1536*1792
			// [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 2014.02 MB(VRAM) // 1792*2048
			fac: map[float64]float64{
				1536 * 1792: 1578.17,
				1792 * 2048: 2014.02,
			},
		},
		{
			n: "GuessSD3MediumDiffusionModelMemoryUsage",
			// [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 37.09 MB(VRAM)    // 256*256
			// [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 169.64 MB(VRAM)   // 512*512
			// [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 1786.11 MB(VRAM)  // 1024*1024
			// [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 3824.36 MB(VRAM)  // 1024*1536
			// [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 5131.48 MB(VRAM)  // 1024*1792
			// [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 8319.03 MB(VRAM)  // 1536*1536
			// [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 15141.18 MB(VRAM) // 1792*1792
			x2y: map[float64]float64{
				256 * 256:   37.09,
				512 * 512:   169.64,
				1024 * 1024: 1786.11,
				1024 * 1536: 3824.36,
				1024 * 1792: 5131.48,
				1536 * 1536: 8319.03,
				1792 * 1792: 15141.18,
			},
			// [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 11215.71 MB(VRAM) // 1536*1792
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 19654.65 MB(VRAM) // 1792*2048
			c: map[float64]float64{
				1536 * 1792: 11215.71,
				1792 * 2048: 19654.65,
			},
		},
		{
			n: "GuessSD35MediumDiffusionModelMemoryUsage",
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 41.48 MB(VRAM)     // 256*256
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 181.64 MB(VRAM)    // 512*512
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 1834.11 MB(VRAM)   // 1024*1024
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 3896.36 MB(VRAM)   // 1024*1536
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 5215.48 MB(VRAM)   // 1024*1792
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 8427.03 MB(VRAM)   // 1536*1536
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 15288.18 MiB(VRAM) // 1792*1792
			x2y: map[float64]float64{
				256 * 256:   41.48,
				512 * 512:   181.64,
				1024 * 1024: 1834.11,
				1024 * 1536: 3896.36,
				1024 * 1792: 5215.48,
				1536 * 1536: 8427.03,
				1792 * 1792: 15288.18,
			},
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 11341.71 MB(VRAM)  // 1536*1792
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 19822.65 MB(VRAM)  // 1792*2048
			c: map[float64]float64{
				1536 * 1792: 11341.71,
				1792 * 2048: 19822.65,
			},
		},
		{
			n: "GuessSD35LargeDiffusionModelMemoryUsage",
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 57.27 MB(VRAM)     // 256*256
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 276.54 MB(VRAM)    // 512*512
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 2865.44 MB(VRAM)   // 1024*1024
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 6109.95 MB(VRAM)   // 1024*1536
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 8188.92 MB(VRAM)   // 1024*1792
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 13258.86 MB(VRAM)  // 1536*1536
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 24091.01 MiB(VRAM) // 1792*1792
			x2y: map[float64]float64{
				256 * 256:   57.27,
				512 * 512:   276.54,
				1024 * 1024: 2865.44,
				1024 * 1536: 6109.95,
				1024 * 1792: 8188.92,
				1536 * 1536: 13258.86,
				1792 * 1792: 24091.01,
			},
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 17859.31 MB(VRAM)  // 1536*1792
			// [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 31253.70 MB(VRAM)  // 1792*2048
			c: map[float64]float64{
				1536 * 1792: 17859.31,
				1792 * 2048: 31253.70,
			},
		},
		{
			n: "GuessFLUXDiffusionModelMemoryUsage",
			// [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 103.35 MB(VRAM)     // 256*256
			// [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 398.05 MB(VRAM)     // 512*512
			// [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 2576.18 MB(VRAM)    // 1024*1024
			// [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 4978.31 MB(VRAM)    // 1024*1536
			// [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 6467.37 MB(VRAM)    // 1024*1792
			// [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 10021.49 MB(VRAM)   // 1536*1536
			// [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 17434.95 MB(VRAM)   // 1792*1792
			x2y: map[float64]float64{
				256 * 256:   103.35,
				512 * 512:   398.05,
				1024 * 1024: 2576.18,
				1024 * 1536: 4978.31,
				1024 * 1792: 6467.37,
				1536 * 1536: 10021.49,
				1792 * 1792: 17434.95,
			},
			// [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 13191.09 MB(VRAM)  // 1536*1792
			// [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 22266.81 MB(VRAM)  // 1792*2048
			c: map[float64]float64{
				1536 * 1792: 13191.09,
				1792 * 2048: 22266.81,
			},
		},
	}

	rs := make([]Regression, len(ts))
	for i, t := range ts {
		rs[i].Name = t.n
	}

	fmt.Println("Polynomial Regression For None Flash Attention")
	for i, t := range ts {
		pr := PolynomialRegression{
			Degree: 2,
		}

		xs := maps.Keys(t.x2y)
		sort.Float64s(xs)
		ys := make([]float64, len(xs))
		for j, x := range xs {
			ys[j] = t.x2y[x] * 1024 * 1024 // MB to B
		}
		pr.Fit(xs, ys)

		for x, y := range t.c {
			y_ := pr.Predict(x) / 1024 / 1024 // B to MB
			d := (y_ - y) / y * 100
			s := "+"
			if d < 0 {
				s = ""
			}
			c := ""
			if d > 10 {
				c = "?"
			}

			fmt.Printf("%50s: y': %10.2f | y: %10.2f | d: %10s%% %s\n", t.n, y_, y, s+strconv.FormatFloat(d, 'f', 6, 64), c)
		}

		rs[i].PolynomialRegression = &pr
	}

	fmt.Println("Linear Regression For Flash Attention")
	for i, t := range ts {
		if len(t.fax2y) == 0 {
			continue
		}

		lr := LinearRegression{}

		xs := maps.Keys(t.fax2y)
		sort.Float64s(xs)
		ys := make([]float64, len(xs))
		for j, x := range xs {
			ys[j] = t.fax2y[x] * 1024 * 1024 // MB to B
		}
		lr.Fit(xs, ys)

		for x, y := range t.fac {
			y_ := lr.Predict(x) / 1024 / 1024 // B to MB
			d := (y_ - y) / y * 100
			s := "+"
			if d < 0 {
				s = ""
			}
			c := ""
			if d > 10 {
				c = "?"
			}

			fmt.Printf("%50s: y': %10.2f | y: %10.2f | d: %10s%% %s\n", t.n, y_, y, s+strconv.FormatFloat(d, 'f', 6, 64), c)
		}

		rs[i].LinearRegression = &lr
	}

	var code []byte
	{
		var (
			buff bytes.Buffer
			err  error
		)
		tmpl := template.Must(template.New("tmpl").Parse(tmplStr))
		if err = tmpl.Execute(&buff, rs); err != nil {
			panic(fmt.Errorf("failed to execute template: %w", err))
		}
		code, err = format.Source(buff.Bytes())
		if err != nil {
			panic(fmt.Errorf("failed to format source: %w", err))
		}
	}

	if err := os.WriteFile(output, code, 0644); err != nil {
		panic(fmt.Errorf("failed to write file: %w", err))
	}
}

func main() {
	DiffusionModelMemoryUsageRegression("zz_generated.diffusion_model_memory_usage.regression.go")
}


================================================
FILE: gen.stringer.go
================================================
//go:build stringer

//go:generate go run golang.org/x/tools/cmd/stringer -linecomment -type GGUFMagic -output zz_generated.ggufmagic.stringer.go -trimprefix GGUFMagic
//go:generate go run golang.org/x/tools/cmd/stringer -linecomment -type GGUFVersion -output zz_generated.ggufversion.stringer.go -trimprefix GGUFVersion
//go:generate go run golang.org/x/tools/cmd/stringer -linecomment -type GGUFMetadataValueType -output zz_generated.ggufmetadatavaluetype.stringer.go -trimprefix GGUFMetadataValueType
//go:generate go run golang.org/x/tools/cmd/stringer -linecomment -type GGUFFileType -output zz_generated.gguffiletype.stringer.go -trimprefix GGUFFileType
//go:generate go run golang.org/x/tools/cmd/stringer -linecomment -type GGMLType -output zz_generated.ggmltype.stringer.go -trimprefix GGMLType
package gguf_parser

import _ "golang.org/x/tools/cmd/stringer"


================================================
FILE: ggml.go
================================================
package gguf_parser

import (
	"errors"
	"fmt"
	"slices"
)

// Types for GGMLType.
type (
	// GGMLType is a type of GGML tensor,
	// see https://github.com/ggml-org/llama.cpp/blob/fd1234cb468935ea087d6929b2487926c3afff4b/ggml/include/ggml.h#L368-L410.
	GGMLType uint32

	// GGMLTypeTrait holds the trait of a GGMLType,
	// see https://github.com/ggml-org/llama.cpp/blob/fd1234cb468935ea087d6929b2487926c3afff4b/ggml/src/ggml.c#L586-L876.
	GGMLTypeTrait struct {
		BlockSize uint64 // Original is int, in order to reduce conversion, here we use uint64.
		TypeSize  uint64 // Original is uint32, in order to reduce conversion, here we use uint64.
		Quantized bool
	}
)

// GGMLType constants.
//
// GGMLTypeQ4_2, GGMLTypeQ4_3 are deprecated.
// GGMLTypeQ4_0_4_4, GGMLTypeQ4_0_4_8, GGMLTypeQ4_0_8_8 are deprecated.
// GGMLTypeIQ4_NL_4_4, GGMLTypeIQ4_NL_4_8, GGMLTypeIQ4_NL_8_8 are deprecated.
const (
	GGMLTypeF32 GGMLType = iota
	GGMLTypeF16
	GGMLTypeQ4_0
	GGMLTypeQ4_1
	GGMLTypeQ4_2
	GGMLTypeQ4_3
	GGMLTypeQ5_0
	GGMLTypeQ5_1
	GGMLTypeQ8_0
	GGMLTypeQ8_1
	GGMLTypeQ2_K
	GGMLTypeQ3_K
	GGMLTypeQ4_K
	GGMLTypeQ5_K
	GGMLTypeQ6_K
	GGMLTypeQ8_K
	GGMLTypeIQ2_XXS
	GGMLTypeIQ2_XS
	GGMLTypeIQ3_XXS
	GGMLTypeIQ1_S
	GGMLTypeIQ4_NL
	GGMLTypeIQ3_S
	GGMLTypeIQ2_S
	GGMLTypeIQ4_XS
	GGMLTypeI8
	GGMLTypeI16
	GGMLTypeI32
	GGMLTypeI64
	GGMLTypeF64
	GGMLTypeIQ1_M
	GGMLTypeBF16
	GGMLTypeQ4_0_4_4
	GGMLTypeQ4_0_4_8
	GGMLTypeQ4_0_8_8
	GGMLTypeTQ1_0
	GGMLTypeTQ2_0
	GGMLTypeIQ4_NL_4_4
	GGMLTypeIQ4_NL_4_8
	GGMLTypeIQ4_NL_8_8
	GGMLTypeMXFP4
	_GGMLTypeCount // Unknown
)

// _GGMLTypeTraits is a table of GGMLTypeTrait for GGMLType.
var _GGMLTypeTraits = map[GGMLType]GGMLTypeTrait{
	GGMLTypeF32:        {BlockSize: 1, TypeSize: 4},
	GGMLTypeF16:        {BlockSize: 1, TypeSize: 2},
	GGMLTypeQ4_0:       {BlockSize: 32, TypeSize: 18, Quantized: true},
	GGMLTypeQ4_1:       {BlockSize: 32, TypeSize: 20, Quantized: true},
	GGMLTypeQ4_2:       {BlockSize: 0, TypeSize: 0}, // Deprecated
	GGMLTypeQ4_3:       {BlockSize: 0, TypeSize: 0}, // Deprecated
	GGMLTypeQ5_0:       {BlockSize: 32, TypeSize: 22, Quantized: true},
	GGMLTypeQ5_1:       {BlockSize: 32, TypeSize: 24, Quantized: true},
	GGMLTypeQ8_0:       {BlockSize: 32, TypeSize: 34, Quantized: true},
	GGMLTypeQ8_1:       {BlockSize: 32, TypeSize: 36, Quantized: true},
	GGMLTypeQ2_K:       {BlockSize: 256, TypeSize: 84, Quantized: true},
	GGMLTypeQ3_K:       {BlockSize: 256, TypeSize: 110, Quantized: true},
	GGMLTypeQ4_K:       {BlockSize: 256, TypeSize: 144, Quantized: true},
	GGMLTypeQ5_K:       {BlockSize: 256, TypeSize: 176, Quantized: true},
	GGMLTypeQ6_K:       {BlockSize: 256, TypeSize: 210, Quantized: true},
	GGMLTypeQ8_K:       {BlockSize: 256, TypeSize: 292, Quantized: true},
	GGMLTypeIQ2_XXS:    {BlockSize: 256, TypeSize: 66, Quantized: true},
	GGMLTypeIQ2_XS:     {BlockSize: 256, TypeSize: 74, Quantized: true},
	GGMLTypeIQ3_XXS:    {BlockSize: 256, TypeSize: 98, Quantized: true},
	GGMLTypeIQ1_S:      {BlockSize: 256, TypeSize: 50, Quantized: true},
	GGMLTypeIQ4_NL:     {BlockSize: 32, TypeSize: 18, Quantized: true},
	GGMLTypeIQ3_S:      {BlockSize: 256, TypeSize: 110, Quantized: true},
	GGMLTypeIQ2_S:      {BlockSize: 256, TypeSize: 82, Quantized: true},
	GGMLTypeIQ4_XS:     {BlockSize: 256, TypeSize: 136, Quantized: true},
	GGMLTypeI8:         {BlockSize: 1, TypeSize: 1},
	GGMLTypeI16:        {BlockSize: 1, TypeSize: 2},
	GGMLTypeI32:        {BlockSize: 1, TypeSize: 4},
	GGMLTypeI64:        {BlockSize: 1, TypeSize: 8},
	GGMLTypeF64:        {BlockSize: 1, TypeSize: 8},
	GGMLTypeIQ1_M:      {BlockSize: 256, TypeSize: 56, Quantized: true},
	GGMLTypeBF16:       {BlockSize: 1, TypeSize: 2},
	GGMLTypeQ4_0_4_4:   {BlockSize: 32, TypeSize: 18, Quantized: true},
	GGMLTypeQ4_0_4_8:   {BlockSize: 32, TypeSize: 18, Quantized: true},
	GGMLTypeQ4_0_8_8:   {BlockSize: 32, TypeSize: 18, Quantized: true},
	GGMLTypeTQ1_0:      {BlockSize: 256, TypeSize: 54, Quantized: true},
	GGMLTypeTQ2_0:      {BlockSize: 256, TypeSize: 66, Quantized: true},
	GGMLTypeIQ4_NL_4_4: {BlockSize: 32, TypeSize: 18, Quantized: true},
	GGMLTypeIQ4_NL_4_8: {BlockSize: 32, TypeSize: 18, Quantized: true},
	GGMLTypeIQ4_NL_8_8: {BlockSize: 32, TypeSize: 18, Quantized: true},
	GGMLTypeMXFP4:      {BlockSize: 32, TypeSize: 17, Quantized: true},
}

// Trait returns the GGMLTypeTrait of the GGMLType.
func (t GGMLType) Trait() (GGMLTypeTrait, bool) {
	tt, ok := _GGMLTypeTraits[t]
	return tt, ok
}

// IsQuantized returns whether the GGMLType is quantized.
func (t GGMLType) IsQuantized() bool {
	tt, ok := t.Trait()
	if !ok {
		return false
	}
	return tt.Quantized
}

// RowSizeOf returns the size of the given dimensions according to the GGMLType's GGMLTypeTrait,
// which is inspired by
// https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L3142-L3145.
//
// The index of the given dimensions means the number of dimension,
// i.e. 0 is the first dimension, 1 is the second dimension, and so on.
//
// The value of the item is the number of elements in the corresponding dimension.
func (t GGMLType) RowSizeOf(dimensions []uint64) uint64 {
	if len(dimensions) == 0 {
		panic(errors.New("no dimensions"))
	}

	tt, ok := t.Trait()
	if !ok {
		panic(fmt.Errorf("invalid type: %v", t))
	}

	// https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2640-L2643
	ds := tt.TypeSize * dimensions[0] / tt.BlockSize // Row size
	for i := 1; i < len(dimensions); i++ {
		ds *= dimensions[i]
	}
	return ds
}

// GGMLMemoryPadding returns the padded size of the given size according to GGML memory padding,
// see https://github.com/ggerganov/ggml/blob/0cbb7c0/include/ggml/ggml.h#L238-L243.
func GGMLMemoryPadding(size uint64) uint64 {
	const align = 16
	return GGMLPadding(size, align)
}

// GGMLPadding returns the padded size of the given size according to given align,
// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L255.
func GGMLPadding(size, align uint64) uint64 {
	return (size + align - 1) &^ (align - 1)
}

// GGML tensor constants.
const (
	// GGMLTensorSize is the size of GGML tensor in bytes,
	// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L606.
	GGMLTensorSize = 368

	// GGMLObjectSize is the size of GGML object in bytes,
	// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L563.
	GGMLObjectSize = 32
)

// GGMLTensorOverhead is the overhead of GGML tensor in bytes,
// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L2765-L2767.
func GGMLTensorOverhead() uint64 {
	return GGMLObjectSize + GGMLTensorSize
}

// GGML computation graph constants.
const (
	// GGMLComputationGraphSize is the size of GGML computation graph in bytes.
	GGMLComputationGraphSize = 80

	// GGMLComputationBitsetSize is the size of GGML computation bitset in bytes,
	// see https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-impl.h#L165.
	GGMLComputationBitsetSize = 4
)

// GGMLComputationGraphOverhead is the overhead of GGML graph in bytes,
// see https://github.com/ggml-org/ggml/blob/5592ffda9c417c3c12232c828247c23d17004c88/src/ggml.c#L5941-L5956.
func GGMLComputationGraphOverhead(nodes uint64, grads bool) uint64 {
	const ps = 8 // c++ pointer size

	hs := GGMLHashSize(nodes * 2)

	var g uint64 = GGMLComputationGraphSize // graph
	g += GGMLPadding(nodes*ps, ps)          // nodes
	g += GGMLPadding(nodes*ps, ps)          // leafs
	g += GGMLPadding(nodes*ps, ps)          // parents
	g += GGMLPadding(hs*ps, ps)             // hash keys
	if grads {
		g += GGMLPadding(hs*ps, ps) // grads
		g += GGMLPadding(hs*ps, ps) // grad_accs
	}
	g += GGMLPadding(GGMLBitsetSize(hs)*GGMLComputationBitsetSize, GGMLComputationBitsetSize) // bitset

	return GGMLObjectSize + GGMLMemoryPadding(g)
}

// GGMLHashSize returns the size of the hash table for the given base,
// see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L17698-L17722.
func GGMLHashSize(base uint64) uint64 {
	primes := []uint64{
		2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031,
		2053, 4099, 8209, 16411, 32771, 65537, 131101,
		262147, 524309, 1048583, 2097169, 4194319, 8388617,
		16777259, 33554467, 67108879, 134217757, 268435459,
		536870923, 1073741827, 2147483659,
	}
	i, ok := slices.BinarySearchFunc(primes, base, func(e, t uint64) int {
		if t >= e {
			return 0
		}
		return -1
	})
	if !ok {
		return base | 1
	}
	return primes[i]
}

// GGMLBitsetSize returns the size of the bitset for the given number of bits,
// see https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/ggml/src/ggml-impl.h#L166-L171.
func GGMLBitsetSize(n uint64) uint64 {
	return (n + (GGMLComputationBitsetSize*8 - 1)) >> 5
}


================================================
FILE: go.mod
================================================
module github.com/gpustack/gguf-parser-go

go 1.22.0

toolchain go1.22.9

require (
	github.com/davecgh/go-spew v1.1.1
	github.com/henvic/httpretty v0.1.4
	github.com/json-iterator/go v1.1.12
	github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d
	github.com/stretchr/testify v1.9.0
	golang.org/x/crypto v0.29.0
	golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f
	golang.org/x/sync v0.9.0
	golang.org/x/sys v0.27.0
	golang.org/x/tools v0.27.0
	gonum.org/v1/gonum v0.15.1
)

require (
	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
	github.com/modern-go/reflect2 v1.0.2 // indirect
	github.com/pmezard/go-difflib v1.0.0 // indirect
	golang.org/x/mod v0.22.0 // indirect
	gopkg.in/yaml.v3 v3.0.1 // indirect
)


================================================
FILE: go.sum
================================================
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/henvic/httpretty v0.1.4 h1:Jo7uwIRWVFxkqOnErcoYfH90o3ddQyVrSANeS4cxYmU=
github.com/henvic/httpretty v0.1.4/go.mod h1:Dn60sQTZfbt2dYsdUSNsCljyF4AfdqnuJFDLJA1I4AM=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d h1:3VwvTjiRPA7cqtgOWddEL+JrcijMlXUmj99c/6YyZoY=
github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d/go.mod h1:tAG61zBM1DYRaGIPloumExGvScf08oHuo0kFoOqdbT0=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
golang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ=
golang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg=
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f h1:XdNn9LlyWAhLVp6P/i8QYBW+hlyhrhei9uErw2B5GJo=
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f/go.mod h1:D5SMRVC3C2/4+F/DB1wZsLRnSNimn2Sp/NPsCrsv8ak=
golang.org/x/mod v0.22.0 h1:D4nJWe9zXqHOmWqj4VMOJhvzj7bEZg4wEYa759z1pH4=
golang.org/x/mod v0.22.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY=
golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ=
golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s=
golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.26.0 h1:WEQa6V3Gja/BhNxg540hBip/kkaYtRg3cxg4oXSw4AU=
golang.org/x/term v0.26.0/go.mod h1:Si5m1o57C5nBNQo5z1iq+XDijt21BDBDp2bK0QI8e3E=
golang.org/x/tools v0.27.0 h1:qEKojBykQkQ4EynWy4S8Weg69NumxKdn40Fce3uc/8o=
golang.org/x/tools v0.27.0/go.mod h1:sUi0ZgbwW9ZPAq26Ekut+weQPR5eIM6GQLQ1Yjm1H0Q=
gonum.org/v1/gonum v0.15.1 h1:FNy7N6OUZVUaWG9pTiD+jlhdQ3lMP+/LcTpJ6+a8sQ0=
gonum.org/v1/gonum v0.15.1/go.mod h1:eZTZuRFrzu5pcyjN5wJhcIhnUdNijYxX1T2IcrOGY0o=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=


================================================
FILE: ollama_model.go
================================================
package gguf_parser

import (
	"context"
	"fmt"
	"net/http"
	"net/url"
	"regexp"
	"strings"

	"golang.org/x/sync/errgroup"

	"github.com/gpustack/gguf-parser-go/util/httpx"
	"github.com/gpustack/gguf-parser-go/util/json"
	"github.com/gpustack/gguf-parser-go/util/stringx"
)

// Inspired by https://github.com/ollama/ollama/blob/380e06e5bea06ae8ded37f47c37bd5d604194d3e/types/model/name.go,
// and https://github.com/ollama/ollama/blob/380e06e5bea06ae8ded37f47c37bd5d604194d3e/server/modelpath.go.

const (
	OllamaDefaultScheme    = "https"
	OllamaDefaultRegistry  = "registry.ollama.ai"
	OllamaDefaultNamespace = "library"
	OllamaDefaultTag       = "latest"
)

type (
	// OllamaModel represents an Ollama model,
	// its manifest(including MediaType, Config and Layers) can be completed further by calling the Complete method.
	OllamaModel struct {
		Schema        string             `json:"schema"`
		Registry      string             `json:"registry"`
		Namespace     string             `json:"namespace"`
		Repository    string             `json:"repository"`
		Tag           string             `json:"tag"`
		SchemaVersion uint32             `json:"schemaVersion"`
		MediaType     string             `json:"mediaType"`
		Config        OllamaModelLayer   `json:"config"`
		Layers        []OllamaModelLayer `json:"layers"`

		// Client is the http client used to complete the OllamaModel's network operations.
		//
		// When this field is nil,
		// it will be set to the client used by OllamaModel.Complete.
		//
		// When this field is offered,
		// the network operations will be done with this client.
		Client *http.Client `json:"-"`
	}

	// OllamaModelLayer represents an Ollama model layer,
	// its digest can be used to download the artifact.
	OllamaModelLayer struct {
		MediaType string `json:"mediaType"`
		Size      uint64 `json:"size"`
		Digest    string `json:"digest"`

		// Root points to the root OllamaModel,
		// which is never serialized or deserialized.
		//
		// When called OllamaModel.Complete,
		// this field will be set to the OllamaModel itself.
		// If not, this field will be nil,
		// and must be set manually to the root OllamaModel before calling the method of OllamaModelLayer.
		Root *OllamaModel `json:"-"`
	}
)

// ParseOllamaModel parses the given Ollama model string,
// and returns the OllamaModel, or nil if the model is invalid.
func ParseOllamaModel(model string, opts ...OllamaModelOption) *OllamaModel {
	if model == "" {
		return nil
	}

	var o _OllamaModelOptions
	for _, opt := range opts {
		opt(&o)
	}

	om := OllamaModel{
		Schema:    OllamaDefaultScheme,
		Registry:  OllamaDefaultRegistry,
		Namespace: OllamaDefaultNamespace,
		Tag:       OllamaDefaultTag,
	}
	{
		if o.DefaultScheme != "" {
			om.Schema = o.DefaultScheme
		}
		if o.DefaultRegistry != "" {
			om.Registry = o.DefaultRegistry
		}
		if o.DefaultNamespace != "" {
			om.Namespace = o.DefaultNamespace
		}
		if o.DefaultTag != "" {
			om.Tag = o.DefaultTag
		}
	}

	m := model

	// Drop digest.
	m, _, _ = stringx.CutFromRight(m, "@")

	// Get tag.
	m, s, ok := stringx.CutFromRight(m, ":")
	if ok && s != "" {
		om.Tag = s
	}

	// Get repository.
	m, s, ok = stringx.CutFromRight(m, "/")
	if ok && s != "" {
		om.Repository = s
	} else if m != "" {
		om.Repository = m
		m = ""
	}

	// Get namespace.
	m, s, ok = stringx.CutFromRight(m, "/")
	if ok && s != "" {
		om.Namespace = s
	} else if m != "" {
		om.Namespace = m
		m = ""
	}

	// Get registry.
	m, s, ok = stringx.CutFromLeft(m, "://")
	if ok && s != "" {
		om.Schema = m
		om.Registry = s
	} else if m != "" {
		om.Registry = m
	}

	if om.Repository == "" {
		return nil
	}
	return &om
}

func (om *OllamaModel) String() string {
	var b strings.Builder
	if om.Registry != "" {
		b.WriteString(om.Registry)
		b.WriteByte('/')
	}
	if om.Namespace != "" {
		b.WriteString(om.Namespace)
		b.WriteByte('/')
	}
	b.WriteString(om.Repository)
	if om.Tag != "" {
		b.WriteByte(':')
		b.WriteString(om.Tag)
	}
	return b.String()
}

// GetLayer returns the OllamaModelLayer with the given media type,
// and true if found, and false otherwise.
func (om *OllamaModel) GetLayer(mediaType string) (OllamaModelLayer, bool) {
	for i := range om.Layers {
		if om.Layers[i].MediaType == mediaType {
			return om.Layers[i], true
		}
	}
	return OllamaModelLayer{}, false
}

// SearchLayers returns a list of OllamaModelLayer with the media type that matches the given regex.
func (om *OllamaModel) SearchLayers(mediaTypeRegex *regexp.Regexp) []OllamaModelLayer {
	var ls []OllamaModelLayer
	for i := range om.Layers {
		if mediaTypeRegex.MatchString(om.Layers[i].MediaType) {
			ls = append(ls, om.Layers[i])
		}
	}
	return ls
}

// WebPageURL returns the Ollama web page URL of the OllamaModel.
func (om *OllamaModel) WebPageURL() *url.URL {
	u := &url.URL{
		Scheme: om.Schema,
		Host:   om.Registry,
	}
	return u.JoinPath(om.Namespace, om.Repository+":"+om.Tag)
}

// Complete completes the OllamaModel with the given context and http client.
func (om *OllamaModel) Complete(ctx context.Context, cli *http.Client) error {
	if om.Client == nil {
		om.Client = cli
	}

	u := &url.URL{
		Scheme: om.Schema,
		Host:   om.Registry,
	}
	u = u.JoinPath("v2", om.Namespace, om.Repository, "manifests", om.Tag)

	req, err := httpx.NewGetRequestWithContext(ctx, u.String())
	if err != nil {
		return fmt.Errorf("new request: %w", err)
	}
	req.Header.Set("Accept", "application/vnd.docker.distribution.manifest.v2+json")

	err = httpx.Do(om.Client, req, func(resp *http.Response) error {
		if resp.StatusCode != http.StatusOK {
			return fmt.Errorf("status code %d", resp.StatusCode)
		}
		return json.NewDecoder(resp.Body).Decode(om)
	})
	if err != nil {
		return fmt.Errorf("do request %s: %w", u, err)
	}

	// Connect.
	om.Config.Root = om
	for i := range om.Layers {
		om.Layers[i].Root = om
	}

	return nil
}

// Params returns the parameters of the OllamaModel.
func (om *OllamaModel) Params(ctx context.Context, cli *http.Client) (map[string]any, error) {
	if cli == nil {
		cli = om.Client
	}
	if cli == nil {
		return nil, fmt.Errorf("no client")
	}

	mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.params$`))
	if len(mls) == 0 {
		return nil, nil
	}

	rs := make([]map[string]any, len(mls))
	eg, ctx := errgroup.WithContext(ctx)
	for i := range mls {
		x := i
		eg.Go(func() error {
			bs, err := mls[x].FetchBlob(ctx, cli)
			if err == nil {
				p := make(map[string]any)
				if err = json.Unmarshal(bs, &p); err == nil {
					rs[x] = p
				}
			}
			return err
		})
	}
	if err := eg.Wait(); err != nil {
		return nil, fmt.Errorf("fetch blob: %w", err)
	}

	r := make(map[string]any)
	for i := range rs {
		for k, v := range rs[i] {
			r[k] = v
		}
	}
	return r, nil
}

// Template returns the template of the OllamaModel.
func (om *OllamaModel) Template(ctx context.Context, cli *http.Client) (string, error) {
	if cli == nil {
		cli = om.Client
	}
	if cli == nil {
		return "", fmt.Errorf("no client")
	}

	mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.(prompt|template)$`))
	if len(mls) == 0 {
		return "", nil
	}

	ml := mls[len(mls)-1]
	bs, err := ml.FetchBlob(ctx, cli)
	if err != nil {
		return "", fmt.Errorf("fetch blob: %w", err)
	}
	return stringx.FromBytes(&bs), nil
}

// System returns the system message of the OllamaModel.
func (om *OllamaModel) System(ctx context.Context, cli *http.Client) (string, error) {
	if cli == nil {
		cli = om.Client
	}
	if cli == nil {
		return "", fmt.Errorf("no client")
	}

	mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.system$`))
	if len(mls) == 0 {
		return "", nil
	}

	ml := mls[len(mls)-1]
	bs, err := ml.FetchBlob(ctx, cli)
	if err != nil {
		return "", fmt.Errorf("fetch blob: %w", err)
	}
	return stringx.FromBytes(&bs), nil
}

// License returns the license of the OllamaModel.
func (om *OllamaModel) License(ctx context.Context, cli *http.Client) ([]string, error) {
	if cli == nil {
		cli = om.Client
	}
	if cli == nil {
		return nil, fmt.Errorf("no client")
	}

	mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.license$`))
	if len(mls) == 0 {
		return nil, nil
	}

	rs := make([]string, len(mls))
	eg, ctx := errgroup.WithContext(ctx)
	for i := range mls {
		x := i
		eg.Go(func() error {
			bs, err := mls[x].FetchBlob(ctx, cli)
			if err == nil {
				rs[x] = stringx.FromBytes(&bs)
			}
			return err
		})
	}
	if err := eg.Wait(); err != nil {
		return nil, fmt.Errorf("fetch blob: %w", err)
	}
	return rs, nil
}

// Messages returns the messages of the OllamaModel.
func (om *OllamaModel) Messages(ctx context.Context, cli *http.Client) ([]json.RawMessage, error) {
	if cli == nil {
		cli = om.Client
	}
	if cli == nil {
		return nil, fmt.Errorf("no client")
	}

	mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.messages$`))
	if len(mls) == 0 {
		return nil, nil
	}

	rs := make([]json.RawMessage, len(mls))
	eg, ctx := errgroup.WithContext(ctx)
	for i := range mls {
		x := i
		eg.Go(func() error {
			bs, err := mls[x].FetchBlob(ctx, cli)
			if err == nil {
				rs[x] = bs
			}
			return err
		})
	}
	if err := eg.Wait(); err != nil {
		return nil, fmt.Errorf("fetch blob: %w", err)
	}
	return rs, nil
}

// BlobURL returns the blob URL of the OllamaModelLayer.
func (ol *OllamaModelLayer) BlobURL() *url.URL {
	if ol.Root == nil {
		return nil
	}

	u := &url.URL{
		Scheme: ol.Root.Schema,
		Host:   ol.Root.Registry,
	}
	return u.JoinPath("v2", ol.Root.Namespace, ol.Root.Repository, "blobs", ol.Digest)
}

// FetchBlob fetches the blob of the OllamaModelLayer with the given context and http client,
// and returns the response body as bytes.
func (ol *OllamaModelLayer) FetchBlob(ctx context.Context, cli *http.Client) ([]byte, error) {
	var b []byte
	err := ol.FetchBlobFunc(ctx, cli, func(resp *http.Response) error {
		b = httpx.BodyBytes(resp)
		return nil
	})
	return b, err
}

// FetchBlobFunc fetches the blob of the OllamaModelLayer with the given context and http client,
// and processes the response with the given function.
func (ol *OllamaModelLayer) FetchBlobFunc(ctx context.Context, cli *http.Client, process func(*http.Response) error) error {
	if cli == nil {
		cli = ol.Root.Client
	}
	if cli == nil {
		return fmt.Errorf("no client")
	}

	u := ol.BlobURL()
	if u == nil {
		return fmt.Errorf("no blob URL")
	}

	req, err := httpx.NewGetRequestWithContext(ctx, u.String())
	if err != nil {
		return fmt.Errorf("new request: %w", err)
	}

	err = httpx.Do(cli, req, process)
	if err != nil {
		return fmt.Errorf("do request %s: %w", u, err)
	}
	return nil
}


================================================
FILE: ollama_model_option.go
================================================
package gguf_parser

import (
	"net/url"
	"strings"
)

type (
	_OllamaModelOptions struct {
		DefaultScheme    string
		DefaultRegistry  string
		DefaultNamespace string
		DefaultTag       string
	}
	OllamaModelOption func(*_OllamaModelOptions)
)

// SetOllamaModelBaseURL parses the given base URL,
// and sets default schema/registry for OllamaModel.
func SetOllamaModelBaseURL(baseURL string) OllamaModelOption {
	baseURL = strings.TrimSpace(baseURL)
	return func(o *_OllamaModelOptions) {
		if baseURL == "" {
			return
		}

		if !strings.Contains(baseURL, "://") {
			baseURL = "https://" + baseURL
		}

		u, err := url.Parse(baseURL)
		if err != nil {
			return
		}

		o.DefaultScheme = u.Scheme
		o.DefaultRegistry = u.Host
	}
}

// SetOllamaModelDefaultScheme sets the default scheme for OllamaModel.
func SetOllamaModelDefaultScheme(scheme string) OllamaModelOption {
	return func(o *_OllamaModelOptions) {
		if scheme == "" {
			return
		}
		o.DefaultScheme = scheme
	}
}

// SetOllamaModelDefaultRegistry sets the default registry for OllamaModel.
func SetOllamaModelDefaultRegistry(registry string) OllamaModelOption {
	return func(o *_OllamaModelOptions) {
		if registry == "" {
			return
		}
		o.DefaultRegistry = registry
	}
}

// SetOllamaModelDefaultNamespace sets the default namespace for OllamaModel.
func SetOllamaModelDefaultNamespace(namespace string) OllamaModelOption {
	return func(o *_OllamaModelOptions) {
		if namespace == "" {
			return
		}
		o.DefaultNamespace = namespace
	}
}

// SetOllamaModelDefaultTag sets the default tag for OllamaModel.
func SetOllamaModelDefaultTag(tag string) OllamaModelOption {
	return func(o *_OllamaModelOptions) {
		if tag == "" {
			return
		}
		o.DefaultTag = tag
	}
}


================================================
FILE: ollama_model_test.go
================================================
package gguf_parser

import (
	"testing"

	"github.com/stretchr/testify/assert"
)

func TestParseOllamaModel(t *testing.T) {
	cases := []struct {
		given    string
		expected *OllamaModel
	}{
		{
			given: "gemma2",
			expected: &OllamaModel{
				Schema:     OllamaDefaultScheme,
				Registry:   OllamaDefaultRegistry,
				Namespace:  OllamaDefaultNamespace,
				Repository: "gemma2",
				Tag:        OllamaDefaultTag,
			},
		},
		{
			given: "gemma2:awesome",
			expected: &OllamaModel{
				Schema:     OllamaDefaultScheme,
				Registry:   OllamaDefaultRegistry,
				Namespace:  OllamaDefaultNamespace,
				Repository: "gemma2",
				Tag:        "awesome",
			},
		},
		{
			given: "gemma2:awesome@sha256:1234567890abcdef",
			expected: &OllamaModel{
				Schema:     OllamaDefaultScheme,
				Registry:   OllamaDefaultRegistry,
				Namespace:  OllamaDefaultNamespace,
				Repository: "gemma2",
				Tag:        "awesome",
			},
		},
		{
			given: "awesome/gemma2:latest@sha256:1234567890abcdef",
			expected: &OllamaModel{
				Schema:     OllamaDefaultScheme,
				Registry:   OllamaDefaultRegistry,
				Namespace:  "awesome",
				Repository: "gemma2",
				Tag:        "latest",
			},
		},
		{
			given: "mysite.com/library/gemma2:latest@sha256:1234567890abcdef",
			expected: &OllamaModel{
				Schema:     OllamaDefaultScheme,
				Registry:   "mysite.com",
				Namespace:  "library",
				Repository: "gemma2",
				Tag:        "latest",
			},
		},
		{
			given: "http://mysite.com/library/gemma2:latest@sha256:1234567890abcdef",
			expected: &OllamaModel{
				Schema:     "http",
				Registry:   "mysite.com",
				Namespace:  "library",
				Repository: "gemma2",
				Tag:        "latest",
			},
		},
	}
	for _, tc := range cases {
		t.Run(tc.given, func(t *testing.T) {
			actual := ParseOllamaModel(tc.given)
			assert.Equal(t, tc.expected, actual)
		})
	}
}


================================================
FILE: ollama_registry_authenticate.go
================================================
package gguf_parser

import (
	"bytes"
	"context"
	"crypto/ed25519"
	"crypto/rand"
	"encoding/base64"
	"encoding/json"
	"encoding/pem"
	"errors"
	"fmt"
	"net/http"
	"net/url"
	"os"
	"path/filepath"
	"runtime"
	"strconv"
	"strings"
	"time"

	"golang.org/x/crypto/ssh"

	"github.com/gpustack/gguf-parser-go/util/funcx"
	"github.com/gpustack/gguf-parser-go/util/httpx"
	"github.com/gpustack/gguf-parser-go/util/osx"
	"github.com/gpustack/gguf-parser-go/util/stringx"
)

const (
	httpHeaderWWWAuthenticate = "WWW-Authenticate"
	httpHeaderAuthorization   = "Authorization"
)

// OllamaUserAgent returns the user agent string for Ollama,
// since llama3.1, the user agent is required to be set,
// otherwise the request will be rejected by 412.
func OllamaUserAgent() string {
	return fmt.Sprintf("ollama/9.9.9 (%s %s) Go/%s", runtime.GOARCH, runtime.GOOS, runtime.Version())
}

// OllamaRegistryAuthorizeRetry returns true if the request should be retried with authorization.
//
// OllamaRegistryAuthorizeRetry leverages OllamaRegistryAuthorize to obtain an authorization token,
// and configures the request with the token.
func OllamaRegistryAuthorizeRetry(resp *http.Response, cli *http.Client) bool {
	if resp == nil || cli == nil {
		return false
	}

	if resp.StatusCode != http.StatusUnauthorized && resp.Request == nil {
		// Not unauthorized, return.
		return false
	}

	req := resp.Request
	if req.Header.Get(httpHeaderAuthorization) != "" {
		// Already authorized, return.
		return false
	}

	const tokenPrefix = "Bearer "
	authnToken := strings.TrimPrefix(resp.Header.Get(httpHeaderWWWAuthenticate), tokenPrefix)
	if authnToken == "" {
		// No authentication token, return.
		return false
	}
	authzToken := funcx.MustNoError(OllamaRegistryAuthorize(req.Context(), cli, authnToken))
	req.Header.Set(httpHeaderAuthorization, tokenPrefix+authzToken)
	return true
}

// OllamaRegistryAuthorize authorizes the request with the given authentication token,
// and returns the authorization token.
func OllamaRegistryAuthorize(ctx context.Context, cli *http.Client, authnToken string) (string, error) {
	priKey, err := OllamaSingKeyLoad()
	if err != nil {
		return "", fmt.Errorf("load sign key: %w", err)
	}

	var authzUrl string
	{
		ss := strings.Split(authnToken, ",")
		if len(ss) < 3 {
			return "", errors.New("invalid authn token")
		}

		var realm, service, scope string
		for _, s := range ss {
			sp := strings.SplitN(s, "=", 2)
			if len(sp) < 2 {
				continue
			}
			sp[1] = strings.TrimFunc(sp[1], func(r rune) bool {
				return r == '"' || r == '\''
			})
			switch sp[0] {
			case "realm":
				realm = sp[1]
			case "service":
				service = sp[1]
			case "scope":
				scope = sp[1]
			}
		}

		u, err := url.Parse(realm)
		if err != nil {
			return "", fmt.Errorf("parse realm: %w", err)
		}

		qs := u.Query()
		qs.Add("service", service)
		for _, s := range strings.Split(scope, " ") {
			qs.Add("scope", s)
		}
		qs.Add("ts", strconv.FormatInt(time.Now().Unix(), 10))
		qs.Add("nonce", stringx.RandomBase64(16))
		u.RawQuery = qs.Encode()

		authzUrl = u.String()
	}

	var authnData string
	{
		pubKey := ssh.MarshalAuthorizedKey(priKey.PublicKey())
		pubKeyp := bytes.Split(pubKey, []byte(" "))
		if len(pubKeyp) < 2 {
			return "", errors.New("malformed public key")
		}

		nc := base64.StdEncoding.EncodeToString([]byte(stringx.SumBytesBySHA256(nil)))
		py := []byte(fmt.Sprintf("%s,%s,%s", http.MethodGet, authzUrl, nc))
		sd, err := priKey.Sign(rand.Reader, py)
		if err != nil {
			return "", fmt.Errorf("signing data: %w", err)
		}
		authnData = fmt.Sprintf("%s:%s", bytes.TrimSpace(pubKeyp[1]), base64.StdEncoding.EncodeToString(sd.Blob))
	}

	req, err := httpx.NewGetRequestWithContext(ctx, authzUrl)
	if err != nil {
		return "", fmt.Errorf("new request: %w", err)
	}
	req.Header.Add(httpHeaderAuthorization, authnData)

	var authzToken string
	err = httpx.Do(cli, req, func(resp *http.Response) error {
		if resp.StatusCode != http.StatusOK {
			return fmt.Errorf("status code %d", resp.StatusCode)
		}
		var tok struct {
			Token string `json:"token"`
		}
		if err = json.NewDecoder(resp.Body).Decode(&tok); err != nil {
			return err
		}
		if tok.Token == "" {
			return errors.New("empty token")
		}
		authzToken = tok.Token
		return nil
	})
	if err != nil {
		return "", fmt.Errorf("do request %s: %w", authzUrl, err)
	}

	return authzToken, nil
}

// OllamaSingKeyLoad loads the signing key for Ollama,
// and generates a new key if not exists.
func OllamaSingKeyLoad() (ssh.Signer, error) {
	hd := filepath.Join(osx.UserHomeDir(), ".ollama")

	priKeyPath := filepath.Join(hd, "id_ed25519")
	if !osx.ExistsFile(priKeyPath) {
		// Generate key if not exists.
		pubKey, priKey, err := ed25519.GenerateKey(rand.Reader)
		if err != nil {
			return nil, fmt.Errorf("generate key: %w", err)
		}

		priKeyPem, err := ssh.MarshalPrivateKey(priKey, "")
		if err != nil {
			return nil, fmt.Errorf("marshal private key: %w", err)
		}
		priKeyBs := pem.EncodeToMemory(priKeyPem)

		sshPubKey, err := ssh.NewPublicKey(pubKey)
		if err != nil {
			return nil, fmt.Errorf("new public key: %w", err)
		}
		pubKeyBs := ssh.MarshalAuthorizedKey(sshPubKey)

		if err = osx.WriteFile(priKeyPath, priKeyBs, 0o600); err != nil {
			return nil, fmt.Errorf("write private key: %w", err)
		}
		if err = osx.WriteFile(priKeyPath+".pub", pubKeyBs, 0o644); err != nil {
			_ = os.Remove(priKeyPath)
			return nil, fmt.Errorf("write public key: %w", err)
		}
	}

	priKeyBs, err := os.ReadFile(priKeyPath)
	if err != nil {
		return nil, fmt.Errorf("read private key: %w", err)
	}
	priKey, err := ssh.ParsePrivateKey(priKeyBs)
	if err != nil {
		return nil, fmt.Errorf("parse private key: %w", err)
	}
	return priKey, nil
}


================================================
FILE: scalar.go
================================================
package gguf_parser

import (
	"errors"
	"strconv"
	"strings"
)

const (
	_Ki = 1 << ((iota + 1) * 10)
	_Mi
	_Gi
	_Ti
	_Pi
)

const (
	_K = 1e3
	_M = 1e6
	_G = 1e9
	_T = 1e12
	_P = 1e15
)

const (
	_Thousand    = 1e3
	_Million     = 1e6
	_Billion     = 1e9
	_Trillion    = 1e12
	_Quadrillion = 1e15
)

type (
	// SizeScalar is the scalar for size.
	SizeScalar uint64

	// FLOPSScalar is the scalar for FLOPS.
	FLOPSScalar uint64

	// BytesPerSecondScalar is the scalar for bytes per second (Bps).
	BytesPerSecondScalar uint64
)

var (
	// _GeneralBaseUnitMatrix is the base unit matrix for bytes.
	_GeneralBaseUnitMatrix = []struct {
		Base float64
		Unit string
	}{
		{_Pi, "Pi"},
		{_P, "P"},
		{_Ti, "Ti"},
		{_T, "T"},
		{_Gi, "Gi"},
		{_G, "G"},
		{_Mi, "Mi"},
		{_M, "M"},
		{_Ki, "Ki"},
		{_K, "K"},
	}

	// _SizeBaseUnitMatrix is the base unit matrix for size.
	_SizeBaseUnitMatrix = []struct {
		Base float64
		Unit string
	}{
		{_Pi, "P"},
		{_Ti, "T"},
		{_Gi, "G"},
		{_Mi, "M"},
		{_Ki, "K"},
	}

	// _NumberBaseUnitMatrix is the base unit matrix for numbers.
	_NumberBaseUnitMatrix = []struct {
		Base float64
		Unit string
	}{
		{_Quadrillion, "Q"},
		{_Trillion, "T"},
		{_Billion, "B"},
		{_Million, "M"},
		{_Thousand, "K"},
	}
)

// ParseSizeScalar parses the SizeScalar from the string.
func ParseSizeScalar(s string) (_ SizeScalar, err error) {
	if s == "" {
		return 0, errors.New("invalid SizeScalar")
	}
	b := float64(1)
	for i := range _SizeBaseUnitMatrix {
		if strings.HasSuffix(s, _SizeBaseUnitMatrix[i].Unit) {
			b = _SizeBaseUnitMatrix[i].Base
			s = strings.TrimSuffix(s, _SizeBaseUnitMatrix[i].Unit)
			break
		}
	}
	f, err := strconv.ParseFloat(strings.TrimSpace(s), 64)
	if err != nil {
		return 0, err
	}
	return SizeScalar(f * b), nil
}

func (s SizeScalar) String() string {
	if s == 0 {
		return "0"
	}
	b, u := float64(1), ""
	for i := range _SizeBaseUnitMatrix {
		if float64(s) >= _SizeBaseUnitMatrix[i].Base {
			b = _SizeBaseUnitMatrix[i].Base
			u = _SizeBaseUnitMatrix[i].Unit
			break
		}
	}
	f := strconv.FormatFloat(float64(s)/b, 'f', 2, 64)
	return strings.TrimSuffix(f, ".00") + " " + u
}

// ParseFLOPSScalar parses the FLOPSScalar from the string.
func ParseFLOPSScalar(s string) (_ FLOPSScalar, err error) {
	if s == "" {
		return 0, errors.New("invalid FLOPSScalar")
	}
	s = strings.TrimSuffix(s, "FLOPS")
	b := float64(1)
	for i := range _GeneralBaseUnitMatrix {
		if strings.HasSuffix(s, _GeneralBaseUnitMatrix[i].Unit) {
			b = _GeneralBaseUnitMatrix[i].Base
			s = strings.TrimSuffix(s, _GeneralBaseUnitMatrix[i].Unit)
			break
		}
	}
	f, err := strconv.ParseFloat(strings.TrimSpace(s), 64)
	if err != nil {
		return 0, err
	}
	return FLOPSScalar(f * b), nil
}

func (s FLOPSScalar) String() string {
	if s == 0 {
		return "0 FLOPS"
	}
	b, u := float64(1), ""
	for i := range _GeneralBaseUnitMatrix {
		if float64(s) >= _GeneralBaseUnitMatrix[i].Base {
			b = _GeneralBaseUnitMatrix[i].Base
			u = _GeneralBaseUnitMatrix[i].Unit
			break
		}
	}
	f := strconv.FormatFloat(float64(s)/b, 'f', 2, 64)
	return strings.TrimSuffix(f, ".00") + " " + u + "FLOPS"
}

// ParseBytesPerSecondScalar parses the BytesPerSecondScalar from the string.
func ParseBytesPerSecondScalar(s string) (_ BytesPerSecondScalar, err error) {
	if s == "" {
		return 0, errors.New("invalid BytesPerSecondScalar")
	}
	b := float64(1)
	o := float64(1)
	switch {
	case strings.HasSuffix(s, "Bps") || strings.HasSuffix(s, "B/s"):
		s = strings.TrimSuffix(strings.TrimSuffix(s, "Bps"), "B/s")
	case strings.HasSuffix(s, "bps") || strings.HasSuffix(s, "b/s"):
		s = strings.TrimSuffix(strings.TrimSuffix(s, "bps"), "b/s")
		o = 8
	}
	for i := range _GeneralBaseUnitMatrix {
		if strings.HasSuffix(s, _GeneralBaseUnitMatrix[i].Unit) {
			b = _GeneralBaseUnitMatrix[i].Base
			s = strings.TrimSuffix(s, _GeneralBaseUnitMatrix[i].Unit)
			break
		}
	}
	f, err := strconv.ParseFloat(strings.TrimSpace(s), 64)
	if err != nil {
		return 0, err
	}
	return BytesPerSecondScalar(f * b / o), nil
}

func (s BytesPerSecondScalar) String() string {
	if s == 0 {
		return "0 Bps"
	}
	b, u := float64(1), ""
	for i := range _GeneralBaseUnitMatrix {
		if float64(s) >= _GeneralBaseUnitMatrix[i].Base {
			b = _GeneralBaseUnitMatrix[i].Base
			u = _GeneralBaseUnitMatrix[i].Unit
			break
		}
	}
	f := strconv.FormatFloat(float64(s)/b, 'f', 2, 64)
	return strings.TrimSuffix(f, ".00") + " " + u + "Bps"
}

type (
	// GGUFBytesScalar is the scalar for bytes.
	GGUFBytesScalar uint64

	// GGUFParametersScalar is the scalar for parameters.
	GGUFParametersScalar uint64

	// GGUFBitsPerWeightScalar is the scalar for bits per weight.
	GGUFBitsPerWeightScalar float64

	// GGUFTokensPerSecondScalar is the scalar for tokens per second.
	GGUFTokensPerSecondScalar float64
)

// ParseGGUFBytesScalar parses the GGUFBytesScalar from the string.
func ParseGGUFBytesScalar(s string) (_ GGUFBytesScalar, err error) {
	if s == "" {
		return 0, errors.New("invalid GGUFBytesScalar")
	}
	s = strings.TrimSuffix(s, "B")
	b := float64(1)
	for i := range _GeneralBaseUnitMatrix {
		if strings.HasSuffix(s, _GeneralBaseUnitMatrix[i].Unit) {
			b = _GeneralBaseUnitMatrix[i].Base
			s = strings.TrimSuffix(s, _GeneralBaseUnitMatrix[i].Unit)
			break
		}
	}
	f, err := strconv.ParseFloat(strings.TrimSpace(s), 64)
	if err != nil {
		return 0, err
	}
	return GGUFBytesScalar(f * b), nil
}

// GGUFBytesScalarStringInMiBytes is the flag to show the GGUFBytesScalar string in MiB.
var GGUFBytesScalarStringInMiBytes bool

func (s GGUFBytesScalar) String() string {
	if s == 0 {
		return "0 B"
	}
	b, u := float64(1), ""
	if GGUFBytesScalarStringInMiBytes {
		b = _Mi
		u = "Mi"
	} else {
		for i := range _GeneralBaseUnitMatrix {
			if float64(s) >= _GeneralBaseUnitMatrix[i].Base {
				b = _GeneralBaseUnitMatrix[i].Base
				u = _GeneralBaseUnitMatrix[i].Unit
				break
			}
		}
	}
	f := strconv.FormatFloat(float64(s)/b, 'f', 2, 64)
	return strings.TrimSuffix(f, ".00") + " " + u + "B"
}

func (s GGUFParametersScalar) String() string {
	if s == 0 {
		return "0"
	}
	b, u := float64(1), ""
	for i := range _NumberBaseUnitMatrix {
		if float64(s) >= _NumberBaseUnitMatrix[i].Base {
			b = _NumberBaseUnitMatrix[i].Base
			u = _NumberBaseUnitMatrix[i].Unit
			break
		}
	}
	f := strconv.FormatFloat(float64(s)/b, 'f', 2, 64)
	return strings.TrimSuffix(f, ".00") + " " + u
}

func (s GGUFBitsPerWeightScalar) String() string {
	if s <= 0 {
		return "0 bpw"
	}
	return strconv.FormatFloat(float64(s), 'f', 2, 64) + " bpw"
}

func (s GGUFTokensPerSecondScalar) String() string {
	if s <= 0 {
		return "0 tps"
	}
	return strconv.FormatFloat(float64(s), 'f', 2, 64) + " tps"
}


================================================
FILE: scalar_test.go
================================================
package gguf_parser

import (
	"testing"

	"github.com/stretchr/testify/assert"
)

func TestParseSizeScalar(t *testing.T) {
	testCases := []struct {
		given    string
		expected SizeScalar
	}{
		{"1", 1},
		{"1K", 1 * _Ki},
		{"1M", 1 * _Mi},
		{"1G", 1 * _Gi},
		{"1T", 1 * _Ti},
		{"1P", 1 * _Pi},
	}
	for _, tc := range testCases {
		t.Run(tc.given, func(t *testing.T) {
			actual, err := ParseSizeScalar(tc.given)
			if !assert.NoError(t, err) {
				return
			}
			assert.Equal(t, tc.expected, actual)
		})
	}
}

func TestParseFLOPSScalar(t *testing.T) {
	testCases := []struct {
		given    string
		expected FLOPSScalar
	}{
		{"1FLOPS", 1},
		{"1KFLOPS", 1 * _K},
		{"1MFLOPS", 1 * _M},
		{"1GFLOPS", 1 * _G},
		{"1TFLOPS", 1 * _T},
		{"1PFLOPS", 1 * _P},
	}
	for _, tc := range testCases {
		t.Run(tc.given, func(t *testing.T) {
			actual, err := ParseFLOPSScalar(tc.given)
			if !assert.NoError(t, err) {
				return
			}
			assert.Equal(t, tc.expected, actual)
		})
	}
}

func TestParseBytesPerSecondScalar(t *testing.T) {
	testCases := []struct {
		given    string
		expected BytesPerSecondScalar
	}{
		{"1B/s", 1},
		{"1KB/s", 1 * _K},
		{"1MB/s", 1 * _M},
		{"1GB/s", 1 * _G},
		{"1TB/s", 1 * _T},
		{"1PB/s", 1 * _P},
		{"1KiBps", 1 * _Ki},
		{"1MiBps", 1 * _Mi},
		{"1GiBps", 1 * _Gi},
		{"1TiBps", 1 * _Ti},
		{"1PiBps", 1 * _Pi},
		{"8b/s", 1},
		{"1Kbps", 1 * _K >> 3},
		{"1Mbps", 1 * _M >> 3},
		{"1Gbps", 1 * _G >> 3},
		{"1Tbps", 1 * _T >> 3},
		{"1Pbps", 1 * _P >> 3},
		{"1Kibps", 1 * _Ki >> 3},
		{"1Mibps", 1 * _Mi >> 3},
		{"1Gibps", 1 * _Gi >> 3},
		{"1Tibps", 1 * _Ti >> 3},
		{"1Pibps", 1 * _Pi >> 3},
	}
	for _, tc := range testCases {
		t.Run(tc.given, func(t *testing.T) {
			actual, err := ParseBytesPerSecondScalar(tc.given)
			if !assert.NoError(t, err) {
				return
			}
			assert.Equal(t, tc.expected, actual)
		})
	}
}

func TestParseGGUFBytesScalar(t *testing.T) {
	testCases := []struct {
		given    string
		expected GGUFBytesScalar
	}{
		{"1B", 1},
		{"1KB", 1 * _K},
		{"1MB", 1 * _M},
		{"1GB", 1 * _G},
		{"1TB", 1 * _T},
		{"1PB", 1 * _P},
		{"1KiB", 1 * _Ki},
		{"1MiB", 1 * _Mi},
		{"1GiB", 1 * _Gi},
		{"1TiB", 1 * _Ti},
		{"1PiB", 1 * _Pi},
	}
	for _, tc := range testCases {
		t.Run(tc.given, func(t *testing.T) {
			actual, err := ParseGGUFBytesScalar(tc.given)
			if !assert.NoError(t, err) {
				return
			}
			assert.Equal(t, tc.expected, actual)
		})
	}
}


================================================
FILE: util/anyx/any.go
================================================
package anyx

import (
	"encoding/json"
	"fmt"
	"strconv"

	"golang.org/x/exp/constraints"
)

// Number converts any type to the specified number type.
func Number[T constraints.Integer | constraints.Float](v any) T {
	switch vv := v.(type) {
	case int:
		return T(vv)
	case int8:
		return T(vv)
	case int16:
		return T(vv)
	case int32:
		return T(vv)
	case int64:
		return T(vv)
	case uint:
		return T(vv)
	case uint8:
		return T(vv)
	case uint16:
		return T(vv)
	case uint32:
		return T(vv)
	case uint64:
		return T(vv)
	case float32:
		return T(vv)
	case float64:
		return T(vv)
	case bool:
		if vv {
			return T(1)
		}
		return T(0)
	case string:
		x, err := strconv.ParseInt(vv, 10, 64)
		if err != nil {
			y, err := strconv.ParseFloat(vv, 64)
			if err != nil {
				return T(0)
			} else {
				return T(y)
			}
		}
		return T(x)
	case json.Number:
		x, err := vv.Int64()
		if err != nil {
			y, err := vv.Float64()
			if err != nil {
				return T(0)
			} else {
				return T(y)
			}
		}
		return T(x)
	default:
		return T(0)
	}
}

// Bool converts any type to a bool.
func Bool(v any) bool {
	switch vv := v.(type) {
	case bool:
		return vv
	case int, int8, int16, int32, int64, uint, uint8, uint16, uint32, uint64, uintptr:
		return vv != 0
	case float32, float64:
		return vv != 0
	case string:
		return vv != "0"
	case fmt.Stringer:
		return vv.String() != "0"
	default:
		return false
	}
}

// String converts any type to a string.
func String(v any) string {
	switch vv := v.(type) {
	case string:
		return vv
	case []byte:
		return string(vv)
	case int:
		return strconv.FormatInt(int64(vv), 10)
	case int8:
		return strconv.FormatInt(int64(vv), 10)
	case int16:
		return strconv.FormatInt(int64(vv), 10)
	case int32:
		return strconv.FormatInt(int64(vv), 10)
	case int64:
		return strconv.FormatInt(vv, 10)
	case uint:
		return strconv.FormatUint(uint64(vv), 10)
	case uint8:
		return strconv.FormatUint(uint64(vv), 10)
	case uint16:
		return strconv.FormatUint(uint64(vv), 10)
	case uint32:
		return strconv.FormatUint(uint64(vv), 10)
	case uint64:
		return strconv.FormatUint(vv, 10)
	case float32:
		return strconv.FormatFloat(float64(vv), 'f', -1, 32)
	case float64:
		return strconv.FormatFloat(vv, 'f', -1, 64)
	case bool:
		return strconv.FormatBool(vv)
	case fmt.Stringer:
		return vv.String()
	case json.RawMessage:
		return string(vv)
	default:
		return fmt.Sprintf("%v", v)
	}
}


================================================
FILE: util/bytex/pool.go
================================================
package bytex

import (
	"bytes"
	"sync"
)

const defaultSize = 32 * 1024

type (
	Bytes       = []byte
	BytesBuffer = *bytes.Buffer
)

var gp = sync.Pool{
	New: func() any {
		buf := make(Bytes, defaultSize)
		return &buf
	},
}

// GetBytes gets a bytes buffer from the pool,
// which can specify with a size,
// default is 32k.
func GetBytes(size ...uint64) Bytes {
	buf := *(gp.Get().(*Bytes))

	s := defaultSize
	if len(size) != 0 {
		s = int(size[0])
		if s == 0 {
			s = defaultSize
		}
	}
	if cap(buf) >= s {
		return buf[:s]
	}

	gp.Put(&buf)

	ns := s
	if ns < defaultSize {
		ns = defaultSize
	}
	buf = make(Bytes, ns)
	return buf[:s]
}

// WithBytes relies on GetBytes to get a buffer,
// calls the function with the buffer,
// finally, puts it back to the pool after the function returns.
func WithBytes(fn func(Bytes) error, size ...uint64) error {
	if fn == nil {
		return nil
	}

	buf := GetBytes(size...)
	defer Put(buf)
	return fn(buf)
}

// GetBuffer is similar to GetBytes,
// but it returns the bytes buffer wrapped by bytes.Buffer.
func GetBuffer(size ...uint64) BytesBuffer {
	return bytes.NewBuffer(GetBytes(size...)[:0])
}

// WithBuffer relies on GetBuffer to get a buffer,
// calls the function with the buffer,
// finally, puts it back to the pool after the function returns.
func WithBuffer(fn func(BytesBuffer) error, size ...uint64) error {
	if fn == nil {
		return nil
	}

	buf := GetBuffer(size...)
	defer Put(buf)
	return fn(buf)
}

// Put puts the buffer(either Bytes or BytesBuffer) back to the pool.
func Put[T Bytes | BytesBuffer](buf T) {
	switch v := any(buf).(type) {
	case Bytes:
		gp.Put(&v)
	case BytesBuffer:
		bs := v.Bytes()
		gp.Put(&bs)
		v.Reset()
	}
}


================================================
FILE: util/funcx/error.go
================================================
package funcx

// NoError ignores the given error,
// it is usually a nice helper for chain function calling.
func NoError[T any](t T, _ error) T {
	return t
}

// NoError2 ignores the given error,
// it is usually a nice helper for chain function calling.
func NoError2[T, U any](t T, u U, _ error) (T, U) {
	return t, u
}

// NoError3 ignores the given error,
// it is usually a nice helper for chain function calling.
func NoError3[T, U, V any](t T, u U, v V, _ error) (T, U, V) {
	return t, u, v
}

// NoError4 ignores the given error,
// it is usually a nice helper for chain function calling.
func NoError4[T, U, V, W any](t T, u U, v V, w W, _ error) (T, U, V, W) {
	return t, u, v, w
}

// MustNoError is similar to NoError,
// but it panics if the given error is not nil,
// it is usually a nice helper for chain function calling.
func MustNoError[T any](t T, e error) T {
	if e != nil {
		panic(e)
	}
	return t
}

// MustNoError2 is similar to NoError2,
// but it panics if the given error is not nil,
// it is usually a nice helper for chain function calling.
func MustNoError2[T, U any](t T, u U, e error) (T, U) {
	if e != nil {
		panic(e)
	}
	return t, u
}

// MustNoError3 is similar to NoError3,
// but it panics if the given error is not nil,
// it is usually a nice helper for chain function calling.
func MustNoError3[T, U, V any](t T, u U, v V, e error) (T, U, V) {
	if e != nil {
		panic(e)
	}
	return t, u, v
}

// MustNoError4 is similar to NoError4,
// but it panics if the given error is not nil,
// it is usually a nice helper for chain function calling.
func MustNoError4[T, U, V, W any](t T, u U, v V, w W, e error) (T, U, V, W) {
	if e != nil {
		panic(e)
	}
	return t, u, v, w
}


================================================
FILE: util/httpx/client.go
================================================
package httpx

import (
	"context"
	"fmt"
	"io"
	"net/http"
	"time"

	"github.com/henvic/httpretty"

	"github.com/gpustack/gguf-parser-go/util/bytex"
)

// DefaultClient is similar to the default http.Client used by the package.
//
// It is used for requests pooling.
var DefaultClient = &http.Client{
	Transport: DefaultTransport,
}

// DefaultInsecureClient is the default http.Client used by the package,
// with TLS insecure skip verify.
//
// It is used for requests pooling.
var DefaultInsecureClient = &http.Client{
	Transport: DefaultInsecureTransport,
}

// Client returns a new http.Client with the given options,
// the result http.Client is used for fast-consuming requests.
//
// If you want a requests pool management, use DefaultClient instead.
func Client(opts ...*ClientOption) *http.Client {
	var o *ClientOption
	if len(opts) > 0 {
		o = opts[0]
	} else {
		o = ClientOptions()
	}

	root := DefaultTransport
	if o.transport != nil {
		root = o.transport
	}

	if o.debug {
		pretty := &httpretty.Logger{
			Time:            true,
			TLS:             true,
			RequestHeader:   true,
			RequestBody:     true,
			MaxRequestBody:  1024,
			ResponseHeader:  true,
			ResponseBody:    true,
			MaxResponseBody: 1024,
			Formatters:      []httpretty.Formatter{&JSONFormatter{}},
		}
		root = pretty.RoundTripper(root)
	}

	rtc := RoundTripperChain{
		Next: root,
	}
	for i := range o.roundTrippers {
		rtc = RoundTripperChain{
			Do:   o.roundTrippers[i],
			Next: rtc,
		}
	}

	var rt http.RoundTripper = rtc
	if o.retryIf != nil {
		rt = RoundTripperFunc(func(req *http.Request) (*http.Response, error) {
			for i := 0; ; i++ {
				resp, err := rtc.RoundTrip(req)
				if !o.retryIf(resp, err) {
					return resp, err
				}
				w, ok := o.retryBackoff(i+1, resp)
				if !ok {
					return resp, err
				}
				wt := time.NewTimer(w)
				select {
				case <-req.Context().Done():
					wt.Stop()
					return resp, req.Context().Err()
				case <-wt.C:
				}
			}
		})
	}

	return &http.Client{
		Transport: rt,
		Timeout:   o.timeout,
	}
}

// NewGetRequestWithContext returns a new http.MethodGet request,
// which is saving your life from http.NewRequestWithContext.
func NewGetRequestWithContext(ctx context.Context, uri string) (*http.Request, error) {
	return http.NewRequestWithContext(ctx, http.MethodGet, uri, nil)
}

// NewGetRequest returns a new http.MethodGet request,
// which is saving your life from http.NewRequest.
func NewGetRequest(uri string) (*http.Request, error) {
	return http.NewRequest(http.MethodGet, uri, nil)
}

// NewHeadRequestWithContext returns a new http.MethodHead request,
// which is saving your life from http.NewRequestWithContext.
func NewHeadRequestWithContext(ctx context.Context, uri string) (*http.Request, error) {
	return http.NewRequestWithContext(ctx, http.MethodHead, uri, nil)
}

// NewHeadRequest returns a new http.MethodHead request,
// which is saving your life from http.NewRequest.
func NewHeadRequest(uri string) (*http.Request, error) {
	return http.NewRequest(http.MethodHead, uri, nil)
}

// NewPostRequestWithContext returns a new http.MethodPost request with the given context,
// which is saving your life from http.NewRequestWithContext.
func NewPostRequestWithContext(ctx context.Context, uri string, body io.Reader) (*http.Request, error) {
	return http.NewRequestWithContext(ctx, http.MethodPost, uri, body)
}

// NewPostRequest returns a new http.MethodPost request,
// which is saving your life from http.NewRequest.
func NewPostRequest(uri string, body io.Reader) (*http.Request, error) {
	return http.NewRequest(http.MethodPost, uri, body)
}

// NewPutRequestWithContext returns a new http.MethodPut request with the given context,
// which is saving your life from http.NewRequestWithContext.
func NewPutRequestWithContext(ctx context.Context, uri string, body io.Reader) (*http.Request, error) {
	return http.NewRequestWithContext(ctx, http.MethodPut, uri, body)
}

// NewPutRequest returns a new http.MethodPut request,
// which is saving your life from http.NewRequest.
func NewPutRequest(uri string, body io.Reader) (*http.Request, error) {
	return http.NewRequest(http.MethodPut, uri, body)
}

// NewPatchRequestWithContext returns a new http.MethodPatch request with the given context,
// which is saving your life from http.NewRequestWithContext.
func NewPatchRequestWithContext(ctx context.Context, uri string, body io.Reader) (*http.Request, error) {
	return http.NewRequestWithContext(ctx, http.MethodPatch, uri, body)
}

// NewPatchRequest returns a new http.MethodPatch request,
// which is saving your life from http.NewRequest.
func NewPatchRequest(uri string, body io.Reader) (*http.Request, error) {
	return http.NewRequest(http.MethodPatch, uri, body)
}

// NewDeleteRequestWithContext returns a new http.MethodDelete request with the given context,
// which is saving your life from http.NewRequestWithContext.
func NewDeleteRequestWithContext(ctx context.Context, uri string) (*http.Request, error) {
	return http.NewRequestWithContext(ctx, http.MethodDelete, uri, nil)
}

// NewDeleteRequest returns a new http.MethodDelete request,
// which is saving your life from http.NewRequest.
func NewDeleteRequest(uri string) (*http.Request, error) {
	return http.NewRequest(http.MethodDelete, uri, nil)
}

// NewConnectRequestWithContext returns a new http.MethodConnect request with the given context,
// which is saving your life from http.NewRequestWithContext.
func NewConnectRequestWithContext(ctx context.Context, uri string) (*http.Request, error) {
	return http.NewRequestWithContext(ctx, http.MethodConnect, uri, nil)
}

// NewConnectRequest returns a new http.MethodConnect request,
// which is saving your life from http.NewRequest.
func NewConnectRequest(uri string) (*http.Request, error) {
	return http.NewRequest(http.MethodConnect, uri, nil)
}

// NewOptionsRequestWithContext returns a new http.MethodOptions request with the given context,
// which is saving your life from http.NewRequestWithContext.
func NewOptionsRequestWithContext(ctx context.Context, uri string) (*http.Request, error) {
	return http.NewRequestWithContext(ctx, http.MethodOptions, uri, nil)
}

// NewOptionsRequest returns a new http.MethodOptions request,
// which is saving your life from http.NewRequest.
func NewOptionsRequest(uri string) (*http.Request, error) {
	return http.NewRequest(http.MethodOptions, uri, nil)
}

// NewTraceRequestWithContext returns a new http.MethodTrace request with the given context,
// which is saving your life from http.NewRequestWithContext.
func NewTraceRequestWithContext(ctx context.Context, uri string) (*http.Request, error) {
	return http.NewRequestWithContext(ctx, http.MethodTrace, uri, nil)
}

// NewTraceRequest returns a new http.MethodTrace request,
// which is saving your life from http.NewRequest.
func NewTraceRequest(uri string) (*http.Request, error) {
	return http.NewRequest(http.MethodTrace, uri, nil)
}

// Error is similar to http.Error,
// but it can get the error message by the given code.
func Error(rw http.ResponseWriter, code int) {
	http.Error(rw, http.StatusText(code), code)
}

// Close closes the http response body without error.
func Close(resp *http.Response) {
	if resp != nil && resp.Body != nil {
		_ = resp.Body.Close()
	}
}

// BodyBytes returns the body of the http response as a byte slice.
func BodyBytes(resp *http.Response) []byte {
	buf := bytex.GetBytes()
	defer bytex.Put(buf)

	w := bytex.GetBuffer()
	_, _ = io.CopyBuffer(w, resp.Body, buf)
	return w.Bytes()
}

// BodyString returns the body of the http response as a string.
func BodyString(resp *http.Response) string {
	return string(BodyBytes(resp))
}

// Do is a helper function to execute the given http request with the given http client,
// and execute the given function with the http response.
//
// It is useful to avoid forgetting to close the http response body.
//
// Do will return the error if failed to execute the http request or the given function.
func Do(cli *http.Client, req *http.Request, respFunc func(*http.Response) error) error {
	resp, err := cli.Do(req)
	if err != nil {
		return fmt.Errorf("do request: %w", err)
	}
	defer Close(resp)
	if respFunc == nil {
		return nil
	}
	return respFunc(resp)
}


================================================
FILE: util/httpx/client_helper.go
================================================
package httpx

import (
	"bytes"
	"errors"
	"io"
	"net/http"
	"regexp"

	"github.com/henvic/httpretty"

	"github.com/gpustack/gguf-parser-go/util/json"
)

var _ httpretty.Formatter = (*JSONFormatter)(nil)

// JSONFormatter is copied from httpretty.JSONFormatter,
// but use our own json package.
type JSONFormatter struct{}

var jsonTypeRE = regexp.MustCompile(`[/+]json($|;)`)

// Match JSON media type.
func (j *JSONFormatter) Match(mediatype string) bool {
	return jsonTypeRE.MatchString(mediatype)
}

// Format JSON content.
func (j *JSONFormatter) Format(w io.Writer, src []byte) error {
	if !json.Valid(src) {
		// We want to get the error of json.checkValid, not unmarshal it.
		// The happy path has been optimized, maybe prematurely.
		if err := json.Unmarshal(src, &json.RawMessage{}); err != nil {
			return err
		}
	}
	// Avoiding allocation as we use *bytes.Buffer to store the formatted body before printing
	dst, ok := w.(*bytes.Buffer)
	if !ok {
		// Mitigating panic to avoid upsetting anyone who uses this directly
		return errors.New("underlying writer for JSONFormatter must be *bytes.Buffer")
	}
	return json.Indent(dst, src, "", "    ")
}

type RoundTripperChain struct {
	Do   func(req *http.Request) error
	Next http.RoundTripper
}

func (c RoundTripperChain) RoundTrip(req *http.Request) (*http.Response, error) {
	if c.Do != nil {
		if err := c.Do(req); err != nil {
			return nil, err
		}
	}
	if c.Next != nil {
		return c.Next.RoundTrip(req)
	}
	return nil, nil
}

type RoundTripperFunc func(*http.Request) (*http.Response, error)

func (fn RoundTripperFunc) RoundTrip(req *http.Request) (*http.Response, error) {
	return fn(req)
}


================================================
FILE: util/httpx/client_options.go
================================================
package httpx

import (
	"math"
	"net/http"
	"strconv"
	"strings"
	"time"
)

type ClientOption struct {
	*TransportOption

	timeout       time.Duration
	debug         bool
	retryIf       RetryFunc
	retryBackoff  func(attemptNum int, resp *http.Response) (wait time.Duration, ok bool)
	roundTrippers []func(req *http.Request) error
}

func ClientOptions() *ClientOption {
	return &ClientOption{
		TransportOption: TransportOptions().WithoutKeepalive(),
		timeout:         30 * time.Second,
		retryIf:         DefaultRetry,
		retryBackoff:    createRetryBackoff(100*time.Millisecond, 5*time.Second, 5),
	}
}

// WithTransport sets the TransportOption.
func (o *ClientOption) WithTransport(opt *TransportOption) *ClientOption {
	if o == nil || opt == nil {
		return o
	}
	o.TransportOption = opt
	return o
}

// WithTimeout sets the request timeout.
//
// This timeout controls the sum of [network dial], [tls handshake], [request], [response header reading] and [response body reading].
//
// Use 0 to disable timeout.
func (o *ClientOption) WithTimeout(timeout time.Duration) *ClientOption {
	if o == nil || timeout < 0 {
		return o
	}
	o.timeout = timeout
	return o
}

// WithDebug sets the debug mode.
func (o *ClientOption) WithDebug() *ClientOption {
	if o == nil {
		return o
	}
	o.debug = true
	return o
}

type RetryFunc func(resp *http.Response, err error) (retry bool)

// WithRetryIf specifies the if-condition of retry operation for request,
// or stops retrying if setting with `nil`.
func (o *ClientOption) WithRetryIf(retryIf RetryFunc) *ClientOption {
	if o == nil {
		return o
	}
	o.retryIf = retryIf
	return o
}

// WithRetryBackoff specifies the retry-backoff mechanism for request.
func (o *ClientOption) WithRetryBackoff(waitMin, waitMax time.Duration, attemptMax int) *ClientOption {
	if o == nil || waitMin < 0 || waitMax < 0 || waitMax < waitMin || attemptMax <= 0 {
		return o
	}
	o.retryBackoff = createRetryBackoff(waitMin, waitMax, attemptMax)
	return o
}

// WithUserAgent sets the user agent.
func (o *ClientOption) WithUserAgent(ua string) *ClientOption {
	return o.WithRoundTripper(func(req *http.Request) error {
		req.Header.Set("User-Agent", ua)
		return nil
	})
}

// WithBearerAuth sets the bearer token.
func (o *ClientOption) WithBearerAuth(token string) *ClientOption {
	return o.WithRoundTripper(func(req *http.Request) error {
		req.Header.Set("Authorization", "Bearer "+token)
		return nil
	})
}

// WithBasicAuth sets the basic authentication.
func (o *ClientOption) WithBasicAuth(username, password string) *ClientOption {
	return o.WithRoundTripper(func(req *http.Request) error {
		req.SetBasicAuth(username, password)
		return nil
	})
}

// WithHeader sets the header.
func (o *ClientOption) WithHeader(key, value string) *ClientOption {
	return o.WithRoundTripper(func(req *http.Request) error {
		req.Header.Set(key, value)
		return nil
	})
}

// WithHeaders sets the headers.
func (o *ClientOption) WithHeaders(headers map[string]string) *ClientOption {
	return o.WithRoundTripper(func(req *http.Request) error {
		for k, v := range headers {
			req.Header.Set(k, v)
		}
		return nil
	})
}

// WithRoundTripper sets the round tripper.
func (o *ClientOption) WithRoundTripper(rt func(req *http.Request) error) *ClientOption {
	if o == nil || rt == nil {
		return o
	}
	o.roundTrippers = append(o.roundTrippers, rt)
	return o
}

// If is a conditional option,
// which receives a boolean condition to trigger the given function or not.
func (o *ClientOption) If(condition bool, then func(*ClientOption) *ClientOption) *ClientOption {
	if condition {
		return then(o)
	}
	return o
}

// DefaultRetry is the default retry condition,
// inspired by https://github.com/hashicorp/go-retryablehttp/blob/40b0cad1633fd521cee5884724fcf03d039aaf3f/client.go#L68-L86.
func DefaultRetry(resp *http.Response, respErr error) bool {
	if respErr != nil {
		switch errMsg := respErr.Error(); {
		case strings.Contains(errMsg, `redirects`):
			return false
		case strings.Contains(errMsg, `unsupported protocol scheme`):
			return false
		case strings.Contains(errMsg, `certificate is not trusted`):
			return false
		case strings.Contains(errMsg, `invalid header`):
			return false
		case strings.Contains(errMsg, `failed to verify certificate`):
			return false
		}

		// Retry if receiving connection closed.
		return true
	}

	// Retry if receiving rate-limited of server.
	if resp.StatusCode == http.StatusTooManyRequests {
		return true
	}

	// Retry if receiving unexpected responses.
	if resp.StatusCode == 0 || (resp.StatusCode >= 500 && resp.StatusCode != http.StatusNotImplemented) {
		return true
	}

	return false
}

// createRetryBackoff creates a backoff function for retry operation.
func createRetryBackoff(waitMin, waitMax time.Duration, attemptMax int) func(int, *http.Response) (time.Duration, bool) {
	return func(attemptNum int, resp *http.Response) (wait time.Duration, ok bool) {
		if attemptNum > attemptMax {
			return 0, false
		}

		if resp != nil && (resp.StatusCode == http.StatusTooManyRequests || resp.StatusCode == http.StatusServiceUnavailable) {
			if retryAfter := resp.Header.Get("Retry-After"); retryAfter != "" {
				if seconds, err := strconv.Atoi(retryAfter); err == nil {
					return time.Duration(seconds) * time.Second, true
				}
			}
		}

		wait = time.Duration(math.Pow(2, float64(attemptNum)) * float64(waitMin))
		return min(wait, waitMax), true
	}
}


================================================
FILE: util/httpx/file.go
================================================
package httpx

import (
	"errors"
	"fmt"
	"io"
	"net/http"
	"strings"
	"syscall"

	"github.com/smallnest/ringbuffer"

	"github.com/gpustack/gguf-parser-go/util/bytex"
)

type SeekerFile struct {
	cli *http.Client
	req *http.Request
	b   *ringbuffer.RingBuffer
	c   int64
	l   int64
}

// OpenSeekerFile tries the GET http.Request as a SeekerFile,
// and returns a SeekerFile, or an error if any.
func OpenSeekerFile(cli *http.Client, req *http.Request, opts ...*SeekerFileOption) (*SeekerFile, error) {
	if cli == nil {
		return nil, errors.New("client is nil")
	}
	if req == nil {
		return nil, errors.New("request is nil")
	}
	if req.Method != http.MethodGet {
		return nil, errors.New("request method is not GET")
	}

	var o *SeekerFileOption
	if len(opts) > 0 {
		o = opts[0]
	} else {
		o = SeekerFileOptions()
	}
	if o.bufSize <= 0 {
		o.bufSize = 4 * 1024 * 1024 // 4mb
	}

	var l int64
	{
		if !o.skipRangeDownloadDetect {
			req := req.Clone(req.Context())
			req.Method = http.MethodHead
			err := Do(cli, req, func(resp *http.Response) error {
				if resp.StatusCode != http.StatusOK {
					return fmt.Errorf("stat: status code %d", resp.StatusCode)
				}
				if !strings.EqualFold(resp.Header.Get("Accept-Ranges"), "bytes") {
					return fmt.Errorf("stat: not support range download")
				}
				l = resp.ContentLength
				return nil
			})
			if err != nil {
				return nil, fmt.Errorf("stat: do head request: %w", err)
			}
		} else {
			req := req.Clone(req.Context())
			err := Do(cli, req, func(resp *http.Response) error {
				if resp.StatusCode != http.StatusOK {
					return fmt.Errorf("stat: status code %d", resp.StatusCode)
				}
				l = resp.ContentLength
				return nil
			})
			if err != nil {
				return nil, fmt.Errorf("stat: do get request: %w", err)
			}
		}
		switch sz := int64(o.size); {
		case sz > l:
			return nil, fmt.Errorf("size %d is greater than limit %d", o.size, l)
		case sz <= 0:
		default:
			l = sz
		}
	}

	b := ringbuffer.New(o.bufSize).WithCancel(req.Context())
	return &SeekerFile{cli: cli, req: req, b: b, c: 1<<63 - 1, l: l}, nil
}

func (f *SeekerFile) Close() error {
	if f.b != nil {
		f.b.CloseWriter()
	}
	return nil
}

func (f *SeekerFile) Len() int64 {
	return f.l
}

func (f *SeekerFile) ReadAt(p []byte, off int64) (int, error) {
	if off < 0 {
		return 0, syscall.EINVAL
	}
	if off > f.Len() {
		return 0, io.EOF
	}

	// Sync and move to new offset, if backward or empty buffer.
	if f.c > off || f.b.IsEmpty() {
		if err := f.sync(off, true); err != nil {
			return 0, err
		}
	}

	var (
		remain   = int64(f.b.Length())
		capacity = int64(f.b.Capacity())
		need     = int64(len(p))
	)

	switch {
	case f.c+remain >= off+need: // Skip and move to new offset, if enough to forward.
		if err := f.skip(off - f.c); err != nil {
			return 0, err
		}
		return f.Read(p)
	case f.c+capacity >= off+need: // Sync and move to new offset, if enough to forward after synced.
		if err := f.sync(f.c+remain, false); err != nil {
			return 0, err
		}
		if err := f.skip(off - f.c); err != nil {
			return 0, err
		}
		return f.Read(p)
	default:
	}

	// Otherwise, read directly.

	f.b.Reset()
	f.c = off

	// Request remain needing.
	lim := off + int64(len(p)) - 1
	if lim > f.Len() {
		lim = f.Len()
	}
	req := f.req.Clone(f.req.Context())
	req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", off, lim))
	resp, err := f.cli.Do(req)
	if err != nil {
		return 0, err
	}
	defer Close(resp)
	if resp.StatusCode != http.StatusPartialContent && resp.StatusCode != http.StatusOK {
		return 0, errors.New(resp.Status)
	}
	n, err := resp.Body.Read(p)
	f.c += int64(n)
	return n, err
}

func (f *SeekerFile) Read(p []byte) (int, error) {
	n, err := f.b.Read(p)
	f.c += int64(n)
	return n, err
}

func (f *SeekerFile) sync(off int64, reset bool) error {
	lim := off + int64(f.b.Free()) - 1
	if lim > f.Len() {
		lim = f.Len()
	}
	req := f.req.Clone(f.req.Context())
	req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", off, lim))

	resp, err := f.cli.Do(req)
	if err != nil {
		return err
	}
	defer Close(resp)
	if resp.StatusCode != http.StatusPartialContent && resp.StatusCode != http.StatusOK {
		return errors.New(resp.Status)
	}

	buf := bytex.GetBytes()
	defer bytex.Put(buf)
	if reset {
		f.b.Reset()
		f.c = off
	}

	_, err = io.CopyBuffer(_WriterOnly{w: f.b}, resp.Body, buf)
	if err != nil {
		return err
	}

	return nil
}

func (f *SeekerFile) skip(dif int64) error {
	if dif <= 0 {
		return nil
	}

	buf := bytex.GetBytes(uint64(dif))
	defer bytex.Put(buf)
	n, err := f.b.Read(buf)
	f.c += int64(n)
	if err != nil {
		return err
	}
	return nil
}

// _WriterOnly is a wrapper to expose the io.Writer method only,
// which to avoid calling the io.ReaderFrom method.
type _WriterOnly struct {
	w io.Writer
}

func (w _WriterOnly) Write(p []byte) (int, error) {
	return w.w.Write(p)
}


================================================
FILE: util/httpx/file_options.go
================================================
package httpx

type SeekerFileOption struct {
	bufSize                 int
	size                    int
	skipRangeDownloadDetect bool
}

func SeekerFileOptions() *SeekerFileOption {
	return &SeekerFileOption{
		bufSize: 4 * 1024 * 1024, // 4mb
	}
}

// WithBufferSize sets the size of the buffer to read the file,
//
// Default is 4mb.
func (o *SeekerFileOption) WithBufferSize(bufSize int) *SeekerFileOption {
	if o == nil || bufSize <= 0 {
		return o
	}
	o.bufSize = bufSize
	return o
}

// WithSize sets the size of the file to read,
//
// If the size is greater than the content size of the file, it will return an error.
func (o *SeekerFileOption) WithSize(size int) *SeekerFileOption {
	if o == nil || size <= 0 {
		return o
	}
	o.size = size
	return o
}

// WithoutRangeDownloadDetect disables range download detection.
//
// Usually, OpenSeekerFile sends a "HEAD" HTTP request to destination to get the content size from the "Content-Length" header,
// and confirms whether supports range download via the "Accept-Ranges" header.
// However, some servers may not support the "HEAD" method, or the "Accept-Ranges" header is not set correctly.
//
// With this option, OpenSeekerFile sends "GET" HTTP request to get the content size as usual,
// and does not confirm whether supports range download. But during the seeking read,
// it still uses the "Range" header to read the file.
func (o *SeekerFileOption) WithoutRangeDownloadDetect() *SeekerFileOption {
	if o == nil {
		return o
	}
	o.skipRangeDownloadDetect = true
	return o
}

// If is a conditional option,
// which receives a boolean condition to trigger the given function or not.
func (o *SeekerFileOption) If(condition bool, then func(*SeekerFileOption) *SeekerFileOption) *SeekerFileOption {
	if condition {
		return then(o)
	}
	return o
}


================================================
FILE: util/httpx/proxy.go
================================================
package httpx

import (
	"net"
	"net/http"
	"net/url"
	"strings"

	"github.com/gpustack/gguf-parser-go/util/osx"
)

var noProxies []*net.IPNet

func init() {
	noProxyEnv := osx.Getenv("NO_PROXY", osx.Getenv("no_proxy"))
	noProxyRules := strings.Split(noProxyEnv, ",")
	for i := range noProxyRules {
		_, cidr, _ := net.ParseCIDR(noProxyRules[i])
		if cidr != nil {
			noProxies = append(noProxies, cidr)
		}
	}
}

// ProxyFromEnvironment is similar to http.ProxyFromEnvironment,
// but it also respects the NO_PROXY environment variable.
func ProxyFromEnvironment(r *http.Request) (*url.URL, error) {
	if ip := net.ParseIP(r.URL.Hostname()); ip != nil {
		for i := range noProxies {
			if noProxies[i].Contains(ip) {
				return nil, nil
			}
		}
	}

	return http.ProxyFromEnvironment(r)
}


================================================
FILE: util/httpx/resolver.go
================================================
package httpx

import (
	"context"
	"net"
)

func DNSCacheDialContext(dialer *net.Dialer) func(context.Context, string, string) (net.Conn, error) {
	cs := map[string][]net.IP{}

	return func(ctx context.Context, nw, addr string) (conn net.Conn, err error) {
		h, p, err := net.SplitHostPort(addr)
		if err != nil {
			return nil, err
		}
		ips, ok := cs[h]
		if !ok {
			ips, err = net.DefaultResolver.LookupIP(ctx, "ip4", h)
			if len(ips) == 0 {
				ips, err = net.DefaultResolver.LookupIP(ctx, "ip", h)
			}
			if err != nil {
				return nil, err
			}
			cs[h] = ips
		}
		// Try to connect to each IP address in order.
		for _, ip := range ips {
			conn, err = dialer.DialContext(ctx, nw, net.JoinHostPort(ip.String(), p))
			if err == nil {
				break
			}
		}
		return conn, err
	}
}


================================================
FILE: util/httpx/transport.go
================================================
package httpx

import (
	"net/http"
)

// DefaultTransport is similar to the default http.DefaultTransport used by the package.
var DefaultTransport http.RoundTripper = Transport()

// DefaultInsecureTransport is the default http.DefaultTransport used by the package,
// with TLS insecure skip verify.
var DefaultInsecureTransport http.RoundTripper = Transport(TransportOptions().WithoutInsecureVerify())

// Transport returns a new http.Transport with the given options,
// the result http.Transport is used for constructing http.Client.
func Transport(opts ...*TransportOption) *http.Transport {
	var o *TransportOption
	if len(opts) > 0 {
		o = opts[0]
	} else {
		o = TransportOptions()
	}

	return o.transport
}


================================================
FILE: util/httpx/transport_options.go
================================================
package httpx

import (
	"crypto/tls"
	"net"
	"net/http"
	"net/url"
	"time"
)

type TransportOption struct {
	dialer    *net.Dialer
	transport *http.Transport
}

func TransportOptions() *TransportOption {
	dialer := &net.Dialer{
		Timeout:   30 * time.Second,
		KeepAlive: 30 * time.Second,
	}
	transport := &http.Transport{
		Proxy: ProxyFromEnvironment,
		TLSClientConfig: &tls.Config{
			MinVersion: tls.VersionTLS12,
		},
		DialContext:           DNSCacheDialContext(dialer),
		ForceAttemptHTTP2:     true,
		MaxIdleConns:          100,
		IdleConnTimeout:       90 * time.Second,
		TLSHandshakeTimeout:   10 * time.Second,
		ExpectContinueTimeout: 1 * time.Second,
	}

	return &TransportOption{
		dialer:    dialer,
		transport: transport,
	}
}

// WithProxy sets the proxy.
func (o *TransportOption) WithProxy(proxy func(*http.Request) (*url.URL, error)) *TransportOption {
	if o == nil || o.transport == nil {
		return o
	}
	o.transport.Proxy = proxy
	return o
}

// WithoutProxy disables the proxy.
func (o *TransportOption) WithoutProxy() *TransportOption {
	if o == nil || o.transport == nil {
		return o
	}
	o.transport.Proxy = nil
	return o
}

// WithKeepalive sets the keepalive.
func (o *TransportOption) WithKeepalive(timeoutAndKeepalive ...time.Duration) *TransportOption {
	if o == nil || o.transport == nil || o.dialer == nil {
		return o
	}
	tak := [2]time.Duration{30 * time.Second, 30 * time.Second}
	if len(timeoutAndKeepalive) > 0 {
		tak[0] = timeoutAndKeepalive[0]
		if len(timeoutAndKeepalive) > 1 {
			tak[1] = timeoutAndKeepalive[1]
		}
	}
	o.dialer.Timeout, o.dialer.KeepAlive = tak[0], tak[1]
	o.transport.MaxIdleConns = 100
	o.transport.IdleConnTimeout = 90 * time.Second
	return o
}

// WithoutKeepalive disables the keepalive.
func (o *TransportOption) WithoutKeepalive() *TransportOption {
	if o == nil || o.transport == nil {
		return o
	}
	o.dialer.KeepAlive = -1
	o.transport.MaxIdleConns = 0
	o.transport.IdleConnTimeout = 0
	return o
}

// WithInsecureVerify verifies the insecure connection.
func (o *TransportOption) WithInsecureVerify() *TransportOption {
	if o == nil || o.transport == nil || o.transport.TLSClientConfig == nil {
		return o
	}
	o.transport.TLSClientConfig.InsecureSkipVerify = false
	return o
}

// WithoutInsecureVerify skips the insecure connection verify.
func (o *TransportOption) WithoutInsecureVerify() *TransportOption {
	if o == nil || o.transport == nil || o.transport.TLSClientConfig == nil {
		return o
	}
	o.transport.TLSClientConfig.InsecureSkipVerify = true
	return o
}

// TimeoutForDial sets the timeout for network dial.
//
// This timeout controls the [network dial] only.
//
// Use 0 to disable timeout.
func (o *TransportOption) TimeoutForDial(timeout time.Duration) *TransportOption {
	if o == nil || o.dialer == nil {
		return o
	}
	o.dialer.Timeout = timeout
	return o
}

// TimeoutForResponseHeader sets the timeout for response header.
//
// This timeout controls the [response header reading] only.
//
// Use 0 to disable timeout.
func (o *TransportOption) TimeoutForResponseHeader(timeout time.Duration) *TransportOption {
	if o == nil || o.transport == nil {
		return o
	}
	o.transport.ResponseHeaderTimeout = timeout
	return o
}

// TimeoutForTLSHandshake sets the timeout for tls handshake.
//
// This timeout controls the [tls handshake] only.
//
// Use 0 to disable timeout.
func (o *TransportOption) TimeoutForTLSHandshake(timeout time.Duration) *TransportOption {
	if o == nil || o.transport == nil {
		return o
	}
	o.transport.TLSHandshakeTimeout = timeout
	return o
}

// TimeoutForIdleConn sets the timeout for idle connection.
//
// This timeout controls the [idle connection lifetime] only.
//
// Use 0 to disable timeout.
func (o *TransportOption) TimeoutForIdleConn(timeout time.Duration) *TransportOption {
	if o == nil || o.transport == nil {
		return o
	}
	o.transport.IdleConnTimeout = timeout
	return o
}

// WithTLSClientConfig sets the tls.Config.
func (o *TransportOption) WithTLSClientConfig(config *tls.Config) *TransportOption {
	if o == nil || o.transport == nil {
		return o
	}
	o.transport.TLSClientConfig = config
	return o
}

// WithoutDNSCache disables the dns cache.
func (o *TransportOption) WithoutDNSCache() *TransportOption {
	if o == nil || o.transport == nil || o.dialer == nil {
		return o
	}
	o.transport.DialContext = o.dialer.DialContext
	return o
}

// WithDialer sets the dialer.
func (o *TransportOption) WithDialer(dialer *net.Dialer) *TransportOption {
	if o == nil || o.transport == nil || dialer == nil {
		return o
	}
	o.dialer = dialer
	o.transport.DialContext = DNSCacheDialContext(o.dialer)
	return o
}

// Customize sets the transport.
func (o *TransportOption) Customize(fn func(*http.Transport)) *TransportOption {
	if o == nil || o.transport == nil {
		return o
	}
	o.dialer = nil
	fn(o.transport)
	return o
}

// If is a conditional option,
// which receives a boolean condition to trigger the given function or not.
func (o *TransportOption) If(condition bool, then func(*TransportOption) *TransportOption) *TransportOption {
	if condition {
		return then(o)
	}
	return o
}


================================================
FILE: util/json/common.go
================================================
package json

import (
	stdjson "encoding/json"
	"fmt"
)

type RawMessage = stdjson.RawMessage

var (
	MarshalIndent = stdjson.MarshalIndent
	Indent        = stdjson.Indent
	NewEncoder    = stdjson.NewEncoder
	Valid         = stdjson.Valid
)

// MustMarshal is similar to Marshal,
// but panics if found error.
func MustMarshal(v any) []byte {
	bs, err := Marshal(v)
	if err != nil {
		panic(fmt.Errorf("error marshaling json: %w", err))
	}

	return bs
}

// MustUnmarshal is similar to Unmarshal,
// but panics if found error.
func MustUnmarshal(data []byte, v any) {
	err := Unmarshal(data, v)
	if err != nil {
		panic(fmt.Errorf("error unmarshaling json: %w", err))
	}
}

// MustMarshalIndent is similar to MarshalIndent,
// but panics if found error.
func MustMarshalIndent(v any, prefix, indent string) []byte {
	bs, err := MarshalIndent(v, prefix, indent)
	if err != nil {
		panic(fmt.Errorf("error marshaling indent json: %w", err))
	}

	return bs
}

// ShouldMarshal is similar to Marshal,
// but never return error.
func ShouldMarshal(v any) []byte {
	bs, _ := Marshal(v)
	return bs
}

// ShouldUnmarshal is similar to Unmarshal,
// but never return error.
func ShouldUnmarshal(data []byte, v any) {
	_ = Unmarshal(data, v)
}

// ShouldMarshalIndent is similar to MarshalIndent,
// but never return error.
func ShouldMarshalIndent(v any, prefix, indent string) []byte {
	bs, _ := MarshalIndent(v, prefix, indent)
	return bs
}


================================================
FILE: util/json/jsoniter.go
================================================
//go:build !stdjson

package json

import (
	stdjson "encoding/json"
	"strconv"
	"unsafe"

	jsoniter "github.com/json-iterator/go"
)

var json = jsoniter.ConfigCompatibleWithStandardLibrary

func init() {
	// borrowed from https://github.com/json-iterator/go/issues/145#issuecomment-323483602
	decodeNumberAsInt64IfPossible := func(ptr unsafe.Pointer, iter *jsoniter.Iterator) {
		switch iter.WhatIsNext() {
		case jsoniter.NumberValue:
			var number stdjson.Number

			iter.ReadVal(&number)
			i, err := strconv.ParseInt(string(number), 10, 64)

			if err == nil {
				*(*any)(ptr) = i
				return
			}

			f, err := strconv.ParseFloat(string(number), 64)
			if err == nil {
				*(*any)(ptr) = f
				return
			}
		default:
			*(*any)(ptr) = iter.Read()
		}
	}
	jsoniter.RegisterTypeDecoderFunc("interface {}", decodeNumberAsInt64IfPossible)
	jsoniter.RegisterTypeDecoderFunc("any", decodeNumberAsInt64IfPossible)
}

var (
	Marshal    = json.Marshal
	Unmarshal  = json.Unmarshal
	NewDecoder = json.NewDecoder
)


================================================
FILE: util/json/stdjson.go
================================================
//go:build stdjson

package json

import (
	"encoding/json"
)

var (
	Marshal    = json.Marshal
	Unmarshal  = json.Unmarshal
	NewDecoder = json.NewDecoder
)


================================================
FILE: util/osx/env.go
================================================
package osx

import (
	"os"
)

// ExistEnv checks if the environment variable named by the key exists.
func ExistEnv(key string) bool {
	_, ok := os.LookupEnv(key)
	return ok
}

// Getenv retrieves the value of the environment variable named by the key.
// It returns the default, which will be empty if the variable is not present.
// To distinguish between an empty value and an unset value, use LookupEnv.
func Getenv(key string, def ...string) string {
	e, ok := os.LookupEnv(key)
	if !ok && len(def) != 0 {
		return def[0]
	}

	return e
}

// ExpandEnv is similar to Getenv,
// but replaces ${var} or $var in the result.
func ExpandEnv(key string, def ...string) string {
	return os.ExpandEnv(Getenv(key, def...))
}


================================================
FILE: util/osx/file.go
================================================
package osx

import (
	"io"
	"os"
	"path/filepath"
	"strings"
)

// InlineTilde replaces the leading ~ with the home directory.
func InlineTilde(path string) string {
	if path == "" {
		return path
	}
	if strings.HasPrefix(path, "~"+string(filepath.Separator)) {
		hd, err := os.UserHomeDir()
		if err == nil {
			path = filepath.Join(hd, path[2:])
		}
	}
	return path
}

// Open is similar to os.Open but supports ~ as the home directory.
func Open(path string) (*os.File, error) {
	p := filepath.Clean(path)
	p = InlineTilde(p)
	return os.Open(p)
}

// Exists checks if the given path exists.
func Exists(path string, checks ...func(os.FileInfo) bool) bool {
	p := filepath.Clean(path)
	p = InlineTilde(p)

	stat, err := os.Lstat(p)
	if err != nil {
		return false
	}

	for i := range checks {
		if checks[i] == nil {
			continue
		}

		if !checks[i](stat) {
			return false
		}
	}

	return true
}

// ExistsDir checks if the given path exists and is a directory.
func ExistsDir(path string) bool {
	return Exists(path, func(stat os.FileInfo) bool {
		return stat.Mode().IsDir()
	})
}

// ExistsLink checks if the given path exists and is a symbolic link.
func ExistsLink(path string) bool {
	return Exists(path, func(stat os.FileInfo) bool {
		return stat.Mode()&os.ModeSymlink != 0
	})
}

// ExistsFile checks if the given path exists and is a regular file.
func ExistsFile(path string) bool {
	return Exists(path, func(stat os.FileInfo) bool {
		return stat.Mode().IsRegular()
	})
}

// ExistsSocket checks if the given path exists and is a socket.
func ExistsSocket(path string) bool {
	return Exists(path, func(stat os.FileInfo) bool {
		return stat.Mode()&os.ModeSocket != 0
	})
}

// ExistsDevice checks if the given path exists and is a device.
func ExistsDevice(path string) bool {
	return Exists(path, func(stat os.FileInfo) bool {
		return stat.Mode()&os.ModeDevice != 0
	})
}

// Close closes the given io.Closer without error.
func Close(c io.Closer) {
	if c == nil {
		return
	}
	_ = c.Close()
}

// WriteFile is similar to os.WriteFile but supports ~ as the home directory,
// and also supports the parent directory creation.
func WriteFile(name string, data []byte, perm os.FileMode) error {
	p := filepath.Clean(name)
	p = InlineTilde(p)

	if err := os.MkdirAll(filepath.Dir(p), 0o700); err != nil {
		return err
	}

	return os.WriteFile(p, data, perm)
}

// CreateFile is similar to os.Create but supports ~ as the home directory,
// and also supports the parent directory creation.
func CreateFile(name string, perm os.FileMode) (*os.File, error) {
	p := filepath.Clean(name)
	p = InlineTilde(p)

	if err := os.MkdirAll(filepath.Dir(p), 0o700); err != nil {
		return nil, err
	}

	return os.OpenFile(p, os.O_RDWR|os.O_CREATE|os.O_TRUNC, perm)
}

// OpenFile is similar to os.OpenFile but supports ~ as the home directory,
// and also supports the parent directory creation.
func OpenFile(name string, flag int, perm os.FileMode) (*os.File, error) {
	p := filepath.Clean(name)
	p = InlineTilde(p)

	if err := os.MkdirAll(filepath.Dir(p), 0o700); err != nil {
		return nil, err
	}

	return os.OpenFile(p, flag, perm)
}


================================================
FILE: util/osx/file_mmap.go
================================================
// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package osx

import (
	"errors"
	"fmt"
	"io"
	"os"
	"path/filepath"
	"runtime/debug"
	"syscall"
)

type MmapFile struct {
	f *os.File
	b []byte
}

func OpenMmapFile(path string) (*MmapFile, error) {
	return OpenMmapFileWithSize(path, 0)
}

func OpenMmapFileWithSize(path string, size int) (*MmapFile, error) {
	p := filepath.Clean(path)
	p = InlineTilde(p)

	f, err := os.Open(p)
	if err != nil {
		return nil, fmt.Errorf("try lock file: %w", err)
	}
	if size <= 0 {
		info, err := f.Stat()
		if err != nil {
			Close(f)
			return nil, fmt.Errorf("stat: %w", err)
		}
		size = int(info.Size())
	}

	b, err := mmap(f, size)
	if err != nil {
		Close(f)
		return nil, fmt.Errorf("mmap, size %d: %w", size, err)
	}

	return &MmapFile{f: f, b: b}, nil
}

func (f *MmapFile) Close() error {
	err0 := munmap(f.b)
	err1 := f.f.Close()

	if err0 != nil {
		return err0
	}
	return err1
}

func (f *MmapFile) Bytes() []byte {
	return f.b
}

func (f *MmapFile) Len() int64 {
	return int64(len(f.b))
}

var ErrPageFault = errors.New("page fault occurred while reading from memory map")

func (f *MmapFile) ReadAt(p []byte, off int64) (_ int, err error) {
	if off < 0 {
		return 0, syscall.EINVAL
	}
	if off > f.Len() {
		return 0, io.EOF
	}

	old := debug.SetPanicOnFault(true)
	defer func() {
		debug.SetPanicOnFault(old)
		if recover() != nil {
			err = ErrPageFault
		}
	}()

	n := copy(p, f.b[off:])
	if n < len(p) {
		err = io.EOF
	}
	return n, err
}


================================================
FILE: util/osx/file_mmap_js.go
================================================
// Copyright 2022 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package osx

import (
	"errors"
	"os"
)

func mmap(f *os.File, length int) ([]byte, error) {
	return nil, errors.New("unsupported")
}

func munmap(b []byte) (err error) {
	return errors.New("unsupported")
}


================================================
FILE: util/osx/file_mmap_unix.go
================================================
// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris

package osx

import (
	"os"

	"golang.org/x/sys/unix"
)

func mmap(f *os.File, length int) ([]byte, error) {
	return unix.Mmap(int(f.Fd()), 0, length, unix.PROT_READ, unix.MAP_SHARED)
}

func munmap(b []byte) (err error) {
	return unix.Munmap(b)
}


================================================
FILE: util/osx/file_mmap_windows.go
================================================
package osx

import (
	"os"
	"syscall"
	"unsafe"
)

func mmap(f *os.File, size int) ([]byte, error) {
	low, high := uint32(size), uint32(size>>32)
	h, errno := syscall.CreateFileMapping(syscall.Handle(f.Fd()), nil, syscall.PAGE_READONLY, high, low, nil)
	if h == 0 {
		return nil, os.NewSyscallError("CreateFileMapping", errno)
	}

	addr, errno := syscall.MapViewOfFile(h, syscall.FILE_MAP_READ, 0, 0, uintptr(size))
	if addr == 0 {
		return nil, os.NewSyscallError("MapViewOfFile", errno)
	}

	if err := syscall.CloseHandle(h); err != nil {
		return nil, os.NewSyscallError("CloseHandle", err)
	}

	return (*[maxMapSize]byte)(unsafe.Pointer(addr))[:size], nil
}

func munmap(b []byte) error {
	if err := syscall.UnmapViewOfFile((uintptr)(unsafe.Pointer(&b[0]))); err != nil {
		return os.NewSyscallError("UnmapViewOfFile", err)
	}
	return nil
}


================================================
FILE: util/osx/file_mmap_windows_386.go
================================================
// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package osx

const maxMapSize = 0x7FFFFFFF // 2GB


================================================
FILE: util/osx/file_mmap_windows_non386.go
================================================
// Copyright 2018 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

//go:build windows && !386

package osx

const maxMapSize = 0xFFFFFFFFFFFF // 256TB


================================================
FILE: util/osx/homedir.go
================================================
package osx

import (
	"os"
	"path/filepath"
	"time"
)

// UserHomeDir is similar to os.UserHomeDir,
// but returns the temp dir if the home dir is not found.
func UserHomeDir() string {
	hd, err := os.UserHomeDir()
	if err != nil {
		hd = filepath.Join(os.TempDir(), time.Now().Format(time.DateOnly))
	}
	return hd
}


================================================
FILE: util/ptr/pointer.go
================================================
package ptr

import (
	"time"

	"golang.org/x/exp/constraints"
)

func Int(v int) *int {
	return Ref(v)
}

func IntDeref(v *int, def int) int {
	return Deref(v, def)
}

func Int8(v int8) *int8 {
	return Ref(v)
}

func Int8Deref(v *int8, def int8) int8 {
	return Deref(v, def)
}

func Int16(v int16) *int16 {
	return Ref(v)
}

func Int16Deref(v *int16, def int16) int16 {
	return Deref(v, def)
}

func Int32(v int32) *int32 {
	return Ref(v)
}

func Int32Deref(v *int32, def int32) int32 {
	return Deref(v, def)
}

func Int64(v int64) *int64 {
	return Ref(v)
}

func Int64Deref(v *int64, def int64) int64 {
	return Deref(v, def)
}

func Uint(v uint) *uint {
	return Ref(v)
}

func UintDeref(v *uint, def uint) uint {
	return Deref(v, def)
}

func Uint8(v uint8) *uint8 {
	return Ref(v)
}

func Uint8Deref(v *uint8, def uint8) uint8 {
	return Deref(v, def)
}

func Uint16(v uint16) *uint16 {
	return Ref(v)
}

func Uint16Deref(v *uint16, def uint16) uint16 {
	return Deref(v, def)
}

func Uint32(v uint32) *uint32 {
	return Ref(v)
}

func Uint32Deref(v *uint32, def uint32) uint32 {
	return Deref(v, def)
}

func Uint64(v uint64) *uint64 {
	return Ref(v)
}

func Uint64Deref(v *uint64, def uint64) uint64 {
	return Deref(v, def)
}

func Float32(v float32) *float32 {
	return Ref(v)
}

func Float32Deref(v *float32, def float32) float32 {
	return Deref(v, def)
}

func Float64(v float64) *float64 {
	return Ref(v)
}

func Float64Deref(v *float64, def float64) float64 {
	return Deref(v, def)
}

func String(v string) *string {
	return Ref(v)
}

func StringDeref(v *string, def string) string {
	return Deref(v, def)
}

func Bool(v bool) *bool {
	return Ref(v)
}

func BoolDeref(v *bool, def bool) bool {
	return Deref(v, def)
}

func Duration(v time.Duration) *time.Duration {
	return Ref(v)
}

func DurationDeref(v *time.Duration, def time.Duration) time.Duration {
	return Deref(v, def)
}

func Time(v time.Time) *time.Time {
	return Ref(v)
}

func TimeDeref(v *time.Time, def time.Time) time.Time {
	return Deref(v, def)
}

type Pointerable interface {
	constraints.Ordered | ~bool | time.Time
}

func Ref[T Pointerable](v T) *T {
	return &v
}

func To[T Pointerable](v T) *T {
	return Ref(v)
}

func Deref[T Pointerable](ptr *T, def T) T {
	if ptr != nil {
		return *ptr
	}

	return def
}

func Equal[T Pointerable](a, b *T) bool {
	if a != nil && b != nil {
		return *a == *b
	}

	return false
}


================================================
FILE: util/signalx/handler.go
================================================
package signalx

import (
	"context"
	"os"
	"os/signal"
)

var registered = make(chan struct{})

// Handler registers for signals and returns a context.
func Handler() context.Context {
	close(registered) // Panics when called twice.

	sigChan := make(chan os.Signal, len(sigs))
	ctx, cancel := context.WithCancel(context.Background())

	// Register for signals.
	signal.Notify(sigChan, sigs...)

	// Process signals.
	go func() {
		var exited bool
		for range sigChan {
			if exited {
				os.Exit(1)
			}
			cancel()
			exited = true
		}
	}()

	return ctx
}


================================================
FILE: util/signalx/handler_unix.go
================================================
//go:build !windows

package signalx

import (
	"os"
	"syscall"
)

var sigs = []os.Signal{syscall.SIGINT, syscall.SIGTERM}


================================================
FILE: util/signalx/handler_windows.go
================================================
package signalx

import (
	"os"
	"syscall"
)

var sigs = []os.Signal{syscall.SIGINT}


================================================
FILE: util/slicex/search.go
================================================
package slicex

import "golang.org/x/exp/constraints"

// UpperBound returns an index of the first element that is greater than value.
func UpperBound[T constraints.Integer | constraints.Float](s []T, e T) int {
	l, r := 0, len(s)
	for l < r {
		m := l + (r-l)/2
		if s[m] <= e {
			l = m + 1
		} else {
			r = m
		}
	}
	return l
}


================================================
FILE: util/stringx/bytes.go
================================================
package stringx

import "unsafe"

// FromBytes converts a byte slice to a string.
func FromBytes(b *[]byte) string {
	return unsafe.String(unsafe.SliceData(*b), len(*b))
}

// ToBytes converts a string to a byte slice,
// which is impossible to modify the item of slice.
func ToBytes(s *string) (bs []byte) {
	return unsafe.Slice(unsafe.StringData(*s), len(*s))
}


================================================
FILE: util/stringx/random.go
================================================
package stringx

// Borrowed from github.com/thanhpk/randstr.

import (
	"bytes"
	"crypto/rand"
	"encoding/binary"
	"encoding/hex"
)

// list of default letters that can be used to make a random string when calling RandomString
// function with no letters provided.
var defLetters = []rune("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")

// RandomBytes generates n random bytes.
func RandomBytes(n int) []byte {
	b := make([]byte, n)

	_, err := rand.Read(b)
	if err != nil {
		panic(err)
	}

	return b
}

// RandomHex generates a random hex string with length of n
// e.g: 67aab2d956bd7cc621af22cfb169cba8.
func RandomHex(n int) string { return hex.EncodeToString(RandomBytes(n)) }

// RandomString generates a random string using only letters provided in the letters parameter
// if user omit letters parameters, this function will use defLetters instead.
func RandomString(n int, letters ...string) string {
	var (
		letterRunes []rune
		bb          bytes.Buffer
	)

	if len(letters) == 0 {
		letterRunes = defLetters
	} else {
		letterRunes = []rune(letters[0])
	}

	bb.Grow(n)

	l := uint32(len(letterRunes))
	// On each loop, generate one random rune and append to output.
	for i := 0; i < n; i++ {
		bb.WriteRune(letterRunes[binary.BigEndian.Uint32(RandomBytes(4))%l])
	}

	return bb.String()
}

// RandomBase64 generates a random base64 string with length of n,
// safe for URL.
func RandomBase64(n int) string {
	return RandomString(n, "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_")
}


================================================
FILE: util/stringx/strings.go
================================================
package stringx

import "strings"

// CutFromLeft is the same as strings.Cut,
// which starts from left to right,
// slices s around the first instance of sep,
// returning the text before and after sep.
// The found result reports whether sep appears in s.
// If sep does not appear in s, cut returns s, "", false.
func CutFromLeft(s, sep string) (before, after string, found bool) {
	return strings.Cut(s, sep)
}

// CutFromRight takes the same arguments as CutFromLeft,
// but starts from right to left,
// slices s around the last instance of sep,
// return the text before and after sep.
// The found result reports whether sep appears in s.
// If sep does not appear in s, cut returns s, "", false.
func CutFromRight(s, sep string) (before, after string, found bool) {
	if i := strings.LastIndex(s, sep); i >= 0 {
		return s[:i], s[i+len(sep):], true
	}
	return s, "", false
}

// ReplaceAllFunc is similar to strings.ReplaceAll,
// but it replaces each rune in s with the result of f(r).
func ReplaceAllFunc(s string, f func(rune) rune) string {
	var b strings.Builder
	for _, r := range s {
		b.WriteRune(f(r))
	}
	return b.String()
}

// HasSuffixes checks if s has any of the suffixes in prefixes.
func HasSuffixes(s string, suffixes ...string) bool {
	for _, suffix := range suffixes {
		if strings.HasSuffix(s, suffix) {
			return true
		}
	}
	return false
}


================================================
FILE: util/stringx/sum.go
================================================
package stringx

import (
	"crypto/sha256"
	"encoding/hex"
	"hash/fnv"
)

// SumByFNV64a sums up the string(s) by FNV-64a hash algorithm.
func SumByFNV64a(s string, ss ...string) string {
	h := fnv.New64a()

	_, _ = h.Write(ToBytes(&s))
	for i := range ss {
		_, _ = h.Write(ToBytes(&ss[i]))
	}

	sum := h.Sum(nil)
	return hex.EncodeToString(sum)
}

// SumBytesByFNV64a sums up the byte slice(s) by FNV-64a hash algorithm.
func SumBytesByFNV64a(bs []byte, bss ...[]byte) string {
	h := fnv.New64a()

	_, _ = h.Write(bs)
	for i := range bss {
		_, _ = h.Write(bss[i])
	}

	sum := h.Sum(nil)
	return hex.EncodeToString(sum)
}

// SumBySHA256 sums up the string(s) by SHA256 hash algorithm.
func SumBySHA256(s string, ss ...string) string {
	h := sha256.New()

	_, _ = h.Write(ToBytes(&s))
	for i := range ss {
		_, _ = h.Write(ToBytes(&ss[i]))
	}

	sum := h.Sum(nil)
	return hex.EncodeToString(sum)
}

// SumBytesBySHA256 sums up the byte slice(s) by SHA256 hash algorithm.
func SumBytesBySHA256(bs []byte, bss ...[]byte) string {
	h := sha256.New()

	_, _ = h.Write(bs)
	for i := range bss {
		_, _ = h.Write(bss[i])
	}

	sum := h.Sum(nil)
	return hex.EncodeToString(sum)
}

// SumBySHA224 sums up the string(s) by SHA224 hash algorithm.
func SumBySHA224(s string, ss ...string) string {
	h := sha256.New224()

	_, _ = h.Write(ToBytes(&s))
	for i := range ss {
		_, _ = h.Write(ToBytes(&ss[i]))
	}

	sum := h.Sum(nil)
	return hex.EncodeToString(sum)
}

// SumBytesBySHA224 sums up the byte slice(s) by SHA224 hash algorithm.
func SumBytesBySHA224(bs []byte, bss ...[]byte) string {
	h := sha256.New224()

	_, _ = h.Write(bs)
	for i := range bss {
		_, _ = h.Write(bss[i])
	}

	sum := h.Sum(nil)
	return hex.EncodeToString(sum)
}


================================================
FILE: zz_generated.diffusion_model_memory_usage.regression.go
================================================
package gguf_parser

import "math"

// GuessSD1DiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height,
// which is calculated by linear regression or polynomial regression.
func GuessSD1DiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 {
	coefficients := []float64{7876368.5672, 161.4230198633, 0.0078124893}
	degree := 2
	x := float64(width * height)

	y := float64(0)
	for i := 0; i <= degree; i++ {
		y += coefficients[i] * math.Pow(x, float64(i))
	}
	return uint64(y)
}

// GuessSD2DiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height,
// which is calculated by linear regression or polynomial regression.
func GuessSD2DiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 {
	coefficients := []float64{-355043979.0562, -1193.3271458642, 0.0054023818}
	degree := 2
	x := float64(width * height)

	if flashAttention {
		coefficients = []float64{3780681.28078, 513.2102510935}
		degree = 1
	}

	y := float64(0)
	for i := 0; i <= degree; i++ {
		y += coefficients[i] * math.Pow(x, float64(i))
	}
	return uint64(y)
}

// GuessSDXLDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height,
// which is calculated by linear regression or polynomial regression.
func GuessSDXLDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 {
	coefficients := []float64{55541290.3893, 138.3196116655, 0.0006109455}
	degree := 2
	x := float64(width * height)

	if flashAttention {
		coefficients = []float64{-5958802.78052, 500.0687898915}
		degree = 1
	}

	y := float64(0)
	for i := 0; i <= degree; i++ {
		y += coefficients[i] * math.Pow(x, float64(i))
	}
	return uint64(y)
}

// GuessSDXLRefinerDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height,
// which is calculated by linear regression or polynomial regression.
func GuessSDXLRefinerDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 {
	coefficients := []float64{49395992.3449, 155.2477810191, 0.0007351736}
	degree := 2
	x := float64(width * height)

	if flashAttention {
		coefficients = []float64{7031343.31998, 599.4137437227}
		degree = 1
	}

	y := float64(0)
	for i := 0; i <= degree; i++ {
		y += coefficients[i] * math.Pow(x, float64(i))
	}
	return uint64(y)
}

// GuessSD3MediumDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height,
// which is calculated by linear regression or polynomial regression.
func GuessSD3MediumDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 {
	coefficients := []float64{16529921.3700, 234.6656247718, 0.0014648995}
	degree := 2
	x := float64(width * height)

	y := float64(0)
	for i := 0; i <= degree; i++ {
		y += coefficients[i] * math.Pow(x, float64(i))
	}
	return uint64(y)
}

// GuessSD35MediumDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height,
// which is calculated by linear regression or polynomial regression.
func GuessSD35MediumDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 {
	coefficients := []float64{17441103.4726, 281.6956819806, 0.0014651233}
	degree := 2
	x := float64(width * height)

	y := float64(0)
	for i := 0; i <= degree; i++ {
		y += coefficients[i] * math.Pow(x, float64(i))
	}
	return uint64(y)
}

// GuessSD35LargeDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height,
// which is calculated by linear regression or polynomial regression.
func GuessSD35LargeDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 {
	coefficients := []float64{23204369.2029, 410.3731196298, 0.0023195947}
	degree := 2
	x := float64(width * height)

	y := float64(0)
	for i := 0; i <= degree; i++ {
		y += coefficients[i] * math.Pow(x, float64(i))
	}
	return uint64(y)
}

// GuessFLUXDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height,
// which is calculated by linear regression or polynomial regression.
func GuessFLUXDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 {
	coefficients := []float64{46511668.6742, 997.7758807792, 0.0014573393}
	degree := 2
	x := float64(width * height)

	y := float64(0)
	for i := 0; i <= degree; i++ {
		y += coefficients[i] * math.Pow(x, float64(i))
	}
	return uint64(y)
}


================================================
FILE: zz_generated.ggmltype.stringer.go
================================================
// Code generated by "stringer -linecomment -type GGMLType -output zz_generated.ggmltype.stringer.go -trimprefix GGMLType"; DO NOT EDIT.

package gguf_parser

import "strconv"

func _() {
	// An "invalid array index" compiler error signifies that the constant values have changed.
	// Re-run the stringer command to generate them again.
	var x [1]struct{}
	_ = x[GGMLTypeF32-0]
	_ = x[GGMLTypeF16-1]
	_ = x[GGMLTypeQ4_0-2]
	_ = x[GGMLTypeQ4_1-3]
	_ = x[GGMLTypeQ4_2-4]
	_ = x[GGMLTypeQ4_3-5]
	_ = x[GGMLTypeQ5_0-6]
	_ = x[GGMLTypeQ5_1-7]
	_ = x[GGMLTypeQ8_0-8]
	_ = x[GGMLTypeQ8_1-9]
	_ = x[GGMLTypeQ2_K-10]
	_ = x[GGMLTypeQ3_K-11]
	_ = x[GGMLTypeQ4_K-12]
	_ = x[GGMLTypeQ5_K-13]
	_ = x[GGMLTypeQ6_K-14]
	_ = x[GGMLTypeQ8_K-15]
	_ = x[GGMLTypeIQ2_XXS-16]
	_ = x[GGMLTypeIQ2_XS-17]
	_ = x[GGMLTypeIQ3_XXS-18]
	_ = x[GGMLTypeIQ1_S-19]
	_ = x[GGMLTypeIQ4_NL-20]
	_ = x[GGMLTypeIQ3_S-21]
	_ = x[GGMLTypeIQ2_S-22]
	_ = x[GGMLTypeIQ4_XS-23]
	_ = x[GGMLTypeI8-24]
	_ = x[GGMLTypeI16-25]
	_ = x[GGMLTypeI32-26]
	_ = x[GGMLTypeI64-27]
	_ = x[GGMLTypeF64-28]
	_ = x[GGMLTypeIQ1_M-29]
	_ = x[GGMLTypeBF16-30]
	_ = x[GGMLTypeQ4_0_4_4-31]
	_ = x[GGMLTypeQ4_0_4_8-32]
	_ = x[GGMLTypeQ4_0_8_8-33]
	_ = x[GGMLTypeTQ1_0-34]
	_ = x[GGMLTypeTQ2_0-35]
	_ = x[GGMLTypeIQ4_NL_4_4-36]
	_ = x[GGMLTypeIQ4_NL_4_8-37]
	_ = x[GGMLTypeIQ4_NL_8_8-38]
	_ = x[GGMLTypeMXFP4-39]
	_ = x[_GGMLTypeCount-40]
}

const _GGMLType_name = "F32F16Q4_0Q4_1Q4_2Q4_3Q5_0Q5_1Q8_0Q8_1Q2_KQ3_KQ4_KQ5_KQ6_KQ8_KIQ2_XXSIQ2_XSIQ3_XXSIQ1_SIQ4_NLIQ3_SIQ2_SIQ4_XSI8I16I32I64F64IQ1_MBF16Q4_0_4_4Q4_0_4_8Q4_0_8_8TQ1_0TQ2_0IQ4_NL_4_4IQ4_NL_4_8IQ4_NL_8_8MXFP4Unknown"

var _GGMLType_index = [...]uint8{0, 3, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, 69, 75, 82, 87, 93, 98, 103, 109, 111, 114, 117, 120, 123, 128, 132, 140, 148, 156, 161, 166, 176, 186, 196, 201, 208}

func (i GGMLType) String() string {
	if i >= GGMLType(len(_GGMLType_index)-1) {
		return "GGMLType(" + strconv.FormatInt(int64(i), 10) + ")"
	}
	return _GGMLType_name[_GGMLType_index[i]:_GGMLType_index[i+1]]
}


================================================
FILE: zz_generated.gguffiletype.stringer.go
================================================
// Code generated by "stringer -linecomment -type GGUFFileType -output zz_generated.gguffiletype.stringer.go -trimprefix GGUFFileType"; DO NOT EDIT.

package gguf_parser

import "strconv"

func _() {
	// An "invalid array index" compiler error signifies that the constant values have changed.
	// Re-run the stringer command to generate them again.
	var x [1]struct{}
	_ = x[GGUFFileTypeMostlyF32-0]
	_ = x[GGUFFileTypeMostlyF16-1]
	_ = x[GGUFFileTypeMostlyQ4_0-2]
	_ = x[GGUFFileTypeMostlyQ4_1-3]
	_ = x[GGUFFileTypeMostlyQ4_1_SOME_F16-4]
	_ = x[GGUFFileTypeMostlyQ4_2-5]
	_ = x[GGUFFileTypeMostlyQ4_3-6]
	_ = x[GGUFFileTypeMostlyQ8_0-7]
	_ = x[GGUFFileTypeMostlyQ5_0-8]
	_ = x[GGUFFileTypeMostlyQ5_1-9]
	_ = x[GGUFFileTypeMostlyQ2_K-10]
	_ = x[GGUFFileTypeMostlyQ3_K_S-11]
	_ = x[GGUFFileTypeMostlyQ3_K_M-12]
	_ = x[GGUFFileTypeMostlyQ3_K_L-13]
	_ = x[GGUFFileTypeMostlyQ4_K_S-14]
	_ = x[GGUFFileTypeMostlyQ4_K_M-15]
	_ = x[GGUFFileTypeMostlyQ5_K_S-16]
	_ = x[GGUFFileTypeMostlyQ5_K_M-17]
	_ = x[GGUFFileTypeMostlyQ6_K-18]
	_ = x[GGUFFileTypeMostlyIQ2_XXS-19]
	_ = x[GGUFFileTypeMostlyIQ2_XS-20]
	_ = x[GGUFFileTypeMostlyQ2_K_S-21]
	_ = x[GGUFFileTypeMostlyIQ3_XS-22]
	_ = x[GGUFFileTypeMostlyIQ3_XXS-23]
	_ = x[GGUFFileTypeMostlyIQ1_S-24]
	_ = x[GGUFFileTypeMostlyIQ4_NL-25]
	_ = x[GGUFFileTypeMostlyIQ3_S-26]
	_ = x[GGUFFileTypeMostlyIQ3_M-27]
	_ = x[GGUFFileTypeMostlyIQ2_S-28]
	_ = x[GGUFFileTypeMostlyIQ2_M-29]
	_ = x[GGUFFileTypeMostlyIQ4_XS-30]
	_ = x[GGUFFileTypeMostlyIQ1_M-31]
	_ = x[GGUFFileTypeMostlyBF16-32]
	_ = x[GGUFFileTypeMostlyQ4_0_4_4-33]
	_ = x[GGUFFileTypeMostlyQ4_0_4_8-34]
	_ = x[GGUFFileTypeMostlyQ4_0_8_8-35]
	_ = x[GGUFFileTypeMostlyTQ1_0-36]
	_ = x[GGUFFileTypeMostlyTQ2_0-37]
	_ = x[GGUFFileTypeMostlyMXFP4-38]
	_ = x[_GGUFFileTypeCount-39]
}

const _GGUFFileType_name = "MOSTLY_F32MOSTLY_F16MOSTLY_Q4_0MOSTLY_Q4_1MOSTLY_Q4_1_SOME_F16MOSTLY_Q4_2MOSTLY_Q4_3MOSTLY_Q8_0MOSTLY_Q5_0MOSTLY_Q5_1MOSTLY_Q2_KMOSTLY_Q3_K_SMOSTLY_Q3_K_MMOSTLY_Q3_K_LMOSTLY_Q4_K_SMOSTLY_Q4_K_MMOSTLY_Q5_K_SMOSTLY_Q5_K_MMOSTLY_Q6_KMOSTLY_IQ2_XXSMOSTLY_IQ2_XSMOSTLY_Q2_K_SMOSTLY_IQ3_XSMOSTLY_IQ3_XXSMOSTLY_IQ1_SMOSTLY_IQ4_NLMOSTLY_IQ3_SMOSTLY_IQ3_MMOSTLY_IQ2_SMOSTLY_IQ2_MMOSTLY_IQ4_XSMOSTLY_IQ1_MMOSTLY_BF16MOSTLY_Q4_0_4_4MOSTLY_Q4_0_4_8MOSTLY_Q4_0_8_8MOSTLY_TQ1_0MOSTLY_TQ2_0MOSTLY_MXFP4Unknown"

var _GGUFFileType_index = [...]uint16{0, 10, 20, 31, 42, 62, 73, 84, 95, 106, 117, 128, 141, 154, 167, 180, 193, 206, 219, 230, 244, 257, 270, 283, 297, 309, 322, 334, 346, 358, 370, 383, 395, 406, 421, 436, 451, 463, 475, 487, 494}

func (i GGUFFileType) String() string {
	if i >= GGUFFileType(len(_GGUFFileType_index)-1) {
		return "GGUFFileType(" + strconv.FormatInt(int64(i), 10) + ")"
	}
	return _GGUFFileType_name[_GGUFFileType_index[i]:_GGUFFileType_index[i+1]]
}


================================================
FILE: zz_generated.ggufmagic.stringer.go
================================================
// Code generated by "stringer -linecomment -type GGUFMagic -output zz_generated.ggufmagic.stringer.go -trimprefix GGUFMagic"; DO NOT EDIT.

package gguf_parser

import "strconv"

func _() {
	// An "invalid array index" compiler error signifies that the constant values have changed.
	// Re-run the stringer command to generate them again.
	var x [1]struct{}
	_ = x[GGUFMagicGGML-1734831468]
	_ = x[GGUFMagicGGMF-1734831462]
	_ = x[GGUFMagicGGJT-1734830708]
	_ = x[GGUFMagicGGUFLe-1179993927]
	_ = x[GGUFMagicGGUFBe-1195857222]
}

const (
	_GGUFMagic_name_0 = "GGUF"
	_GGUFMagic_name_1 = "GGUF"
	_GGUFMagic_name_2 = "GGJT"
	_GGUFMagic_name_3 = "GGMF"
	_GGUFMagic_name_4 = "GGML"
)

func (i GGUFMagic) String() string {
	switch {
	case i == 1179993927:
		return _GGUFMagic_name_0
	case i == 1195857222:
		return _GGUFMagic_name_1
	case i == 1734830708:
		return _GGUFMagic_name_2
	case i == 1734831462:
		return _GGUFMagic_name_3
	case i == 1734831468:
		return _GGUFMagic_name_4
	default:
		return "GGUFMagic(" + strconv.FormatInt(int64(i), 10) + ")"
	}
}


================================================
FILE: zz_generated.ggufmetadatavaluetype.stringer.go
================================================
// Code generated by "stringer -linecomment -type GGUFMetadataValueType -output zz_generated.ggufmetadatavaluetype.stringer.go -trimprefix GGUFMetadataValueType"; DO NOT EDIT.

package gguf_parser

import "strconv"

func _() {
	// An "invalid array index" compiler error signifies that the constant values have changed.
	// Re-run the stringer command to generate them again.
	var x [1]struct{}
	_ = x[GGUFMetadataValueTypeUint8-0]
	_ = x[GGUFMetadataValueTypeInt8-1]
	_ = x[GGUFMetadataValueTypeUint16-2]
	_ = x[GGUFMetadataValueTypeInt16-3]
	_ = x[GGUFMetadataValueTypeUint32-4]
	_ = x[GGUFMetadataValueTypeInt32-5]
	_ = x[GGUFMetadataValueTypeFloat32-6]
	_ = x[GGUFMetadataValueTypeBool-7]
	_ = x[GGUFMetadataValueTypeString-8]
	_ = x[GGUFMetadataValueTypeArray-9]
	_ = x[GGUFMetadataValueTypeUint64-10]
	_ = x[GGUFMetadataValueTypeInt64-11]
	_ = x[GGUFMetadataValueTypeFloat64-12]
	_ = x[_GGUFMetadataValueTypeCount-13]
}

const _GGUFMetadataValueType_name = "Uint8Int8Uint16Int16Uint32Int32Float32BoolStringArrayUint64Int64Float64Unknown"

var _GGUFMetadataValueType_index = [...]uint8{0, 5, 9, 15, 20, 26, 31, 38, 42, 48, 53, 59, 64, 71, 78}

func (i GGUFMetadataValueType) String() string {
	if i >= GGUFMetadataValueType(len(_GGUFMetadataValueType_index)-1) {
		return "GGUFMetadataValueType(" + strconv.FormatInt(int64(i), 10) + ")"
	}
	return _GGUFMetadataValueType_name[_GGUFMetadataValueType_index[i]:_GGUFMetadataValueType_index[i+1]]
}


================================================
FILE: zz_generated.ggufversion.stringer.go
================================================
// Code generated by "stringer -linecomment -type GGUFVersion -output zz_generated.ggufversion.stringer.go -trimprefix GGUFVersion"; DO NOT EDIT.

package gguf_parser

import "strconv"

func _() {
	// An "invalid array index" compiler error signifies that the constant values have changed.
	// Re-run the stringer command to generate them again.
	var x [1]struct{}
	_ = x[GGUFVersionV1-1]
	_ = x[GGUFVersionV2-2]
	_ = x[GGUFVersionV3-3]
}

const _GGUFVersion_name = "V1V2V3"

var _GGUFVersion_index = [...]uint8{0, 2, 4, 6}

func (i GGUFVersion) String() string {
	i -= 1
	if i >= GGUFVersion(len(_GGUFVersion_index)-1) {
		return "GGUFVersion(" + strconv.FormatInt(int64(i+1), 10) + ")"
	}
	return _GGUFVersion_name[_GGUFVersion_index[i]:_GGUFVersion_index[i+1]]
}