Repository: gpustack/gguf-parser-go Branch: main Commit: 35c4501b75f7 Files: 85 Total size: 593.2 KB Directory structure: gitextract_sklq_6lp/ ├── .gitattributes ├── .github/ │ └── workflows/ │ ├── ci.yml │ ├── cmd.yml │ ├── prune.yml │ └── sync.yml ├── .gitignore ├── .golangci.yaml ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── cache.go ├── cmd/ │ └── gguf-parser/ │ ├── README.md │ ├── go.mod │ ├── go.sum │ └── main.go ├── file.go ├── file_architecture.go ├── file_architecture_test.go ├── file_estimate__llamacpp.go ├── file_estimate__llamacpp_test.go ├── file_estimate__stablediffusioncpp.go ├── file_estimate__stablediffusioncpp_test.go ├── file_estimate_option.go ├── file_from_distro.go ├── file_from_remote.go ├── file_metadata.go ├── file_metadata_test.go ├── file_option.go ├── file_test.go ├── file_tokenizer.go ├── file_tokenizer_test.go ├── filename.go ├── filename_test.go ├── gen.go ├── gen.regression.go ├── gen.stringer.go ├── ggml.go ├── go.mod ├── go.sum ├── ollama_model.go ├── ollama_model_option.go ├── ollama_model_test.go ├── ollama_registry_authenticate.go ├── scalar.go ├── scalar_test.go ├── util/ │ ├── anyx/ │ │ └── any.go │ ├── bytex/ │ │ └── pool.go │ ├── funcx/ │ │ └── error.go │ ├── httpx/ │ │ ├── client.go │ │ ├── client_helper.go │ │ ├── client_options.go │ │ ├── file.go │ │ ├── file_options.go │ │ ├── proxy.go │ │ ├── resolver.go │ │ ├── transport.go │ │ └── transport_options.go │ ├── json/ │ │ ├── common.go │ │ ├── jsoniter.go │ │ └── stdjson.go │ ├── osx/ │ │ ├── env.go │ │ ├── file.go │ │ ├── file_mmap.go │ │ ├── file_mmap_js.go │ │ ├── file_mmap_unix.go │ │ ├── file_mmap_windows.go │ │ ├── file_mmap_windows_386.go │ │ ├── file_mmap_windows_non386.go │ │ └── homedir.go │ ├── ptr/ │ │ └── pointer.go │ ├── signalx/ │ │ ├── handler.go │ │ ├── handler_unix.go │ │ └── handler_windows.go │ ├── slicex/ │ │ └── search.go │ └── stringx/ │ ├── bytes.go │ ├── random.go │ ├── strings.go │ └── sum.go ├── zz_generated.diffusion_model_memory_usage.regression.go ├── zz_generated.ggmltype.stringer.go ├── zz_generated.gguffiletype.stringer.go ├── zz_generated.ggufmagic.stringer.go ├── zz_generated.ggufmetadatavaluetype.stringer.go └── zz_generated.ggufversion.stringer.go ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitattributes ================================================ * text=auto eol=lf **/go.sum linguist-generated=true **/zz_generated.*.go linguist-generated=true ================================================ FILE: .github/workflows/ci.yml ================================================ name: ci permissions: contents: read pull-requests: read actions: read defaults: run: shell: bash on: push: branches: - 'main' - 'branch-v*.*' paths-ignore: - "docs/**" - "**.md" - "**.mdx" - "**.png" - "**.jpg" - ".github/workflows/cmd.yml" - ".github/workflows/prune.yml" - ".github/workflows/sync.yml" pull_request: branches: - 'main' paths-ignore: - "docs/**" - "**.md" - "**.mdx" - "**.png" - "**.jpg" - ".github/workflows/cmd.yml" - ".github/workflows/prune.yml" - ".github/workflows/sync.yml" jobs: ci: timeout-minutes: 15 runs-on: ubuntu-22.04 steps: - name: Checkout uses: actions/checkout@v4 with: fetch-depth: 1 persist-credentials: false - name: Setup Go timeout-minutes: 15 uses: actions/setup-go@v5 with: go-version: "1.22.9" cache-dependency-path: | **/go.sum - name: Setup Toolbox timeout-minutes: 5 uses: actions/cache@v4 with: key: toolbox-${{ runner.os }} path: | ${{ github.workspace }}/.sbin - name: Make run: make ci env: LINT_DIRTY: "true" ================================================ FILE: .github/workflows/cmd.yml ================================================ name: cmd permissions: contents: write actions: read id-token: write defaults: run: shell: bash on: push: branches: - 'main' - 'branch-v*.*' paths-ignore: - "docs/**" - "**.md" - "**.mdx" - "**.png" - "**.jpg" - ".github/workflows/ci.yml" - ".github/workflows/prune.yml" - ".github/workflows/sync.yml" tags: - "v*.*.*" jobs: build: timeout-minutes: 15 runs-on: ubuntu-22.04 steps: - name: Checkout uses: actions/checkout@v4 with: fetch-depth: 1 persist-credentials: false - name: Setup Go timeout-minutes: 15 uses: actions/setup-go@v5 with: go-version: "1.22.9" cache-dependency-path: | cmd/**/go.sum - name: Make run: make build env: VERSION: "${{ github.ref_name }}" - name: Upload Artifact uses: actions/upload-artifact@v4 with: include-hidden-files: true path: ${{ github.workspace }}/.dist/* - name: Release if: ${{ startsWith(github.ref, 'refs/tags/') }} uses: softprops/action-gh-release@v2 with: fail_on_unmatched_files: true tag_name: "${{ github.ref_name }}" prerelease: ${{ contains(github.ref, 'rc') }} files: ${{ github.workspace }}/.dist/* publish: needs: - build permissions: contents: write actions: read id-token: write timeout-minutes: 15 runs-on: ubuntu-22.04 env: PACKAGE_REGISTRY: "gpustack" PACKAGE_IMAGE: "gguf-parser" steps: - name: Checkout uses: actions/checkout@v4 with: fetch-depth: 1 persist-credentials: false - name: Setup QEMU uses: docker/setup-qemu-action@v3 with: image: tonistiigi/binfmt:qemu-v9.2.2 platforms: "arm64" - name: Setup Buildx uses: docker/setup-buildx-action@v3 - name: Login DockerHub uses: docker/login-action@v3 with: username: ${{ secrets.CI_DOCKERHUB_USERNAME }} password: ${{ secrets.CI_DOCKERHUB_PASSWORD }} - name: Download Artifact uses: actions/download-artifact@v4 with: path: ${{ github.workspace }}/.dist merge-multiple: true - name: Get Metadata id: metadata uses: docker/metadata-action@v5 with: images: "${{ env.PACKAGE_REGISTRY }}/${{ env.PACKAGE_IMAGE }}" - name: Package uses: docker/build-push-action@v6 with: push: true file: ${{ github.workspace }}/Dockerfile context: ${{ github.workspace }} platforms: "linux/amd64,linux/arm64" tags: ${{ steps.metadata.outputs.tags }} labels: ${{ steps.metadata.outputs.labels }} cache-from: | type=registry,ref=${{ env.PACKAGE_REGISTRY }}/${{ env.PACKAGE_IMAGE }}:build-cache cache-to: | type=registry,mode=max,compression=gzip,ref=${{ env.PACKAGE_REGISTRY }}/${{ env.PACKAGE_IMAGE }}:build-cache,ignore-error=true provenance: true sbom: true ================================================ FILE: .github/workflows/prune.yml ================================================ name: prune permissions: contents: write pull-requests: write actions: write issues: write defaults: run: shell: bash on: workflow_dispatch: inputs: prune: description: 'Prune all caches' required: false type: boolean default: false schedule: - cron: "0 0 * * *" # every day at 00:00 UTC jobs: close-stale-issues-and-prs: uses: gpustack/.github/.github/workflows/close-stale-issues-and-prs.yml@main clean-stale-caches: uses: gpustack/.github/.github/workflows/clean-stale-caches.yml@main with: # allow to prune all caches on demand prune: ${{ github.event_name != 'schedule' && inputs.prune || false }} ================================================ FILE: .github/workflows/sync.yml ================================================ name: sync permissions: contents: read pull-requests: read actions: read defaults: run: shell: bash on: workflow_dispatch: inputs: max_releases: description: "Maximum number of latest releases to sync" required: false default: 1 type: number specific_release_tag: description: "Specific release tag to sync" required: false default: "" type: string dry_run: description: "Skip the actual sync" required: false default: false type: boolean schedule: - cron: "0 */12 * * *" # every 12 hours jobs: gitcode: runs-on: ubuntu-22.04 timeout-minutes: 240 steps: - name: Checkout uses: actions/checkout@v4 with: fetch-depth: 0 persist-credentials: false - name: Sync uses: gpustack/.github/.github/actions/mirror-release-gitcode@main with: gitcode-username: "${{ secrets.CI_GITCODE_USERNAME }}" gitcode-password: "${{ secrets.CI_GITCODE_PASSWORD }}" gitcode-token: "${{ secrets.CI_GITCODE_TOKEN }}" max-releases: "${{ inputs.max_releases && inputs.max_releases || '1' }}" specific-release-tag: "${{ inputs.specific_release_tag && inputs.specific_release_tag || '' }}" code-only: true dry-run: "${{ inputs.dry_run && inputs.dry_run || 'false' }}" gitee: runs-on: ubuntu-22.04 timeout-minutes: 120 steps: - name: Checkout uses: actions/checkout@v4 with: fetch-depth: 0 persist-credentials: false - name: Sync uses: gpustack/.github/.github/actions/mirror-release-gitee@main with: gitee-username: "${{ secrets.CI_GITEE_USERNAME }}" gitee-token: "${{ secrets.CI_GITEE_TOKEN }}" max-releases: "${{ inputs.max_releases && inputs.max_releases || '1' }}" specific-release-tag: "${{ inputs.specific_release_tag && inputs.specific_release_tag || '' }}" code-only: true dry-run: "${{ inputs.dry_run && inputs.dry_run || 'false' }}" tencent-cos: runs-on: ubuntu-22.04 timeout-minutes: 120 steps: - name: Sync uses: gpustack/.github/.github/actions/mirror-release-tencent-cos@main with: tencent-secret-id: "${{ secrets.CI_TECENTCOS_SECRET_ID }}" tencent-secret-key: "${{ secrets.CI_TECENTCOS_SECRET_KEY }}" tencent-cos-region: "ap-guangzhou" tencent-cos-bucket: "gpustack-1303613262" max-releases: "${{ inputs.max_releases && inputs.max_releases || '1' }}" specific-release-tag: "${{ inputs.specific_release_tag && inputs.specific_release_tag || '' }}" dry-run: "${{ inputs.dry_run && inputs.dry_run || 'false' }}" ================================================ FILE: .gitignore ================================================ # Files .DS_Store *.lock *.test *.out *.swp *.swo *.db *.exe *.exe~ *.dll *.so *.dylib *.log go.work go.work.* # Dirs /.idea /.vscode /.kube /.terraform /.vagrant /.bundle /.cache /.docker /.entc /.sbin /.dist /log /certs ================================================ FILE: .golangci.yaml ================================================ version: "1" run: timeout: 10m tests: true modules-download-mode: readonly go: "1.22" # output configuration options output: print-issued-lines: true print-linter-name: true path-prefix: "" sort-results: true linters: disable-all: true enable: - asciicheck - bidichk - decorder - durationcheck - errcheck - errname - errorlint - copyloopvar - godot - goconst - gocritic - gosimple - gosec - govet - gofumpt - gofmt - ineffassign - importas - lll - makezero - misspell - nakedret - nilerr - prealloc - predeclared - revive - staticcheck - stylecheck - typecheck - unconvert - unparam - unused - usestdlibvars - whitespace linters-settings: decorder: dec-order: - const - var - func disable-init-func-first-check: false disable-dec-order-check: true errorlint: errorf: true asserts: true comparison: true godot: scope: all exclude: - "(?i)^ FIXME:" - "(?i)^ TODO:" - "(?i)^ SPDX\\-License\\-Identifier:" - "(?i)^ +" period: true capital: false goconst: min-len: 3 min-occurrences: 10 gosimple: checks: [ "all" ] gosec: severity: "low" confidence: "low" excludes: - G101 - G107 - G112 - G115 - G404 gofumpt: extra-rules: true gofmt: simplify: true rewrite-rules: - pattern: 'interface{}' replacement: 'any' - pattern: 'a[b:len(a)]' replacement: 'a[b:]' importas: no-unaliased: true lll: line-length: 150 tab-width: 1 makezero: always: false misspell: locale: US nakedret: max-func-lines: 60 revive: rules: - name: var-naming disabled: true arguments: - [ "HTTP", "ID", "TLS", "TCP", "UDP", "API", "CA", "URL", "DNS" ] staticcheck: checks: [ "all", "-SA1019", "-SA2002", "-SA5008" ] stylecheck: checks: [ "all", "-ST1003" ] unparam: check-exported: false unused: field-writes-are-uses: true post-statements-are-reads: true exported-fields-are-used: true parameters-are-used: true local-variables-are-used: true generated-is-used: true usestdlibvars: http-method: true http-status-code: true time-weekday: true time-month: true time-layout: true crypto-hash: true issues: uniq-by-line: true exclude-files: - "doc.go" - "zz_generated.*.go" - "gen.*.go" exclude-rules: - path: _test\.go linters: - errcheck - gosec - makezero - lll ================================================ FILE: Dockerfile ================================================ FROM scratch ARG TARGETOS ARG TARGETARCH COPY --chmod=755 .dist/gguf-parser-${TARGETOS}-${TARGETARCH} /bin/gguf-parser ENTRYPOINT ["/bin/gguf-parser"] ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2024 gguf-parser-go authors Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: Makefile ================================================ .SILENT: .DEFAULT_GOAL := ci SHELL := /bin/bash SRCDIR := $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))))) GOOS := $(shell go env GOOS) GOARCH := $(shell go env GOARCH) LINT_DIRTY ?= false VERSION ?= $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null | tr '[:upper:]' '[:lower:]' || echo "unknown") DEPS_UPDATE ?= false deps: @echo "+++ $@ +++" cd $(SRCDIR) && go mod tidy && go mod download cd $(SRCDIR)/cmd/gguf-parser && go mod tidy && go mod download if [[ "$(DEPS_UPDATE)" == "true" ]]; then \ cd $(SRCDIR) && go get -u -v ./...; \ cd $(SRCDIR)/cmd/gguf-parser && go get -u -v ./...; \ fi @echo "--- $@ ---" generate: @echo "+++ $@ +++" cd $(SRCDIR) && go generate ./... cd $(SRCDIR)/cmd/gguf-parser && go generate ./... @echo "--- $@ ---" lint: @echo "+++ $@ +++" [[ -d "$(SRCDIR)/.sbin" ]] || mkdir -p "$(SRCDIR)/.sbin" [[ -f "$(SRCDIR)/.sbin/goimports-reviser" ]] || \ curl --retry 3 --retry-all-errors --retry-delay 3 -sSfL "https://github.com/incu6us/goimports-reviser/releases/download/v3.8.2/goimports-reviser_3.8.2_$(GOOS)_$(GOARCH).tar.gz" \ | tar -zxvf - --directory "$(SRCDIR)/.sbin" --no-same-owner --exclude ./LICENSE --exclude ./README.md && chmod +x "$(SRCDIR)/.sbin/goimports-reviser" cd $(SRCDIR) && \ go list -f "{{.Dir}}" ./... | xargs -I {} find {} -maxdepth 1 -type f -name '*.go' ! -name 'gen.*' ! -name 'zz_generated.*' \ | xargs -I {} "$(SRCDIR)/.sbin/goimports-reviser" -use-cache -imports-order=std,general,company,project,blanked,dotted -output=file {} 1>/dev/null 2>&1 cd $(SRCDIR)/cmd/gguf-parser && \ go list -f "{{.Dir}}" ./... | xargs -I {} find {} -maxdepth 1 -type f -name '*.go' ! -name 'gen.*' ! -name 'zz_generated.*' \ | xargs -I {} "$(SRCDIR)/.sbin/goimports-reviser" -use-cache -imports-order=std,general,company,project,blanked,dotted -output=file {} 1>/dev/null 2>&1 [[ -f "$(SRCDIR)/.sbin/golangci-lint" ]] || \ curl --retry 3 --retry-all-errors --retry-delay 3 -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh \ | sh -s -- -b "$(SRCDIR)/.sbin" "v1.63.4" cd $(SRCDIR) && \ "$(SRCDIR)/.sbin/golangci-lint" run --fix ./... cd $(SRCDIR)/cmd/gguf-parser && \ "$(SRCDIR)/.sbin/golangci-lint" run --fix ./... if [[ "$(LINT_DIRTY)" == "true" ]]; then \ if [[ -n $$(git status --porcelain) ]]; then \ echo "Code tree is dirty."; \ git diff --exit-code; \ fi; \ fi @echo "--- $@ ---" test: @echo "+++ $@ +++" go test -v -failfast -race -cover -timeout=30m $(SRCDIR)/... @echo "--- $@ ---" benchmark: @echo "+++ $@ +++" go test -v -failfast -run="^Benchmark[A-Z]+" -bench=. -benchmem -timeout=30m $(SRCDIR)/... @echo "--- $@ ---" gguf-parser: [[ -d "$(SRCDIR)/.dist" ]] || mkdir -p "$(SRCDIR)/.dist" cd "$(SRCDIR)/cmd/gguf-parser" && for os in darwin linux windows; do \ tags="netgo"; \ if [[ $$os == "windows" ]]; then \ suffix=".exe"; \ tags="netcgo"; \ else \ suffix=""; \ fi; \ for arch in amd64 arm64; do \ echo "Building gguf-parser for $$os-$$arch $(VERSION)"; \ GOOS="$$os" GOARCH="$$arch" CGO_ENABLED=1 go build \ -trimpath \ -ldflags="-w -s -X main.Version=$(VERSION)" \ -tags="urfave_cli_no_docs $$tags" \ -o $(SRCDIR)/.dist/gguf-parser-$$os-$$arch$$suffix; \ done; \ if [[ $$os == "darwin" ]]; then \ [[ -d "$(SRCDIR)/.sbin" ]] || mkdir -p "$(SRCDIR)/.sbin"; \ [[ -f "$(SRCDIR)/.sbin/lipo" ]] || \ GOBIN="$(SRCDIR)/.sbin" go install github.com/konoui/lipo@v0.9.2; \ "$(SRCDIR)/.sbin/lipo" -create -output $(SRCDIR)/.dist/gguf-parser-darwin-universal $(SRCDIR)/.dist/gguf-parser-darwin-amd64 $(SRCDIR)/.dist/gguf-parser-darwin-arm64; \ fi;\ if [[ $$os == "$(GOOS)" ]] && [[ $$arch == "$(GOARCH)" ]]; then \ cp -rf $(SRCDIR)/.dist/gguf-parser-$$os-$$arch$$suffix $(SRCDIR)/.dist/gguf-parser$$suffix; \ fi; \ done build: gguf-parser PACKAGE_PUBLISH ?= false PACKAGE_REGISTRY ?= "gpustack" PACKAGE_IMAGE ?= "gguf-parser" package: build @echo "+++ $@ +++" if [[ -z $$(command -v docker) ]]; then \ echo "Docker is not installed."; \ exit 1; \ fi; \ platform="linux/amd64,linux/arm64"; \ image="$(PACKAGE_IMAGE):$(VERSION)"; \ if [[ -n "$(PACKAGE_REGISTRY)" ]]; then \ image="$(PACKAGE_REGISTRY)/$$image"; \ fi; \ if [[ "$(PACKAGE_PUBLISH)" == "true" ]]; then \ if [[ -z $$(docker buildx inspect --builder "gguf-parser") ]]; then \ docker run --rm --privileged tonistiigi/binfmt:qemu-v9.2.2 --install $$platform; \ docker buildx create --name "gguf-parser" --driver "docker-container" --buildkitd-flags "--allow-insecure-entitlement security.insecure --allow-insecure-entitlement network.host" --bootstrap; \ fi; \ docker buildx build --progress=plain --platform=$$platform --builder="gguf-parser" --output="type=image,name=$$image,push=true" "$(SRCDIR)"; \ else \ platform="linux/$(GOARCH)"; \ docker buildx build --progress=plain --platform=$$platform --output="type=docker,name=$$image" "$(SRCDIR)"; \ fi @echo "--- $@ ---" ci: deps generate lint test build ================================================ FILE: README.md ================================================ # GGUF Parser > tl;dr, Review/Check [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) files and estimate the memory > usage. [![Go Report Card](https://goreportcard.com/badge/github.com/gpustack/gguf-parser-go)](https://goreportcard.com/report/github.com/gpustack/gguf-parser-go) [![CI](https://img.shields.io/github/actions/workflow/status/gpustack/gguf-parser-go/cmd.yml?label=ci)](https://github.com/gpustack/gguf-parser-go/actions) [![License](https://img.shields.io/github/license/gpustack/gguf-parser-go?label=license)](https://github.com/gpustack/gguf-parser-go#license) [![Download](https://img.shields.io/github/downloads/gpustack/gguf-parser-go/total)](https://github.com/gpustack/gguf-parser-go/releases) [![Docker Pulls](https://img.shields.io/docker/pulls/gpustack/gguf-parser)](https://hub.docker.com/r/gpustack/gguf-parser) [![Release](https://img.shields.io/github/v/release/gpustack/gguf-parser-go)](https://github.com/gpustack/gguf-parser-go/releases/latest) [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) is a file format for storing models for inference with GGML and executors based on GGML. GGUF is a binary format that is designed for fast loading and saving of models, and for ease of reading. Models are traditionally developed using PyTorch or another framework, and then converted to GGUF for use in GGML. GGUF Parser helps in reviewing and estimating the usage and maximum tokens per second of a GGUF format model without download it. ## Key Features - **No File Required**: GGUF Parser uses chunking reading to parse the metadata of remote GGUF file, which means you don't need to download the entire file and load it. - **Accurate Prediction**: The evaluation results of GGUF Parser usually deviate from the actual usage by about 100MiB. - **Quick Verification**: You can provide device metrics to calculate the maximum tokens per second (TPS) without running the model. - **Type Screening**: GGUF Parser can distinguish what the GGUF file used for, such as Embedding, Reranking, LoRA, etc. - **Fast**: GGUF Parser is written in Go, which is fast and efficient. ## Agenda - [Notes](#notes) - [Installation](#installation) - [Overview](#overview) + [Parse](#parse) * [Local File](#parse-local-file) * [Remote File](#parse-remote-file) * [From HuggingFace](#parse-from-huggingface) * [From ModelScope](#parse-from-modelscope) * [From Ollama Library](#parse-from-ollama-library) * [Others](#others) * [Image Model](#parse-image-model) * [None Model](#parse-none-model) + [Estimate](#estimate) * [Across Multiple GPU devices](#across-multiple-gpu-devices) * [Maximum Tokens Per Second](#maximum-tokens-per-second) * [Full Layers Offload (default)](#full-layers-offload-default) * [Zero Layers Offload](#zero-layers-offload) * [Specific Layers Offload](#specific-layers-offload) * [Specific Context Size](#specific-context-size) * [Enable Flash Attention](#enable-flash-attention) * [Disable MMap](#disable-mmap) * [With Adapter](#with-adapter) * [Get Proper Offload Layers](#get-proper-offload-layers) ## Notes - **Since v0.20.0**, GGUF Parser supports leveraging `--override-tensor` to indicate how to place the model tensors. - **Since v0.19.0**, GGUF Parser supports estimating Audio projector model file, like Ultravox series, Qwen2 Audio series, etc. - **Since v0.18.0**, GGUF Parser supports estimating SWA-supported(sliding window attention) model file, like LLaMA 4 series, Gemma2/3 series, etc. - **Since v0.17.0**, GGUF Parser align the `QUANTIZATION`( aka. [`general.file_type`](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#general-metadata)) to [HuggingFace processing](https://github.com/huggingface/huggingface.js/blob/2475d6d316135c0a4fceff6b3fe2aed0dde36ac1/packages/gguf/src/types.ts#L11-L48), but there are still many model files whose naming does not fully follow `general.file_type`. - **Since v0.16.0**, GGUF Parser supports estimating MLA-supported model file, like DeepSeek series. - **Since v0.14.0 (BREAKING CHANGE)**, GGUF Parser parses `*.feed_forward_length` metadata as `[]uint64`, which means the architecture `feedForwardLength` is a list of integers. - **Since v0.13.0 (BREAKING CHANGE)**, GGUF Parser can parse files for [StableDiffusion.Cpp](https://github.com/leejet/stable-diffusion.cpp) or StableDiffusion.Cpp like application. + [LLaMA Box](https://github.com/gpustack/llama-box) is able to offload different components of the all-in-one model to different devices, e.g. with `-ts 1,1,1`, GGUF Parser return the usage of Text Encoder Models in 1st device, VAE Model in 2nd device, and Diffusion Model in 3rd device. - Experimentally, GGUF Parser can estimate the maximum tokens per second(`MAX TPS`) for a (V)LM model according to the `--device-metric` options. - GGUF Parser distinguishes the remote devices from `--tensor-split` via `--rpc`. + For one host multiple GPU devices, you can use `--tensor-split` to get the estimated memory usage of each GPU. + For multiple hosts multiple GPU devices, you can use `--tensor-split` and `--rpc` to get the estimated memory usage of each GPU. Since v0.11.0, `--rpc` flag masks the devices specified by `--tensor-split` in front. - Table result usage: + `DISTRIBUTABLE` indicates the GGUF file supports distribution inference or not, if the file doesn't support distribution inference, you can not offload it with [RPC servers](https://github.com/ggerganov/llama.cpp/tree/master/examples/rpc). + `RAM` indicates the system memory usage. + `VRAM *` indicates the local GPU memory usage. + `RPC * (V)RAM` indicates the remote memory usage. The kind of memory is determined by which backend the RPC server uses, check the running logs for more details. + `UMA` indicates the memory usage of Apple macOS only. `NONUMA` adapts to other cases, including non-GPU devices. + `LAYERS`(`I`/`T`/`O`) indicates the count for input layers, transformer layers, and output layers. Input layers are not offloaded at present. ## Installation Install from [releases](https://github.com/gpustack/gguf-parser-go/releases). ## Overview ### Parse #### Parse Local File ```shell $ gguf-parser --path ~/.cache/lm-studio/models/unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF/DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf +-----------------------------------------------------------------------------------------------------------+ | METADATA | +-------+-------------------------+-------+--------------+---------------+----------+------------+----------+ | TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | +-------+-------------------------+-------+--------------+---------------+----------+------------+----------+ | model | DeepSeek R1 Distill ... | qwen2 | Q4_K_M | true | 4.36 GiB | 7.62 B | 4.91 bpw | +-------+-------------------------+-------+--------------+---------------+----------+------------+----------+ +-----------------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ | MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ | 131072 | 3584 | true | 28 | 28 | 18944 | 0 | 152064 | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------+ | TOKENIZER | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ | gpt2 | 2.47 MiB | 152064 | N/A | 151646 | 151643 | N/A | N/A | N/A | N/A | 151654 | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+ | ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | | | | | | | | | | | +--------------------+------------+------------+----------------+----------+-----------+ | | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+ | qwen2 | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 29 (28 + 1) | Yes | 1 + 0 + 0 | 677.44 MiB | 827.44 MiB | 28 + 1 | 7.30 GiB | 18.89 GiB | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+ $ # Retrieve the model's metadata via split file, $ # which needs all split files has been downloaded. $ gguf-parser --path ~/.cache/lm-studio/models/Qwen/Qwen2.5-7B-Instruct-GGUF/qwen2.5-7b-instruct-q8_0-00001-of-00003.gguf +-------------------------------------------------------------------------------------------------------+ | METADATA | +-------+---------------------+-------+--------------+---------------+----------+------------+----------+ | TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | +-------+---------------------+-------+--------------+---------------+----------+------------+----------+ | model | qwen2.5-7b-instruct | qwen2 | Q8_0 | true | 7.54 GiB | 7.62 B | 8.50 bpw | +-------+---------------------+-------+--------------+---------------+----------+------------+----------+ +-----------------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ | MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ | 131072 | 3584 | true | 28 | 28 | 18944 | 0 | 152064 | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------+ | TOKENIZER | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ | gpt2 | 2.47 MiB | 152064 | N/A | 151643 | 151645 | N/A | N/A | N/A | N/A | 151643 | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+ | ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | | | | | | | | | | | +--------------------+------------+------------+----------------+----------+-----------+ | | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+ | qwen2 | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 29 (28 + 1) | Yes | 1 + 0 + 0 | 677.44 MiB | 827.44 MiB | 28 + 1 | 7.30 GiB | 21.82 GiB | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+ ``` #### Parse Remote File ```shell $ gguf-parser --url="https://huggingface.co/bartowski/Qwen2.5-72B-Instruct-GGUF/resolve/main/Qwen2.5-72B-Instruct-Q4_K_M.gguf" +---------------------------------------------------------------------------------------------------------+ | METADATA | +-------+----------------------+-------+--------------+---------------+-----------+------------+----------+ | TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | +-------+----------------------+-------+--------------+---------------+-----------+------------+----------+ | model | Qwen2.5 72B Instruct | qwen2 | Q4_K_M | true | 44.15 GiB | 72.71 B | 5.22 bpw | +-------+----------------------+-------+--------------+---------------+-----------+------------+----------+ +-----------------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ | MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ | 32768 | 8192 | true | 64 | 80 | 29568 | 0 | 152064 | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------+ | TOKENIZER | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ | gpt2 | 2.47 MiB | 152064 | N/A | 151643 | 151645 | N/A | N/A | N/A | N/A | 151643 | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+----------------------------------------+ | ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | | | | | | | | | | | +--------------------+------------+------------+----------------+-----------+-----------+ | | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+-----------+-----------+ | qwen2 | 32768 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 81 (80 + 1) | Yes | 1 + 0 + 0 | 426.57 MiB | 576.57 MiB | 80 + 1 | 10.31 GiB | 58.18 GiB | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+-----------+-----------+ $ # Retrieve the model's metadata via split file $ gguf-parser --url="https://huggingface.co/unsloth/DeepSeek-R1-GGUF/resolve/main/DeepSeek-R1-UD-IQ1_S/DeepSeek-R1-UD-IQ1_S-00001-of-00003.gguf" +----------------------------------------------------------------------------------------------------------+ | METADATA | +-------+------------------+-----------+--------------+---------------+------------+------------+----------+ | TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | +-------+------------------+-----------+--------------+---------------+------------+------------+----------+ | model | DeepSeek R1 BF16 | deepseek2 | IQ1_S | true | 130.60 GiB | 671.03 B | 1.67 bpw | +-------+------------------+-----------+--------------+---------------+------------+------------+----------+ +-----------------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ | MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ | 163840 | 7168 | true | N/A | 61 | 18432 | 256 | 129280 | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------+ | TOKENIZER | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ | gpt2 | 2.21 MiB | 129280 | N/A | 0 | 1 | N/A | N/A | N/A | N/A | 128815 | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ +--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +-----------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------------------------------+--------------------------------------+ | ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | | | | | | | | | | | +--------------------+-----------+-----------+----------------+------------+--------+ | | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +-----------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+-----------+-----------+----------------+------------+--------+ | deepseek2 | 163840 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 62 (61 + 1) | Yes | 1 + 0 + 0 | 13.03 GiB | 13.18 GiB | 61 + 1 | 762.76 GiB | 1 TB | +-----------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+-----------+-----------+----------------+------------+--------+ ``` #### Parse From HuggingFace > [!NOTE] > > Allow using `HF_ENDPOINT` to override the default HuggingFace endpoint: `https://huggingface.co`. ```shell $ gguf-parser --hf-repo="bartowski/Qwen2-VL-2B-Instruct-GGUF" --hf-file="Qwen2-VL-2B-Instruct-f16.gguf" --hf-mmproj-file="mmproj-Qwen2-VL-2B-Instruct-f32.gguf" --visual-max-image-size 1344 +-----------------------------------------------------------------------------------------------------------+ | METADATA | +-------+----------------------+---------+--------------+---------------+----------+------------+-----------+ | TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | +-------+----------------------+---------+--------------+---------------+----------+------------+-----------+ | model | Qwen2 VL 2B Instruct | qwen2vl | F16 | true | 2.88 GiB | 1.54 B | 16.00 bpw | +-------+----------------------+---------+--------------+---------------+----------+------------+-----------+ +-----------------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ | MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ | 32768 | 1536 | true | 12 | 28 | 8960 | 0 | 151936 | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------+ | TOKENIZER | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ | gpt2 | 2.47 MiB | 151936 | N/A | 151643 | 151645 | N/A | N/A | N/A | N/A | 151643 | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +---------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+ | ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | | | | | | | | | | | +--------------------+------------+------------+----------------+----------+-----------+ | | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +---------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+ | qwen2vl | 32768 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 29 (28 + 1) | Yes | 1 + 0 + 0 | 236.87 MiB | 386.87 MiB | 28 + 1 | 3.65 GiB | 12.86 GiB | +---------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+ $ # Retrieve the model's metadata via split file $ gguf-parser --hf-repo="bartowski/openbuddy-llama3.3-70b-v24.1-131k-GGUF" --hf-file="openbuddy-llama3.3-70b-v24.1-131k-Q4_0.gguf" +------------------------------------------------------------------------------------------------------------+ | METADATA | +-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ | TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | +-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ | model | Openbuddy Llama3.3 7... | llama | Q4_0 | true | 37.35 GiB | 70.55 B | 4.55 bpw | +-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ +-----------------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ | MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ | 131072 | 8192 | true | 64 | 80 | 28672 | 0 | 128256 | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------+ | TOKENIZER | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ | gpt2 | 2 MiB | 128256 | N/A | 128000 | 128048 | N/A | N/A | N/A | N/A | 128044 | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+-----------------------------------------+----------------------------------------+ | ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | | | | | | | | | | | +--------------------+---------+----------+----------------+-----------+-----------+ | | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+ | llama | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 81 (80 + 1) | Yes | 1 + 0 + 0 | 1.06 GB | 1.13 GiB | 80 + 1 | 40.26 GiB | 93.62 GiB | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+ ``` #### Parse From ModelScope > [!NOTE] > > Allow using `MS_ENDPOINT` to override the default ModelScope endpoint: `https://modelscope.cn`. ```shell $ gguf-parser --ms-repo="unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF" --ms-file="DeepSeek-R1-Distill-Qwen-7B-F16.gguf" +-------------------------------------------------------------------------------------------------------------+ | METADATA | +-------+-------------------------+-------+--------------+---------------+-----------+------------+-----------+ | TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | +-------+-------------------------+-------+--------------+---------------+-----------+------------+-----------+ | model | DeepSeek R1 Distill ... | qwen2 | F16 | true | 14.19 GiB | 7.62 B | 16.00 bpw | +-------+-------------------------+-------+--------------+---------------+-----------+------------+-----------+ +-----------------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ | MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ | 131072 | 3584 | true | 28 | 28 | 18944 | 0 | 152064 | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------+ | TOKENIZER | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ | gpt2 | 2.47 MiB | 152064 | N/A | 151646 | 151643 | N/A | N/A | N/A | N/A | 151654 | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+ | ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | | | | | | | | | | | +--------------------+------------+------------+----------------+----------+-----------+ | | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+ | qwen2 | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 29 (28 + 1) | Yes | 1 + 0 + 0 | 677.44 MiB | 827.44 MiB | 28 + 1 | 7.30 GiB | 27.99 GiB | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+ ``` #### Parse From Ollama Library > [!NOTE] > > Allow using `--ol-base-url` to override the default Ollama registry endpoint: `https://registry.ollama.ai`. ```shell $ gguf-parser --ol-model="llama3.3" +------------------------------------------------------------------------------------------------------------+ | METADATA | +-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ | TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | +-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ | model | Llama 3.1 70B Instru... | llama | Q4_K_M | true | 39.59 GiB | 70.55 B | 4.82 bpw | +-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ +-----------------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ | MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ | 131072 | 8192 | true | 64 | 80 | 28672 | 0 | 128256 | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------+ | TOKENIZER | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ | gpt2 | 2 MiB | 128256 | N/A | 128000 | 128009 | N/A | N/A | N/A | N/A | N/A | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+-----------------------------------------+----------------------------------------+ | ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | | | | | | | | | | | +--------------------+---------+----------+----------------+-----------+-----------+ | | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+ | llama | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 81 (80 + 1) | Yes | 1 + 0 + 0 | 1.06 GB | 1.13 GiB | 80 + 1 | 40.26 GiB | 95.86 GiB | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+ $ # Ollama Model includes the preset params and other artifacts, like multimodal projectors or LoRA adapters, $ # you can get the usage of Ollama running by using `--ol-usage` option. +------------------------------------------------------------------------------------------------------------+ | METADATA | +-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ | TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | +-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ | model | Llama 3.1 70B Instru... | llama | Q4_K_M | true | 39.59 GiB | 70.55 B | 4.82 bpw | +-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+ +-----------------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ | MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ | 131072 | 8192 | true | 64 | 80 | 28672 | 0 | 128256 | +-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+ +-------------------------------------------------------------------------------------------------------------------------------------------------------+ | TOKENIZER | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ | MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ | gpt2 | 2 MiB | 128256 | N/A | 128000 | 128009 | N/A | N/A | N/A | N/A | N/A | +-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+ +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+-----------------------------------------+ | ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 | | | | | | | | | | | +--------------------+------------+------------+----------------+------------+-----------+ | | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+------------+-----------+ | llama | 2048 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 81 (80 + 1) | Yes | 1 + 0 + 0 | 255.27 MiB | 405.27 MiB | 80 + 1 | 906.50 MiB | 40.49 GiB | +-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+------------+-----------+ ``` #### Others ##### Parse Image Model ```shell $ # Parse FLUX.1-dev Model $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gguf" +----------------------------------------------------------------------------------------------+ | METADATA | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ | TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ | model | N/A | diffusion | F16 | true | 31.79 GiB | 17 B | 16.06 bpw | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ +----------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | +----------------+---------------------------------------------------------------+-------------------------+ | DIFFUSION ARCH | CONDITIONERS | AUTOENCODER | +----------------+---------------------------------------------------------------+-------------------------+ | FLUX.1 | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) | +----------------+---------------------------------------------------------------+-------------------------+ +---------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +--------+-----------------+-------------+---------------+----------------+-------------------------+-----------------------+ | ARCH | FLASH ATTENTION | MMAP LOAD | DISTRIBUTABLE | FULL OFFLOADED | RAM | VRAM 0 | | | | | | +------------+------------+-----------+-----------+ | | | | | | UMA | NONUMA | UMA | NONUMA | +--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+ | flux_1 | Disabled | Unsupported | Supported | Yes | 343.89 MiB | 493.89 MiB | 31.89 GiB | 41.15 GiB | +--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+ $ # Parse FLUX.1-dev Model without offload Conditioner and Autoencoder $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gguf" --clip-on-cpu --vae-on-cpu +----------------------------------------------------------------------------------------------+ | METADATA | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ | TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ | model | N/A | diffusion | F16 | true | 31.79 GiB | 17 B | 16.06 bpw | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ +----------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | +----------------+---------------------------------------------------------------+-------------------------+ | DIFFUSION ARCH | CONDITIONERS | AUTOENCODER | +----------------+---------------------------------------------------------------+-------------------------+ | FLUX.1 | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) | +----------------+---------------------------------------------------------------+-------------------------+ +-------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +--------+-----------------+-------------+---------------+----------------+-----------------------+-----------------------+ | ARCH | FLASH ATTENTION | MMAP LOAD | DISTRIBUTABLE | FULL OFFLOADED | RAM | VRAM 0 | | | | | | +-----------+-----------+-----------+-----------+ | | | | | | UMA | NONUMA | UMA | NONUMA | +--------+-----------------+-------------+---------------+----------------+-----------+-----------+-----------+-----------+ | flux_1 | Disabled | Unsupported | Supported | Yes | 16.44 GiB | 16.59 GiB | 22.29 GiB | 25.05 GiB | +--------+-----------------+-------------+---------------+----------------+-----------+-----------+-----------+-----------+ $ # Parse FLUX.1-dev Model with Autoencoder tiling $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gguf" --vae-tiling +----------------------------------------------------------------------------------------------+ | METADATA | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ | TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ | model | N/A | diffusion | F16 | true | 31.79 GiB | 17 B | 16.06 bpw | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ +----------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | +----------------+---------------------------------------------------------------+-------------------------+ | DIFFUSION ARCH | CONDITIONERS | AUTOENCODER | +----------------+---------------------------------------------------------------+-------------------------+ | FLUX.1 | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) | +----------------+---------------------------------------------------------------+-------------------------+ +---------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +--------+-----------------+-------------+---------------+----------------+-------------------------+-----------------------+ | ARCH | FLASH ATTENTION | MMAP LOAD | DISTRIBUTABLE | FULL OFFLOADED | RAM | VRAM 0 | | | | | | +------------+------------+-----------+-----------+ | | | | | | UMA | NONUMA | UMA | NONUMA | +--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+ | flux_1 | Disabled | Unsupported | Supported | Yes | 343.89 MiB | 493.89 MiB | 31.89 GiB | 36.28 GiB | +--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+ $ # Parse FLUX.1-dev Model with multiple devices offloading $ # Support by LLaMA Box v0.0.106+, https://github.com/gpustack/llama-box. $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gguf" --tensor-split="1,1,1" +----------------------------------------------------------------------------------------------+ | METADATA | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ | TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ | model | N/A | diffusion | F16 | true | 31.79 GiB | 17 B | 16.06 bpw | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ +----------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | +----------------+---------------------------------------------------------------+-------------------------+ | DIFFUSION ARCH | CONDITIONERS | AUTOENCODER | +----------------+---------------------------------------------------------------+-------------------------+ | FLUX.1 | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) | +----------------+---------------------------------------------------------------+-------------------------+ +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +--------+-----------------+-------------+---------------+----------------+-------------------------+---------------------+---------------------+-----------------------+ | ARCH | FLASH ATTENTION | MMAP LOAD | DISTRIBUTABLE | FULL OFFLOADED | RAM | VRAM 0 | VRAM 1 | VRAM 2 | | | | | | +------------+------------+----------+----------+------------+--------+-----------+-----------+ | | | | | | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | +--------+-----------------+-------------+---------------+----------------+------------+------------+----------+----------+------------+--------+-----------+-----------+ | flux_1 | Disabled | Unsupported | Supported | Yes | 343.89 MiB | 493.89 MiB | 9.34 GiB | 9.60 GiB | 259.96 MiB | 7 GiB | 22.29 GiB | 25.05 GiB | +--------+-----------------+-------------+---------------+----------------+------------+------------+----------+----------+------------+--------+-----------+-----------+ ``` ##### Parse None Model ```shell $ # Parse Multi-Modal Projector $ gguf-parser --hf-repo="unsloth/Qwen2.5-Omni-3B-GGUF" --hf-file="mmproj-F32.gguf" +-------------------------------------------------------------------------------------------------------+ | METADATA | +-----------+-----------------+------+--------------+---------------+----------+------------+-----------+ | TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | +-----------+-----------------+------+--------------+---------------+----------+------------+-----------+ | projector | Qwen2.5-Omni-3B | clip | F32 | true | 4.86 GiB | 1.31 B | 31.93 bpw | +-----------+-----------------+------+--------------+---------------+----------+------------+-----------+ +-------------------------------------------------------------------------------------------------------------------------+ | ARCHITECTURE | +----------------+-------------------------------+-----------------+-------------------------------------+----------------+ | PROJECTOR TYPE | EMBEDDING LEN | LAYERS | FEED FORWARD LEN | ENCODER | | +---------------+---------------+--------+--------+------------------+------------------+ | | | VISION | AUDIO | VISION | AUDIO | VISION | AUDIO | | +----------------+---------------+---------------+--------+--------+------------------+------------------+----------------+ | qwen2.5o | 1280 | 1280 | 32 | 32 | 1280 | 5120 | Vision & Audio | +----------------+---------------+---------------+--------+--------+------------------+------------------+----------------+ $ # Parse LoRA Adapter $ gguf-parser --hf-repo="ngxson/test_gguf_lora_adapter" --hf-file="lora-Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf" +---------------------------------------------------------------------------------------------+ | METADATA | +---------+------+-------+--------------+---------------+------------+------------+-----------+ | TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | +---------+------+-------+--------------+---------------+------------+------------+-----------+ | adapter | N/A | llama | F16 | true | 168.08 MiB | 88.12 M | 16.00 bpw | +---------+------+-------+--------------+---------------+------------+------------+-----------+ +---------------------------+ | ARCHITECTURE | +--------------+------------+ | ADAPTER TYPE | LORA ALPHA | +--------------+------------+ | lora | 32 | +--------------+------------+ ``` ### Estimate #### Across Multiple GPU Devices Imaging you're preparing to run the [hierholzer/Llama-3.1-70B-Instruct-GGUF](https://huggingface.co/hierholzer/Llama-3.1-70B-Instruct-GGUF) model file across several hosts in your local network. Some of these hosts are equipped with GPU devices, while others do not have any GPU capabilities. ```mermaid flowchart TD subgraph host4["Windows 11 (host4)"] ram40(["11GiB RAM remaining"]) end subgraph host3["Apple macOS (host3)"] gpu10["Apple M1 Max (6GiB VRAM remaining)"] end subgraph host2["Windows 11 (host2)"] gpu20["NVIDIA 4090 (12GiB VRAM remaining)"] end subgraph host1["Ubuntu (host1)"] gpu30["NVIDIA 4080 0 (8GiB VRAM remaining)"] gpu31["NVIDIA 4080 1 (10GiB VRAM remaining)"] end ``` ##### Single Host Multiple GPU Devices Let's assume you plan to run the model on `host1` only. ```mermaid flowchart TD subgraph host1["Ubuntu (host1)"] gpu30["NVIDIA 4080 0 (8GiB VRAM remaining)"] gpu31["NVIDIA 4080 1 (10GiB VRAM remaining)"] end ``` ```shell $ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --ctx-size=1024 --tensor-split="8,10" --estimate --in-short +------------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +----------------------------------------------+--------------------------------------+----------------------------------------+ | RAM | VRAM 0 | VRAM 1 | +--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+ | 1 + 0 + 0 | 249.27 MiB | 399.27 MiB | 36 + 0 | 144 MiB | 17.83 GiB | 44 + 1 | 22.27 GiB | 22.83 GiB | +--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+ ``` Based on the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host1` has the following resource consumption: | Host | Available RAM | Request RAM | Available VRAM | Request VRAM | Result | |-----------------------|---------------|-------------|----------------|--------------|------------| | host1 | ENOUGH | 399.27 MiB | | | :thumbsup: | | host1 (NVIDIA 4080 0) | | | 8 GiB | 17.83 GiB | | | host1 (NVIDIA 4080 1) | | | 10 GiB | 22.83 GiB | | It appears that running the model on `host1` alone is not feasible. ##### Multiple Hosts Multiple GPU Devices Next, let's consider the scenario where you plan to run the model on `host4`, while offloading all layers to `host1`, `host2`, and `host3`. ```mermaid flowchart TD host4 -->|TCP| gpu10 host4 -->|TCP| gpu20 host4 -->|TCP| gpu30 host4 -->|TCP| gpu31 subgraph host4["Windows 11 (host4)"] ram40(["11GiB RAM remaining"]) end subgraph host3["Apple macOS (host3)"] gpu10["Apple M1 Max (6GiB VRAM remaining)"] end subgraph host2["Windows 11 (host2)"] gpu20["NVIDIA 4090 (12GiB VRAM remaining)"] end subgraph host1["Ubuntu (host1)"] gpu30["NVIDIA 4080 0 (8GiB VRAM remaining)"] gpu31["NVIDIA 4080 1 (10GiB VRAM remaining)"] end ``` ```shell $ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --ctx-size=1024 --tensor-split="8,10,12,6" --rpc="host1:50052,host1:50053,host2:50052,host3:50052" --estimate --in-short +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+ | RAM | RPC 0 (V)RAM | RPC 1 (V)RAM | RPC 2 (V)RAM | RPC 3 (V)RAM | +--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+ | 1 + 0 + 0 | 249.27 MiB | 399.27 MiB | 18 + 0 | 8.85 GiB | 9.28 GiB | 23 + 0 | 10.88 GiB | 11.32 GiB | 27 + 0 | 12.75 GiB | 13.19 GiB | 12 + 1 | 7.13 GiB | 7.64 GiB | +--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+ ``` According to the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host4` results in the following resource consumption: | Host | Available RAM | Request RAM | Available VRAM | Request VRAM | Result | |-----------------------|---------------|-------------|----------------|--------------|------------| | host4 | 11 GiB | 399.27 MiB | | | :thumbsup: | | host1 (NVIDIA 4080 0) | | | 8 GiB | 9.28 GiB | | | host1 (NVIDIA 4080 1) | | | 10 GiB | 11.32 GiB | | | host2 (NVIDIA 4090) | | | 12 GiB | 13.19 GiB | | | host3 (Apple M1 Max) | ENOUGH | | 6 GiB | 7.13 GiB | | It seems that the model cannot be served on `host4`, even with all layers offloaded to `host1`, `host2`, and `host3`. We should consider a different approach: running the model on `host3` while offloading all layers to `host1`, `host2`, and `host4`. ```mermaid flowchart TD host3 -->|TCP| ram40 host3 -->|TCP| gpu20 host3 -->|TCP| gpu30 host3 -->|TCP| gpu31 subgraph host4["Windows 11 (host4)"] ram40(["11GiB RAM remaining"]) end subgraph host3["Apple macOS (host3)"] gpu10["Apple M1 Max (6GiB VRAM remaining)"] end subgraph host2["Windows 11 (host2)"] gpu20["NVIDIA 4090 (12GiB VRAM remaining)"] end subgraph host1["Ubuntu (host1)"] gpu30["NVIDIA 4080 0 (8GiB VRAM remaining)"] gpu31["NVIDIA 4080 1 (10GiB VRAM remaining)"] end ``` ```shell $ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --ctx-size=1024 --tensor-split="11,12,8,10,6" --rpc="host4:50052,host2:50052,host1:50052,host1:50053" --estimate --in-short +-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------+ | RAM | RPC 0 (V)RAM | RPC 1 (V)RAM | RPC 2 (V)RAM | RPC 3 (V)RAM | VRAM 0 | +--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+------------+----------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+------------+----------+ | 1 + 0 + 0 | 249.27 MiB | 399.27 MiB | 19 + 0 | 9.36 GiB | 9.79 GiB | 21 + 0 | 9.92 GiB | 10.35 GiB | 14 + 0 | 6.57 GiB | 7.01 GiB | 17 + 0 | 8.11 GiB | 8.54 GiB | 9 + 1 | 302.50 MiB | 6.16 GiB | +--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+------------+----------+ ``` According to the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host3` results in the following resource consumption: | Host | Available RAM | Request RAM | Available VRAM | Request VRAM | Result | |-----------------------|---------------|-------------|----------------|--------------|------------| | host3 (Apple M1 Max) | ENOUGH | 249.27 MiB | | | :thumbsup: | | host4 | 11 GiB | 9.79 GiB | | | :thumbsup: | | host2 (NVIDIA 4090) | | | 12 GiB | 10.35 GiB | :thumbsup: | | host1 (NVIDIA 4080 0) | | | 8 GiB | 7.01 GiB | :thumbsup: | | host1 (NVIDIA 4080 1) | | | 10 GiB | 8.54 GiB | :thumbsup: | | host3 (Apple M1 Max) | | | 6 GiB | 302.50 MiB | :thumbsup: | Now, the model can be successfully served on `host3`, with all layers offloaded to `host1`, `host2`, and `host4`. #### Maximum Tokens Per Second The maximum TPS estimation for the GGUF Parser is determined by the model's parameter size, context size, model offloaded layers, and devices on which the model runs. Among these factors, the device's specifications are particularly important. Inspired by [LLM inference speed of light](https://zeux.io/2024/03/15/llm-inference-sol/), GGUF Parser use the **FLOPS** and **bandwidth** of the device as evaluation metrics: - When the device is a CPU, FLOPS refers to the performance of that CPU, while bandwidth corresponds to the DRAM bandwidth. - When the device is a (i)GPU, FLOPS indicates the performance of that (i)GPU, and bandwidth corresponds to the VRAM bandwidth. - When the device is a specific host, FLOPS depends on whether the CPU or (i)GPU of that host is being used, while bandwidth corresponds to the bandwidth connecting the main node to that host. **After all, a chain is only as strong as its weakest link.** If the connection bandwidth between the main node and the host is equal to or greater than the *RAM bandwidth, then the bandwidth should be taken as the *RAM bandwidth value. ##### CPU FLOPS Calculation The performance of a single CPU cache can be calculated using the following formula: $$ CPU\ FLOPS = Number\ of \ Cores \times Core\ Frequency \times Floating\ Point\ Operations\ per\ Cycle $$ The Apple M1 Max CPU features a total of 10 cores, consisting of 8 performance cores and 2 efficiency cores. The performance cores operate at a clock speed of 3.2 GHz, while the efficiency cores run at 2.2 GHz. All cores support the [ARM NEON instruction set](https://en.wikipedia.org/wiki/ARM_architecture_family#Advanced_SIMD_(Neon)), which enables 128-bit SIMD operations, allowing multiple floating-point numbers to be processed simultaneously within a single CPU cycle. Specifically, using single-precision (32-bit) floating-point numbers, each cycle can handle 4 floating-point operations. The peak floating-point performance for a single performance core is calculated as follows: $$ Peak\ Performance = 3.2\ GHz \times 4\ FLOPS = 12.8\ GFLOPS $$ For a single efficiency core, the calculation is: $$ Peak\ Performance = 2.2\ GHz \times 4\ FLOPS = 8.8\ GFLOPS $$ Thus, the overall peak floating-point performance of the entire CPU can be determined by combining the contributions from both types of cores: $$ Peak\ Performance = 8\ Cores \times 12.8\ GFLOPS + 2\ Cores \times 8.8\ GFLOPS = 120\ GFLOPS $$ > This results in an average performance of 12 GFLOPS per core. It is evident that the average performance achieved by > utilizing both performance and efficiency cores is lower than that obtained by exclusively using performance cores. ##### Run LLaMA2-7B-Chat with Apple Silicon M-series Taking [TheBloke/Llama-2-7B-Chat-GGUF](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF) as an example and estimate the maximum tokens per second for Apple Silicon M-series using the GGUF Parser. ```shell $ # Estimate full offloaded Q8_0 model $ gguf-parser --hf-repo TheBloke/LLaMA-7b-GGUF --hf-file llama-7b.Q8_0.gguf --estimate --in-short \ -c 512 \ --device-metric ";,;" $ # Estimate full offloaded Q4_0 model $ gguf-parser --hf-repo TheBloke/LLaMA-7b-GGUF --hf-file llama-7b.Q4_0.gguf --estimate --in-short \ -c 512 \ --device-metric ";,;" ``` | Variant | CPU FLOPS (Performance Core) | iGPU FLOPS | (V)RAM Bandwidth | Q8_0 Max TPS | Q4_0 Max TPS | |----------|------------------------------|------------------------|------------------|--------------|--------------| | M1 | 51.2 GFLOPS (4 cores) | 2.6 TFLOPS (8 cores) | 68.3 GBps | 8.68 | 14.56 | | M1 Pro | 102.4 GFLOPS (8 cores) | 5.2 TFLOPS (16 cores) | 204.8 GBps | 26.04 | 43.66 | | M1 Max | 102.4 GFLOPS (8 cores) | 10.4 TFLOPS (32 cores) | 409.6 GBps | 52.08 | 87.31 | | M1 Ultra | 204.8 GFLOPS (16 cores) | 21 TFLOPS (64 cores) | 819.2 GBps | 104.16 | 174.62 | | M2 | 56 GFLOPS (4 cores) | 3.6 TFLOPS (10 cores) | 102.4 GBps | 13.02 | 21.83 | | M2 Pro | 112 GFLOPS (8 cores) | 6.8 TFLOPS (19 cores) | 204.8 GBps | 26.04 | 43.66 | | M2 Max | 112 GFLOPS (8 cores) | 13.6 TFLOPS (38 cores) | 409.6 GBps | 52.08 | 87.31 | | M2 Ultra | 224 GFLOPS (16 cores) | 27.2 TFLOPS (76 cores) | 819.2 GBps | 104.16 | 174.62 | | M3 | 64.96 GFLOPS (4 cores) | 4.1 TFLOPS (10 cores) | 102.4 GBps | 13.02 | 21.83 | | M3 Pro | 97.44 GFLOPS (6 cores) | 7.4 TFLOPS (18 cores) | 153.6 GBps | 19.53 | 32.74 | | M3 Max | 194.88 GFLOPS (12 cores) | 16.4 TFLOPS (40 cores) | 409.6 GBps | 52.08 | 87.31 | | M4 | 70.56 GFLOPS (4 cores) | 4.1 TFLOPS | 120 GBps | 15.26 | 25.58 | > References: > - https://www.cpu-monkey.com/en/cpu_family-apple_m_series > - https://nanoreview.net/ > - https://en.wikipedia.org/wiki/Apple_M1#Variants > - https://en.wikipedia.org/wiki/Apple_M2#Variants > - https://en.wikipedia.org/wiki/Apple_M3#Variants > - https://en.wikipedia.org/wiki/Apple_M4#Variants You can further verify the above results in [Performance of llama.cpp on Apple Silicon M-series ](https://github.com/ggerganov/llama.cpp/discussions/4167#user-content-fn-1-e9a4caf2848534167e450e18fc4ede7f). ##### Run LLaMA3.1-405B-Instruct with Apple Mac Studio devices combined with Thunderbolt Example by [leafspark/Meta-Llama-3.1-405B-Instruct-GGUF](https://huggingface.co/leafspark/Meta-Llama-3.1-405B-Instruct-GGUF) and estimate the maximum tokens per second for three Apple Mac Studio devices combined with Thunderbolt. | Device | CPU FLOPS (Performance Core) | iGPU FLOPS | (V)RAM Bandwidth | Thunderbolt Bandwidth | Role | |-------------------------------|------------------------------|------------------------|------------------|-----------------------|------------| | Apple Mac Studio (M2 Ultra) 0 | 224 GFLOPS (16 cores) | 27.2 TFLOPS (76 cores) | 819.2 GBps | 40 Gbps | Main | | Apple Mac Studio (M2 Ultra) 1 | 224 GFLOPS (16 cores) | 27.2 TFLOPS (76 cores) | 819.2 GBps | 40 Gbps | RPC Server | | Apple Mac Studio (M2 Ultra) 2 | 224 GFLOPS (16 cores) | 27.2 TFLOPS (76 cores) | 819.2 GBps | 40 Gbps | RPC Server | Get the maximum tokens per second with the following command: ```shell $ # Explain the command: $ # --device-metric "224GFLOPS;819.2GBps" <-- Apple Mac Studio 0 CPU FLOPS and RAM Bandwidth $ # --device-metric "27.2TFLOPS;819.2GBps;40Gbps" <-- Apple Mac Studio 1 (RPC 0) iGPU FLOPS, VRAM Bandwidth, and Thunderbolt Bandwidth $ # --device-metric "27.2TFLOPS;819.2GBps;40Gbps" <-- Apple Mac Studio 2 (RPC 1) iGPU FLOPS, VRAM Bandwidth, and Thunderbolt Bandwidth $ # --device-metric "27.2TFLOPS;819.2GBps" <-- Apple Mac Studio 0 iGPU FLOPS and VRAM Bandwidth $ gguf-parser --hf-repo leafspark/Meta-Llama-3.1-405B-Instruct-GGUF --hf-file Llama-3.1-405B-Instruct.Q4_0.gguf/Llama-3.1-405B-Instruct.Q4_0-00001-of-00012.gguf --estimate --in-short \ --no-mmap \ -c 512 \ --rpc host1:port,host2:port \ --tensor-split "" \ --device-metric "224GFLOPS;819.2GBps" \ --device-metric "27.2TFLOPS;819.2GBps;40Gbps" \ --device-metric "27.2TFLOPS;819.2GBps;40Gbps" \ --device-metric "27.2TFLOPS;819.2GBps" ``` | Tensor Split | Apple Mac Studio 0 RAM | Apple Mac Studio 1 VRAM (RPC 0) | Apple Mac Studio 2 VRAM (RPC 1) | Apple Mac Studio 0 VRAM | Q4_0 Max TPS | |--------------|------------------------|---------------------------------|----------------------------------|-------------------------|--------------| | 1,1,1 | 1.99 GiB | 72.74 GiB | 71.04 GiB | 70.96 GiB | 10.71 | | 2,1,1 | 1.99 GiB | 108.26 GiB | 54.13 GiB | 52.35 GiB | 11.96 | | 3,1,1 | 1.99 GiB | 130.25 GiB | 42.29 GiB | 42.20 GiB | 9.10 | | 4,1,1 | 1.99 GiB | 143.78 GiB | 35.52 GiB | 35.44 GiB | 7.60 | ##### Run Qwen2.5-72B-Instruct with NVIDIA RTX 4080 and remote RPC by Apple Mac Studio (M2) Example by [Qwen/Qwen2.5-72B-Instruct-GGUF](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GGUF) and estimate the maximum tokens per second for NVIDIA RTX 4080. | Hardware | FLOPS | Bandwidth | |---------------------------------------------|--------------|------------| | Intel i5-14600k | 510.4 GFLOPS | | | 2 x Corsair Vengeance RGB DDR5-6000 (32GiB) | | 96 GBps | | 2 x NVIDIA GeForce RTX 4080 | 48.74 TFLOPS | 736.3 GBps | | Apple Mac Studio (M2) | 27.2 TFLOPS | 819.2 GBps | ```shell $ # Explain the command: $ # --tensor-split 20369,12935,13325 <-- Available Memory in MiB for each device $ # --device-metric "510.4GFLOPS;96GBps" <-- Intel i5-14600k CPU FLOPS and RAM Bandwidth $ # --device-metric "27.2TFLOPS;819.2GBps;40Gbps" <-- Apple Mac Studio (M2) (RPC 0) iGPU FLOPS, VRAM Bandwidth, and Thunderbolt Bandwidth $ # --device-metric "48.74TFLOPS;736.3GBps;64GBps" <-- NVIDIA GeForce RTX 0 4080 GPU FLOPS, VRAM Bandwidth, and PCIe 5.0 x16 Bandwidth $ # --device-metric "48.74TFLOPS;736.3GBps;8GBps" <-- NVIDIA GeForce RTX 1 4080 GPU FLOPS, VRAM Bandwidth, and PCIe 4.0 x4 Bandwidth $ gguf-parser --hf-repo Qwen/Qwen2.5-72B-Instruct-GGUF --hf-file qwen2.5-72b-instruct-q4_k_m-00001-of-00012.gguf --estimate --in-short \ --no-mmap \ -c 8192 \ --rpc host:port \ --tensor-split 20369,12935,13325 \ --device-metric "510.4GFLOPS;96GBps" \ --device-metric "27.2TFLOPS;819.2GBps;40Gbps" \ --device-metric "48.74TFLOPS;736.3GBps;64GBps" \ --device-metric "48.74TFLOPS;736.3GBps;8GBps" +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ESTIMATE | +-----------+------------------------------------------+----------------------------------------------+----------------------------------------+----------------------------------------+ | MAX TPS | RAM | RPC 0 (V)RAM | VRAM 0 | VRAM 1 | | +--------------------+----------+----------+----------------+--------------+--------------+----------------+-----------+-----------+----------------+-----------+-----------+ | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +-----------+--------------------+----------+----------+----------------+--------------+--------------+----------------+-----------+-----------+----------------+-----------+-----------+ | 51.82 tps | 1 + 0 + 0 | 1.19 GiB | 1.34 GiB | 36 + 0 | 18.85 GiB | 20.17 GiB | 22 + 0 | 11.34 GiB | 12.66 GiB | 22 + 1 | 12.65 GiB | 13.97 GiB | +-----------+--------------------+----------+----------+----------------+--------------+--------------+----------------+-----------+-----------+----------------+-----------+-----------+ ``` #### Full Layers Offload (default) ```shell $ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --estimate --in-short +-------------------------------------------------------------------------------------+ | ESTIMATE | +------------------------------------------+------------------------------------------+ | RAM | VRAM 0 | +--------------------+----------+----------+----------------+------------+------------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+----------+----------+----------------+------------+------------+ | 1 + 0 + 0 | 1.63 GiB | 1.78 GiB | 126 + 1 | 126.28 GiB | 246.86 GiB | +--------------------+----------+----------+----------------+------------+------------+ ``` #### Zero Layers Offload ```shell $ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --gpu-layers=0 --estimate --in-short +------------------------------------------------------------------------------------+ | ESTIMATE | +----------------------------------------------+-------------------------------------+ | RAM | VRAM 0 | +--------------------+------------+------------+----------------+--------+-----------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+------------+------------+----------------+--------+-----------+ | 1 + 126 + 1 | 127.64 GiB | 127.79 GiB | 0 + 0 | 0 B | 33.62 GiB | +--------------------+------------+------------+----------------+--------+-----------+ ``` #### Specific Layers Offload ```shell $ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --gpu-layers=10 --estimate --in-short +----------------------------------------------------------------------------------+ | ESTIMATE | +----------------------------------------------+-----------------------------------+ | RAM | VRAM 0 | +--------------------+------------+------------+----------------+--------+---------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+------------+------------+----------------+--------+---------+ | 1 + 126 + 1 | 127.64 GiB | 127.79 GiB | 0 + 0 | 0 B | 250 MiB | +--------------------+------------+------------+----------------+--------+---------+ ``` #### Specific Context Size By default, the context size retrieved from the model's metadata. Use `--ctx-size` to specify the context size. ```shell $ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --ctx-size=4096 --estimate --in-short +--------------------------------------------------------------------------------------+ | ESTIMATE | +----------------------------------------------+---------------------------------------+ | RAM | VRAM 0 | +--------------------+------------+------------+----------------+----------+-----------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+------------+------------+----------------+----------+-----------+ | 1 + 0 + 0 | 404.53 MiB | 554.53 MiB | 126 + 1 | 3.94 GiB | 93.28 GiB | +--------------------+------------+------------+----------------+----------+-----------+ ``` #### Enable Flash Attention By default, LLaMA.cpp disables the Flash Attention. Enable Flash Attention will reduce the VRAM usage, but it also increases the GPU/CPU usage. Use `--flash-attention` to enable the Flash Attention. Please note that not all models support Flash Attention, if the model does not support, the "FLASH ATTENTION" shows " Disabled" even if you enable it. ```shell $ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --flash-attention --estimate --in-short +-------------------------------------------------------------------------------------+ | ESTIMATE | +------------------------------------------+------------------------------------------+ | RAM | VRAM 0 | +--------------------+----------+----------+----------------+------------+------------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+----------+----------+----------------+------------+------------+ | 1 + 0 + 0 | 1.63 GiB | 1.78 GiB | 126 + 1 | 126.28 GiB | 215.98 GiB | +--------------------+----------+----------+----------------+------------+------------+ ``` #### Disable MMap By default, LLaMA.cpp loads the model via Memory-Mapped. For Apple MacOS, Memory-Mapped is an efficient way to load the model, and results in a lower VRAM usage. For other platforms, Memory-Mapped affects the first-time model loading speed only. Use `--no-mmap` to disable loading the model via Memory-Mapped. Please note that some models require loading the whole weight into memory, if the model does not support MMap, the "MMAP LOAD" shows "Not Supported". ```shell $ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --no-mmap --estimate --in-short +-------------------------------------------------------------------------------------+ | ESTIMATE | +------------------------------------------+------------------------------------------+ | RAM | VRAM 0 | +--------------------+----------+----------+----------------+------------+------------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+----------+----------+----------------+------------+------------+ | 1 + 0 + 0 | 2.97 GiB | 3.12 GiB | 126 + 1 | 214.24 GiB | 246.86 GiB | +--------------------+----------+----------+----------------+------------+------------+ ``` #### With Adapter Use `--lora`/`--control-vector` to estimate the usage when loading a model with adapters. ```shell $ gguf-parser --hf-repo="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" --hf-file="Meta-Llama-3-8B-Instruct.Q5_K_M.gguf" --estimate --in-short +-------------------------------------------------------------------------------------+ | ESTIMATE | +----------------------------------------------+--------------------------------------+ | RAM | VRAM 0 | +--------------------+------------+------------+----------------+----------+----------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+------------+------------+----------------+----------+----------+ | 1 + 0 + 0 | 210.80 MiB | 360.80 MiB | 32 + 1 | 1.25 GiB | 7.04 GiB | +--------------------+------------+------------+----------------+----------+----------+ $ # With a LoRA adapter. $ gguf-parser --hf-repo="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" --hf-file="Meta-Llama-3-8B-Instruct.Q5_K_M.gguf" --lora-url="https://huggingface.co/ngxson/test_gguf_lora_adapter/resolve/main/lora-Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf" --estimate --in-short +-------------------------------------------------------------------------------------+ | ESTIMATE | +----------------------------------------------+--------------------------------------+ | RAM | VRAM 0 | +--------------------+------------+------------+----------------+----------+----------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+------------+------------+----------------+----------+----------+ | 1 + 0 + 0 | 223.91 MiB | 373.91 MiB | 32 + 1 | 1.42 GiB | 7.20 GiB | +--------------------+------------+------------+----------------+----------+----------+ ``` #### Get Proper Offload Layers Use `--gpu-layers-step` to get the proper offload layers number when the model is too large to fit into the GPUs memory. ```shell $ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --gpu-layers-step=6 --estimate --in-short +-----------------------------------------------------------------------------------------+ | ESTIMATE | +----------------------------------------------+------------------------------------------+ | RAM | VRAM 0 | +--------------------+------------+------------+----------------+------------+------------+ | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 126 + 1 | 127.64 GiB | 127.79 GiB | 0 + 0 | 0 B | 250 MiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 120 + 1 | 121.90 GiB | 122.05 GiB | 6 + 0 | 6 GiB | 44.68 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 114 + 1 | 115.90 GiB | 116.05 GiB | 12 + 0 | 12 GiB | 54.74 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 108 + 1 | 109.90 GiB | 110.05 GiB | 18 + 0 | 18 GiB | 64.80 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 102 + 1 | 103.90 GiB | 104.05 GiB | 24 + 0 | 24 GiB | 74.86 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 96 + 1 | 97.90 GiB | 98.05 GiB | 30 + 0 | 30 GiB | 84.93 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 90 + 1 | 91.90 GiB | 92.05 GiB | 36 + 0 | 36 GiB | 94.99 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 84 + 1 | 85.90 GiB | 86.05 GiB | 42 + 0 | 42 GiB | 105.05 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 78 + 1 | 79.90 GiB | 80.05 GiB | 48 + 0 | 48 GiB | 115.11 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 72 + 1 | 73.90 GiB | 74.05 GiB | 54 + 0 | 54 GiB | 125.17 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 66 + 1 | 67.90 GiB | 68.05 GiB | 60 + 0 | 60 GiB | 135.23 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 60 + 1 | 61.90 GiB | 62.05 GiB | 66 + 0 | 66 GiB | 145.29 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 54 + 1 | 55.90 GiB | 56.05 GiB | 72 + 0 | 72 GiB | 155.35 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 48 + 1 | 49.90 GiB | 50.05 GiB | 78 + 0 | 78 GiB | 165.42 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 42 + 1 | 43.90 GiB | 44.05 GiB | 84 + 0 | 84 GiB | 175.48 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 36 + 1 | 37.90 GiB | 38.05 GiB | 90 + 0 | 90 GiB | 185.54 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 30 + 1 | 31.90 GiB | 32.05 GiB | 96 + 0 | 96 GiB | 195.60 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 24 + 1 | 25.90 GiB | 26.05 GiB | 102 + 0 | 102 GiB | 205.66 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 18 + 1 | 19.90 GiB | 20.05 GiB | 108 + 0 | 108 GiB | 215.72 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 12 + 1 | 13.90 GiB | 14.05 GiB | 114 + 0 | 114 GiB | 226.05 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 6 + 1 | 7.90 GiB | 8.05 GiB | 120 + 0 | 120 GiB | 236.64 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 0 + 1 | 1.90 GiB | 2.05 GiB | 126 + 0 | 126 GiB | 246.24 GiB | +--------------------+------------+------------+----------------+------------+------------+ | 1 + 0 + 0 | 1.63 GiB | 1.78 GiB | 126 + 1 | 126.28 GiB | 246.86 GiB | +--------------------+------------+------------+----------------+------------+------------+ ``` ## License MIT ================================================ FILE: cache.go ================================================ package gguf_parser import ( "errors" "fmt" "os" "path/filepath" "time" "github.com/gpustack/gguf-parser-go/util/json" "github.com/gpustack/gguf-parser-go/util/osx" "github.com/gpustack/gguf-parser-go/util/stringx" ) var ( ErrGGUFFileCacheDisabled = errors.New("GGUF file cache disabled") ErrGGUFFileCacheMissed = errors.New("GGUF file cache missed") ErrGGUFFileCacheCorrupted = errors.New("GGUF file cache corrupted") ) type GGUFFileCache string func (c GGUFFileCache) getKeyPath(key string) string { k := stringx.SumByFNV64a(key) p := filepath.Join(string(c), k[:1], k) return p } func (c GGUFFileCache) Get(key string, exp time.Duration) (*GGUFFile, error) { if c == "" { return nil, ErrGGUFFileCacheDisabled } if key == "" { return nil, ErrGGUFFileCacheMissed } p := c.getKeyPath(key) if !osx.Exists(p, func(stat os.FileInfo) bool { if !stat.Mode().IsRegular() { return false } return exp == 0 || time.Since(stat.ModTime()) < exp }) { return nil, ErrGGUFFileCacheMissed } var gf GGUFFile { bs, err := os.ReadFile(p) if err != nil { return nil, fmt.Errorf("GGUF file cache get: %w", err) } if err = json.Unmarshal(bs, &gf); err != nil { return nil, fmt.Errorf("GGUF file cache get: %w", err) } } if len(gf.TensorInfos) == 0 { _ = os.Remove(p) return nil, ErrGGUFFileCacheCorrupted } return &gf, nil } func (c GGUFFileCache) Put(key string, gf *GGUFFile) error { if c == "" { return ErrGGUFFileCacheDisabled } if key == "" || gf == nil { return nil } bs, err := json.Marshal(gf) if err != nil { return fmt.Errorf("GGUF file cache put: %w", err) } p := c.getKeyPath(key) if err = osx.WriteFile(p, bs, 0o600); err != nil { return fmt.Errorf("GGUF file cache put: %w", err) } return nil } func (c GGUFFileCache) Delete(key string) error { if c == "" { return ErrGGUFFileCacheDisabled } if key == "" { return ErrGGUFFileCacheMissed } p := c.getKeyPath(key) if !osx.ExistsFile(p) { return ErrGGUFFileCacheMissed } if err := os.Remove(p); err != nil { return fmt.Errorf("GGUF file cache delete: %w", err) } return nil } ================================================ FILE: cmd/gguf-parser/README.md ================================================ # GGUF Parser Review/Check [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) files, estimate the memory usage for [llama.cpp](https://github.com/ggerganov/llama.cpp), [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp) and [llama-box](https://github.com/gpustack/llama-box). See [GGUF Parser detail introduction](https://github.com/gpustack/gguf-parser-go) for more information. ## Usage ```shell $ gguf-parser --help NAME: gguf-parser - Review/Check GGUF files and estimate the memory usage. USAGE: gguf-parser [GLOBAL OPTIONS] VERSION: ... GLOBAL OPTIONS: --debug Enable debugging, verbosity. (default: false) --help, -h Print the usage. --version, -v Print the version. Estimate --device-metric value [ --device-metric value ] Specify the device metrics, which is used to estimate the throughput, in form of "FLOPS;Up Bandwidth[;Down Bandwidth]". The FLOPS unit, select from [PFLOPS, TFLOPS, GFLOPS, MFLOPS, KFLOPS]. The Up/Down Bandwidth unit, select from [PiBps, TiBps, GiBps, MiBps, KiBps, PBps, TBps, GBps, MBps, KBps, Pbps, Tbps, Gbps, Mbps, Kbps]. Up Bandwidth usually indicates the bandwidth to transmit the data to calculate, and Down Bandwidth indicates the bandwidth to transmit the calculated result to next layer. For example, "--device-metric 10TFLOPS;400GBps" means the device has 10 TFLOPS and 400 GBps Up/Down bandwidth, "--device-metric 10TFLOPS;400GBps;5000MBps" means the device has 5000MBps Down bandwidth. If the quantity specified by "--device-metric" is less than the number of estimation devices(determined by "--tensor-split" and "--rpc" to infer the device count), then replicate the last "--device-metric" to meet the required number of evaluation devices. --flash-attention, --flash-attn, --fa, --diffusion-fa Specify enabling Flash Attention, which is used to estimate the usage. Flash Attention can reduce the usage of RAM/VRAM. (default: false) --gpu-layers value, --ngl value, --n-gpu-layers value Specify how many layers of the main model to offload, which is used to estimate the usage, default is full offloaded. (default: -1) --main-gpu value, --mg value Specify the GPU to use for the model (with "--split-mode=none") or for intermediate results and KV (with "--split-mode=row"), which is used to estimate the usage. Since gguf-parser cannot recognize the host GPU devices or RPC servers, "--main-gpu" only works when "--tensor-split" is set. (default: 0) --no-flash-attention, --no-flash-attn Specify disabling Flash Attention. (default: false) --override-tensor value, --ot value [ --override-tensor value, --ot value ] Override tensor buffer type, for example, use --override-tensor "[2-9][0-9]\.ffn_.*_exps\.=CPU" to keep experts of layers 20-99 in the CPU --parallel-size value, --parallel value, --np value, --threads-http value Specify the number of parallel sequences to decode, which is used to estimate the usage. (default: 1) --platform-footprint value Specify the platform footprint(RAM,VRAM) of running host in MiB, which is used to estimate the NonUMA usage, default is "150,250". Different platform always gets different RAM and VRAM footprints, for example, within CUDA, "cudaMemGetInfo" or "cudaSetDevice" would occupy some RAM and VRAM, see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo. (default: "150,250") --rpc value Specify the RPC servers, which is used to estimate the usage, it is a comma-separated list of host:port. Woks with "--tensor-split". --tensor-split value, --ts value Specify the fraction of the model to offload to each device, which is used to estimate the usage, it is a comma-separated list of integer. Since gguf-parser cannot recognize the host GPU devices or RPC servers, must explicitly set "--tensor-split" to indicate how many devices are used. To declare the devices belong to RPC servers, set "--rpc" please. Estimate/LLaMACpp --batch-size value, -b value Specify the logical batch size, which is used to estimate the usage. (default: 2048) --cache-type-k value, --ctk value Specify the type of Key cache, which is used to estimate the usage, select from [f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1]. (default: "f16") --cache-type-v value, --ctv value Specify the type of Value cache, which is used to estimate the usage, select from [f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1]. (default: "f16") --ctx-size value, -c value Specify the size of prompt context, which is used to estimate the usage, default is equal to the model's maximum context size. (default: 0) --gpu-layers-draft value, --ngld value, --n-gpu-layers-draft value Specify how many layers of the draft model to offload, which is used to estimate the usage, default is full offloaded. (default: -1) --gpu-layers-step value Specify the step of layers to offload, works with "--gpu-layers". (default: 0) --in-max-ctx-size Limit the context size to the maximum context size of the model, if the context size is larger than the maximum context size. (default: false) --max-projected-cache value, --visual-max-image-cache value Specify how many projected embedding to be cached. (default: 0) --mmap Specify enabling Memory-Mapped using, which is used to estimate the usage. Memory-Mapped can avoid loading the entire model weights into RAM. (default: false) --no-kv-offload, --nkvo Specify disabling Key-Value offloading, which is used to estimate the usage. Disable Key-Value offloading can reduce the usage of VRAM. (default: false) --no-mmap Specify disabling Memory-Mapped using, which is used to estimate the usage. Memory-Mapped can avoid loading the entire model weights into RAM. (default: false) --rope-freq-base value RoPE base frequency, used by NTK-aware scaling. (default: 0) --rope-freq-scale value RoPE frequency scaling factor, expands context by a factor of 1/N. (default: 0) --rope-scale value RoPE context scaling factor, expands context by a factor of N. (default: 0) --rope-scaling value RoPE frequency scaling method, defaults to linear unless specified by the model, select from [none, linear, yarn]. --split-mode value, --sm value Specify how to split the model across multiple devices, which is used to estimate the usage, select from [layer, row, none]. Since gguf-parser always estimates the usage of VRAM, "none" is meaningless here, keep for compatibility. (default: "layer") --swa-full Specify using full-size SWA cache. (default: false) --ubatch-size value, --ub value Specify the physical maximum batch size, which is used to estimate the usage. (default: 512) --visual-max-image-size value Specify maximum image size when completion with vision model. (default: 0) --yarn-orig-ctx value YaRN original context size of model, defaults to model training context size. (default: 0) Estimate/StableDiffusionCpp --image-autoencoder-tiling, --vae-tiling, --image-vae-tiling Specify to enable tiling for the vae model. (default: false) --image-batch-count value, --batch-count value, --image-max-batch value Specify the batch(generation) count of the image. (default: 1) --image-free-compute-memory-immediately Specify to free the compute memory immediately after the generation, which burst using VRAM. (default: false) --image-height value, --height value, --image-max-height value Specify the (maximum) height of the image. (default: 1024) --image-no-autoencoder-offload, --vae-on-cpu, --image-no-vae-model-offload Specify to offload the vae model to CPU. (default: false) --image-no-autoencoder-tiling, --image-no-vae-tiling Specify to disable tiling for the vae model, it takes precedence over --image-autoencoder-tiling. (default: false) --image-no-conditioner-offload, --clip-on-cpu, --image-no-text-encoder-model-offload Specify to offload the text encoder model to CPU. (default: false) --image-no-control-net-offload, --control-net-cpu, --image-no-control-net-model-offload Specify to offload the control net model to CPU. (default: false) --image-width value, --width value, --image-max-width value Specify the (maximum) width of the image. (default: 1024) Load --cache-expiration value Specify the expiration of cache, works with "--url/--hf-*/--ms-*/--ol-*". (default: 24h0m0s) --cache-path value Cache the read result to the path, works with "--url/--hf-*/--ms-*/--ol-*". (default: "/Users/thxcode/.cache/gguf-parser") --skip-cache Skip cache, works with "--url/--hf-*/--ms-*/--ol-*", default is caching the read result. (default: false) [$SKIP_CACHE] --skip-dns-cache Skip DNS cache, works with "--url/--hf-*/--ms-*/--ol-*", default is caching the DNS lookup result. (default: false) [$SKIP_DNS_CACHE] --skip-proxy Skip proxy settings, works with "--url/--hf-*/--ms-*/--ol-*", default is respecting the environment variables "HTTP_PROXY/HTTPS_PROXY/NO_PROXY". (default: false) [$SKIP_PROXY] --skip-range-download-detect Skip range download detect, works with "--url/--hf-*/--ms-*/--ol-*", default is detecting the range download support. (default: false) [$SKIP_RANGE_DOWNLOAD_DETECT] --skip-tls-verify Skip TLS verification, works with "--url/--hf-*/--ms-*/--ol-*", default is verifying the TLS certificate on HTTPs request. (default: false) [$SKIP_TLS_VERIFY] Model/Local --control-net-path value, --control-net value, --image-control-net-model value Path where the GGUF file to load for the Control Net model, optional. --control-vector-path value, --control-vector value [ --control-vector-path value, --control-vector value ] Path where the GGUF file to load for the Control Vector adapter, optional. --draft-path value, --model-draft value, --md value Path where the GGUF file to load for the draft model, optional, e.g. "~/.cache/lm-studio/models/QuantFactory/Qwen2-1.5B-Instruct-GGUF/Qwen2-1.5B-Instruct.Q5_K_M.gguf". --lora-path value, --lora value [ --lora-path value, --lora value ] Path where the GGUF file to load for the LoRA adapter, optional. --mmproj-path value, --mmproj value Path where the GGUF file to load for the multimodal projector, optional. --path value, --model value, -m value Path where the GGUF file to load for the main model, e.g. "~/.cache/lm-studio/models/QuantFactory/Qwen2-7B-Instruct-GGUF/Qwen2-7B-Instruct.Q5_K_M.gguf". --upscale-path value, --upscale-model value, --image-upscale-model value Path where the GGUF file to load for the Upscale model, optional. Model/Remote --control-net-url value Url where the GGUF file to load for the Control Net model, optional. --control-vector-url value [ --control-vector-url value ] Url where the GGUF file to load for the Control Vector adapter, optional. --draft-url value Url where the GGUF file to load for the draft model, optional, e.g. "https://huggingface.co/QuantFactory/Qwen2-1.5B-Instruct-GGUF/resolve/main/Qwen2-1.5B-Instruct.Q5_K_M.gguf". Note that gguf-parser does not need to download the entire GGUF file. --header value [ --header value ] Custom HTTP header in "Key: Value" format, works with "--url/--draft-url". --lora-url value [ --lora-url value ] Url where the GGUF file to load for the LoRA adapter, optional. --mmproj-url value Url where the GGUF file to load for the multimodal projector, optional. --token value Bearer auth token to load GGUF file, optional, works with "--url/--draft-url". [$TOKEN] --upscale-url value Url where the GGUF file to load for the Upscale model, optional. --url value, --model-url value, --mu value Url where the GGUF file to load for the main model, e.g. "https://huggingface.co/QuantFactory/Qwen2-7B-Instruct-GGUF/resolve/main/Qwen2-7B-Instruct.Q5_K_M.gguf". Note that gguf-parser does not need to download the entire GGUF file. Model/Remote/HuggingFace --hf-control-net-file value Model file below the "--hf-control-net-repo", optional. --hf-control-net-repo value Repository of HuggingFace which the GGUF file store for the Control Net model, optional, works with "--hf-control-net-file". --hf-control-vector-file value [ --hf-control-vector-file value ] Control Vector adapter file below the "--hf-repo". --hf-draft-file value Model file below the "--hf-draft-repo", optional, e.g. "Qwen2-1.5B-Instruct.Q5_K_M.gguf". --hf-draft-repo value Repository of HuggingFace which the GGUF file store for the draft model, optional, e.g. "QuantFactory/Qwen2-1.5B-Instruct-GGUF", works with "--hf-draft-file". --hf-file value, --hff value Model file below the "--hf-repo", e.g. "Qwen2-7B-Instruct.Q5_K_M.gguf". --hf-lora-file value [ --hf-lora-file value ] LoRA adapter file below the "--hf-repo". --hf-mmproj-file value Multimodal projector file below the "--hf-repo". --hf-repo value, --hfr value Repository of HuggingFace which the GGUF file store for the main model, e.g. "QuantFactory/Qwen2-7B-Instruct-GGUF", works with "--hf-file". --hf-token value, --hft value User access token of HuggingFace, optional, works with "--hf-repo/--hf-file pair" or "--hf-draft-repo/--hf-draft-file" pair. See https://huggingface.co/settings/tokens. [$HF_TOKEN] --hf-upscale-file value Model file below the "--hf-upscale-repo", optional. --hf-upscale-repo value Repository of HuggingFace which the GGUF file store for the Upscale model, optional, works with "--hf-upscale-file". Model/Remote/ModelScope --ms-control-net-file value Model file below the "--ms-control-net-repo", optional. --ms-control-net-repo value Repository of ModelScope which the GGUF file store for the Control Net model, optional, works with "--ms-control-net-file". --ms-control-vector-file value [ --ms-control-vector-file value ] Control Vector adapter file below the "--ms-repo". --ms-draft-file value Model file below the "--ms-draft-repo", optional, e.g. "qwen1_5-1_8b-chat-q5_k_m.gguf". --ms-draft-repo value Repository of ModelScope which the GGUF file store for the draft model, optional, e.g. "qwen/Qwen1.5-1.8B-Chat-GGUF", works with "--ms-draft-file". --ms-file value Model file below the "--ms-repo", e.g. "qwen1_5-7b-chat-q5_k_m.gguf". --ms-lora-file value [ --ms-lora-file value ] LoRA adapter file below the "--ms-repo". --ms-mmproj-file value Multimodal projector file below the "--ms-repo". --ms-repo value Repository of ModelScope which the GGUF file store for the main model, e.g. "qwen/Qwen1.5-7B-Chat-GGUF", works with "--ms-file". --ms-token value Git access token of ModelScope, optional, works with "--ms-repo/--ms-file" pair or "--ms-draft-repo/--ms-draft-file" pair. See https://modelscope.cn/my/myaccesstoken. [$HF_TOKEN, $MS_TOKEN] --ms-upscale-file value Model file below the "--ms-upscale-repo", optional. --ms-upscale-repo value Repository of ModelScope which the GGUF file store for the Upscale model, optional, works with "--ms-upscale-file". Model/Remote/Ollama --ol-base-url value Model base URL of Ollama, e.g. https://registry.ollama.ai. (default: "https://registry.ollama.ai") --ol-model value Model name of Ollama, e.g. "gemma2". --ol-usage Specify respecting the extending layers introduced by Ollama, works with "--ol-model", which affects the usage estimation. (default: false) Output --estimate Skip all the information except the estimate result. (default: false) --in-mib Display the estimated result in table with MiB. (default: false) --in-short Display the estimated result in table in short form. (default: false) --json Output as JSON. (default: false) --json-pretty Works with "--json", to output pretty format JSON. (default: true) --raw Output the GGUF file information as JSON only, skip anything. (default: false) --raw-output value Works with "--raw", to save the result to the file --skip-architecture Skip to display architecture. (default: false) --skip-estimate Skip to estimate. By default, gguf-parser always estimates the file which types with "model". (default: false) --skip-metadata Skip to display metadata. (default: false) --skip-tokenizer Skip to display tokenizer. By default, gguf-parser always displays the tokenizer of the file which types with "model". (default: false) ``` ### Environment Variables Support - `TOKEN`: The bearer auth token to load GGUF file, works with `--url/--draft-url`. - `HF_ENDPOINT`: The HuggingFace endpoint, default is `https://huggingface.co`. - `HF_TOKEN`: The HuggingFace token, see [HuggingFace token](https://huggingface.co/settings/tokens). - `MS_ENDPOINT`: The ModelScope endpoint, default is `https://modelscope.cn`. - `MS_TOKEN`: The ModelScope token, see [ModelScope token](https://modelscope.cn/my/myaccesstoken). ## License MIT ================================================ FILE: cmd/gguf-parser/go.mod ================================================ module github.com/gpustack/gguf-parser-go/cmd/gguf-parser go 1.22.0 toolchain go1.22.9 replace github.com/gpustack/gguf-parser-go => ../../ require ( github.com/gpustack/gguf-parser-go v0.6.0 github.com/jedib0t/go-pretty/v6 v6.6.1 github.com/urfave/cli/v2 v2.27.5 ) require ( github.com/cpuguy83/go-md2man/v2 v2.0.5 // indirect github.com/henvic/httpretty v0.1.4 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/mattn/go-runewidth v0.0.16 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/rivo/uniseg v0.4.7 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d // indirect github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect golang.org/x/crypto v0.29.0 // indirect golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f // indirect golang.org/x/mod v0.22.0 // indirect golang.org/x/sync v0.9.0 // indirect golang.org/x/sys v0.27.0 // indirect golang.org/x/tools v0.27.0 // indirect gonum.org/v1/gonum v0.15.1 // indirect ) ================================================ FILE: cmd/gguf-parser/go.sum ================================================ github.com/cpuguy83/go-md2man/v2 v2.0.5 h1:ZtcqGrnekaHpVLArFSe4HK5DoKx1T0rq2DwVB0alcyc= github.com/cpuguy83/go-md2man/v2 v2.0.5/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/henvic/httpretty v0.1.4 h1:Jo7uwIRWVFxkqOnErcoYfH90o3ddQyVrSANeS4cxYmU= github.com/henvic/httpretty v0.1.4/go.mod h1:Dn60sQTZfbt2dYsdUSNsCljyF4AfdqnuJFDLJA1I4AM= github.com/jedib0t/go-pretty/v6 v6.6.1 h1:iJ65Xjb680rHcikRj6DSIbzCex2huitmc7bDtxYVWyc= github.com/jedib0t/go-pretty/v6 v6.6.1/go.mod h1:zbn98qrYlh95FIhwwsbIip0LYpwSG8SUOScs+v9/t0E= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc= github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d h1:3VwvTjiRPA7cqtgOWddEL+JrcijMlXUmj99c/6YyZoY= github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d/go.mod h1:tAG61zBM1DYRaGIPloumExGvScf08oHuo0kFoOqdbT0= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/urfave/cli/v2 v2.27.5 h1:WoHEJLdsXr6dDWoJgMq/CboDmyY/8HMMH1fTECbih+w= github.com/urfave/cli/v2 v2.27.5/go.mod h1:3Sevf16NykTbInEnD0yKkjDAeZDS0A6bzhBH5hrMvTQ= github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4= github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM= golang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ= golang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg= golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f h1:XdNn9LlyWAhLVp6P/i8QYBW+hlyhrhei9uErw2B5GJo= golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f/go.mod h1:D5SMRVC3C2/4+F/DB1wZsLRnSNimn2Sp/NPsCrsv8ak= golang.org/x/mod v0.22.0 h1:D4nJWe9zXqHOmWqj4VMOJhvzj7bEZg4wEYa759z1pH4= golang.org/x/mod v0.22.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ= golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.26.0 h1:WEQa6V3Gja/BhNxg540hBip/kkaYtRg3cxg4oXSw4AU= golang.org/x/term v0.26.0/go.mod h1:Si5m1o57C5nBNQo5z1iq+XDijt21BDBDp2bK0QI8e3E= golang.org/x/tools v0.27.0 h1:qEKojBykQkQ4EynWy4S8Weg69NumxKdn40Fce3uc/8o= golang.org/x/tools v0.27.0/go.mod h1:sUi0ZgbwW9ZPAq26Ekut+weQPR5eIM6GQLQ1Yjm1H0Q= gonum.org/v1/gonum v0.15.1 h1:FNy7N6OUZVUaWG9pTiD+jlhdQ3lMP+/LcTpJ6+a8sQ0= gonum.org/v1/gonum v0.15.1/go.mod h1:eZTZuRFrzu5pcyjN5wJhcIhnUdNijYxX1T2IcrOGY0o= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= ================================================ FILE: cmd/gguf-parser/main.go ================================================ package main import ( "errors" "fmt" "net" "os" "path/filepath" "regexp" "strconv" "strings" "sync" "time" "github.com/gpustack/gguf-parser-go/util/anyx" "github.com/gpustack/gguf-parser-go/util/json" "github.com/gpustack/gguf-parser-go/util/osx" "github.com/gpustack/gguf-parser-go/util/signalx" "github.com/jedib0t/go-pretty/v6/table" "github.com/jedib0t/go-pretty/v6/text" "github.com/urfave/cli/v2" . "github.com/gpustack/gguf-parser-go" // nolint: stylecheck ) var Version = "v0.0.0" func init() { cli.VersionFlag = &cli.BoolFlag{ Name: "version", Aliases: []string{"v"}, Usage: "Print the version.", DisableDefaultText: true, } cli.HelpFlag = &cli.BoolFlag{ Name: "help", Aliases: []string{"h"}, Usage: "Print the usage.", DisableDefaultText: true, } } func main() { name := filepath.Base(os.Args[0]) app := &cli.App{ Name: name, Usage: "Review/Check GGUF files and estimate the memory usage and provide optimization suggestions.", UsageText: name + " [GLOBAL OPTIONS]", Version: Version, Reader: os.Stdin, Writer: os.Stdout, ErrWriter: os.Stderr, HideHelpCommand: true, OnUsageError: func(c *cli.Context, _ error, _ bool) error { return cli.ShowAppHelp(c) }, Flags: []cli.Flag{ &cli.BoolFlag{ Destination: &debug, Value: debug, Name: "debug", Usage: "Enable debugging, verbosity.", }, &cli.StringFlag{ Destination: &path, Value: path, Category: "Model/Local", Name: "path", Aliases: []string{ // LLaMACpp compatibility "model", "m", }, Usage: "Path where the GGUF file to load for the main model, e.g. \"~/.cache" + "/lm-studio/models/QuantFactory/Qwen2-7B-Instruct-GGUF" + "/Qwen2-7B-Instruct.Q5_K_M.gguf\".", }, &cli.StringFlag{ Destination: &draftPath, Value: draftPath, Category: "Model/Local", Name: "draft-path", Aliases: []string{ // LLaMACpp compatibility "model-draft", "md", }, Usage: "Path where the GGUF file to load for the draft model, optional, e.g. \"~/.cache" + "/lm-studio/models/QuantFactory/Qwen2-1.5B-Instruct-GGUF" + "/Qwen2-1.5B-Instruct.Q5_K_M.gguf\".", }, &cli.StringFlag{ Destination: &mmprojPath, Value: mmprojPath, Category: "Model/Local", Name: "mmproj-path", Aliases: []string{ // LLaMACpp compatibility "mmproj", }, Usage: "Path where the GGUF file to load for the multimodal projector, optional.", }, &cli.StringSliceFlag{ Destination: &loraPaths, Category: "Model/Local", Name: "lora-path", Aliases: []string{ // LLaMACpp compatibility "lora", }, Usage: "Path where the GGUF file to load for the LoRA adapter, optional.", }, &cli.StringSliceFlag{ Destination: &controlVectorPaths, Category: "Model/Local", Name: "control-vector-path", Aliases: []string{ // LLaMACpp compatibility "control-vector", }, Usage: "Path where the GGUF file to load for the Control Vector adapter, optional.", }, &cli.StringFlag{ Destination: &upscalePath, Value: upscalePath, Category: "Model/Local", Name: "upscale-path", Aliases: []string{ "upscale-model", // StableDiffusionCpp compatibility "image-upscale-model", // LLaMABox compatibility }, Usage: "Path where the GGUF file to load for the Upscale model, optional.", }, &cli.StringFlag{ Destination: &controlNetPath, Value: controlNetPath, Category: "Model/Local", Name: "control-net-path", Aliases: []string{ "control-net", // StableDiffusionCpp compatibility "image-control-net-model", // LLaMABox compatibility }, Usage: "Path where the GGUF file to load for the Control Net model, optional.", }, &cli.StringFlag{ Destination: &url, Value: url, Category: "Model/Remote", Name: "url", Aliases: []string{ "model-url", "mu", }, Usage: "Url where the GGUF file to load for the main model, e.g. " + "\"https://huggingface.co/QuantFactory/Qwen2-7B-Instruct-GGUF" + "/resolve/main/Qwen2-7B-Instruct.Q5_K_M.gguf\". " + "Note that gguf-parser does not need to download the entire GGUF file.", }, &cli.StringFlag{ Destination: &draftUrl, Value: draftUrl, Category: "Model/Remote", Name: "draft-url", Usage: "Url where the GGUF file to load for the draft model, optional, e.g. " + "\"https://huggingface.co/QuantFactory/Qwen2-1.5B-Instruct-GGUF" + "/resolve/main/Qwen2-1.5B-Instruct.Q5_K_M.gguf\". " + "Note that gguf-parser does not need to download the entire GGUF file.", }, &cli.StringFlag{ Destination: &mmprojUrl, Value: mmprojUrl, Category: "Model/Remote", Name: "mmproj-url", Usage: "Url where the GGUF file to load for the multimodal projector, optional.", }, &cli.StringSliceFlag{ Destination: &loraUrls, Category: "Model/Remote", Name: "lora-url", Usage: "Url where the GGUF file to load for the LoRA adapter, optional.", }, &cli.StringSliceFlag{ Destination: &controlVectorUrls, Category: "Model/Remote", Name: "control-vector-url", Usage: "Url where the GGUF file to load for the Control Vector adapter, optional.", }, &cli.StringFlag{ Destination: &upscaleUrl, Value: upscaleUrl, Category: "Model/Remote", Name: "upscale-url", Usage: "Url where the GGUF file to load for the Upscale model, optional.", }, &cli.StringFlag{ Destination: &controlNetUrl, Value: controlNetUrl, Category: "Model/Remote", Name: "control-net-url", Usage: "Url where the GGUF file to load for the Control Net model, optional.", }, &cli.StringFlag{ Destination: &token, Value: token, Category: "Model/Remote", Name: "token", EnvVars: []string{ "TOKEN", }, Usage: "Bearer auth token to load GGUF file, optional, " + "works with \"--url/--draft-url\".", }, &cli.StringSliceFlag{ Destination: &headers, Category: "Model/Remote", Name: "header", Usage: "Custom HTTP header in \"Key: Value\" format, " + "works with \"--url/--draft-url\".", }, &cli.StringFlag{ Destination: &hfRepo, Value: hfRepo, Category: "Model/Remote/HuggingFace", Name: "hf-repo", Aliases: []string{ // LLaMACpp compatibility "hfr", }, Usage: "Repository of HuggingFace which the GGUF file store for the main model, e.g. " + "\"QuantFactory/Qwen2-7B-Instruct-GGUF\", works with \"--hf-file\".", }, &cli.StringFlag{ Destination: &hfFile, Value: hfFile, Category: "Model/Remote/HuggingFace", Name: "hf-file", Aliases: []string{ // LLaMACpp compatibility "hff", }, Usage: "Model file below the \"--hf-repo\", e.g. " + "\"Qwen2-7B-Instruct.Q5_K_M.gguf\".", }, &cli.StringFlag{ Destination: &hfDraftRepo, Value: hfDraftRepo, Category: "Model/Remote/HuggingFace", Name: "hf-draft-repo", Usage: "Repository of HuggingFace which the GGUF file store for the draft model, optional, e.g. " + "\"QuantFactory/Qwen2-1.5B-Instruct-GGUF\", works with \"--hf-draft-file\".", }, &cli.StringFlag{ Destination: &hfDraftFile, Value: hfDraftFile, Category: "Model/Remote/HuggingFace", Name: "hf-draft-file", Usage: "Model file below the \"--hf-draft-repo\", optional, e.g. " + "\"Qwen2-1.5B-Instruct.Q5_K_M.gguf\".", }, &cli.StringFlag{ Destination: &hfMMProjFile, Value: hfMMProjFile, Category: "Model/Remote/HuggingFace", Name: "hf-mmproj-file", Usage: "Multimodal projector file below the \"--hf-repo\".", }, &cli.StringSliceFlag{ Destination: &hfLoRAFiles, Category: "Model/Remote/HuggingFace", Name: "hf-lora-file", Usage: "LoRA adapter file below the \"--hf-repo\".", }, &cli.StringSliceFlag{ Destination: &hfControlVectorFiles, Category: "Model/Remote/HuggingFace", Name: "hf-control-vector-file", Usage: "Control Vector adapter file below the \"--hf-repo\".", }, &cli.StringFlag{ Destination: &hfUpscaleRepo, Value: hfUpscaleRepo, Category: "Model/Remote/HuggingFace", Name: "hf-upscale-repo", Usage: "Repository of HuggingFace which the GGUF file store for the Upscale model, optional, " + "works with \"--hf-upscale-file\".", }, &cli.StringFlag{ Destination: &hfUpscaleFile, Value: hfUpscaleFile, Category: "Model/Remote/HuggingFace", Name: "hf-upscale-file", Usage: "Model file below the \"--hf-upscale-repo\", optional.", }, &cli.StringFlag{ Destination: &hfControlNetRepo, Value: hfControlNetRepo, Category: "Model/Remote/HuggingFace", Name: "hf-control-net-repo", Usage: "Repository of HuggingFace which the GGUF file store for the Control Net model, optional, " + "works with \"--hf-control-net-file\".", }, &cli.StringFlag{ Destination: &hfControlNetFile, Value: hfControlNetFile, Category: "Model/Remote/HuggingFace", Name: "hf-control-net-file", Usage: "Model file below the \"--hf-control-net-repo\", optional.", }, &cli.StringFlag{ Destination: &hfToken, Value: hfToken, Category: "Model/Remote/HuggingFace", Name: "hf-token", Aliases: []string{ // LLaMACpp compatibility "hft", }, EnvVars: []string{ "HF_TOKEN", }, Usage: "User access token of HuggingFace, optional, " + "works with \"--hf-repo/--hf-file pair\" or \"--hf-draft-repo/--hf-draft-file\" pair. " + "See https://huggingface.co/settings/tokens.", }, &cli.StringFlag{ Destination: &msRepo, Value: msRepo, Category: "Model/Remote/ModelScope", Name: "ms-repo", Usage: "Repository of ModelScope which the GGUF file store for the main model, e.g. " + "\"qwen/Qwen1.5-7B-Chat-GGUF\", works with \"--ms-file\".", }, &cli.StringFlag{ Destination: &msFile, Value: msFile, Category: "Model/Remote/ModelScope", Name: "ms-file", Usage: "Model file below the \"--ms-repo\", e.g. " + "\"qwen1_5-7b-chat-q5_k_m.gguf\".", }, &cli.StringFlag{ Destination: &msDraftRepo, Value: msDraftRepo, Category: "Model/Remote/ModelScope", Name: "ms-draft-repo", Usage: "Repository of ModelScope which the GGUF file store for the draft model, optional, e.g. " + "\"qwen/Qwen1.5-1.8B-Chat-GGUF\", works with \"--ms-draft-file\".", }, &cli.StringFlag{ Destination: &msDraftFile, Value: msDraftFile, Category: "Model/Remote/ModelScope", Name: "ms-draft-file", Usage: "Model file below the \"--ms-draft-repo\", optional, e.g. " + "\"qwen1_5-1_8b-chat-q5_k_m.gguf\".", }, &cli.StringFlag{ Destination: &msMMProjFile, Value: msMMProjFile, Category: "Model/Remote/ModelScope", Name: "ms-mmproj-file", Usage: "Multimodal projector file below the \"--ms-repo\".", }, &cli.StringSliceFlag{ Destination: &msLoRAFiles, Category: "Model/Remote/ModelScope", Name: "ms-lora-file", Usage: "LoRA adapter file below the \"--ms-repo\".", }, &cli.StringSliceFlag{ Destination: &msControlVectorFiles, Category: "Model/Remote/ModelScope", Name: "ms-control-vector-file", Usage: "Control Vector adapter file below the \"--ms-repo\".", }, &cli.StringFlag{ Destination: &msUpscaleRepo, Value: msUpscaleRepo, Category: "Model/Remote/ModelScope", Name: "ms-upscale-repo", Usage: "Repository of ModelScope which the GGUF file store for the Upscale model, optional, " + "works with \"--ms-upscale-file\".", }, &cli.StringFlag{ Destination: &msUpscaleFile, Value: msUpscaleFile, Category: "Model/Remote/ModelScope", Name: "ms-upscale-file", Usage: "Model file below the \"--ms-upscale-repo\", optional.", }, &cli.StringFlag{ Destination: &msControlNetRepo, Value: msControlNetRepo, Category: "Model/Remote/ModelScope", Name: "ms-control-net-repo", Usage: "Repository of ModelScope which the GGUF file store for the Control Net model, optional, " + "works with \"--ms-control-net-file\".", }, &cli.StringFlag{ Destination: &msControlNetFile, Value: msControlNetFile, Category: "Model/Remote/ModelScope", Name: "ms-control-net-file", Usage: "Model file below the \"--ms-control-net-repo\", optional.", }, &cli.StringFlag{ Destination: &msToken, Value: msToken, Category: "Model/Remote/ModelScope", Name: "ms-token", EnvVars: []string{ "HF_TOKEN", // Compatible with HuggingFace "MS_TOKEN", }, Usage: "Git access token of ModelScope, optional, " + "works with \"--ms-repo/--ms-file\" pair or \"--ms-draft-repo/--ms-draft-file\" pair. " + "See https://modelscope.cn/my/myaccesstoken.", }, &cli.StringFlag{ Destination: &olBaseURL, Value: olBaseURL, Category: "Model/Remote/Ollama", Name: "ol-base-url", Usage: "Model base URL of Ollama, e.g. " + "https://registry.ollama.ai.", }, &cli.StringFlag{ Destination: &olModel, Value: olModel, Category: "Model/Remote/Ollama", Name: "ol-model", Usage: "Model name of Ollama, e.g. " + "\"gemma2\".", }, &cli.BoolFlag{ Destination: &olUsage, Value: olUsage, Category: "Model/Remote/Ollama", Name: "ol-usage", Usage: "Specify respecting the extending layers introduced by Ollama, " + "works with \"--ol-model\", which affects the usage estimation.", }, &cli.BoolFlag{ Destination: &skipProxy, Value: skipProxy, Category: "Load", Name: "skip-proxy", EnvVars: []string{ "SKIP_PROXY", }, Usage: "Skip proxy settings, " + "works with \"--url/--hf-*/--ms-*/--ol-*\", " + "default is respecting the environment variables \"HTTP_PROXY/HTTPS_PROXY/NO_PROXY\".", }, &cli.BoolFlag{ Destination: &skipTLSVerify, Value: skipTLSVerify, Category: "Load", Name: "skip-tls-verify", EnvVars: []string{ "SKIP_TLS_VERIFY", }, Usage: "Skip TLS verification, " + "works with \"--url/--hf-*/--ms-*/--ol-*\", " + "default is verifying the TLS certificate on HTTPs request.", }, &cli.BoolFlag{ Destination: &skipDNSCache, Value: skipDNSCache, Category: "Load", Name: "skip-dns-cache", EnvVars: []string{ "SKIP_DNS_CACHE", }, Usage: "Skip DNS cache, " + "works with \"--url/--hf-*/--ms-*/--ol-*\", " + "default is caching the DNS lookup result.", }, &cli.BoolFlag{ Destination: &skipRangDownloadDetect, Value: skipRangDownloadDetect, Category: "Load", Name: "skip-range-download-detect", EnvVars: []string{ "SKIP_RANGE_DOWNLOAD_DETECT", }, Usage: "Skip range download detect, " + "works with \"--url/--hf-*/--ms-*/--ol-*\", " + "default is detecting the range download support.", }, &cli.DurationFlag{ Destination: &cacheExpiration, Value: cacheExpiration, Category: "Load", Name: "cache-expiration", Usage: "Specify the expiration of cache, " + "works with \"--url/--hf-*/--ms-*/--ol-*\".", }, &cli.StringFlag{ Destination: &cachePath, Value: cachePath, Category: "Load", Name: "cache-path", Usage: "Cache the read result to the path, " + "works with \"--url/--hf-*/--ms-*/--ol-*\".", }, &cli.BoolFlag{ Destination: &skipCache, Value: skipCache, Category: "Load", Name: "skip-cache", EnvVars: []string{ "SKIP_CACHE", }, Usage: "Skip cache, " + "works with \"--url/--hf-*/--ms-*/--ol-*\", " + "default is caching the read result.", }, &cli.IntFlag{ Destination: ¶llelSize, Value: parallelSize, Category: "Estimate", Name: "parallel-size", Aliases: []string{ // LLaMACpp compatibility "parallel", "np", "threads-http", // LLaMABox v0.0.140+ compatibility }, Usage: "Specify the number of parallel sequences to decode, " + "which is used to estimate the usage.", }, &cli.BoolFlag{ Destination: &flashAttention, Value: flashAttention, Category: "Estimate", Name: "flash-attention", Aliases: []string{ "flash-attn", "fa", "diffusion-fa", // StableDiffusionCpp compatibility }, Usage: "Specify enabling Flash Attention, " + "which is used to estimate the usage. " + "Flash Attention can reduce the usage of RAM/VRAM.", }, &cli.BoolFlag{ // LLaMABox compatibility Category: "Estimate", Name: "no-flash-attention", Aliases: []string{ "no-flash-attn", }, Usage: "Specify disabling Flash Attention.", Action: func(context *cli.Context, b bool) error { flashAttention = !b return nil }, }, &cli.UintFlag{ Destination: &mainGPU, Value: mainGPU, Category: "Estimate", Name: "main-gpu", Aliases: []string{ // LLaMACpp compatibility "mg", }, Usage: "Specify the GPU to use for the model (with \"--split-mode=none\") " + "or for intermediate results and KV (with \"--split-mode=row\"), " + "which is used to estimate the usage. " + "Since gguf-parser cannot recognize the host GPU devices or RPC servers, " + "\"--main-gpu\" only works when \"--tensor-split\" is set.", }, &cli.StringFlag{ Destination: &rpcServers, Value: rpcServers, Category: "Estimate", Name: "rpc", Usage: "Specify the RPC servers, " + "which is used to estimate the usage, " + "it is a comma-separated list of host:port. " + "Woks with \"--tensor-split\".", }, &cli.StringFlag{ Destination: &tensorSplit, Value: tensorSplit, Category: "Estimate", Name: "tensor-split", Aliases: []string{ // LLaMACpp compatibility "ts", }, Usage: "Specify the fraction of the model to offload to each device, " + "which is used to estimate the usage, " + "it is a comma-separated list of integer. " + "Since gguf-parser cannot recognize the host GPU devices or RPC servers, " + "must explicitly set \"--tensor-split\" to indicate how many devices are used. " + "To declare the devices belong to RPC servers, set \"--rpc\" please.", }, &cli.IntFlag{ Destination: &offloadLayers, Value: offloadLayers, Category: "Estimate", Name: "gpu-layers", Aliases: []string{ // LLaMACpp compatibility "ngl", "n-gpu-layers", }, Usage: "Specify how many layers of the main model to offload, " + "which is used to estimate the usage, " + "default is full offloaded.", }, &cli.StringSliceFlag{ Destination: &overrideTensors, Category: "Estimate", Name: "override-tensor", Aliases: []string{ // LLaMACpp compatibility "ot", }, Usage: "Override tensor buffer type, " + "for example, use --override-tensor \"[2-9][0-9]\\.ffn_.*_exps\\.=CPU\" to keep experts of layers 20-99 in the CPU", }, &cli.StringSliceFlag{ Destination: &deviceMetrics, Category: "Estimate", Name: "device-metric", Usage: "Specify the device metrics, " + "which is used to estimate the throughput, in form of \"FLOPS;Up Bandwidth[;Down Bandwidth]\". " + "The FLOPS unit, select from [PFLOPS, TFLOPS, GFLOPS, MFLOPS, KFLOPS]. " + "The Up/Down Bandwidth unit, select from [PiBps, TiBps, GiBps, MiBps, KiBps, PBps, TBps, GBps, MBps, KBps, Pbps, Tbps, Gbps, Mbps, Kbps]. " + "Up Bandwidth usually indicates the bandwidth to transmit the data to calculate, " + "and Down Bandwidth indicates the bandwidth to transmit the calculated result to next layer. " + "For example, \"--device-metric 10TFLOPS;400GBps\" means the device has 10 TFLOPS and 400 GBps Up/Down bandwidth, " + "\"--device-metric 10TFLOPS;400GBps;5000MBps\" means the device has 5000MBps Down bandwidth. " + "If the quantity specified by \"--device-metric\" is less than the number of estimation devices(" + "determined by \"--tensor-split\" and \"--rpc\" to infer the device count), " + "then replicate the last \"--device-metric\" to meet the required number of evaluation devices.", }, &cli.StringFlag{ Destination: &platformFootprint, Value: platformFootprint, Category: "Estimate", Name: "platform-footprint", Usage: "Specify the platform footprint(RAM,VRAM) of running host in MiB, " + "which is used to estimate the NonUMA usage, " + "default is \"150,250\". " + "Different platform always gets different RAM and VRAM footprints, " + "for example, within CUDA, \"cudaMemGetInfo\" or \"cudaSetDevice\" would occupy some RAM and VRAM, " + "see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo.", }, &cli.IntFlag{ Destination: &lmcCtxSize, Value: lmcCtxSize, Category: "Estimate/LLaMACpp", Name: "ctx-size", Aliases: []string{ // LLaMACpp compatibility "c", }, Usage: "Specify the size of prompt context, " + "which is used to estimate the usage, " + "default is equal to the model's maximum context size.", }, &cli.StringFlag{ Destination: &lmcRoPEScalingType, Category: "Estimate/LLaMACpp", Name: "rope-scaling", Usage: "RoPE frequency scaling method, " + "defaults to linear unless specified by the model, select from [none, linear, yarn].", }, &cli.Float64Flag{ Category: "Estimate/LLaMACpp", Name: "rope-scale", Usage: "RoPE context scaling factor, " + "expands context by a factor of N.", Action: func(context *cli.Context, f float64) error { if f != 0 { lmcRoPEFreqScale = 1 / f } return nil }, }, &cli.Float64Flag{ Destination: &lmcRoPEFreqBase, Category: "Estimate/LLaMACpp", Name: "rope-freq-base", Usage: "RoPE base frequency, " + "used by NTK-aware scaling.", }, &cli.Float64Flag{ Destination: &lmcRoPEFreqScale, Category: "Estimate/LLaMACpp", Name: "rope-freq-scale", Usage: "RoPE frequency scaling factor, " + "expands context by a factor of 1/N.", }, &cli.IntFlag{ Destination: &lmcRoPEScalingOrigCtxSize, Category: "Estimate/LLaMACpp", Name: "yarn-orig-ctx", Usage: "YaRN original context size of model, " + "defaults to model training context size.", }, &cli.BoolFlag{ Destination: &lmcInMaxCtxSize, Value: lmcInMaxCtxSize, Category: "Estimate/LLaMACpp", Name: "in-max-ctx-size", Usage: "Limit the context size to the maximum context size of the model, " + "if the context size is larger than the maximum context size.", }, &cli.IntFlag{ Destination: &lmcLogicalBatchSize, Value: lmcLogicalBatchSize, Category: "Estimate/LLaMACpp", Name: "batch-size", Aliases: []string{ // LLaMACpp compatibility "b", }, Usage: "Specify the logical batch size, " + "which is used to estimate the usage.", }, &cli.IntFlag{ Destination: &lmcPhysicalBatchSize, Value: lmcPhysicalBatchSize, Category: "Estimate/LLaMACpp", Name: "ubatch-size", Aliases: []string{ // LLaMACpp compatibility "ub", }, Usage: "Specify the physical maximum batch size, " + "which is used to estimate the usage.", }, &cli.StringFlag{ Destination: &lmcCacheKeyType, Value: lmcCacheKeyType, Category: "Estimate/LLaMACpp", Name: "cache-type-k", Aliases: []string{ // LLaMACpp compatibility "ctk", }, Usage: "Specify the type of Key cache, " + "which is used to estimate the usage, select from [f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1].", }, &cli.StringFlag{ Destination: &lmcCacheValueType, Value: lmcCacheValueType, Category: "Estimate/LLaMACpp", Name: "cache-type-v", Aliases: []string{ // LLaMACpp compatibility "ctv", }, Usage: "Specify the type of Value cache, " + "which is used to estimate the usage, select from [f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1].", }, &cli.BoolFlag{ Destination: &lmcNoKVOffload, Value: lmcNoKVOffload, Category: "Estimate/LLaMACpp", Name: "no-kv-offload", Aliases: []string{ // LLaMACpp compatibility "nkvo", }, Usage: "Specify disabling Key-Value offloading, " + "which is used to estimate the usage. " + "Disable Key-Value offloading can reduce the usage of VRAM.", }, &cli.StringFlag{ Destination: &lmcSplitMode, Value: lmcSplitMode, Category: "Estimate/LLaMACpp", Name: "split-mode", Aliases: []string{ // LLaMACpp compatibility "sm", }, Usage: "Specify how to split the model across multiple devices, " + "which is used to estimate the usage, select from [layer, row, none]. " + "Since gguf-parser always estimates the usage of VRAM, " + "\"none\" is meaningless here, keep for compatibility.", }, &cli.BoolFlag{ Destination: &lmcSWAFull, Value: lmcSWAFull, Category: "Estimate/LLaMACpp", Name: "swa-full", Usage: "Specify using full-size SWA cache.", }, &cli.BoolFlag{ Destination: &lmcNoMMap, Value: lmcNoMMap, Category: "Estimate/LLaMACpp", Name: "no-mmap", Usage: "Specify disabling Memory-Mapped using, " + "which is used to estimate the usage. " + "Memory-Mapped can avoid loading the entire model weights into RAM.", }, &cli.BoolFlag{ // LLaMABox compatibility Category: "Estimate/LLaMACpp", Name: "mmap", Usage: "Specify enabling Memory-Mapped using, " + "which is used to estimate the usage. " + "Memory-Mapped can avoid loading the entire model weights into RAM.", Action: func(context *cli.Context, b bool) error { lmcNoMMap = !b return nil }, }, &cli.UintFlag{ // LLaMABox compatibility Destination: &lmcVisualMaxImageSize, Value: lmcVisualMaxImageSize, Category: "Estimate/LLaMACpp", Name: "visual-max-image-size", Usage: "Specify maximum image size when completion with vision model.", }, &cli.UintFlag{ // LLaMABox compatibility Destination: &lmcMaxProjectedCache, Value: lmcMaxProjectedCache, Category: "Estimate/LLaMACpp", Name: "max-projected-cache", Aliases: []string{ "visual-max-image-cache", // Deprecated argument name }, Usage: "Specify how many projected embedding to be cached.", }, &cli.IntFlag{ Destination: &lmcOffloadLayersDraft, Value: lmcOffloadLayersDraft, Category: "Estimate/LLaMACpp", Name: "gpu-layers-draft", Aliases: []string{ // LLaMACpp compatibility "ngld", "n-gpu-layers-draft", }, Usage: "Specify how many layers of the draft model to offload, " + "which is used to estimate the usage, " + "default is full offloaded.", }, &cli.Uint64Flag{ Destination: &lmcOffloadLayersStep, Value: lmcOffloadLayersStep, Category: "Estimate/LLaMACpp", Name: "gpu-layers-step", Usage: "Specify the step of layers to offload, " + "works with \"--gpu-layers\".", }, &cli.UintFlag{ Destination: &sdcBatchCount, Value: sdcBatchCount, Category: "Estimate/StableDiffusionCpp", Name: "image-batch-count", Aliases: []string{ "batch-count", // StableDiffusionCpp compatibility "image-max-batch", // LLaMABox compatibility }, Usage: "Specify the batch(generation) count of the image.", }, &cli.UintFlag{ Destination: &sdcHeight, Value: sdcHeight, Category: "Estimate/StableDiffusionCpp", Name: "image-height", Aliases: []string{ "height", // StableDiffusionCpp compatibility "image-max-height", // LLaMABox compatibility }, Usage: "Specify the (maximum) height of the image.", }, &cli.UintFlag{ Destination: &sdcWidth, Value: sdcWidth, Category: "Estimate/StableDiffusionCpp", Name: "image-width", Aliases: []string{ "width", // StableDiffusionCpp compatibility "image-max-width", // LLaMABox compatibility }, Usage: "Specify the (maximum) width of the image.", }, &cli.BoolFlag{ Destination: &sdcNoConditionerOffload, Value: sdcNoConditionerOffload, Category: "Estimate/StableDiffusionCpp", Name: "image-no-conditioner-offload", Aliases: []string{ "clip-on-cpu", // StableDiffusionCpp compatibility "image-no-text-encoder-model-offload", // LLaMABox compatibility }, Usage: "Specify to offload the text encoder model to CPU.", }, &cli.BoolFlag{ Destination: &sdcNoAutoencoderOffload, Value: sdcNoAutoencoderOffload, Category: "Estimate/StableDiffusionCpp", Name: "image-no-autoencoder-offload", Aliases: []string{ "vae-on-cpu", // StableDiffusionCpp compatibility "image-no-vae-model-offload", // LLaMABox compatibility }, Usage: "Specify to offload the vae model to CPU.", }, &cli.BoolFlag{ Destination: &sdcNoControlNetOffload, Value: sdcNoControlNetOffload, Category: "Estimate/StableDiffusionCpp", Name: "image-no-control-net-offload", Aliases: []string{ "control-net-cpu", // StableDiffusionCpp compatibility "image-no-control-net-model-offload", // LLaMABox compatibility }, Usage: "Specify to offload the control net model to CPU.", }, &cli.BoolFlag{ Destination: &sdcAutoencoderTiling, Value: sdcAutoencoderTiling, Category: "Estimate/StableDiffusionCpp", Name: "image-autoencoder-tiling", Aliases: []string{ "vae-tiling", // StableDiffusionCpp compatibility "image-vae-tiling", // LLaMABox compatibility }, Usage: "Specify to enable tiling for the vae model.", }, &cli.BoolFlag{ Destination: &sdcNoAutoencoderTiling, Value: sdcNoAutoencoderTiling, Category: "Estimate/StableDiffusionCpp", Name: "image-no-autoencoder-tiling", Aliases: []string{ "image-no-vae-tiling", // LLaMABox compatibility }, Usage: "Specify to disable tiling for the vae model, it takes precedence over --image-autoencoder-tiling.", }, &cli.BoolFlag{ Destination: &sdcFreeComputeMemoryImmediately, Value: sdcFreeComputeMemoryImmediately, Category: "Estimate/StableDiffusionCpp", Name: "image-free-compute-memory-immediately", // LLaMABox compatibility Usage: "Specify to free the compute memory immediately after the generation, which burst using VRAM.", }, &cli.BoolFlag{ Destination: &raw, Value: raw, Category: "Output", Name: "raw", Usage: "Output the GGUF file information as JSON only, skip anything.", }, &cli.StringFlag{ Destination: &rawOutput, Value: rawOutput, Category: "Output", Name: "raw-output", Usage: "Works with \"--raw\", to save the result to the file", }, &cli.BoolFlag{ Destination: &skipMetadata, Value: skipMetadata, Category: "Output", Name: "skip-metadata", Usage: "Skip to display metadata.", }, &cli.BoolFlag{ Destination: &skipArchitecture, Value: skipArchitecture, Category: "Output", Name: "skip-architecture", Usage: "Skip to display architecture.", }, &cli.BoolFlag{ Destination: &skipTokenizer, Value: skipTokenizer, Category: "Output", Name: "skip-tokenizer", Usage: "Skip to display tokenizer. " + "By default, gguf-parser always displays the tokenizer of the file which types with \"model\".", }, &cli.BoolFlag{ Destination: &skipEstimate, Value: skipEstimate, Category: "Output", Name: "skip-estimate", Usage: "Skip to estimate. " + "By default, gguf-parser always estimates the file which types with \"model\".", }, &cli.BoolFlag{ Category: "Output", Name: "estimate", Usage: "Skip all the information except the estimate result.", Action: func(_ *cli.Context, estimateOnly bool) error { if estimateOnly { skipMetadata = true skipArchitecture = true skipTokenizer = true } return nil }, }, &cli.BoolFlag{ Destination: &inShort, Value: inShort, Category: "Output", Name: "in-short", Usage: "Display the estimated result in table in short form.", }, &cli.BoolFlag{ Destination: &inMib, Value: inMib, Category: "Output", Name: "in-mib", Usage: "Display the estimated result in table with MiB.", }, &cli.BoolFlag{ Destination: &inJson, Value: inJson, Category: "Output", Name: "json", Usage: "Output as JSON.", }, &cli.BoolFlag{ Destination: &inPrettyJson, Value: inPrettyJson, Category: "Output", Name: "json-pretty", Usage: "Works with \"--json\", to output pretty format JSON.", }, }, Action: mainAction, } if err := app.RunContext(signalx.Handler(), os.Args); err != nil { _, _ = fmt.Fprintf(os.Stderr, "%v\n", err) os.Exit(1) } } var ( // model options path string draftPath string // for estimate mmprojPath string // for estimate loraPaths cli.StringSlice // for estimate controlVectorPaths cli.StringSlice // for estimate upscalePath string // for estimate controlNetPath string // for estimate url string draftUrl string // for estimate mmprojUrl string // for estimate loraUrls cli.StringSlice // for estimate controlVectorUrls cli.StringSlice // for estimate upscaleUrl string // for estimate controlNetUrl string // for estimate token string headers cli.StringSlice hfRepo string hfFile string hfDraftRepo string // for estimate hfDraftFile string // for estimate hfMMProjFile string // for estimate hfLoRAFiles cli.StringSlice // for estimate hfControlVectorFiles cli.StringSlice // for estimate hfUpscaleRepo string // for estimate hfUpscaleFile string // for estimate hfControlNetRepo string // for estimate hfControlNetFile string // for estimate hfToken string msRepo string msFile string msDraftRepo string // for estimate msDraftFile string // for estimate msMMProjFile string // for estimate msLoRAFiles cli.StringSlice // for estimate msControlVectorFiles cli.StringSlice // for estimate msUpscaleRepo string // for estimate msUpscaleFile string // for estimate msControlNetRepo string // for estimate msControlNetFile string // for estimate msToken string olBaseURL = "https://registry.ollama.ai" olModel string olUsage bool // load options debug bool skipProxy bool skipTLSVerify bool skipDNSCache bool skipRangDownloadDetect bool cacheExpiration = 24 * time.Hour cachePath = DefaultCachePath() skipCache bool // estimate options parallelSize = 1 flashAttention bool mainGPU uint rpcServers string tensorSplit string offloadLayers = -1 overrideTensors cli.StringSlice deviceMetrics cli.StringSlice platformFootprint = "150,250" // estimate options for llama.cpp lmcCtxSize = 0 lmcRoPEFreqBase float64 lmcRoPEFreqScale float64 lmcRoPEScalingType string lmcRoPEScalingOrigCtxSize int lmcInMaxCtxSize bool lmcLogicalBatchSize = 2048 lmcPhysicalBatchSize = 512 lmcCacheKeyType = "f16" lmcCacheValueType = "f16" lmcNoKVOffload bool lmcSplitMode = "layer" lmcSWAFull = false lmcNoMMap bool lmcVisualMaxImageSize uint lmcMaxProjectedCache uint lmcOffloadLayersDraft = -1 lmcOffloadLayersStep uint64 // estimate options for stable-diffusion.cpp sdcBatchCount uint = 1 sdcHeight uint = 1024 sdcWidth uint = 1024 sdcNoConditionerOffload bool sdcNoAutoencoderOffload bool sdcNoControlNetOffload bool sdcAutoencoderTiling bool sdcNoAutoencoderTiling bool sdcFreeComputeMemoryImmediately bool // output options raw bool rawOutput string inShort bool skipMetadata bool skipArchitecture bool skipTokenizer bool skipEstimate bool inMib bool inJson bool inPrettyJson = true ) func mainAction(c *cli.Context) error { ctx := c.Context // Prepare options. ropts := []GGUFReadOption{ SkipLargeMetadata(), UseMMap(), UseCache(), } if hs := headers.Value(); len(hs) > 0 { hm := make(map[string]string, len(hs)) for _, h := range hs { parts := strings.SplitN(h, ":", 2) if len(parts) == 2 { hm[strings.TrimSpace(parts[0])] = strings.TrimSpace(parts[1]) } } if len(hm) > 0 { ropts = append(ropts, UseHeaders(hm)) } } if token != "" { ropts = append(ropts, UseBearerAuth(token)) } if debug { ropts = append(ropts, UseDebug()) } if skipProxy { ropts = append(ropts, SkipProxy()) } if skipTLSVerify { ropts = append(ropts, SkipTLSVerification()) } if skipDNSCache { ropts = append(ropts, SkipDNSCache()) } if skipRangDownloadDetect { ropts = append(ropts, SkipRangeDownloadDetection()) } if cacheExpiration >= 0 { ropts = append(ropts, UseCacheExpiration(cacheExpiration)) } if cachePath != "" { ropts = append(ropts, UseCachePath(cachePath)) } if skipCache { ropts = append(ropts, SkipCache()) } eopts := []GGUFRunEstimateOption{ WithLLaMACppCacheValueType(GGMLTypeF16), WithLLaMACppCacheKeyType(GGMLTypeF16), } if parallelSize > 0 { eopts = append(eopts, WithParallelSize(int32(parallelSize))) } if flashAttention { eopts = append(eopts, WithFlashAttention()) } if tensorSplit != "" { tss := strings.Split(tensorSplit, ",") if len(tss) > 128 { return errors.New("--tensor-split exceeds the number of devices") } var vs float64 vv := make([]float64, len(tss)) vf := make([]float64, len(tss)) for i, s := range tss { s = strings.TrimSpace(s) v, err := strconv.ParseFloat(s, 64) if err != nil { return errors.New("--tensor-split has invalid integer") } vs += v vv[i] = vs } for i, v := range vv { vf[i] = v / vs } eopts = append(eopts, WithTensorSplitFraction(vf)) if mainGPU < uint(len(vv)) { eopts = append(eopts, WithMainGPUIndex(int(mainGPU))) } else { return errors.New("--main-gpu must be less than item size of --tensor-split") } if rpcServers != "" { rss := strings.Split(rpcServers, ",") if len(rss) > len(tss) { return errors.New("--rpc has more items than --tensor-split") } rpc := make([]string, len(rss)) for i, s := range rss { s = strings.TrimSpace(s) if _, _, err := net.SplitHostPort(s); err != nil { return errors.New("--rpc has invalid host:port") } rpc[i] = s } eopts = append(eopts, WithRPCServers(rpc)) } } if otss := overrideTensors.Value(); len(otss) > 0 { var ots []GGUFRunOverriddenTensor for i := range otss { pots := strings.Split(otss[i], ",") for j := range pots { ss := strings.SplitN(strings.TrimSpace(pots[j]), "=", 2) if len(ss) != 2 { return errors.New("--override-tensor has invalid format") } pr, err := regexp.Compile(strings.TrimSpace(ss[0])) if err != nil { return fmt.Errorf("--override-tensor has invalid pattern: %w", err) } bt := strings.TrimSpace(ss[1]) if bt == "" { return errors.New("--override-tensor has empty buffer type") } ots = append(ots, GGUFRunOverriddenTensor{ PatternRegex: pr, BufferType: bt, }) } } eopts = append(eopts, WithOverriddenTensors(ots)) } if dmss := deviceMetrics.Value(); len(dmss) > 0 { dms := make([]GGUFRunDeviceMetric, len(dmss)) for i := range dmss { ss := strings.Split(dmss[i], ";") if len(ss) < 2 { return errors.New("--device-metric has invalid format") } var err error dms[i].FLOPS, err = ParseFLOPSScalar(strings.TrimSpace(ss[0])) if err != nil { return fmt.Errorf("--device-metric has invalid FLOPS: %w", err) } dms[i].UpBandwidth, err = ParseBytesPerSecondScalar(strings.TrimSpace(ss[1])) if err != nil { return fmt.Errorf("--device-metric has invalid Up Bandwidth: %w", err) } if len(ss) > 2 { dms[i].DownBandwidth, err = ParseBytesPerSecondScalar(strings.TrimSpace(ss[2])) if err != nil { return fmt.Errorf("--device-metric has invalid Down Bandwidth: %w", err) } } else { dms[i].DownBandwidth = dms[i].UpBandwidth } } eopts = append(eopts, WithDeviceMetrics(dms)) } if lmcCtxSize > 0 { eopts = append(eopts, WithLLaMACppContextSize(int32(lmcCtxSize))) } if lmcRoPEFreqBase > 0 || lmcRoPEFreqScale > 0 || lmcRoPEScalingType != "" || lmcRoPEScalingOrigCtxSize > 0 { eopts = append(eopts, WithLLaMACppRoPE(lmcRoPEFreqBase, lmcRoPEFreqScale, lmcRoPEScalingType, int32(lmcRoPEScalingOrigCtxSize))) } if lmcInMaxCtxSize { eopts = append(eopts, WithinLLaMACppMaxContextSize()) } if lmcLogicalBatchSize > 0 { eopts = append(eopts, WithLLaMACppLogicalBatchSize(int32(max(32, lmcLogicalBatchSize)))) } if lmcPhysicalBatchSize > 0 { if lmcPhysicalBatchSize > lmcLogicalBatchSize { return errors.New("--ubatch-size must be less than or equal to --batch-size") } eopts = append(eopts, WithLLaMACppPhysicalBatchSize(int32(lmcPhysicalBatchSize))) } if lmcCacheKeyType != "" { eopts = append(eopts, WithLLaMACppCacheKeyType(toGGMLType(lmcCacheKeyType))) } if lmcCacheValueType != "" { eopts = append(eopts, WithLLaMACppCacheValueType(toGGMLType(lmcCacheValueType))) } if lmcNoKVOffload { eopts = append(eopts, WithoutLLaMACppOffloadKVCache()) } switch lmcSplitMode { case "row": eopts = append(eopts, WithLLaMACppSplitMode(LLaMACppSplitModeRow)) case "none": eopts = append(eopts, WithLLaMACppSplitMode(LLaMACppSplitModeNone)) default: eopts = append(eopts, WithLLaMACppSplitMode(LLaMACppSplitModeLayer)) } if lmcSWAFull { eopts = append(eopts, WithLLaMACppFullSizeSWACache()) } if lmcVisualMaxImageSize > 0 { eopts = append(eopts, WithLLaMACppVisualMaxImageSize(uint32(lmcVisualMaxImageSize))) } if lmcMaxProjectedCache > 0 { eopts = append(eopts, WithLLaMACppMaxProjectedCache(uint32(lmcMaxProjectedCache))) } if sdcBatchCount > 1 { eopts = append(eopts, WithStableDiffusionCppBatchCount(int32(sdcBatchCount))) } if sdcHeight > 0 { eopts = append(eopts, WithStableDiffusionCppHeight(uint32(sdcHeight))) } if sdcWidth > 0 { eopts = append(eopts, WithStableDiffusionCppWidth(uint32(sdcWidth))) } if sdcNoConditionerOffload { eopts = append(eopts, WithoutStableDiffusionCppOffloadConditioner()) } if sdcNoAutoencoderOffload { eopts = append(eopts, WithoutStableDiffusionCppOffloadAutoencoder()) } if sdcAutoencoderTiling && !sdcNoAutoencoderTiling { eopts = append(eopts, WithStableDiffusionCppAutoencoderTiling()) } if sdcFreeComputeMemoryImmediately { eopts = append(eopts, WithStableDiffusionCppFreeComputeMemoryImmediately()) } if offloadLayers >= 0 { eopts = append(eopts, WithLLaMACppOffloadLayers(uint64(offloadLayers)), WithStableDiffusionCppOffloadLayers(uint64(offloadLayers))) } // Parse GGUF file. var ( // Common. gf *GGUFFile adapterGfs []*GGUFFile // LLaMACpp specific. lmcProjectGf *GGUFFile lmcDrafterGf *GGUFFile // StableDiffusionCpp specific. sdcControlNetGf *GGUFFile sdcUpscaleGf *GGUFFile ) { var err error ropts := ropts[:len(ropts):len(ropts)] // Main model. switch { default: return errors.New("no model specified") case path != "": gf, err = ParseGGUFFile(path, ropts...) case url != "": gf, err = ParseGGUFFileRemote(ctx, url, ropts...) case hfRepo != "" && hfFile != "": if hfToken != "" { ropts = append(ropts, UseBearerAuth(hfToken)) } gf, err = ParseGGUFFileFromHuggingFace(ctx, hfRepo, hfFile, ropts...) case msRepo != "" && msFile != "": if msToken != "" { ropts = append(ropts, UseBearerAuth(msToken)) } gf, err = ParseGGUFFileFromModelScope(ctx, msRepo, msFile, ropts...) case olModel != "": om := ParseOllamaModel(olModel, SetOllamaModelBaseURL(olBaseURL)) gf, err = ParseGGUFFileFromOllamaModel(ctx, om, ropts...) if err == nil && om != nil && olUsage { // Parameters override. { ps, _ := om.Params(ctx, nil) if v, ok := ps["num_ctx"]; ok { eopts = append(eopts, WithLLaMACppContextSize(anyx.Number[int32](v))) } else if lmcCtxSize <= 0 { eopts = append(eopts, WithLLaMACppContextSize(2048)) } if v, ok := ps["use_mmap"]; ok && !anyx.Bool(v) { lmcNoMMap = true } if v, ok := ps["num_gpu"]; ok { offloadLayers = anyx.Number[int](v) } } // Multimodal projector overlap. { mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.projector$`)) if len(mls) > 0 { lmcProjectGf, err = ParseGGUFFileRemote(ctx, mls[len(mls)-1].BlobURL().String(), ropts...) if err != nil { return fmt.Errorf("failed to parse GGUF file: %w", err) } } } // Adapter overlap. { als := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.adapter$`)) if len(als) > 0 { var adpgf *GGUFFile for i := range als { adpgf, err = ParseGGUFFileRemote(ctx, als[i].BlobURL().String(), ropts...) if err != nil { return fmt.Errorf("failed to parse GGUF file: %w", err) } adapterGfs = append(adapterGfs, adpgf) } } } } } if err != nil { return fmt.Errorf("failed to parse GGUF file: %w", err) } // Adapter. { // LoRA. for _, loraPath := range loraPaths.Value() { adpgf, err := ParseGGUFFile(loraPath, ropts...) if err != nil { return fmt.Errorf("failed to parse LoRA adapter GGUF file: %w", err) } adapterGfs = append(adapterGfs, adpgf) } for _, loraUrl := range loraUrls.Value() { adpgf, err := ParseGGUFFileRemote(ctx, loraUrl, ropts...) if err != nil { return fmt.Errorf("failed to parse LoRA adapter GGUF file: %w", err) } adapterGfs = append(adapterGfs, adpgf) } if hfRepo != "" { for _, hfLoRAFile := range hfLoRAFiles.Value() { adpgf, err := ParseGGUFFileFromHuggingFace(ctx, hfRepo, hfLoRAFile, ropts...) if err != nil { return fmt.Errorf("failed to parse LoRA adapter GGUF file: %w", err) } adapterGfs = append(adapterGfs, adpgf) } } if msRepo != "" { for _, msLoRAFile := range msLoRAFiles.Value() { adpgf, err := ParseGGUFFileFromModelScope(ctx, msRepo, msLoRAFile, ropts...) if err != nil { return fmt.Errorf("failed to parse LoRA adapter GGUF file: %w", err) } adapterGfs = append(adapterGfs, adpgf) } } // Control Vector. for _, cvPath := range controlVectorPaths.Value() { adpgf, err := ParseGGUFFile(cvPath, ropts...) if err != nil { return fmt.Errorf("failed to parse Control Vector adapter GGUF file: %w", err) } adapterGfs = append(adapterGfs, adpgf) } for _, cvUrl := range controlVectorUrls.Value() { adpgf, err := ParseGGUFFileRemote(ctx, cvUrl, ropts...) if err != nil { return fmt.Errorf("failed to parse Control Vector adapter GGUF file: %w", err) } adapterGfs = append(adapterGfs, adpgf) } if hfRepo != "" { for _, hfCvFile := range hfControlVectorFiles.Value() { adpgf, err := ParseGGUFFileFromHuggingFace(ctx, hfRepo, hfCvFile, ropts...) if err != nil { return fmt.Errorf("failed to parse Control Vector adapter GGUF file: %w", err) } adapterGfs = append(adapterGfs, adpgf) } } if msRepo != "" { for _, msCvFile := range msControlVectorFiles.Value() { adpgf, err := ParseGGUFFileFromModelScope(ctx, msRepo, msCvFile, ropts...) if err != nil { return fmt.Errorf("failed to parse Control Vector adapter GGUF file: %w", err) } adapterGfs = append(adapterGfs, adpgf) } } } // Drafter for LLaMACpp. switch { case draftPath != "": lmcDrafterGf, err = ParseGGUFFile(draftPath, ropts...) case draftUrl != "": lmcDrafterGf, err = ParseGGUFFileRemote(ctx, draftUrl, ropts...) case hfDraftRepo != "" && hfDraftFile != "": lmcDrafterGf, err = ParseGGUFFileFromHuggingFace(ctx, hfDraftRepo, hfDraftFile, ropts...) case msDraftRepo != "" && msDraftFile != "": lmcDrafterGf, err = ParseGGUFFileFromModelScope(ctx, msDraftRepo, msDraftFile, ropts...) } if err != nil { return fmt.Errorf("failed to parse draft GGUF file: %w", err) } // Projector for LLaMACpp. switch { case mmprojPath != "": lmcProjectGf, err = ParseGGUFFile(mmprojPath, ropts...) case mmprojUrl != "": lmcProjectGf, err = ParseGGUFFileRemote(ctx, mmprojUrl, ropts...) case hfRepo != "" && hfMMProjFile != "": lmcProjectGf, err = ParseGGUFFileFromHuggingFace(ctx, hfRepo, hfMMProjFile, ropts...) case msRepo != "" && msMMProjFile != "": lmcProjectGf, err = ParseGGUFFileFromModelScope(ctx, msRepo, msMMProjFile, ropts...) } if err != nil { return fmt.Errorf("failed to parse multimodal projector GGUF file: %w", err) } // ControlNet for StableDiffusionCpp. switch { case controlNetPath != "": sdcControlNetGf, err = ParseGGUFFile(controlNetPath, ropts...) case controlNetUrl != "": sdcControlNetGf, err = ParseGGUFFileRemote(ctx, controlNetUrl, ropts...) case hfControlNetRepo != "" && hfControlNetFile != "": sdcControlNetGf, err = ParseGGUFFileFromHuggingFace(ctx, hfControlNetRepo, hfControlNetFile, ropts...) case msControlNetRepo != "" && msControlNetFile != "": sdcControlNetGf, err = ParseGGUFFileFromModelScope(ctx, msControlNetRepo, msControlNetFile, ropts...) } if err != nil { return fmt.Errorf("failed to parse control net GGUF file: %w", err) } // Upscaler for StableDiffusionCpp. switch { case upscalePath != "": sdcUpscaleGf, err = ParseGGUFFile(upscalePath, ropts...) case upscaleUrl != "": sdcUpscaleGf, err = ParseGGUFFileRemote(ctx, upscaleUrl, ropts...) case hfUpscaleRepo != "" && hfUpscaleFile != "": sdcUpscaleGf, err = ParseGGUFFileFromHuggingFace(ctx, hfUpscaleRepo, hfUpscaleFile, ropts...) case msUpscaleRepo != "" && msUpscaleFile != "": sdcUpscaleGf, err = ParseGGUFFileFromModelScope(ctx, msUpscaleRepo, msUpscaleFile, ropts...) } if err != nil { return fmt.Errorf("failed to parse upscaler GGUF file: %w", err) } } // Output raw. if raw { w := os.Stdout if rawOutput != "" { f, err := osx.CreateFile(rawOutput, 0o666) if err != nil { return fmt.Errorf("failed to create file: %w", err) } defer osx.Close(f) w = f } if err := json.NewEncoder(w).Encode(gf); err != nil { return fmt.Errorf("failed to encode JSON: %w", err) } return nil } // Otherwise, display the metadata and estimate the usage. var ( m = gf.Metadata() a = gf.Architecture() t = gf.Tokenizer() lme LLaMACppRunEstimate sde StableDiffusionCppRunEstimate ) skipArchitecture = skipArchitecture || m.Type == "imatrix" skipTokenizer = skipTokenizer || t.Model == "" skipEstimate = skipEstimate || m.Type != "model" if !skipEstimate && m.Architecture != "diffusion" { if lmcDrafterGf != nil { dlmceopts := eopts[:len(eopts):len(eopts)] if lmcOffloadLayersDraft >= 0 { dlmceopts = append(dlmceopts, WithLLaMACppOffloadLayers(uint64(lmcOffloadLayersDraft))) } dlmceopts = append(dlmceopts, WithLLaMACppCacheKeyType(GGMLTypeF16), WithLLaMACppCacheValueType(GGMLTypeF16)) de := lmcDrafterGf.EstimateLLaMACppRun(dlmceopts...) eopts = append(eopts, WithLLaMACppDrafter(&de)) } if lmcProjectGf != nil { plmceopts := eopts[:len(eopts):len(eopts)] me := lmcProjectGf.EstimateLLaMACppRun(plmceopts...) eopts = append(eopts, WithLLaMACppProjector(&me)) } if len(adapterGfs) > 0 { adps := make([]LLaMACppRunEstimate, len(adapterGfs)) almceopts := eopts[:len(eopts):len(eopts)] for i, adpgf := range adapterGfs { ae := adpgf.EstimateLLaMACppRun(almceopts...) adps[i] = ae } eopts = append(eopts, WithLLaMACppAdapters(adps)) } lme = gf.EstimateLLaMACppRun(eopts...) } if !skipEstimate && m.Architecture == "diffusion" { if sdcUpscaleGf != nil { sdceopts := eopts[:len(eopts):len(eopts)] ue := sdcUpscaleGf.EstimateStableDiffusionCppRun(sdceopts...) eopts = append(eopts, WithStableDiffusionCppUpscaler(&ue)) } if sdcControlNetGf != nil { sdceopts := eopts[:len(eopts):len(eopts)] if sdcNoControlNetOffload { sdceopts = append(sdceopts, WithStableDiffusionCppOffloadLayers(0)) } ce := sdcControlNetGf.EstimateStableDiffusionCppRun(sdceopts...) eopts = append(eopts, WithStableDiffusionCppControlNet(&ce)) } sde = gf.EstimateStableDiffusionCppRun(eopts...) } // Then, output as JSON or table. var ( mmap = !lmcNoMMap platformRAM, platformVRAM uint64 ) { if platformFootprint != "" { parts := strings.Split(platformFootprint, ",") if len(parts) == 2 { if v, err := strconv.ParseUint(parts[0], 10, 64); err == nil { platformRAM = v * 1024 * 1024 } if v, err := strconv.ParseUint(parts[1], 10, 64); err == nil { platformVRAM = v * 1024 * 1024 } } } } if inJson { o := map[string]any{} if !skipMetadata { o["metadata"] = m } if !skipArchitecture { o["architecture"] = a } if !skipTokenizer { o["tokenizer"] = t } if !skipEstimate && m.Architecture != "diffusion" { lmes := lme.Summarize(mmap, platformRAM, platformVRAM) switch { case lmcOffloadLayersStep > lme.OffloadLayers: lmcOffloadLayersStep = lme.OffloadLayers case lmcOffloadLayersStep <= 0: lmcOffloadLayersStep = lme.OffloadLayers } if lmcOffloadLayersStep < lme.OffloadLayers { cnt := lme.OffloadLayers/lmcOffloadLayersStep + 1 if lme.OffloadLayers%lmcOffloadLayersStep != 0 || lme.FullOffloaded { cnt++ } esis := make([]LLaMACppRunEstimateSummaryItem, cnt) var wg sync.WaitGroup for i := 0; i < cap(esis); i++ { wg.Add(1) go func(i int) { defer wg.Done() lmeopts := eopts[:len(eopts):len(eopts)] lmeopts = append(lmeopts, WithLLaMACppOffloadLayers(uint64(i)*lmcOffloadLayersStep)) esis[i] = gf.EstimateLLaMACppRun(lmeopts...).SummarizeItem(mmap, platformRAM, platformVRAM) }(i) } wg.Wait() esis[cap(esis)-1] = lmes.Items[0] lmes.Items = esis } o["estimate"] = lmes } if !skipEstimate && m.Architecture == "diffusion" { sdes := sde.Summarize(mmap, platformRAM, platformVRAM) o["estimate"] = sdes } enc := json.NewEncoder(os.Stdout) if inPrettyJson { enc.SetIndent("", " ") } if err := enc.Encode(o); err != nil { return fmt.Errorf("failed to encode JSON: %w", err) } return nil } GGUFBytesScalarStringInMiBytes = inMib if !skipMetadata { tprint( "Metadata", [][]any{ { "Type", "Name", "Arch", "Quantization", "Little Endian", "Size", "Parameters", "BPW", }, }, [][]any{ { m.Type, sprintf(tenary(len(m.Name) == 0, "N/A", tenary(len([]rune(m.Name)) <= 20, m.Name, string([]rune(m.Name)[:20])+"..."))), m.Architecture, m.FileTypeDescriptor, sprintf(m.LittleEndian), sprintf(m.Size), sprintf(m.Parameters), sprintf(m.BitsPerWeight), }, }) } if !skipArchitecture { var ( hds [][]any bds [][]any ) switch a.Type { case "projector": hds = [][]any{ { "Projector Type", "Embedding Len", "Layers", "Feed Forward Len", "Encoder", }, } switch { case a.ClipHasVisionEncoder && a.ClipHasAudioEncoder: hds = [][]any{ { "Projector Type", "Embedding Len", "Embedding Len", "Layers", "Layers", "Feed Forward Len", "Feed Forward Len", "Encoder", }, { "Projector Type", "Vision", "Audio", "Vision", "Audio", "Vision", "Audio", "Encoder", }, } bds = [][]any{ { sprintf(a.ClipProjectorType), sprintf(a.ClipVisionEmbeddingLength), sprintf(a.ClipAudioEmbeddingLength), sprintf(a.ClipVisionBlockCount), sprintf(a.ClipAudioBlockCount), sprintf(tenary( a.ClipVisionFeedForwardLength[0] == a.ClipVisionFeedForwardLength[1], a.ClipVisionFeedForwardLength[0], sprintf("[%d, %d, ...]", a.ClipVisionFeedForwardLength[0], a.ClipVisionFeedForwardLength[1]))), sprintf(tenary( a.ClipAudioFeedForwardLength[0] == a.ClipAudioFeedForwardLength[1], a.ClipAudioFeedForwardLength[0], sprintf("[%d, %d, ...]", a.ClipAudioFeedForwardLength[0], a.ClipAudioFeedForwardLength[1]))), "Vision & Audio", }, } case a.ClipHasVisionEncoder: bds = [][]any{ { sprintf(a.ClipProjectorType), sprintf(a.ClipVisionEmbeddingLength), sprintf(a.ClipVisionBlockCount), sprintf(tenary( a.ClipVisionFeedForwardLength[0] == a.ClipVisionFeedForwardLength[1], a.ClipVisionFeedForwardLength[0], sprintf("[%d, %d, ...]", a.ClipVisionFeedForwardLength[0], a.ClipVisionFeedForwardLength[1]))), "Vision", }, } default: bds = [][]any{ { sprintf(a.ClipProjectorType), sprintf(a.ClipAudioEmbeddingLength), sprintf(a.ClipAudioBlockCount), sprintf(tenary( a.ClipAudioFeedForwardLength[0] == a.ClipAudioFeedForwardLength[1], a.ClipAudioFeedForwardLength[0], sprintf("[%d, %d, ...]", a.ClipAudioFeedForwardLength[0], a.ClipAudioFeedForwardLength[1]))), "Audio", }, } } case "adapter": hds = [][]any{ { "Adapter Type", }, } bds = [][]any{ { sprintf(a.AdapterType), }, } if a.AdapterType == "lora" { hds[0] = append(hds[0], "LoRA Alpha") bds[0] = append(bds[0], sprintf(a.AdapterLoRAAlpha)) } else { hds[0] = append(hds[0], "ControlVector Layers") bds[0] = append(bds[0], sprintf(a.AdapterControlVectorLayerCount)) } default: if a.Architecture == "diffusion" { hds = [][]any{ { "Diffusion Arch", "Conditioners", "Autoencoder", }, } bds = [][]any{ { sprintf(tenary(a.DiffusionArchitecture != "", a.DiffusionArchitecture, "N/A")), sprintf(tenary(a.DiffusionHasConditioners(), a.DiffusionConditioners, "N/A")), sprintf(tenary(a.DiffusionHasAutoencoder(), a.DiffusionAutoencoder, "N/A")), }, } } else { hds = [][]any{ { "Max Context Len", "Embedding Len", "Attention Causal", "Attention Head Cnt", "Layers", tenary(a.ExpertFeedForwardLength != 0, "Expert Feed Forward Len", "Feed Forward Len"), "Expert Cnt", "Vocabulary Len", }, } bds = [][]any{ { sprintf(a.MaximumContextLength), sprintf(a.EmbeddingLength), sprintf(a.AttentionCausal), sprintf(tenary( a.AttentionHeadCountKV == 0 || a.AttentionHeadCountKV == a.AttentionHeadCount, "N/A", a.AttentionHeadCount)), sprintf(a.BlockCount), sprintf(tenaryFunc( a.ExpertFeedForwardLength != 0, func() any { return a.ExpertFeedForwardLength }, func() any { switch { case len(a.FeedForwardLength) == 0: return "N/A" case len(a.FeedForwardLength) == 1: return a.FeedForwardLength[0] case a.FeedForwardLength[0] == a.FeedForwardLength[1]: return a.FeedForwardLength[0] default: return sprintf("[%d, %d, ...]", a.FeedForwardLength[0], a.FeedForwardLength[1]) } }, )), sprintf(a.ExpertCount), sprintf(a.VocabularyLength), }, } } } tprint( "ARCHITECTURE", hds, bds) } if !skipTokenizer { tprint( "TOKENIZER", [][]any{ { "Model", "Tokens Size", "Tokens Len", "Added Tokens Len", "BOS Token", "EOS Token", "EOT Token", "EOM Token", "Unknown Token", "Separator Token", "Padding Token", }, }, [][]any{ { t.Model, sprintf(tenary(t.TokensSize <= 0, "N/A", GGUFBytesScalar(t.TokensSize))), sprintf(tenary(t.TokensLength <= 0, "N/A", t.TokensLength)), sprintf(tenary(t.AddedTokensLength <= 0, "N/A", t.AddedTokensLength)), sprintf(tenary(t.BOSTokenID < 0, "N/A", t.BOSTokenID)), sprintf(tenary(t.EOSTokenID < 0, "N/A", t.EOSTokenID)), sprintf(tenary(t.EOTTokenID < 0, "N/A", t.EOTTokenID)), sprintf(tenary(t.EOMTokenID < 0, "N/A", t.EOMTokenID)), sprintf(tenary(t.UnknownTokenID < 0, "N/A", t.UnknownTokenID)), sprintf(tenary(t.SeparatorTokenID < 0, "N/A", t.SeparatorTokenID)), sprintf(tenary(t.PaddingTokenID < 0, "N/A", t.PaddingTokenID)), }, }) } if !skipEstimate && m.Architecture != "diffusion" { hds := make([][]any, 2) lmes := lme.Summarize(mmap, platformRAM, platformVRAM) if !inShort { hds[0] = []any{ "Arch", "Context Size", "Batch Size (L / P)", "Flash Attention", "MMap Load", "Embedding Only", "Reranking", "Distributable", "Offload Layers", "Full Offloaded", } hds[1] = []any{ "Arch", "Context Size", "Batch Size (L / P)", "Flash Attention", "MMap Load", "Embedding Only", "Reranking", "Distributable", "Offload Layers", "Full Offloaded", } } if lmes.Items[0].MaximumTokensPerSecond != nil { hds[0] = append(hds[0], "Max TPS") hds[1] = append(hds[1], "Max TPS") } hds[0] = append(hds[0], "RAM", "RAM", "RAM") hds[1] = append(hds[1], "Layers (I + T + O)", "UMA", "NonUMA") for _, v := range lmes.Items[0].VRAMs { var hd string if v.Remote { hd = fmt.Sprintf("RPC %d (V)RAM", v.Position) } else { hd = fmt.Sprintf("VRAM %d", v.Position) } hds[0] = append(hds[0], hd, hd, hd) hds[1] = append(hds[1], "Layers (T + O)", "UMA", "NonUMA") } switch { case lmcOffloadLayersStep > lme.OffloadLayers: lmcOffloadLayersStep = lme.OffloadLayers case lmcOffloadLayersStep <= 0: lmcOffloadLayersStep = lme.OffloadLayers } if lmcOffloadLayersStep < lme.OffloadLayers { cnt := lme.OffloadLayers/lmcOffloadLayersStep + 1 if lme.OffloadLayers%lmcOffloadLayersStep != 0 || lme.FullOffloaded { cnt++ } esis := make([]LLaMACppRunEstimateSummaryItem, cnt) var wg sync.WaitGroup for i := 0; i < cap(esis); i++ { wg.Add(1) go func(i int) { defer wg.Done() lmeopts := eopts[:len(eopts):len(eopts)] lmeopts = append(lmeopts, WithLLaMACppOffloadLayers(uint64(i)*lmcOffloadLayersStep)) esis[i] = gf.EstimateLLaMACppRun(lmeopts...).SummarizeItem(mmap, platformRAM, platformVRAM) }(i) } wg.Wait() esis[cap(esis)-1] = lmes.Items[0] lmes.Items = esis } bds := make([][]any, len(lmes.Items)) for i := range lmes.Items { if !inShort { bds[i] = []any{ sprintf(tenary(lmes.Architecture != "", lmes.Architecture, "N/A")), sprintf(lmes.ContextSize), sprintf("%d / %d", lmes.LogicalBatchSize, lmes.PhysicalBatchSize), sprintf(tenary(flashAttention, tenary(lmes.FlashAttention, "Enabled", "Unsupported"), "Disabled")), sprintf(tenary(mmap, tenary(!lmes.NoMMap, "Enabled", "Unsupported"), "Disabled")), sprintf(tenary(lmes.EmbeddingOnly, "Yes", "No")), sprintf(tenary(lmes.Reranking, "Supported", "Unsupported")), sprintf(tenary(lmes.Architecture != "" && lmes.Distributable, "Supported", "Unsupported")), sprintf(tenary(lmes.Items[i].FullOffloaded, sprintf("%d (%d + 1)", lmes.Items[i].OffloadLayers, lmes.Items[i].OffloadLayers-1), lmes.Items[i].OffloadLayers)), sprintf(tenary(lmes.Items[i].FullOffloaded, "Yes", "No")), } } if lmes.Items[i].MaximumTokensPerSecond != nil { bds[i] = append(bds[i], sprintf(*lmes.Items[i].MaximumTokensPerSecond)) } bds[i] = append(bds[i], sprintf("1 + %d + %d", lmes.Items[i].RAM.HandleLayers, tenary(lmes.Items[i].RAM.HandleOutputLayer, 1, 0)), sprintf(lmes.Items[i].RAM.UMA), sprintf(lmes.Items[i].RAM.NonUMA)) for _, v := range lmes.Items[i].VRAMs { bds[i] = append(bds[i], sprintf("%d + %d", v.HandleLayers, tenary(v.HandleOutputLayer, 1, 0)), sprintf(v.UMA), sprintf(v.NonUMA)) } } tprint( "ESTIMATE", hds, bds) } if !skipEstimate && m.Architecture == "diffusion" { hds := make([][]any, 2) sdes := sde.Summarize(mmap, platformRAM, platformVRAM) if !inShort { hds[0] = []any{ "Arch", "Flash Attention", "MMap Load", "Distributable", "Full Offloaded", } hds[1] = []any{ "Arch", "Flash Attention", "MMap Load", "Distributable", "Full Offloaded", } } hds[0] = append(hds[0], "RAM", "RAM") hds[1] = append(hds[1], "UMA", "NonUMA") for _, v := range sdes.Items[0].VRAMs { var hd string if v.Remote { hd = fmt.Sprintf("RPC %d (V)RAM", v.Position) } else { hd = fmt.Sprintf("VRAM %d", v.Position) } hds[0] = append(hds[0], hd, hd) hds[1] = append(hds[1], "UMA", "NonUMA") } bds := make([][]any, len(sdes.Items)) for i := range sdes.Items { if !inShort { bds[i] = []any{ sprintf(tenary(sdes.Architecture != "", sdes.Architecture, "N/A")), sprintf(tenary(flashAttention, tenary(sdes.FlashAttention, "Enabled", "Unsupported"), "Disabled")), sprintf(tenary(mmap, tenary(!sdes.NoMMap, "Enabled", "Unsupported"), "Disabled")), sprintf(tenary(sdes.Architecture != "" && sdes.Distributable, "Supported", "Unsupported")), sprintf(tenary(sdes.Items[i].FullOffloaded, "Yes", "No")), } } bds[i] = append(bds[i], sprintf(sdes.Items[i].RAM.UMA), sprintf(sdes.Items[i].RAM.NonUMA)) for _, v := range sdes.Items[i].VRAMs { bds[i] = append(bds[i], sprintf(v.UMA), sprintf(v.NonUMA)) } } tprint( "ESTIMATE", hds, bds) } return nil } func sprintf(f any, a ...any) string { if v, ok := f.(string); ok { if len(a) != 0 { return fmt.Sprintf(v, a...) } return v } return anyx.String(f) } func tprint(title string, headers, bodies [][]any) { tw := table.NewWriter() tw.SetOutputMirror(os.Stdout) tw.SetTitle(strings.ToUpper(title)) for i := range headers { tw.AppendHeader(headers[i], table.RowConfig{AutoMerge: true, AutoMergeAlign: text.AlignCenter}) } for i := range bodies { tw.AppendRow(bodies[i]) } tw.SetColumnConfigs(func() (r []table.ColumnConfig) { r = make([]table.ColumnConfig, len(headers[0])) for i := range r { r[i].Number = i + 1 r[i].AutoMerge = true if len(headers) > 1 && (strings.HasPrefix(headers[1][i].(string), "Layers") || headers[1][i] == "UMA" || headers[1][i] == "NonUMA") { r[i].AutoMerge = false } r[i].Align = text.AlignCenter r[i].AlignHeader = text.AlignCenter } return r }()) tw.Style().Options.SeparateRows = true tw.Render() fmt.Println() } func tenary(c bool, t, f any) any { if c { return t } return f } func tenaryFunc(c bool, t, f func() any) any { if c { return t() } return f() } func toGGMLType(s string) GGMLType { t := GGMLTypeF16 switch s { case "f32": t = GGMLTypeF32 case "f16": t = GGMLTypeF16 case "q8_0": t = GGMLTypeQ8_0 case "q4_0": t = GGMLTypeQ4_0 case "q4_1": t = GGMLTypeQ4_1 case "iq4_nl": t = GGMLTypeIQ4_NL case "q5_0": t = GGMLTypeQ5_0 case "q5_1": t = GGMLTypeQ5_1 } return t } ================================================ FILE: file.go ================================================ package gguf_parser import ( "bytes" "encoding/binary" "errors" "fmt" "io" "regexp" "strings" "golang.org/x/exp/constraints" "github.com/gpustack/gguf-parser-go/util/anyx" "github.com/gpustack/gguf-parser-go/util/bytex" "github.com/gpustack/gguf-parser-go/util/funcx" "github.com/gpustack/gguf-parser-go/util/osx" "github.com/gpustack/gguf-parser-go/util/stringx" ) // GGUFFile represents a GGUF file, // see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#file-structure. // // Compared with the complete GGUF file, // this structure lacks the tensor data part. type GGUFFile struct { /* Basic */ // Header is the header of the GGUF file. Header GGUFHeader `json:"header"` // TensorInfos are the tensor infos of the GGUF file, // the size of TensorInfos is equal to `Header.TensorCount`. TensorInfos GGUFTensorInfos `json:"tensorInfos"` // Padding is the padding size of the GGUF file, // which is used to split Header and TensorInfos from tensor data. Padding int64 `json:"padding"` // SplitPaddings holds the padding size slice of the GGUF file splits, // each item represents splitting Header and TensorInfos from tensor data. // // The length of SplitPaddings is the number of split files. SplitPaddings []int64 `json:"splitPaddings,omitempty"` // TensorDataStartOffset is the offset in bytes of the tensor data in this file. // // The offset is the start of the file. TensorDataStartOffset int64 `json:"tensorDataStartOffset"` // SplitTensorDataStartOffsets holds the offset slice in bytes of the tensor data of the GGUF file splits, // each item represents the offset of the tensor data in the split file. // // The length of SplitTensorDataStartOffsets is the number of split files. SplitTensorDataStartOffsets []int64 `json:"splitTensorDataStartOffsets,omitempty"` /* Appendix */ // Size is the size of the GGUF file, // if the file is split, the size is the sum of all split files. Size GGUFBytesScalar `json:"size"` // SplitSizes holds the size slice of the GGUF file splits, // each item represents the size of the split file. // // The length of SplitSizes is the number of split files. SplitSizes []GGUFBytesScalar `json:"splitSizes,omitempty"` // ModelSize is the size of the model when loading. ModelSize GGUFBytesScalar `json:"modelSize"` // SplitModelSizes holds the size slice of the model, // each item represents a size when loading of the split file. // // The length of SplitModelSizes is the number of split files. SplitModelSizes []GGUFBytesScalar `json:"splitModelSizes,omitempty"` // ModelParameters is the number of the model parameters. ModelParameters GGUFParametersScalar `json:"modelParameters"` // ModelBitsPerWeight is the bits per weight of the model, // which describes how many bits are used to store a weight, // higher is better. ModelBitsPerWeight GGUFBitsPerWeightScalar `json:"modelBitsPerWeight"` } // GGUFMagic is a magic number of GGUF file, // see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#historical-state-of-affairs. type GGUFMagic uint32 // GGUFMagic constants. const ( GGUFMagicGGML GGUFMagic = 0x67676d6c GGUFMagicGGMF GGUFMagic = 0x67676d66 GGUFMagicGGJT GGUFMagic = 0x67676a74 GGUFMagicGGUFLe GGUFMagic = 0x46554747 // GGUF GGUFMagicGGUFBe GGUFMagic = 0x47475546 // GGUF ) // GGUFVersion is a version of GGUF file format, // see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#version-history. type GGUFVersion uint32 // GGUFVersion constants. const ( GGUFVersionV1 GGUFVersion = iota + 1 GGUFVersionV2 GGUFVersionV3 ) // GGUFHeader represents the header of a GGUF file. type GGUFHeader struct { // Magic is a magic number that announces that this is a GGUF file. Magic GGUFMagic `json:"magic"` // Version is a version of the GGUF file format. Version GGUFVersion `json:"version"` // TensorCount is the number of tensors in the file. TensorCount uint64 `json:"tensorCount"` // MetadataKVCount is the number of key-value pairs in the metadata. MetadataKVCount uint64 `json:"metadataKVCount"` // MetadataKV are the key-value pairs in the metadata, MetadataKV GGUFMetadataKVs `json:"metadataKV"` } // GGUFMetadataValueType is a type of GGUF metadata value, // see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#file-structure. type GGUFMetadataValueType uint32 // GGUFMetadataValueType constants. const ( GGUFMetadataValueTypeUint8 GGUFMetadataValueType = iota GGUFMetadataValueTypeInt8 GGUFMetadataValueTypeUint16 GGUFMetadataValueTypeInt16 GGUFMetadataValueTypeUint32 GGUFMetadataValueTypeInt32 GGUFMetadataValueTypeFloat32 GGUFMetadataValueTypeBool GGUFMetadataValueTypeString GGUFMetadataValueTypeArray GGUFMetadataValueTypeUint64 GGUFMetadataValueTypeInt64 GGUFMetadataValueTypeFloat64 _GGUFMetadataValueTypeCount // Unknown ) // Types for GGUFMetadataKV. type ( // GGUFMetadataKV is a key-value pair in the metadata of a GGUF file. GGUFMetadataKV struct { // Key is the key of the metadata key-value pair, // which is no larger than 64 bytes long. Key string `json:"key"` // ValueType is the type of the metadata value. ValueType GGUFMetadataValueType `json:"valueType"` // Value is the value of the metadata key-value pair. Value any `json:"value"` } // GGUFMetadataKVArrayValue is a value of a GGUFMetadataKV with type GGUFMetadataValueTypeArray. GGUFMetadataKVArrayValue struct { /* Basic */ // Type is the type of the array item. Type GGUFMetadataValueType `json:"type"` // Len is the length of the array. Len uint64 `json:"len"` // Array holds all array items. Array []any `json:"array,omitempty"` /* Appendix */ // StartOffset is the offset in bytes of the GGUFMetadataKVArrayValue in the GGUFFile file. // // The offset is the start of the file. StartOffset int64 `json:"startOffset"` // Size is the size of the array in bytes. Size int64 `json:"size"` } // GGUFMetadataKVs is a list of GGUFMetadataKV. GGUFMetadataKVs []GGUFMetadataKV ) // Types for GGUFTensorInfo. type ( // GGUFTensorInfo represents a tensor info in a GGUF file. GGUFTensorInfo struct { /* Basic */ // Name is the name of the tensor, // which is no larger than 64 bytes long. Name string `json:"name"` // NDimensions is the number of dimensions of the tensor. NDimensions uint32 `json:"nDimensions"` // Dimensions is the dimensions of the tensor, // the length is NDimensions. Dimensions []uint64 `json:"dimensions"` // Type is the type of the tensor. Type GGMLType `json:"type"` // Offset is the offset in bytes of the tensor's data in this file. // // The offset is relative to tensor data, not to the start of the file. Offset uint64 `json:"offset"` /* Appendix */ // StartOffset is the offset in bytes of the GGUFTensorInfo in the GGUFFile file. // // The offset is the start of the file. StartOffset int64 `json:"startOffset"` } // GGUFTensorInfos is a list of GGUFTensorInfo. GGUFTensorInfos []GGUFTensorInfo ) var ErrGGUFFileInvalidFormat = errors.New("invalid GGUF format") // ParseGGUFFile parses a GGUF file from the local given path, // and returns the GGUFFile, or an error if any. func ParseGGUFFile(path string, opts ...GGUFReadOption) (*GGUFFile, error) { var o _GGUFReadOptions for _, opt := range opts { opt(&o) } var paths []string { rs := CompleteShardGGUFFilename(path) if rs != nil { paths = rs } else { paths = []string{path} } } fs := make([]_GGUFFileReadSeeker, 0, len(paths)) defer func() { for i := range fs { osx.Close(fs[i]) } }() for i := range paths { if o.MMap { mf, err := osx.OpenMmapFile(paths[i]) if err != nil { return nil, fmt.Errorf("open mmap file: %w", err) } fs = append(fs, _GGUFFileReadSeeker{ Closer: mf, ReadSeeker: io.NewSectionReader(mf, 0, mf.Len()), Size: mf.Len(), }) continue } ff, err := osx.Open(paths[i]) if err != nil { return nil, fmt.Errorf("open file: %w", err) } fs = append(fs, _GGUFFileReadSeeker{ Closer: ff, ReadSeeker: ff, Size: funcx.MustNoError(ff.Stat()).Size(), }) } return parseGGUFFile(fs, o) } type _GGUFFileReadSeeker struct { io.Closer io.ReadSeeker Size int64 } func _validateCountWithRemaining(f _GGUFFileReadSeeker, count uint64, version GGUFVersion, what string) error { if count == 0 { return nil } var minItemSize int64 switch strings.ToLower(what) { case "metadatakvcount": if version <= GGUFVersionV1 { minItemSize = 12 // key length (uint32) + value type (uint32) + min value (string length uint32) } else { minItemSize = 20 // key length (uint64) + value type (uint32) + min value (string length uint64) } case "tensor": if version <= GGUFVersionV1 { minItemSize = 20 // name length (uint32) + n_dims (uint32) + type (uint32) + offset (uint64) } else { minItemSize = 24 // name length (uint64) + n_dims (uint32) + type (uint32) + offset (uint64) } } if minItemSize <= 0 { return fmt.Errorf("invalid min item size for %s: %d", what, minItemSize) } pos, err := f.Seek(0, io.SeekCurrent) if err != nil { return fmt.Errorf("seek %s count position: %w", what, err) } remaining := f.Size - pos if remaining < 0 { return fmt.Errorf("invalid file size: %d", f.Size) } maxCount := uint64(remaining / minItemSize) if maxCount < count { return fmt.Errorf("%s count too large for remaining bytes: %d", what, count) } return nil } func parseGGUFFile(fs []_GGUFFileReadSeeker, o _GGUFReadOptions) (_ *GGUFFile, err error) { var gf GGUFFile for _, f := range fs { var bo binary.ByteOrder = binary.LittleEndian // magic var magic GGUFMagic if err = binary.Read(f, bo, &magic); err != nil { return nil, fmt.Errorf("read magic: %w", err) } switch magic { default: return nil, ErrGGUFFileInvalidFormat case GGUFMagicGGML, GGUFMagicGGMF, GGUFMagicGGJT: return nil, fmt.Errorf("unsupported format: %s", magic) case GGUFMagicGGUFLe: case GGUFMagicGGUFBe: bo = binary.BigEndian } gf.Header.Magic = magic // version var version GGUFVersion if err = binary.Read(f, bo, &version); err != nil { return nil, fmt.Errorf("read version: %w", err) } if version > GGUFVersionV3 { return nil, fmt.Errorf("unsupported GGUF version: %d (supported: %d-%d)", version, GGUFVersionV1, GGUFVersionV3) } gf.Header.Version = version rd := _GGUFReader{v: version, o: o, f: f, bo: bo} // tensor count var tensorCount uint64 if version <= GGUFVersionV1 { tensorCount, err = rd.ReadUint64FromUint32() } else { tensorCount, err = rd.ReadUint64() } if err != nil { return nil, fmt.Errorf("read tensor count: %w", err) } if err := _validateCountWithRemaining(f, tensorCount, version, "tensor"); err != nil { return nil, err } gf.Header.TensorCount += tensorCount // metadata kv count var metadataKVCount uint64 if version <= GGUFVersionV1 { metadataKVCount, err = rd.ReadUint64FromUint32() } else { metadataKVCount, err = rd.ReadUint64() } if err != nil { return nil, fmt.Errorf("read metadata kv count: %w", err) } if err := _validateCountWithRemaining(f, metadataKVCount, version, "metadatakvcount"); err != nil { return nil, err } gf.Header.MetadataKVCount += metadataKVCount // metadata kv { rd := _GGUFMetadataReader{_GGUFReader: rd} kvs := make(GGUFMetadataKVs, metadataKVCount) for i := uint64(0); i < metadataKVCount; i++ { kvs[i], err = rd.Read() if err != nil { return nil, fmt.Errorf("read metadata kv %d: %w", i, err) } } for i := range kvs { if kvs[i].Key == "split.no" { gf.Header.MetadataKVCount-- continue } gf.Header.MetadataKV = append(gf.Header.MetadataKV, kvs[i]) } } // tensor infos if gf.TensorInfos == nil { tc, ok := gf.Header.MetadataKV.Get("split.tensors.count") if ok { gf.TensorInfos = make(GGUFTensorInfos, 0, anyx.Number[int](tc.Value)) } else { // avoid preallocating with tensorCount (could be huge); start empty and append gf.TensorInfos = make(GGUFTensorInfos, 0) } } { rd := _GGUFTensorInfoReader{_GGUFReader: rd} tis := make(GGUFTensorInfos, 0) for i := uint64(0); i < tensorCount; i++ { ti, err := rd.Read() if err != nil { return nil, fmt.Errorf("read tensor info %d: %w", i, err) } tis = append(tis, ti) } gf.TensorInfos = append(gf.TensorInfos, tis...) } pds, err := f.Seek(0, io.SeekCurrent) if err != nil { return nil, fmt.Errorf("seek padding start: %w", err) } // padding var padding int64 { // The global alignment to use, as described above. // This can vary to allow for different alignment schemes, but it must be a multiple of 8. // Some writers may not write the alignment. // If the alignment is not specified, assume it is 32. var ag uint32 = 32 if v, ok := gf.Header.MetadataKV.Get("general.alignment"); ok { ag = v.ValueUint32() } padding = int64(ag) - (pds % int64(ag)) } if len(fs) == 1 { gf.Padding = padding } gf.SplitPaddings = append(gf.SplitPaddings, padding) // tensor data offset tensorDataStartOffset := pds + padding if len(fs) == 1 { gf.TensorDataStartOffset = tensorDataStartOffset } gf.SplitTensorDataStartOffsets = append(gf.SplitTensorDataStartOffsets, tensorDataStartOffset) // size size := GGUFBytesScalar(f.Size) gf.Size += size gf.SplitSizes = append(gf.SplitSizes, size) // model size modelSize := GGUFBytesScalar(f.Size - tensorDataStartOffset) gf.ModelSize += modelSize gf.SplitModelSizes = append(gf.SplitModelSizes, modelSize) } // model parameters gf.ModelParameters = GGUFParametersScalar(gf.TensorInfos.Elements()) // bpw if gf.ModelParameters != 0 { gf.ModelBitsPerWeight = GGUFBitsPerWeightScalar(float64(gf.ModelSize) * 8 / float64(gf.ModelParameters)) } return &gf, nil } // Types for GGUF hierarchical tensors. type ( // GGUFTensorInfoFilter is a filter to filter out if the given tensor name matches. // Return true if the name matches, and false otherwise. GGUFTensorInfoFilter func(name string) bool // IGGUFTensorInfos is an interface for GGUF tensor infos, // which includes basic operations. IGGUFTensorInfos interface { // Get returns the GGUFTensorInfo with the given name, // and true if found, and false otherwise. Get(name string) (info GGUFTensorInfo, found bool) // GetFileType returns the GGUFFileType. GetFileType() GGUFFileType // Match returns true if the name matches the given regex, and false otherwise. Match(nameRegex *regexp.Regexp) bool // Search returns a list of GGUFTensorInfo with the names that match the given regex. Search(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo) // Index returns a map value to the GGUFTensorInfo with the given names, // and the number of names found. Index(names []string) (infos map[string]GGUFTensorInfo, found int) // Elements returns the number of elements(parameters). Elements(filter ...GGUFTensorInfoFilter) uint64 // Bytes returns the number of bytes. Bytes(filter ...GGUFTensorInfoFilter) uint64 // Count returns the number of tensors. Count() uint64 } // GGUFLayerTensorInfos represents hierarchical tensor infos of a GGUF file, // it can save GGUFNamedTensorInfos, GGUFTensorInfos, and GGUFTensorInfo. GGUFLayerTensorInfos []IGGUFTensorInfos // GGUFNamedTensorInfos is the namespace for relevant tensors, // which must has a name. GGUFNamedTensorInfos struct { // Name is the name of the namespace. Name string `json:"name"` // GGUFLayerTensorInfos can save GGUFNamedTensorInfos, GGUFTensorInfos, or GGUFTensorInfo. // // If the item is type of GGUFTensorInfo, it must be the leaf node. // // Any branch nodes are type of GGUFNamedTensorInfos or GGUFTensorInfos, // which can be nested. // // Branch nodes store in type pointer. GGUFLayerTensorInfos `json:"items,omitempty"` } ) // Layers converts the GGUFTensorInfos to GGUFLayerTensorInfos. func (gf *GGUFFile) Layers(ignores ...string) GGUFLayerTensorInfos { return gf.TensorInfos.Layers(ignores...) } func (kv GGUFMetadataKV) ValueUint8() uint8 { if kv.ValueType != GGUFMetadataValueTypeUint8 { panic(fmt.Errorf("key %q try to get type Uint8 but type %v", kv.Key, kv.ValueType)) } return anyx.Number[uint8](kv.Value) } func (kv GGUFMetadataKV) ValueInt8() int8 { if kv.ValueType != GGUFMetadataValueTypeInt8 { panic(fmt.Errorf("key %q try to get type Int8 but type %v", kv.Key, kv.ValueType)) } return anyx.Number[int8](kv.Value) } func (kv GGUFMetadataKV) ValueUint16() uint16 { if kv.ValueType != GGUFMetadataValueTypeUint16 { panic(fmt.Errorf("key %q try to get type Uint16 but type %v", kv.Key, kv.ValueType)) } return anyx.Number[uint16](kv.Value) } func (kv GGUFMetadataKV) ValueInt16() int16 { if kv.ValueType != GGUFMetadataValueTypeInt16 { panic(fmt.Errorf("key %q try to get type Int16 but type %v", kv.Key, kv.ValueType)) } return anyx.Number[int16](kv.Value) } func (kv GGUFMetadataKV) ValueUint32() uint32 { if kv.ValueType != GGUFMetadataValueTypeUint32 { panic(fmt.Errorf("key %q try to get type Uint32 but type %v", kv.Key, kv.ValueType)) } return anyx.Number[uint32](kv.Value) } func (kv GGUFMetadataKV) ValueInt32() int32 { if kv.ValueType != GGUFMetadataValueTypeInt32 { panic(fmt.Errorf("key %q try to get type Int32 but type %v", kv.Key, kv.ValueType)) } return anyx.Number[int32](kv.Value) } func (kv GGUFMetadataKV) ValueFloat32() float32 { if kv.ValueType != GGUFMetadataValueTypeFloat32 { panic(fmt.Errorf("key %q try to get type Float32 but type %v", kv.Key, kv.ValueType)) } return anyx.Number[float32](kv.Value) } func (kv GGUFMetadataKV) ValueBool() bool { if kv.ValueType != GGUFMetadataValueTypeBool { panic(fmt.Errorf("key %q try to get type Bool but type %v", kv.Key, kv.ValueType)) } return anyx.Bool(kv.Value) } func (kv GGUFMetadataKV) ValueString() string { if kv.ValueType != GGUFMetadataValueTypeString { panic(fmt.Errorf("key %q try to get type String but type %v", kv.Key, kv.ValueType)) } return anyx.String(kv.Value) } func (kv GGUFMetadataKV) ValueArray() GGUFMetadataKVArrayValue { if kv.ValueType != GGUFMetadataValueTypeArray { panic(fmt.Errorf("key %q try to get type Array but type %v", kv.Key, kv.ValueType)) } switch t := kv.Value.(type) { case GGUFMetadataKVArrayValue: return t case map[string]any: return GGUFMetadataKVArrayValue{ Type: anyx.Number[GGUFMetadataValueType](t["type"]), Len: anyx.Number[uint64](t["len"]), Array: func() []any { if vv, ok := t["array"].([]any); ok { return vv } return nil }(), StartOffset: anyx.Number[int64](t["startOffset"]), Size: anyx.Number[int64](t["size"]), } default: panic(fmt.Errorf("key %q try to get type Array but type %T", kv.Key, kv.Value)) } } func (kv GGUFMetadataKV) ValueUint64() uint64 { if kv.ValueType != GGUFMetadataValueTypeUint64 { panic(fmt.Errorf("key %q try to get type Uint64 but type %v", kv.Key, kv.ValueType)) } return anyx.Number[uint64](kv.Value) } func (kv GGUFMetadataKV) ValueInt64() int64 { if kv.ValueType != GGUFMetadataValueTypeInt64 { panic(fmt.Errorf("key %q try to get type Int64 but type %v", kv.Key, kv.ValueType)) } return anyx.Number[int64](kv.Value) } func (kv GGUFMetadataKV) ValueFloat64() float64 { if kv.ValueType != GGUFMetadataValueTypeFloat64 { panic(fmt.Errorf("key %q try to get type Float64 but type %v", kv.Key, kv.ValueType)) } return anyx.Number[float64](kv.Value) } // ValueNumeric returns the numeric values of the GGUFMetadataKV, // and panics if the value type is not numeric. // // ValueNumeric is a generic function, and the type T must be constraints.Integer or constraints.Float. // // Compare to the GGUFMetadataKV's Value* functions, // ValueNumeric will cast the original value to the target type. func ValueNumeric[T constraints.Integer | constraints.Float](kv GGUFMetadataKV) T { switch kv.ValueType { case GGUFMetadataValueTypeUint8: case GGUFMetadataValueTypeInt8: case GGUFMetadataValueTypeUint16: case GGUFMetadataValueTypeInt16: case GGUFMetadataValueTypeUint32: case GGUFMetadataValueTypeInt32: case GGUFMetadataValueTypeFloat32: case GGUFMetadataValueTypeUint64: case GGUFMetadataValueTypeInt64: case GGUFMetadataValueTypeFloat64: default: panic(fmt.Errorf("key %q try to get type Numeric but got type %v", kv.Key, kv.ValueType)) } return anyx.Number[T](kv.Value) } func (av GGUFMetadataKVArrayValue) ValuesUint8() []uint8 { if av.Type != GGUFMetadataValueTypeUint8 { panic(fmt.Errorf("try to get type Uint8 but got type %v", av.Type)) } v := make([]uint8, av.Len) for i := uint64(0); i < av.Len; i++ { v[i] = anyx.Number[uint8](av.Array[i]) } return v } func (av GGUFMetadataKVArrayValue) ValuesInt8() []int8 { if av.Type != GGUFMetadataValueTypeInt8 { panic(fmt.Errorf("try to get type Int8 but got type %v", av.Type)) } v := make([]int8, av.Len) for i := uint64(0); i < av.Len; i++ { v[i] = anyx.Number[int8](av.Array[i]) } return v } func (av GGUFMetadataKVArrayValue) ValuesUint16() []uint16 { if av.Type != GGUFMetadataValueTypeUint16 { panic(fmt.Errorf("try to get type Uint16 but got type %v", av.Type)) } v := make([]uint16, av.Len) for i := uint64(0); i < av.Len; i++ { v[i] = anyx.Number[uint16](av.Array[i]) } return v } func (av GGUFMetadataKVArrayValue) ValuesInt16() []int16 { if av.Type != GGUFMetadataValueTypeInt16 { panic(fmt.Errorf("try to get type Int16 but got type %v", av.Type)) } v := make([]int16, av.Len) for i := uint64(0); i < av.Len; i++ { v[i] = anyx.Number[int16](av.Array[i]) } return v } func (av GGUFMetadataKVArrayValue) ValuesUint32() []uint32 { if av.Type != GGUFMetadataValueTypeUint32 { panic(fmt.Errorf("try to get type Uint8 but got type %v", av.Type)) } v := make([]uint32, av.Len) for i := uint64(0); i < av.Len; i++ { v[i] = anyx.Number[uint32](av.Array[i]) } return v } func (av GGUFMetadataKVArrayValue) ValuesInt32() []int32 { if av.Type != GGUFMetadataValueTypeInt32 { panic(fmt.Errorf("try to get type Int32 but got type %v", av.Type)) } v := make([]int32, av.Len) for i := uint64(0); i < av.Len; i++ { v[i] = anyx.Number[int32](av.Array[i]) } return v } func (av GGUFMetadataKVArrayValue) ValuesFloat32() []float32 { if av.Type != GGUFMetadataValueTypeFloat32 { panic(fmt.Errorf("try to get type Float32 but got type %v", av.Type)) } v := make([]float32, av.Len) for i := uint64(0); i < av.Len; i++ { v[i] = anyx.Number[float32](av.Array[i]) } return v } func (av GGUFMetadataKVArrayValue) ValuesBool() []bool { if av.Type != GGUFMetadataValueTypeBool { panic(fmt.Errorf("try to get type Bool but got type %v", av.Type)) } v := make([]bool, av.Len) for i := uint64(0); i < av.Len; i++ { v[i] = anyx.Bool(av.Array[i]) } return v } func (av GGUFMetadataKVArrayValue) ValuesString() []string { if av.Type != GGUFMetadataValueTypeString { panic(fmt.Errorf("try to get type String but got type %v", av.Type)) } v := make([]string, av.Len) for i := uint64(0); i < av.Len; i++ { v[i] = anyx.String(av.Array[i]) } return v } func (av GGUFMetadataKVArrayValue) ValuesArray() []GGUFMetadataKVArrayValue { if av.Type != GGUFMetadataValueTypeArray { panic(fmt.Errorf("try to get type Array but got type %v", av.Type)) } v := make([]GGUFMetadataKVArrayValue, av.Len) for i := uint64(0); i < av.Len; i++ { switch t := av.Array[i].(type) { case GGUFMetadataKVArrayValue: v[i] = t case map[string]any: v[i] = GGUFMetadataKVArrayValue{ Type: anyx.Number[GGUFMetadataValueType](t["type"]), Len: anyx.Number[uint64](t["len"]), Array: func() []any { if vv, ok := t["array"].([]any); ok { return vv } return nil }(), StartOffset: anyx.Number[int64](t["startOffset"]), Size: anyx.Number[int64](t["size"]), } default: panic(fmt.Errorf("try to get type Array but got type %T", av.Array[i])) } } return v } func (av GGUFMetadataKVArrayValue) ValuesUint64() []uint64 { if av.Type != GGUFMetadataValueTypeUint64 { panic(fmt.Errorf("try to get type Uint16 but got type %v", av.Type)) } v := make([]uint64, av.Len) for i := uint64(0); i < av.Len; i++ { v[i] = anyx.Number[uint64](av.Array[i]) } return v } func (av GGUFMetadataKVArrayValue) ValuesInt64() []int64 { if av.Type != GGUFMetadataValueTypeInt64 { panic(fmt.Errorf("try to get type Int64 but got type %v", av.Type)) } v := make([]int64, av.Len) for i := uint64(0); i < av.Len; i++ { v[i] = anyx.Number[int64](av.Array[i]) } return v } func (av GGUFMetadataKVArrayValue) ValuesFloat64() []float64 { if av.Type != GGUFMetadataValueTypeFloat64 { panic(fmt.Errorf("try to get type Float64 but got type %v", av.Type)) } v := make([]float64, av.Len) for i := uint64(0); i < av.Len; i++ { v[i] = anyx.Number[float64](av.Array[i]) } return v } // ValuesNumeric returns the numeric values of the GGUFMetadataKVArrayValue, // and panics if the value type is not numeric. // // ValuesNumeric is a generic function, and the type T must be constraints.Integer or constraints.Float. // // Compare to the GGUFMetadataKVArrayValue's Value* functions, // ValuesNumeric will cast the original value to the target type. func ValuesNumeric[T constraints.Integer | constraints.Float](av GGUFMetadataKVArrayValue) []T { v := make([]T, av.Len) for i := uint64(0); i < av.Len; i++ { switch av.Type { case GGUFMetadataValueTypeUint8: case GGUFMetadataValueTypeInt8: case GGUFMetadataValueTypeUint16: case GGUFMetadataValueTypeInt16: case GGUFMetadataValueTypeUint32: case GGUFMetadataValueTypeInt32: case GGUFMetadataValueTypeFloat32: case GGUFMetadataValueTypeUint64: case GGUFMetadataValueTypeInt64: case GGUFMetadataValueTypeFloat64: default: panic(fmt.Errorf("try to get type Numeric but got type %v", av.Type)) } if av.Array != nil { v[i] = anyx.Number[T](av.Array[i]) } } return v } // Get returns the GGUFMetadataKV with the given key, // and true if found, and false otherwise. func (kvs GGUFMetadataKVs) Get(key string) (value GGUFMetadataKV, found bool) { for i := range kvs { if kvs[i].Key == key { return kvs[i], true } } return GGUFMetadataKV{}, false } // Search returns a list of GGUFMetadataKV with the keys that match the given regex. func (kvs GGUFMetadataKVs) Search(keyRegex *regexp.Regexp) (values []GGUFMetadataKV) { for i := range kvs { if keyRegex.MatchString(kvs[i].Key) { values = append(values, kvs[i]) } } return values } // Index returns a map value to the GGUFMetadataKVs with the given keys, // and the number of keys found. func (kvs GGUFMetadataKVs) Index(keys []string) (values map[string]GGUFMetadataKV, found int) { ks := make(map[string]struct{}, len(keys)) for i := range keys { ks[keys[i]] = struct{}{} } values = make(map[string]GGUFMetadataKV) for i := range kvs { if _, ok := ks[kvs[i].Key]; ok { values[kvs[i].Key] = kvs[i] found++ } if found == len(ks) { break } } return values, found } // Get returns the GGUFTensorInfo with the given name, // and true if found, and false otherwise. func (ti GGUFTensorInfo) Get(name string) (info GGUFTensorInfo, found bool) { if ti.Name == name { return ti, true } return GGUFTensorInfo{}, false } // GetFileType returns the GGUFFileType. func (ti GGUFTensorInfo) GetFileType() GGUFFileType { return GetFileType(map[GGMLType]int{ti.Type: 1}) } // Match returns true if the name of the GGUFTensorInfo matches the given regex. func (ti GGUFTensorInfo) Match(nameRegex *regexp.Regexp) bool { return nameRegex.MatchString(ti.Name) } // Search returns a list of GGUFTensorInfo with the names that match the given regex. func (ti GGUFTensorInfo) Search(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo) { if nameRegex.MatchString(ti.Name) { return []GGUFTensorInfo{ti} } return nil } // Index returns a map value to the GGUFTensorInfo with the given names, // and the number of names found. func (ti GGUFTensorInfo) Index(names []string) (infos map[string]GGUFTensorInfo, found int) { if len(names) == 0 { return nil, 0 } if names[0] == ti.Name { return map[string]GGUFTensorInfo{ti.Name: ti}, 1 } return nil, 0 } // Elements returns the number of elements of the GGUFTensorInfo, // which is inspired by // https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2597-L2601. func (ti GGUFTensorInfo) Elements(filter ...GGUFTensorInfoFilter) uint64 { if ti.NDimensions == 0 { return 0 } for i := range filter { if filter[i] != nil && !filter[i](ti.Name) { return 0 } } ret := uint64(1) for i := uint32(0); i < ti.NDimensions; i++ { ret *= ti.Dimensions[i] } return ret } // Bytes returns the number of bytes of the GGUFTensorInfo, // which is inspired by // https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2609-L2626. func (ti GGUFTensorInfo) Bytes(filter ...GGUFTensorInfoFilter) uint64 { if ti.NDimensions == 0 { return 0 } tt, ok := ti.Type.Trait() if !ok { panic(fmt.Errorf("invalid type: %v", ti.Type)) } for i := range filter { if filter[i] != nil && !filter[i](ti.Name) { return 0 } } // https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L3210-L3214 nb := make([]uint64, 0, ti.NDimensions) { nb = append(nb, tt.TypeSize) nb = append(nb, nb[0]*(ti.Dimensions[0]/tt.BlockSize)) for i := uint32(2); i < ti.NDimensions; i++ { nb = append(nb, nb[i-1]*ti.Dimensions[i-1]) } } var ret uint64 if tt.BlockSize == 1 { ret = tt.TypeSize for i := uint32(0); i < ti.NDimensions; i++ { ret += (ti.Dimensions[i] - 1) * nb[i] } return ret } ret = ti.Dimensions[0] * nb[0] / tt.BlockSize for i := uint32(1); i < ti.NDimensions; i++ { ret += (ti.Dimensions[i] - 1) * nb[i] } return ret } // Count returns the number of GGUF tensors of the GGUFTensorInfo, // which is always 1. func (ti GGUFTensorInfo) Count() uint64 { return 1 } // Get returns the GGUFTensorInfo with the given name, // and true if found, and false otherwise. func (tis GGUFTensorInfos) Get(name string) (info GGUFTensorInfo, found bool) { for i := range tis { if tis[i].Name == name { return tis[i], true } } return GGUFTensorInfo{}, false } // GetFileType returns the GGUFFileType represented the mostly GGMLType of the GGUFTensorInfos. func (tis GGUFTensorInfos) GetFileType() GGUFFileType { if len(tis) == 0 { return _GGUFFileTypeCount } cm := make(map[GGMLType]int) for i := range tis { cm[tis[i].Type]++ } return GetFileType(cm) } // Match returns true if a tensor of GGUFTensorInfos matches the given regex. func (tis GGUFTensorInfos) Match(nameRegex *regexp.Regexp) bool { for i := range tis { if nameRegex.MatchString(tis[i].Name) { return true } } return false } // Search returns a list of GGUFTensorInfo with the names that match the given regex. func (tis GGUFTensorInfos) Search(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo) { for i := range tis { if nameRegex.MatchString(tis[i].Name) { infos = append(infos, tis[i]) } } return infos } // Index returns a map value to the GGUFTensorInfos with the given names, // and the number of names found. func (tis GGUFTensorInfos) Index(names []string) (infos map[string]GGUFTensorInfo, found int) { ns := make(map[string]struct{}, len(names)) for i := range names { ns[names[i]] = struct{}{} } infos = make(map[string]GGUFTensorInfo) for i := range tis { if _, ok := ns[tis[i].Name]; ok { infos[tis[i].Name] = tis[i] found++ } if found == len(ns) { break } } return infos, found } // Elements returns the number of elements of the GGUFTensorInfos. func (tis GGUFTensorInfos) Elements() uint64 { var ret uint64 for i := range tis { ret += tis[i].Elements() } return ret } // Bytes returns the number of bytes of the GGUFTensorInfos. func (tis GGUFTensorInfos) Bytes() uint64 { var ret uint64 for i := range tis { ret += tis[i].Bytes() } return ret } // Count returns the number of GGUF tensors of the GGUFTensorInfos. func (tis GGUFTensorInfos) Count() uint64 { return uint64(len(tis)) } // Layers converts the GGUFTensorInfos to GGUFLayerTensorInfos. func (tis GGUFTensorInfos) Layers(ignores ...string) GGUFLayerTensorInfos { if len(tis) == 0 { return nil } ls := tis.layers() if len(ignores) != 0 { _, ls, _ = ls.Cut(ignores) return ls } return ls } var numberRegex = regexp.MustCompile(`^\d+$`) func (tis GGUFTensorInfos) layers() GGUFLayerTensorInfos { var ret GGUFLayerTensorInfos pm := make(map[string]any) for i := range tis { ps := strings.Split(tis[i].Name, ".") if len(ps) < 2 { ret = append(ret, tis[i]) continue } switch { default: ret = append(ret, tis[i]) case ps[0] == "blk" || ps[0] == "block": // LLaMACpp. p := strings.Join([]string{ps[0], ps[1]}, ".") if _, ok := pm[p]; !ok { l := &GGUFNamedTensorInfos{Name: p} pm[p] = l ret = append(ret, l) } l := pm[p].(*GGUFNamedTensorInfos) l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i]) case (ps[0] == "v" || ps[0] == "t" || ps[0] == "a") && ps[1] == "blk": // LLaMACpp CLIP. p := ps[0] if _, ok := pm[p]; !ok { l := &GGUFNamedTensorInfos{Name: p} pm[p] = l ret = append(ret, l) } l := pm[p].(*GGUFNamedTensorInfos) if len(ps) < 3 { l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i]) continue } p = strings.Join([]string{ps[0], ps[1], ps[2]}, ".") if _, ok := pm[p]; !ok { xl := &GGUFNamedTensorInfos{Name: p} pm[p] = xl l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, xl) } xl := pm[p].(*GGUFNamedTensorInfos) xl.GGUFLayerTensorInfos = append(xl.GGUFLayerTensorInfos, tis[i]) case ((ps[0] == "dec" || ps[0] == "enc") && ps[1] == "blk") || ((ps[0] == "decoder" || ps[0] == "encoder") && ps[1] == "block"): // BERT. p := ps[0] if _, ok := pm[p]; !ok { l := &GGUFNamedTensorInfos{Name: p} pm[p] = l ret = append(ret, l) } l := pm[p].(*GGUFNamedTensorInfos) if len(ps) < 3 { l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i]) continue } p = strings.Join([]string{ps[0], ps[1], ps[2]}, ".") if _, ok := pm[p]; !ok { xl := &GGUFNamedTensorInfos{Name: p} pm[p] = xl l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, xl) } xl := pm[p].(*GGUFNamedTensorInfos) xl.GGUFLayerTensorInfos = append(xl.GGUFLayerTensorInfos, tis[i]) case ps[0] == "first_stage_model": // StableDiffusionCpp Autoencoder. p := strings.Join([]string{ps[0], ps[1]}, ".") if _, ok := pm[p]; !ok { l := &GGUFNamedTensorInfos{Name: p} pm[p] = l ret = append(ret, l) } l := pm[p].(*GGUFNamedTensorInfos) if len(ps) < 3 { l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i]) continue } p = strings.Join([]string{ps[0], ps[1], ps[2]}, ".") if _, ok := pm[p]; !ok { xl := &GGUFNamedTensorInfos{Name: p} pm[p] = xl l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, xl) } xl := pm[p].(*GGUFNamedTensorInfos) xl.GGUFLayerTensorInfos = append(xl.GGUFLayerTensorInfos, tis[i]) case ps[0] == "cond_stage_model": // StableDiffusionCpp Conditioner. if len(ps) < 3 { ret = append(ret, tis[i]) continue } p := strings.Join([]string{ps[0], ps[1], ps[2]}, ".") if !numberRegex.MatchString(ps[1]) { p = strings.Join([]string{ps[0], ps[1]}, ".") } if _, ok := pm[p]; !ok { l := &GGUFNamedTensorInfos{Name: p} pm[p] = l ret = append(ret, l) } l := pm[p].(*GGUFNamedTensorInfos) if len(ps) < 4 { l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i]) continue } p = strings.Join([]string{ps[0], ps[1], ps[2], ps[3]}, ".") if !numberRegex.MatchString(ps[1]) { p = strings.Join([]string{ps[0], ps[1], ps[2]}, ".") } if _, ok := pm[p]; !ok { xl := &GGUFNamedTensorInfos{Name: p} pm[p] = xl l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, xl) } xl := pm[p].(*GGUFNamedTensorInfos) xl.GGUFLayerTensorInfos = append(xl.GGUFLayerTensorInfos, tis[i]) case ps[0] == "model" && ps[1] == "diffusion_model": // nolint: goconst // StableDiffusionCpp. p := "model.diffusion_model" if _, ok := pm[p]; !ok { l := &GGUFNamedTensorInfos{Name: p} pm[p] = l ret = append(ret, l) } l := pm[p].(*GGUFNamedTensorInfos) if len(ps) < 3 { l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, tis[i]) continue } p = strings.Join([]string{"model.diffusion_model", ps[2]}, ".") if _, ok := pm[p]; !ok { xl := &GGUFNamedTensorInfos{Name: p} pm[p] = xl l.GGUFLayerTensorInfos = append(l.GGUFLayerTensorInfos, xl) } xl := pm[p].(*GGUFNamedTensorInfos) xl.GGUFLayerTensorInfos = append(xl.GGUFLayerTensorInfos, tis[i]) } } return ret } // Get returns the IGGUFTensorInfos with the given name, // and true if found, and false otherwise. func (ltis GGUFLayerTensorInfos) Get(name string) (info GGUFTensorInfo, found bool) { for i := range ltis { switch v := ltis[i].(type) { case GGUFTensorInfo: if v.Name == name { return v, true } case *GGUFNamedTensorInfos: info, found = v.GGUFLayerTensorInfos.Get(name) if found { return info, true } } } return GGUFTensorInfo{}, false } // GetFileType returns the GGUFFileType represented the mostly GGMLType of the GGUFLayerTensorInfos. func (ltis GGUFLayerTensorInfos) GetFileType() GGUFFileType { if len(ltis) == 0 { return _GGUFFileTypeCount } cm := make(map[GGMLType]int) for i := range ltis { switch v := ltis[i].(type) { case GGUFTensorInfo: cm[v.Type]++ case *GGUFNamedTensorInfos: cm[v.GetFileType().GGMLType()]++ } } return GetFileType(cm) } // Match returns true if a tensor of GGUFLayerTensorInfos matches the given regex. func (ltis GGUFLayerTensorInfos) Match(nameRegex *regexp.Regexp) bool { for i := range ltis { switch v := ltis[i].(type) { case GGUFTensorInfo: if nameRegex.MatchString(v.Name) { return true } case *GGUFNamedTensorInfos: if v.Match(nameRegex) { return true } } } return false } // Search returns a list of GGUFTensorInfo with the names that match the given regex. func (ltis GGUFLayerTensorInfos) Search(nameRegex *regexp.Regexp) (infos []GGUFTensorInfo) { for i := range ltis { switch v := ltis[i].(type) { case GGUFTensorInfo: if nameRegex.MatchString(v.Name) { infos = append(infos, v) } case *GGUFNamedTensorInfos: infos = append(infos, v.Search(nameRegex)...) } } return infos } // Index returns a map value to the GGUFTensorInfos with the given names, // and the number of names found. func (ltis GGUFLayerTensorInfos) Index(names []string) (infos map[string]GGUFTensorInfo, found int) { ns := make(map[string]struct{}, len(names)) for i := range names { ns[names[i]] = struct{}{} } infos = make(map[string]GGUFTensorInfo) for i := range ltis { switch v := ltis[i].(type) { case GGUFTensorInfo: if _, ok := ns[v.Name]; ok { infos[v.Name] = v found++ } case *GGUFNamedTensorInfos: inf, _ := v.Index(names) for k := range inf { infos[k] = inf[k] found++ } } if found == len(ns) { break } } return infos, found } // Elements returns the number of elements of the GGUFLayerTensorInfos. func (ltis GGUFLayerTensorInfos) Elements(filter ...GGUFTensorInfoFilter) uint64 { var ret uint64 for i := range ltis { ret += ltis[i].Elements(filter...) } return ret } // Bytes returns the number of bytes of the GGUFLayerTensorInfos. func (ltis GGUFLayerTensorInfos) Bytes(filter ...GGUFTensorInfoFilter) uint64 { var ret uint64 for i := range ltis { ret += ltis[i].Bytes(filter...) } return ret } // Count returns the number of GGUF tensors of the GGUFLayerTensorInfos. func (ltis GGUFLayerTensorInfos) Count() uint64 { var ret uint64 for i := range ltis { ret += ltis[i].Count() } return ret } // Cut splits the GGUFLayerTensorInfos into two parts, // and returns the GGUFLayerTensorInfos with the names that match the given names at first, // and the GGUFLayerTensorInfos without the names at second, // and true if the GGUFLayerTensorInfos with the names are found, and false otherwise. // // The given names support glob pattern, for example, "a*" matches "a", "ab", "abc", and so on. func (ltis GGUFLayerTensorInfos) Cut(names []string) (before, after GGUFLayerTensorInfos, found bool) { prefixes := make(map[string]struct{}) matches := make(map[string]struct{}) for i := range names { if strings.HasSuffix(names[i], "*") { prefixes[strings.TrimSuffix(names[i], "*")] = struct{}{} } else { matches[names[i]] = struct{}{} } } before = make(GGUFLayerTensorInfos, 0, len(names)) after = make(GGUFLayerTensorInfos, 0, len(ltis)) for i := range ltis { switch v := ltis[i].(type) { case GGUFTensorInfo: if len(matches) != 0 { if _, ok := matches[v.Name]; ok { before = append(before, v) continue } } if len(prefixes) != 0 { var check bool for prefix := range prefixes { if strings.HasPrefix(v.Name, prefix) { before = append(before, v) check = true break } } if check { continue } } after = append(after, v) case *GGUFNamedTensorInfos: if len(matches) != 0 { if _, ok := matches[v.Name]; ok { before = append(before, v) continue } } if len(prefixes) != 0 { var check bool for prefix := range prefixes { if strings.HasPrefix(v.Name, prefix) { before = append(before, v) check = true break } } if check { continue } } after = append(after, v) } } return before, after, len(before) > 0 } type _GGUFReader struct { v GGUFVersion o _GGUFReadOptions f io.ReadSeeker bo binary.ByteOrder } func (rd _GGUFReader) ReadUint8() (v uint8, err error) { err = binary.Read(rd.f, rd.bo, &v) if err != nil { return 0, fmt.Errorf("read uint8: %w", err) } return v, nil } func (rd _GGUFReader) ReadInt8() (v int8, err error) { err = binary.Read(rd.f, rd.bo, &v) if err != nil { return 0, fmt.Errorf("read int8: %w", err) } return v, nil } func (rd _GGUFReader) ReadUint16() (v uint16, err error) { err = binary.Read(rd.f, rd.bo, &v) if err != nil { return 0, fmt.Errorf("read uint16: %w", err) } return v, nil } func (rd _GGUFReader) ReadInt16() (v int16, err error) { err = binary.Read(rd.f, rd.bo, &v) if err != nil { return 0, fmt.Errorf("read int16: %w", err) } return v, nil } func (rd _GGUFReader) ReadUint32() (v uint32, err error) { err = binary.Read(rd.f, rd.bo, &v) if err != nil { return 0, fmt.Errorf("read uint32: %w", err) } return v, nil } func (rd _GGUFReader) ReadUint64FromUint32() (uint64, error) { v, err := rd.ReadUint32() return uint64(v), err } func (rd _GGUFReader) ReadInt32() (v int32, err error) { err = binary.Read(rd.f, rd.bo, &v) if err != nil { return 0, fmt.Errorf("read int32: %w", err) } return v, nil } func (rd _GGUFReader) ReadFloat32() (v float32, err error) { err = binary.Read(rd.f, rd.bo, &v) if err != nil { return 0, fmt.Errorf("read float32: %w", err) } return v, nil } func (rd _GGUFReader) ReadBool() (v bool, err error) { b, err := rd.ReadUint8() if err != nil { return false, fmt.Errorf("read bool: %w", err) } return b != 0, nil } func (rd _GGUFReader) ReadString() (v string, err error) { var l uint64 if rd.v <= GGUFVersionV1 { l, err = rd.ReadUint64FromUint32() } else { l, err = rd.ReadUint64() } if err != nil { return "", fmt.Errorf("read string length: %w", err) } b := bytex.GetBytes(l) defer bytex.Put(b) if _, err = rd.f.Read(b); err != nil { return "", fmt.Errorf("read string: %w", err) } return string(bytes.TrimSpace(b)), nil } func (rd _GGUFReader) SkipReadingString() (err error) { var l uint64 if rd.v <= GGUFVersionV1 { l, err = rd.ReadUint64FromUint32() } else { l, err = rd.ReadUint64() } if err != nil { return fmt.Errorf("read string length: %w", err) } _, err = rd.f.Seek(int64(l), io.SeekCurrent) if err != nil { return fmt.Errorf("seek string: %w", err) } return nil } func (rd _GGUFReader) ReadArray(key string) (v GGUFMetadataKVArrayValue, err error) { v.StartOffset, err = rd.f.Seek(0, io.SeekCurrent) if err != nil { return v, fmt.Errorf("read array start: %w", err) } if err = binary.Read(rd.f, rd.bo, &v.Type); err != nil { return v, fmt.Errorf("read array item type: %w", err) } if rd.v <= GGUFVersionV1 { v.Len, err = rd.ReadUint64FromUint32() } else { v.Len, err = rd.ReadUint64() } if err != nil { return v, fmt.Errorf("read array length: %w", err) } itemStart, err := rd.f.Seek(0, io.SeekCurrent) if err != nil { return v, fmt.Errorf("seek array item start: %w", err) } if !rd.o.SkipLargeMetadata || stringx.HasSuffixes(key, ".feed_forward_length", ".attention.head_count") { v.Array = make([]any, v.Len) for i := uint64(0); i < v.Len; i++ { v.Array[i], err = rd.ReadValue(key, v.Type) if err != nil { return v, fmt.Errorf("read array item %d: %w", i, err) } } itemEnd, err := rd.f.Seek(0, io.SeekCurrent) if err != nil { return v, fmt.Errorf("seek array item end: %w", err) } v.Size = itemEnd - itemStart return v, nil } switch v.Type { case GGUFMetadataValueTypeUint8, GGUFMetadataValueTypeInt8, GGUFMetadataValueTypeBool: _, err = rd.f.Seek(int64(v.Len), io.SeekCurrent) case GGUFMetadataValueTypeUint16, GGUFMetadataValueTypeInt16: _, err = rd.f.Seek(int64(v.Len)*2, io.SeekCurrent) case GGUFMetadataValueTypeUint32, GGUFMetadataValueTypeInt32, GGUFMetadataValueTypeFloat32: _, err = rd.f.Seek(int64(v.Len)*4, io.SeekCurrent) case GGUFMetadataValueTypeUint64, GGUFMetadataValueTypeInt64, GGUFMetadataValueTypeFloat64: _, err = rd.f.Seek(int64(v.Len)*8, io.SeekCurrent) case GGUFMetadataValueTypeString: for i := uint64(0); i < v.Len; i++ { if err = rd.SkipReadingString(); err != nil { return v, fmt.Errorf("seek array[string] %d: %w", i, err) } } default: // Should not happen. panic(fmt.Errorf("invalid type: %v", v.Type)) } if err != nil { return v, fmt.Errorf("seek array end: %w", err) } itemEnd, err := rd.f.Seek(0, io.SeekCurrent) if err != nil { return v, fmt.Errorf("seek array item end: %w", err) } v.Size = itemEnd - itemStart return v, nil } func (rd _GGUFReader) ReadUint64() (v uint64, err error) { err = binary.Read(rd.f, rd.bo, &v) if err != nil { return 0, fmt.Errorf("read uint64: %w", err) } return v, nil } func (rd _GGUFReader) ReadInt64() (v int64, err error) { err = binary.Read(rd.f, rd.bo, &v) if err != nil { return 0, fmt.Errorf("read int64: %w", err) } return v, nil } func (rd _GGUFReader) ReadFloat64() (v float64, err error) { err = binary.Read(rd.f, rd.bo, &v) if err != nil { return 0, fmt.Errorf("read float64: %w", err) } return v, nil } func (rd _GGUFReader) ReadValue(vk string, vt GGUFMetadataValueType) (v any, err error) { if vt >= _GGUFMetadataValueTypeCount { return nil, fmt.Errorf("invalid type: %v", vt) } switch vt { case GGUFMetadataValueTypeUint8: v, err = rd.ReadUint8() case GGUFMetadataValueTypeInt8: v, err = rd.ReadInt8() case GGUFMetadataValueTypeUint16: v, err = rd.ReadUint16() case GGUFMetadataValueTypeInt16: v, err = rd.ReadInt16() case GGUFMetadataValueTypeUint32: v, err = rd.ReadUint32() case GGUFMetadataValueTypeInt32: v, err = rd.ReadInt32() case GGUFMetadataValueTypeFloat32: v, err = rd.ReadFloat32() case GGUFMetadataValueTypeBool: v, err = rd.ReadBool() case GGUFMetadataValueTypeString: v, err = rd.ReadString() case GGUFMetadataValueTypeArray: v, err = rd.ReadArray(vk) case GGUFMetadataValueTypeUint64: v, err = rd.ReadUint64() case GGUFMetadataValueTypeInt64: v, err = rd.ReadInt64() case GGUFMetadataValueTypeFloat64: v, err = rd.ReadFloat64() default: // Should not happen. panic(fmt.Errorf("invalid type: %v", vt)) } if err != nil { return nil, err } return v, nil } type _GGUFMetadataReader struct { _GGUFReader } func (rd _GGUFMetadataReader) Read() (kv GGUFMetadataKV, err error) { kv.Key, err = rd.ReadString() if err != nil { return kv, fmt.Errorf("read key: %w", err) } { vt, err := rd.ReadUint32() if err != nil { return kv, fmt.Errorf("read value type: %w", err) } kv.ValueType = GGUFMetadataValueType(vt) if kv.ValueType >= _GGUFMetadataValueTypeCount { return kv, fmt.Errorf("invalid value type: %v", kv.ValueType) } } kv.Value, err = rd.ReadValue(kv.Key, kv.ValueType) if err != nil { return kv, fmt.Errorf("read %s value: %w", kv.Key, err) } return kv, nil } type _GGUFTensorInfoReader struct { _GGUFReader } func (rd _GGUFTensorInfoReader) Read() (ti GGUFTensorInfo, err error) { ti.StartOffset, err = rd.f.Seek(0, io.SeekCurrent) if err != nil { return ti, fmt.Errorf("seek tensor info start: %w", err) } ti.Name, err = rd.ReadString() if err != nil { return ti, fmt.Errorf("read name: %w", err) } ti.NDimensions, err = rd.ReadUint32() if err != nil { return ti, fmt.Errorf("read n dimensions: %w", err) } ti.Dimensions = make([]uint64, ti.NDimensions) for i := uint32(0); i < ti.NDimensions; i++ { if rd.v <= GGUFVersionV1 { ti.Dimensions[i], err = rd.ReadUint64FromUint32() } else { ti.Dimensions[i], err = rd.ReadUint64() } if err != nil { return ti, fmt.Errorf("read dimension %d: %w", i, err) } } { v, err := rd.ReadUint32() if err != nil { return ti, fmt.Errorf("read type: %w", err) } ti.Type = GGMLType(v) if ti.Type >= _GGMLTypeCount { return ti, fmt.Errorf("%v: This quantized type is currently unsupported", ti.Type) } } ti.Offset, err = rd.ReadUint64() if err != nil { return ti, fmt.Errorf("read offset: %w", err) } return ti, nil } ================================================ FILE: file_architecture.go ================================================ package gguf_parser import ( "regexp" "slices" "strings" ) // Types for the architecture metadata of a GGUF file. type ( // GGUFArchitecture represents the architecture metadata of a GGUF file. GGUFArchitecture struct { /* Basic */ // Type describes the type of the file, // default is "model". Type string `json:"type"` // Architecture describes what architecture this model implements. // // All lowercase ASCII. Architecture string `json:"architecture"` // MaximumContextLength(n_ctx_train) is the maximum context length of the model. // // For most architectures, this is the hard limit on the length of the input. // Architectures, like RWKV, // that are not reliant on transformer-style attention may be able to handle larger inputs, // but this is not guaranteed. MaximumContextLength uint64 `json:"maximumContextLength,omitempty"` // EmbeddingLength(n_embd) is the length of the embedding layer. EmbeddingLength uint64 `json:"embeddingLength,omitempty"` // BlockCount(n_layer) is the number of blocks of attention and feed-forward layers, // i.e. the bulk of the LLM. // This does not include the input or embedding layers. BlockCount uint64 `json:"blockCount,omitempty"` // FeedForwardLength(n_ff) stores the length of each feed-forward layer. FeedForwardLength []uint64 `json:"feedForwardLength,omitempty"` // ExpertFeedForwardLength(expert_feed_forward_length) is the length of the feed-forward layer in the expert model. ExpertFeedForwardLength uint64 `json:"expertFeedForwardLength,omitempty"` // ExpertSharedFeedForwardLength(expert_shared_feed_forward_length) is the length of the shared feed-forward layer in the expert model. ExpertSharedFeedForwardLength uint64 `json:"expertSharedFeedForwardLength,omitempty"` // ExpertCount(n_expert) is the number of experts in MoE models. ExpertCount uint32 `json:"expertCount,omitempty"` // ExpertUsedCount(n_expert_used) is the number of experts used during each token evaluation in MoE models. ExpertUsedCount uint32 `json:"expertUsedCount,omitempty"` // ExpertSharedCount(n_expert_shared) is the number of shared experts in MoE models. ExpertSharedCount uint32 `json:"expertSharedCount,omitempty"` // AttentionHeadCount(n_head) is the number of attention heads. AttentionHeadCount uint64 `json:"attentionHeadCount,omitempty"` // AttentionHeadCountKV(n_head_kv) is the number of attention heads per group used in Grouped-Query-Attention. // // If not provided or equal to AttentionHeadCount, // the model does not use Grouped-Query-Attention. AttentionHeadCountKV uint64 `json:"attentionHeadCountKV,omitempty"` // AttentionSlidingWindowPattern is the pattern used in the sliding window attention. // // 0 means all layers are Sliding Window Attention. // 1 means all layers are none Sliding Window Attention. // N means every Nth layer is none Sliding Window Attention. AttentionSlidingWindowPattern uint32 `json:"attentionSlidingWindowPattern,omitempty"` // AttentionSlidingWindow is the size of the sliding window used in the attention layer. AttentionSlidingWindow uint64 `json:"attentionSlidingWindow,omitempty"` // AttentionMaxALiBIBias is the maximum bias to use for ALiBI. AttentionMaxALiBIBias float32 `json:"attentionMaxALiBIBias,omitempty"` // AttentionClampKQV describes a value `C`, // which is used to clamp the values of the `Q`, `K` and `V` tensors between `[-C, C]`. AttentionClampKQV float32 `json:"attentionClampKQV,omitempty"` // AttentionLayerNormEpsilon is the epsilon value used in the LayerNorm(Layer Normalization). AttentionLayerNormEpsilon float32 `json:"attentionLayerNormEpsilon,omitempty"` // AttentionLayerNormRMSEpsilon is the epsilon value used in the RMSNorm(root Mean Square Layer Normalization), // which is a simplification of the original LayerNorm. AttentionLayerNormRMSEpsilon float32 `json:"attentionLayerNormRMSEpsilon,omitempty"` // AttentionQueryLORARank is the LORA rank of the query matrix. // // Zero means no LORA. AttentionQueryLORARank uint32 `json:"attentionQueryLORARank,omitempty"` // AttentionKeyValueLORARank is the LORA rank of the key/value matrix. // // Zero means no LORA. AttentionKeyValueLORARank uint32 `json:"attentionKeyValueLORARank,omitempty"` // AttentionKeyLength(n_embd_head_k) is the size of a key head. // // Defaults to `EmbeddingLength / AttentionHeadCount`. AttentionKeyLength uint32 `json:"attentionKeyLength,omitempty"` // AttentionKeyLengthMLA(n_embd_head_k_mla) is the size of a key head in MLA(Multi-Layer Attention). // // Zero means no MLA. AttentionKeyLengthMLA uint32 `json:"attentionKeyLengthMLA,omitempty"` // AttentionValueLength(n_embd_head_v) is the size of a value head. // // Defaults to `EmbeddingLength / AttentionHeadCount`. AttentionValueLength uint32 `json:"attentionValueLength,omitempty"` // AttentionValueLengthMLA(n_embd_head_v_mla) is the size of a value head in MLA(Multi-Layer Attention). // // Zero means no MLA. AttentionValueLengthMLA uint32 `json:"attentionValueLengthMLA,omitempty"` // AttentionCausal is true if the attention is causal. AttentionCausal bool `json:"attentionCausal,omitempty"` // AttentionRecurrent is true if the attention is recurrent. // // Used in Mamba, RWKV, and similar architectures. AttentionRecurrent bool `json:"attentionRecurrent,omitempty"` // AttentionHybrid is true if the attention is hybrid (causal (self-attention) + recurrent). // // Used in Jamba, Falcon-H1, and similar architectures. AttentionHybrid bool `json:"attentionHybrid,omitempty"` // RoPEDimensionCount is the number of dimensions in the RoPE(Rotary Positional Encoding). RoPEDimensionCount uint64 `json:"ropeDimensionCount,omitempty"` // RoPEFrequencyBase is the base frequency of the RoPE. RoPEFrequencyBase float32 `json:"ropeFrequencyBase,omitempty"` // RoPEFrequencyScale is the scale frequency of the RoPE. RoPEFrequencyScale float32 `json:"ropeFrequencyScale,omitempty"` // RoPEFrequencyScale is the frequency scale of the RoPE. RoPEScalingType string `json:"ropeScalingType,omitempty"` // RoPEScalingFactor is the scaling factor of the RoPE. RoPEScalingFactor float32 `json:"ropeScalingFactor,omitempty"` // RoPEScalingOriginalContextLength is the original context length of the RoPE scaling. RoPEScalingOriginalContextLength uint64 `json:"ropeScalingOriginalContextLength,omitempty"` // RoPEScalingFinetuned is true if the RoPE scaling is fine-tuned. RoPEScalingFinetuned bool `json:"ropeScalingFinetuned,omitempty"` // PoolingType is the type of pooling used in the model. PoolingType uint32 `json:"poolingType,omitempty"` // SSMConvolutionKernel is the size of the convolution kernel used in the Selective State Space Model (SSM) and similar architectures. SSMConvolutionKernel uint32 `json:"ssmConvolutionKernel,omitempty"` // SSMInnerSize is the embedding size of the state in SSM and similar architectures. SSMInnerSize uint32 `json:"ssmInnerSize,omitempty"` // SSMStateSize is the size of the recurrent state in SSM and similar architectures. SSMStateSize uint32 `json:"ssmStateSize,omitempty"` // SSMTimeStepRank is the rank of the time steps in SSM and similar architectures. SSMTimeStepRank uint32 `json:"ssmTimeStepRank,omitempty"` // SSMGroupCount is the number of groups in the SSM and similar architectures. SSMGroupCount uint32 `json:"ssmGroupCount,omitempty"` // WKVHeadSize is the size of the head in RWKV and similar architectures. RWKVHeadSize uint32 `json:"rwkvHeadSize,omitempty"` // RWKVRescaleEveryNLayers is the number of layers after which the rescaling is applied in RWKV and similar architectures. RWKVRescaleEveryNLayers uint32 `json:"rwkvRescaleEveryNLayers,omitempty"` // RWKVTimeMixExtraDimension indicates whether the RWKV architecture has an extra dimension for time mixing. RWKVTimeMixExtraDimension uint32 `json:"rwkvTimeMixExtraDimension,omitempty"` // RWKVTimeDecayExtraDimension indicates whether the RWKV architecture has an extra dimension for time decay. RWKVTimeDecayExtraDimension uint32 `json:"rwkvTimeDecayExtraDimension,omitempty"` // TokenShiftCount is the number of token shifts used in RWKV and similar architectures. RWKVTokenShiftCount uint32 `json:"rwkvTokenShiftCount,omitempty"` // VocabularyLength is the size of the vocabulary. // // VocabularyLength is the same as the tokenizer's token size. VocabularyLength uint64 `json:"vocabularyLength,omitempty"` /* Appendix */ // ClipProjectorType is the type of the projector used in the clip model. // // Only used when Architecture is "clip". ClipProjectorType string `json:"clipProjectorType,omitempty"` // ClipHasLLaVAProjector indicates whether the clip model has LLaVA projector or not. // // Only used when Architecture is "clip". // // Deprecated: use ClipProjectorType instead. ClipHasLLaVAProjector bool `json:"clipHasLLaVAProjector,omitempty"` // ClipHasMiniCPMVProjector indicates whether the clip model has MiniCPMV projector or not. // // Only used when Architecture is "clip". // // Deprecated: use ClipProjectorType instead. ClipHasMiniCPMVProjector bool `json:"clipHasMiniCPMVProject,omitempty"` // ClipMiniCPMVVersion is the version of the MiniCPMV projector. // // Only used when Architecture is "clip". ClipMiniCPMVVersion int32 `json:"clipMiniCPMVVersion,omitempty"` // ClipMiniCPMVQueryNum is the number of queries used in the MiniCPMV projector. // // Only used when Architecture is "clip". ClipMiniCPMVQueryNum int32 `json:"clipMiniCPMVQueryNum,omitempty"` // ClipHasGLMProjector indicates whether the clip model has GLM projector or not. // // Only used when Architecture is "clip". // // Deprecated: use ClipProjectorType instead. ClipHasGLMProjector bool `json:"clipHasGLMProjector,omitempty"` // ClipHasQwen2VLMerger indicates whether the clip model has Qwen2VL merger or not. // // Only used when Architecture is "clip". // // Deprecated: use ClipProjectorType instead. ClipHasQwen2VLMerger bool `json:"clipHasQwen2VLMerger,omitempty"` // ClipHasVisionEncoder indicates whether the clip model has vision encoder or not. // // Only used when Architecture is "clip". ClipHasVisionEncoder bool `json:"clipHasVisionEncoder,omitempty"` // ClipVisionEmbeddingLength indicates the embedding length of vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. ClipVisionEmbeddingLength uint64 `json:"clipVisionEmbeddingLength,omitempty"` // ClipVisionBlockCount indicates the number of blocks in the vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. ClipVisionBlockCount uint64 `json:"clipVisionBlockCount,omitempty"` // ClipVisionFeedForwardLength indicates the feed-forward length of the vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. ClipVisionFeedForwardLength []uint64 `json:"clipVisionFeedForwardLength,omitempty"` // ClipVisionAttentionHeadCount indicates the number of attention heads in the vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. ClipVisionAttentionHeadCount uint64 `json:"clipVisionAttentionHeadCount,omitempty"` // ClipVisionAttentionLayerNormRMSEpsilon indicates the epsilon value used in the RMSNorm of the vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. ClipVisionAttentionLayerNormRMSEpsilon float32 `json:"clipVisionAttentionLayerNormRMSEpsilon,omitempty"` // ClipVisionProjectionDim indicates the projection dimension of vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. ClipVisionProjectionDim uint32 `json:"clipVisionProjectionDim,omitempty"` // ClipVisionProjectorScaleFactor is the scale factor of the projector. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. ClipVisionProjectorScaleFactor uint32 `json:"clipVisionProjectorScaleFactor,omitempty"` // ClipVisionImageSize indicates the image size of vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. ClipVisionImageSize uint32 `json:"clipVisionImageSize,omitempty"` // ClipVisionPatchSize indicates the patch size of vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. ClipVisionPatchSize uint32 `json:"clipVisionPatchSize,omitempty"` // ClipVisionMMPatchMergeType indicates the merge type of the vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. ClipVisionMMPatchMergeType string `json:"clipVisionMMPatchMergeType,omitempty"` // ClipVisionSpatialMergeSize is the spatial merge size of the vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. ClipVisionSpatialMergeSize uint32 `json:"clipVisionSpatialMergeSize,omitempty"` // ClipVisionWindowAttentionPattern is the Window Attention pattern used in the vision encoder. // // Only used when Architecture is "clip" and ClipHasVisionEncoder is true. ClipVisionWindowAttentionPattern uint32 `json:"clipVisionWindowAttentionPattern,omitempty"` // ClipHasAudioEncoder indicates whether the clip model has audio encoder or not. // // Only used when Architecture is "clip". ClipHasAudioEncoder bool `json:"clipHasAudioEncoder,omitempty"` // ClipAudioEmbeddingLength indicates the embedding length of audio encoder. // // Only used when Architecture is "clip" and ClipHasAudioEncoder is true. ClipAudioEmbeddingLength uint64 `json:"clipAudioEmbeddingLength,omitempty"` // ClipAudioBlockCount indicates the number of blocks in the audio encoder. // // Only used when Architecture is "clip" and ClipHasAudioEncoder is true. ClipAudioBlockCount uint64 `json:"clipAudioBlockCount,omitempty"` // ClipAudioFeedForwardLength indicates the feed-forward length of the audio encoder. // // Only used when Architecture is "clip" and ClipHasAudioEncoder is true. ClipAudioFeedForwardLength []uint64 `json:"clipAudioFeedForwardLength,omitempty"` // ClipAudioAttentionHeadCount indicates the number of attention heads in the audio encoder. // // Only used when Architecture is "clip" and ClipHasAudioEncoder is true. ClipAudioAttentionHeadCount uint64 `json:"clipAudioAttentionHeadCount,omitempty"` // ClipAudioAttentionLayerNormRMSEpsilon indicates the epsilon value used in the RMSNorm of the audio encoder. // // Only used when Architecture is "clip" and ClipHasAudioEncoder is true. ClipAudioAttentionLayerNormRMSEpsilon float32 `json:"clipAudioAttentionLayerNormRMSEpsilon,omitempty"` // ClipAudioProjectionDim indicates the projection dimension of audio encoder. // // Only used when Architecture is "clip" and ClipHasAudioEncoder is true. ClipAudioProjectionDim uint32 `json:"clipAudioProjectionDim,omitempty"` // ClipAudioProjectorStackFactor is the scale factor of the projector. // // Only used when Architecture is "clip" and ClipHasAudioEncoder is true. ClipAudioProjectorStackFactor uint32 `json:"clipAudioProjectorStackFactor,omitempty"` // ClipAudioNumMelBins is the number of mel bins used in the audio encoder. // // Only used when Architecture is "clip" and ClipHasAudioEncoder is true. ClipAudioNumMelBins uint32 `json:"clipAudioNumMelBins,omitempty"` // AdapterType is the type of the adapter. // // Only used when Architecture is "adapter". AdapterType string `json:"adapterType,omitempty"` // AdapterLoRAAlpha is the alpha value of the LoRA adapter. // // Only used when AdapterType is "lora". AdapterLoRAAlpha float32 `json:"adapterLoRAAlpha,omitempty"` // AdapterControlVectorLayerCount is the number of layers in the control vector. // // Only used when Architecture is "control_vector". AdapterControlVectorLayerCount uint32 `json:"adapterControlVectorLayerCount,omitempty"` // DiffusionArchitecture is the actual architecture of the diffusion model. // // Only used when Architecture is "diffusion". DiffusionArchitecture string `json:"diffusionArchitecture,omitempty"` // DiffusionTransformer indicates whether the diffusion model is a diffusion transformer or not. // DiffusionTransformer bool `json:"diffusionTransformer,omitempty"` // DiffusionConditioners is the list of diffusion conditioners. // // Only used when Architecture is "diffusion". DiffusionConditioners GGUFArchitectureDiffusionConditioners `json:"diffusionConditioners,omitempty"` // DiffusionAutoencoder represents the autoencoder of the diffusion model. // // Only used when Architecture is "diffusion". DiffusionAutoencoder *GGUFArchitectureDiffusionAutoencoder `json:"diffusionAutoencoder,omitempty"` } // GGUFArchitectureDiffusionConditioners is the list of GGUFArchitectureDiffusionConditioner. GGUFArchitectureDiffusionConditioners []GGUFArchitectureDiffusionConditioner // GGUFArchitectureDiffusionConditioner represents the conditioner metadata of the diffusion architecture. GGUFArchitectureDiffusionConditioner struct { // Architecture is the architecture of the diffusion conditioner. Architecture string `json:"architecture"` // FileType describes the type of the majority of the tensors in the GGUF file. FileType GGUFFileType `json:"fileType"` } // GGUFArchitectureDiffusionAutoencoder represents the autoencoder metadata of the diffusion architecture. GGUFArchitectureDiffusionAutoencoder struct { // Architecture is the architecture of the diffusion autoencoder. // // Currently, only "VAE" is supported. Architecture string `json:"architecture"` // FileType describes the type of the majority of the tensors in the GGUF file. FileType GGUFFileType `json:"fileType"` } ) // DiffusionHasConditioners returns true if the diffusion model has conditioners. func (ga GGUFArchitecture) DiffusionHasConditioners() bool { return len(ga.DiffusionConditioners) > 0 } // DiffusionHasAutoencoder returns true if the diffusion model has an autoencoder. func (ga GGUFArchitecture) DiffusionHasAutoencoder() bool { return ga.DiffusionAutoencoder != nil && ga.DiffusionAutoencoder.Architecture != "" } func (gacs GGUFArchitectureDiffusionConditioners) String() string { var sb strings.Builder for i, gac := range gacs { if i > 0 { sb.WriteString(", ") } sb.WriteString(gac.String()) } return sb.String() } func (gac GGUFArchitectureDiffusionConditioner) String() string { return gac.Architecture + " (" + gac.FileType.String() + ")" } func (gaa GGUFArchitectureDiffusionAutoencoder) String() string { return gaa.Architecture + " (" + gaa.FileType.String() + ")" } // Architecture returns the architecture metadata of the GGUF file. func (gf *GGUFFile) Architecture() (ga GGUFArchitecture) { for _, re := range _GGUFPotentialDiffusionArchitectureTensorsRegexes { if gf.TensorInfos.Match(re) { return gf.diffuserArchitecture() } } var ( generalTypeKey = "general.type" generalArchitectureKey = "general.architecture" controlVectorModelHintKey = "controlvector.model_hint" ) m, _ := gf.Header.MetadataKV.Index([]string{ generalTypeKey, generalArchitectureKey, controlVectorModelHintKey, }) typ, arch := "model", "llama" // nolint: goconst { if v, ok := m[generalTypeKey]; ok { typ = v.ValueString() } if v, ok := m[generalArchitectureKey]; ok { arch = v.ValueString() } } switch { case arch == "clip": return gf.clipArchitecture() case arch == "controlvector": arch = "llama" if v, ok := m[controlVectorModelHintKey]; ok { arch = v.ValueString() } return gf.adapterArchitecture(arch) case typ == "adapter": return gf.adapterArchitecture(arch) case typ == "imatrix": return gf.imatrixArchitecture(arch) } return gf.transformerArchitecture(arch) } func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) { const ( // Diffusion sdKey = "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_v.weight" // SD 1.x/2.x sdKey2 = "output_blocks.11.1.transformer_blocks.0.attn2.to_v.weight" sdXlKey = "model.diffusion_model.output_blocks.5.1.transformer_blocks.1.attn1.to_v.weight" // SD XL sdXlKey2 = "output_blocks.5.1.transformer_blocks.1.attn1.to_v.weight" sdXlRefinerKey = "model.diffusion_model.output_blocks.8.1.transformer_blocks.1.attn1.to_v.weight" // SD XL Refiner sdXlRefinerKey2 = "output_blocks.8.1.transformer_blocks.1.attn1.to_v.weight" sd3Key = "model.diffusion_model.joint_blocks.23.x_block.attn.proj.weight" // SD 3.x sd3Key2 = "joint_blocks.23.x_block.attn.proj.weight" sdInPaintFeatureKey = "model.diffusion_model.input_blocks.0.0.weight" // SD in-paint feature sdInPaintFeatureKey2 = "input_blocks.0.0.weight" fluxKey = "model.diffusion_model.double_blocks.0.txt_attn.proj.weight" // FLUX.1 fluxKey2 = "double_blocks.0.txt_attn.proj.weight" fluxFillFeatureKey = "model.diffusion_model.img_in.weight" // FLUX.1 Fill feature fluxFillFeatureKey2 = "img_in.weight" // Conditioner openAiClipVitL14Key = "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight" // OpenAI CLIP ViT-L/14 openAiClipVitL14Key2 = "text_model.encoder.layers.11.self_attn.k_proj.weight" openClipVitH14Key = "cond_stage_model.transformer.text_model.encoder.layers.22.self_attn.k_proj.weight" // OpenCLIP ViT-H/14 openClipVitH14Key2 = "text_model.encoder.layers.22.self_attn.k_proj.weight" openClipVitG14Key = "cond_stage_model.1.transformer.text_model.encoder.layers.31.self_attn.k_proj.weight" // OpenCLIP ViT-G/14 openClipVitG14Key2 = "text_model.encoder.layers.31.self_attn.k_proj.weight" t5xxlKey = "cond_stage_model.1.transformer.encoder.block.23.layer.0.SelfAttention.k.weight" // Google T5-xxl t5xxlKey2 = "cond_stage_model.2.transformer.encoder.block.23.layer.0.SelfAttention.k.weight" t5xxlKey3 = "encoder.block.23.layer.0.SelfAttention.k.weight" ) tis, _ := gf.TensorInfos.Index([]string{ sdKey, sdKey2, sdXlKey, sdXlKey2, sdXlRefinerKey, sdXlRefinerKey2, sd3Key, sd3Key2, sdInPaintFeatureKey, sdInPaintFeatureKey2, fluxKey, fluxKey2, fluxFillFeatureKey, fluxFillFeatureKey2, openAiClipVitL14Key, openAiClipVitL14Key2, openClipVitH14Key, openClipVitH14Key2, openClipVitG14Key, openClipVitG14Key2, t5xxlKey, t5xxlKey2, t5xxlKey3, }) ga.Type = "model" ga.Architecture = "diffusion" if ti, ok := tis[sdKey]; ok { ga.DiffusionArchitecture = "Stable Diffusion 1.x" if ti.Dimensions[0] == 1024 { ga.DiffusionArchitecture = "Stable Diffusion 2.x" } if ti, ok := tis[sdInPaintFeatureKey]; ok && ti.Dimensions[2] == 9 { ga.DiffusionArchitecture += " InPaint" } } else if _, ok := tis[sdKey2]; ok { ga.DiffusionArchitecture = "Stable Diffusion 1.x" if ti.Dimensions[0] == 1024 { ga.DiffusionArchitecture = "Stable Diffusion 2.x" } if ti, ok := tis[sdInPaintFeatureKey2]; ok && ti.Dimensions[2] == 9 { ga.DiffusionArchitecture += " InPaint" } } else if _, ok := tis[sdXlKey]; ok { ga.DiffusionArchitecture = "Stable Diffusion XL" if _, ok = tis[sdXlRefinerKey]; ok { ga.DiffusionArchitecture = "Stable Diffusion XL Refiner" } if ti, ok := tis[sdInPaintFeatureKey]; ok && ti.Dimensions[2] == 9 { ga.DiffusionArchitecture += " InPaint" } } else if _, ok := tis[sdXlKey2]; ok { ga.DiffusionArchitecture = "Stable Diffusion XL" if _, ok = tis[sdXlRefinerKey2]; ok { ga.DiffusionArchitecture = "Stable Diffusion XL Refiner" } if ti, ok := tis[sdInPaintFeatureKey2]; ok && ti.Dimensions[2] == 9 { ga.DiffusionArchitecture += " InPaint" } } else if _, ok := tis[sd3Key]; ok { ga.DiffusionArchitecture = "Stable Diffusion 3.x" ga.DiffusionTransformer = true } else if _, ok := tis[sd3Key2]; ok { ga.DiffusionArchitecture = "Stable Diffusion 3.x" ga.DiffusionTransformer = true } if _, ok := tis[fluxKey]; ok { ga.DiffusionArchitecture = "FLUX.1" ga.DiffusionTransformer = true if ti, ok := tis[fluxFillFeatureKey]; ok && ti.Dimensions[0] == 384 { ga.DiffusionArchitecture += " Fill" } } else if _, ok := tis[fluxKey2]; ok { ga.DiffusionArchitecture = "FLUX.1" ga.DiffusionTransformer = true if ti, ok := tis[fluxFillFeatureKey2]; ok && ti.Dimensions[0] == 384 { ga.DiffusionArchitecture += " Fill" } } if ti, ok := tis[openAiClipVitL14Key]; ok { cond := GGUFArchitectureDiffusionConditioner{ Architecture: "OpenAI CLIP ViT-L/14", FileType: ti.GetFileType(), } if ti, ok = tis[openClipVitH14Key]; ok { cond = GGUFArchitectureDiffusionConditioner{ Architecture: "OpenCLIP ViT-H/14", FileType: ti.GetFileType(), } } ga.DiffusionConditioners = append(ga.DiffusionConditioners, cond) } else if ti, ok := tis[openAiClipVitL14Key2]; ok { cond := GGUFArchitectureDiffusionConditioner{ Architecture: "OpenAI CLIP ViT-L/14", FileType: ti.GetFileType(), } if ti, ok = tis[openClipVitH14Key2]; ok { cond = GGUFArchitectureDiffusionConditioner{ Architecture: "OpenCLIP ViT-H/14", FileType: ti.GetFileType(), } } ga.DiffusionConditioners = append(ga.DiffusionConditioners, cond) } if ti, ok := tis[openClipVitG14Key]; ok { ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{ Architecture: "OpenCLIP ViT-G/14", FileType: ti.GetFileType(), }) } else if ti, ok = tis[openClipVitG14Key2]; ok { ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{ Architecture: "OpenCLIP ViT-G/14", FileType: ti.GetFileType(), }) } if ti, ok := tis[t5xxlKey]; ok { ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{ Architecture: "Google T5-xxl", FileType: ti.GetFileType(), }) } else if ti, ok = tis[t5xxlKey2]; ok { ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{ Architecture: "Google T5-xxl", FileType: ti.GetFileType(), }) } else if ti, ok = tis[t5xxlKey3]; ok { ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{ Architecture: "Google T5-xxl", FileType: ti.GetFileType(), }) } for _, re := range []*regexp.Regexp{ regexp.MustCompile(`^first_stage_model\..*`), regexp.MustCompile(`^decoder\.conv_in\..*`), } { if tis := gf.TensorInfos.Search(re); len(tis) != 0 { ga.DiffusionAutoencoder = &GGUFArchitectureDiffusionAutoencoder{ Architecture: ga.DiffusionArchitecture + " VAE", FileType: GGUFTensorInfos(tis).GetFileType(), } break } } return ga } func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitecture) { const ( projectorTypeKey = "clip.projector_type" hasLLaVAProjectorKey = "clip.has_llava_projector" hasMiniCPMVProjector = "clip.has_minicpmv_projector" miniCPMVVersionKey = "clip.minicpmv_version" miniCPMVQueryNumKey = "clip.minicpmv_query_num" hasGLMProjectorKey = "clip.has_glm_projector" hasQwen2VLMergerKey = "clip.has_qwen2vl_merger" hasVisionEncoderKey = "clip.has_vision_encoder" visionEmbeddingLengthKey = "clip.vision.embedding_length" visionBlockCountKey = "clip.vision.block_count" visionFeedForwardLengthKey = "clip.vision.feed_forward_length" visionAttentionHeadCountKey = "clip.vision.attention.head_count" visionAttentionLayerNormRMSEpsilonKey = "clip.vision.attention.layer_norm_epsilon" visionProjectionDimKey = "clip.vision.projection_dim" visionProjectorScaleFactorKey = "clip.vision.projector.scale_factor" visionImageSizeKey = "clip.vision.image_size" visionPatchSizeKey = "clip.vision.patch_size" visionMMPatchMergeTypeKey = "clip.vision.mm_patch_merge_type" visioSpatialMergeSizeKey = "clip.vision.spatial_merge_size" visionWindowAttentionPatternKey = "clip.vision.n_wa_pattern" hasAudioEncoderKey = "clip.has_audio_encoder" audioEmbeddingLengthKey = "clip.audio.embedding_length" audioBlockCountKey = "clip.audio.block_count" audioFeedForwardLengthKey = "clip.audio.feed_forward_length" audioAttentionHeadCountKey = "clip.audio.attention.head_count" audioAttentionLayerNormRMSEpsilonKey = "clip.audio.attention.layer_norm_epsilon" audioProjectionDimKey = "clip.audio.projection_dim" audioProjectorStackFactorKey = "clip.audio.projector.stack_factor" audioNumMelBinsKey = "clip.audio.num_mel_bins" ) ga.Type = "projector" ga.Architecture = "clip" m, _ := gf.Header.MetadataKV.Index([]string{ projectorTypeKey, hasLLaVAProjectorKey, hasMiniCPMVProjector, miniCPMVVersionKey, miniCPMVQueryNumKey, hasGLMProjectorKey, hasQwen2VLMergerKey, // Vision hasVisionEncoderKey, visionEmbeddingLengthKey, visionBlockCountKey, visionFeedForwardLengthKey, visionAttentionHeadCountKey, visionAttentionLayerNormRMSEpsilonKey, visionProjectionDimKey, visionProjectorScaleFactorKey, visionImageSizeKey, visionPatchSizeKey, visionMMPatchMergeTypeKey, visioSpatialMergeSizeKey, visionWindowAttentionPatternKey, // Audio hasAudioEncoderKey, audioEmbeddingLengthKey, audioBlockCountKey, audioFeedForwardLengthKey, audioAttentionHeadCountKey, audioAttentionLayerNormRMSEpsilonKey, audioProjectionDimKey, audioProjectorStackFactorKey, audioNumMelBinsKey, }) if v, ok := m[projectorTypeKey]; ok { ga.ClipProjectorType = v.ValueString() } else { ga.ClipProjectorType = "mlp" } if v, ok := m[hasLLaVAProjectorKey]; ok { ga.ClipHasLLaVAProjector = v.ValueBool() } if v, ok := m[hasMiniCPMVProjector]; ok { ga.ClipHasMiniCPMVProjector = v.ValueBool() } if v, ok := m[miniCPMVVersionKey]; ok { ga.ClipMiniCPMVVersion = ValueNumeric[int32](v) } if v, ok := m[miniCPMVQueryNumKey]; ok { ga.ClipMiniCPMVQueryNum = ValueNumeric[int32](v) } if v, ok := m[hasGLMProjectorKey]; ok { ga.ClipHasGLMProjector = v.ValueBool() } if v, ok := m[hasQwen2VLMergerKey]; ok { ga.ClipHasQwen2VLMerger = v.ValueBool() } // Vision if v, ok := m[hasVisionEncoderKey]; ok { ga.ClipHasVisionEncoder = v.ValueBool() } if v, ok := m[visionEmbeddingLengthKey]; ok { ga.ClipVisionEmbeddingLength = ValueNumeric[uint64](v) } if v, ok := m[visionBlockCountKey]; ok { ga.ClipVisionBlockCount = ValueNumeric[uint64](v) } if v, ok := m[visionFeedForwardLengthKey]; ok { if v.ValueType == GGUFMetadataValueTypeArray { ga.ClipVisionFeedForwardLength = ValuesNumeric[uint64](v.ValueArray()) } else { vx := ValueNumeric[uint64](v) ga.ClipVisionFeedForwardLength = make([]uint64, ga.ClipVisionBlockCount) for i := range ga.ClipVisionFeedForwardLength { ga.ClipVisionFeedForwardLength[i] = vx } } } if v, ok := m[visionAttentionHeadCountKey]; ok { ga.ClipVisionAttentionHeadCount = ValueNumeric[uint64](v) } if v, ok := m[visionAttentionLayerNormRMSEpsilonKey]; ok { ga.ClipVisionAttentionLayerNormRMSEpsilon = ValueNumeric[float32](v) } if v, ok := m[visionImageSizeKey]; ok { ga.ClipVisionImageSize = ValueNumeric[uint32](v) } if v, ok := m[visionProjectionDimKey]; ok { ga.ClipVisionProjectionDim = ValueNumeric[uint32](v) } ga.ClipVisionProjectorScaleFactor = 1 if ga.ClipProjectorType == "gemma3" { ga.ClipVisionProjectorScaleFactor = 4 } if v, ok := m[visionProjectorScaleFactorKey]; ok { ga.ClipVisionProjectorScaleFactor = ValueNumeric[uint32](v) } ga.ClipVisionPatchSize = 1 if v, ok := m[visionPatchSizeKey]; ok { ga.ClipVisionPatchSize = ValueNumeric[uint32](v) } ga.ClipVisionMMPatchMergeType = "flat" if v, ok := m[visionMMPatchMergeTypeKey]; ok { ga.ClipVisionMMPatchMergeType = v.ValueString() } if v, ok := m[visioSpatialMergeSizeKey]; ok { ga.ClipVisionSpatialMergeSize = ValueNumeric[uint32](v) } if v, ok := m[visionWindowAttentionPatternKey]; ok { ga.ClipVisionWindowAttentionPattern = ValueNumeric[uint32](v) } // Audio if v, ok := m[hasAudioEncoderKey]; ok { ga.ClipHasAudioEncoder = v.ValueBool() } if v, ok := m[audioEmbeddingLengthKey]; ok { ga.ClipAudioEmbeddingLength = ValueNumeric[uint64](v) } if v, ok := m[audioBlockCountKey]; ok { ga.ClipAudioBlockCount = ValueNumeric[uint64](v) } if v, ok := m[audioFeedForwardLengthKey]; ok { if v.ValueType == GGUFMetadataValueTypeArray { ga.ClipAudioFeedForwardLength = ValuesNumeric[uint64](v.ValueArray()) } else { vx := ValueNumeric[uint64](v) ga.ClipAudioFeedForwardLength = make([]uint64, ga.ClipAudioBlockCount) for i := range ga.ClipAudioFeedForwardLength { ga.ClipAudioFeedForwardLength[i] = vx } } } if v, ok := m[audioAttentionHeadCountKey]; ok { ga.ClipAudioAttentionHeadCount = ValueNumeric[uint64](v) } if v, ok := m[audioAttentionLayerNormRMSEpsilonKey]; ok { ga.ClipAudioAttentionLayerNormRMSEpsilon = ValueNumeric[float32](v) } if v, ok := m[audioProjectionDimKey]; ok { ga.ClipAudioProjectionDim = ValueNumeric[uint32](v) } ga.ClipAudioProjectorStackFactor = 1 if v, ok := m[audioProjectorStackFactorKey]; ok { ga.ClipAudioProjectorStackFactor = ValueNumeric[uint32](v) } if v, ok := m[audioNumMelBinsKey]; ok { ga.ClipAudioNumMelBins = ValueNumeric[uint32](v) } ga.AttentionHeadCountKV = ga.AttentionHeadCount return ga } func (gf *GGUFFile) adapterArchitecture(arch string) (ga GGUFArchitecture) { var ( typeKey = "adapter.type" loraAlphaKey = "adapter.lora.alpha" controlVectorLayerCountKey = "adapter.control_vector.layer_count" controlVectorLayerCountKey2 = "control_vector.layer_count" ) ga.Type = "adapter" ga.Architecture = arch m, _ := gf.Header.MetadataKV.Index([]string{ typeKey, loraAlphaKey, controlVectorLayerCountKey, controlVectorLayerCountKey2, }) if v, ok := m[typeKey]; ok { ga.AdapterType = v.ValueString() } if v, ok := m[loraAlphaKey]; ok { ga.AdapterLoRAAlpha = ValueNumeric[float32](v) } if v, ok := m[controlVectorLayerCountKey]; ok { ga.AdapterControlVectorLayerCount = ValueNumeric[uint32](v) } else if v, ok := m[controlVectorLayerCountKey2]; ok { ga.AdapterControlVectorLayerCount = ValueNumeric[uint32](v) } return ga } func (gf *GGUFFile) imatrixArchitecture(_ string) (ga GGUFArchitecture) { ga.Type = "imatrix" ga.Architecture = "imatrix" return ga } func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchitecture) { var ( contextLengthKey = arch + ".context_length" embeddingLengthKey = arch + ".embedding_length" blockCountKey = arch + ".block_count" feedForwardLengthKey = arch + ".feed_forward_length" expertFeedForwardLengthKey = arch + ".expert_feed_forward_length" expertSharedFeedForwardLengthKey = arch + ".expert_shared_feed_forward_length" expertCountKey = arch + ".expert_count" expertUsedCountKey = arch + ".expert_used_count" expertSharedCountKey = arch + ".expert_shared_count" attentionHeadCountKey = arch + ".attention.head_count" attentionHeadCountKVKey = arch + ".attention.head_count_kv" attentionSlidingWindowKey = arch + ".attention.sliding_window" attentionMaxALiBIBiasKey = arch + ".attention.max_alibi_bias" attentionMaxALiBIBiasKey2 = arch + ".attention.alibi_bias_max" attentionClampKQVKey = arch + ".attention.clamp_kqv" attentionClampKQVKey2 = arch + ".attention.clip_kqv" attentionLayerNormEpsilonKey = arch + ".attention.layer_norm_epsilon" attentionLayerNormRMSEpsilonKey = arch + ".attention.layer_norm_rms_epsilon" attentionQueryLORARankKey = arch + ".attention.q_lora_rank" attentionKeyValueLORARankKey = arch + ".attention.kv_lora_rank" attentionKeyLengthKey = arch + ".attention.key_length" attentionKeyLengthMLAKey = arch + ".attention.key_length_mla" attentionValueLengthKey = arch + ".attention.value_length" attentionValueLengthMLAKey = arch + ".attention.value_length_mla" attentionCausalKey = arch + ".attention.causal" ropeDimensionCountKey = arch + ".rope.dimension_count" ropeFrequencyBaseKey = arch + ".rope.freq_base" ropeFrequencyScaleKey = arch + ".rope.freq_scale" ropeScaleLinearKey = arch + ".rope.scale_linear" ropeScalingTypeKey = arch + ".rope.scaling.type" ropeScalingFactorKey = arch + ".rope.scaling.factor" ropeScalingOriginalContextKey = arch + ".rope.scaling.original_context_length" // uint32 maybe ropeScalingFinetunedKey = arch + ".rope.scaling.finetuned" poolingTypeKey = arch + ".pooling_type" ssmConvolutionKernelKey = arch + ".ssm.conv_kernel" ssmInnerSizeKey = arch + ".ssm.inner_size" ssmStateSizeKey = arch + ".ssm.state_size" ssmTimeStepRankKey = arch + ".ssm.time_step_rank" ssmGroupCountKey = arch + ".ssm.group_count" rwkvHeadSizeKey = arch + ".wkv.head_size" rwkvRescaleEveryNLayersKey = arch + ".rescale_every_n_layers" rwkvTimeMixExtraDimensionKey = arch + ".time_mix_extra_dim" rwkvTimeDecayExtraDimensionKey = arch + ".time_decay_extra_dim" rwkvTokenShiftCountKey = arch + ".token_shift_count" vocabularyLengthKey = arch + ".vocab_size" tokenizerGGMLTokensKey = "tokenizer.ggml.tokens" ) ga.Type = "model" ga.Architecture = arch m, _ := gf.Header.MetadataKV.Index([]string{ contextLengthKey, embeddingLengthKey, blockCountKey, feedForwardLengthKey, expertFeedForwardLengthKey, expertSharedFeedForwardLengthKey, expertCountKey, expertUsedCountKey, expertSharedCountKey, attentionHeadCountKey, attentionHeadCountKVKey, attentionSlidingWindowKey, attentionMaxALiBIBiasKey, attentionMaxALiBIBiasKey2, attentionClampKQVKey, attentionClampKQVKey2, attentionLayerNormEpsilonKey, attentionLayerNormRMSEpsilonKey, attentionQueryLORARankKey, attentionKeyValueLORARankKey, attentionKeyLengthKey, attentionKeyLengthMLAKey, attentionValueLengthKey, attentionValueLengthMLAKey, attentionCausalKey, ropeDimensionCountKey, ropeFrequencyBaseKey, ropeFrequencyScaleKey, ropeScaleLinearKey, ropeScalingTypeKey, ropeScalingFactorKey, ropeScalingOriginalContextKey, ropeScalingFinetunedKey, poolingTypeKey, ssmConvolutionKernelKey, ssmInnerSizeKey, ssmStateSizeKey, ssmTimeStepRankKey, ssmGroupCountKey, rwkvHeadSizeKey, rwkvRescaleEveryNLayersKey, rwkvTimeMixExtraDimensionKey, rwkvTimeDecayExtraDimensionKey, rwkvTokenShiftCountKey, vocabularyLengthKey, tokenizerGGMLTokensKey, }) if v, ok := m[contextLengthKey]; ok { ga.MaximumContextLength = ValueNumeric[uint64](v) } if v, ok := m[embeddingLengthKey]; ok { ga.EmbeddingLength = ValueNumeric[uint64](v) } if v, ok := m[blockCountKey]; ok { ga.BlockCount = ValueNumeric[uint64](v) } if v, ok := m[feedForwardLengthKey]; ok { if v.ValueType == GGUFMetadataValueTypeArray { ga.FeedForwardLength = ValuesNumeric[uint64](v.ValueArray()) } else { vx := ValueNumeric[uint64](v) ga.FeedForwardLength = make([]uint64, ga.BlockCount) for i := range ga.FeedForwardLength { ga.FeedForwardLength[i] = vx } } } if v, ok := m[expertCountKey]; ok { ga.ExpertCount = ValueNumeric[uint32](v) } if v, ok := m[expertUsedCountKey]; ok { ga.ExpertUsedCount = ValueNumeric[uint32](v) } if v, ok := m[expertSharedCountKey]; ok { ga.ExpertSharedCount = ValueNumeric[uint32](v) } if v, ok := m[expertFeedForwardLengthKey]; ok { ga.ExpertFeedForwardLength = ValueNumeric[uint64](v) } if v, ok := m[expertSharedFeedForwardLengthKey]; ok { ga.ExpertSharedFeedForwardLength = ValueNumeric[uint64](v) } if v, ok := m[attentionHeadCountKey]; ok { if v.ValueType == GGUFMetadataValueTypeArray { ga.AttentionHeadCount = ValuesNumeric[uint64](v.ValueArray())[0] } else { ga.AttentionHeadCount = ValueNumeric[uint64](v) } } if v, ok := m[attentionHeadCountKVKey]; ok { if v.ValueType == GGUFMetadataValueTypeArray { ga.AttentionHeadCountKV = ValuesNumeric[uint64](v.ValueArray())[0] } else { ga.AttentionHeadCountKV = ValueNumeric[uint64](v) } } else { ga.AttentionHeadCountKV = ga.AttentionHeadCount } ga.AttentionSlidingWindowPattern = 1 if v, ok := m[attentionSlidingWindowKey]; ok { if v.ValueType == GGUFMetadataValueTypeArray { ga.AttentionSlidingWindow = ValuesNumeric[uint64](v.ValueArray())[0] } else { ga.AttentionSlidingWindow = ValueNumeric[uint64](v) } } switch arch { case "llama4": if ga.AttentionSlidingWindow == 0 { ga.AttentionSlidingWindow = 8192 } ga.AttentionSlidingWindowPattern = 4 case "phi3": // See https://github.com/ggml-org/llama.cpp/pull/13676 ga.AttentionSlidingWindow = 0 case "gemma2": if ga.AttentionSlidingWindow == 0 { ga.AttentionSlidingWindow = 4096 } ga.AttentionSlidingWindowPattern = 2 case "gemma3": ga.AttentionSlidingWindowPattern = 6 case "cohere2": ga.AttentionSlidingWindowPattern = 4 } if v, ok := m[attentionMaxALiBIBiasKey]; ok { ga.AttentionMaxALiBIBias = ValueNumeric[float32](v) } else if v, ok := m[attentionMaxALiBIBiasKey2]; ok { ga.AttentionMaxALiBIBias = ValueNumeric[float32](v) } if v, ok := m[attentionClampKQVKey]; ok { ga.AttentionClampKQV = ValueNumeric[float32](v) } else if v, ok := m[attentionClampKQVKey2]; ok { ga.AttentionClampKQV = ValueNumeric[float32](v) } if v, ok := m[attentionLayerNormEpsilonKey]; ok { ga.AttentionLayerNormEpsilon = ValueNumeric[float32](v) } if v, ok := m[attentionLayerNormRMSEpsilonKey]; ok { ga.AttentionLayerNormRMSEpsilon = ValueNumeric[float32](v) } if v, ok := m[attentionQueryLORARankKey]; ok { ga.AttentionQueryLORARank = ValueNumeric[uint32](v) } if v, ok := m[attentionKeyValueLORARankKey]; ok { ga.AttentionKeyValueLORARank = ValueNumeric[uint32](v) } if v, ok := m[attentionKeyLengthKey]; ok { ga.AttentionKeyLength = ValueNumeric[uint32](v) } else if ga.AttentionHeadCount != 0 { ga.AttentionKeyLength = uint32(ga.EmbeddingLength / ga.AttentionHeadCount) } if v, ok := m[attentionKeyLengthMLAKey]; ok { ga.AttentionKeyLengthMLA = ValueNumeric[uint32](v) } if v, ok := m[attentionValueLengthKey]; ok { ga.AttentionValueLength = ValueNumeric[uint32](v) } else if ga.AttentionHeadCount != 0 { ga.AttentionValueLength = uint32(ga.EmbeddingLength / ga.AttentionHeadCount) } if v, ok := m[attentionValueLengthMLAKey]; ok { ga.AttentionValueLengthMLA = ValueNumeric[uint32](v) } if v, ok := m[attentionCausalKey]; ok { ga.AttentionCausal = v.ValueBool() } else { ga.AttentionCausal = true } // See https://github.com/ggml-org/llama.cpp/blob/6491d6e4f1caf0ad2221865b4249ae6938a6308c/src/llama-arch.cpp#L1913-L1924. ga.AttentionRecurrent = slices.Contains([]string{ // TODO(thxCode): calculate this from the metadata. "mamba", "mamba2", "rwkv6", "rwkv6qwen2", "rwkv7", "arwkv7", }, ga.Architecture) // See https://github.com/ggml-org/llama.cpp/blob/a57d1bcb3c0165ac87b1f0dbb429839b0da69689/src/llama-arch.cpp#L2029-L2038. ga.AttentionHybrid = slices.Contains([]string{ // TODO(thxCode): calculate this from the metadata. "jamba", "falcon-h1", "granitehybrid", }, ga.Architecture) ga.AttentionRecurrent = ga.AttentionHybrid || ga.AttentionRecurrent if v, ok := m[ropeDimensionCountKey]; ok { ga.RoPEDimensionCount = ValueNumeric[uint64](v) } ga.RoPEFrequencyBase = 10000.0 if v, ok := m[ropeFrequencyBaseKey]; ok { ga.RoPEFrequencyBase = ValueNumeric[float32](v) } ga.RoPEFrequencyScale = 1.0 if v, ok := m[ropeFrequencyScaleKey]; ok { ga.RoPEFrequencyScale = ValueNumeric[float32](v) } if v, ok := m[ropeScalingTypeKey]; ok { ga.RoPEScalingType = v.ValueString() } if v, ok := m[ropeScaleLinearKey]; ok { ga.RoPEScalingType = "linear" ga.RoPEScalingFactor = ValueNumeric[float32](v) if ga.RoPEScalingFactor != 0 { ga.RoPEFrequencyScale = 1.0 / ga.RoPEScalingFactor } } if v, ok := m[ropeScalingFactorKey]; ok { ga.RoPEScalingFactor = ValueNumeric[float32](v) if ga.RoPEScalingFactor != 0 { ga.RoPEFrequencyScale = 1.0 / ga.RoPEScalingFactor } } if v, ok := m[ropeScalingOriginalContextKey]; ok { ga.RoPEScalingOriginalContextLength = ValueNumeric[uint64](v) } if v, ok := m[ropeScalingFinetunedKey]; ok { ga.RoPEScalingFinetuned = v.ValueBool() } if v, ok := m[poolingTypeKey]; ok { ga.PoolingType = v.ValueUint32() if ga.AttentionCausal && ga.PoolingType > 2 { ga.AttentionCausal = false } } if v, ok := m[ssmConvolutionKernelKey]; ok { ga.SSMConvolutionKernel = ValueNumeric[uint32](v) } if v, ok := m[ssmInnerSizeKey]; ok { ga.SSMInnerSize = ValueNumeric[uint32](v) } if v, ok := m[ssmStateSizeKey]; ok { ga.SSMStateSize = ValueNumeric[uint32](v) } if v, ok := m[ssmTimeStepRankKey]; ok { ga.SSMTimeStepRank = ValueNumeric[uint32](v) } if v, ok := m[ssmGroupCountKey]; ok { ga.SSMGroupCount = ValueNumeric[uint32](v) } if v, ok := m[rwkvHeadSizeKey]; ok { ga.RWKVHeadSize = ValueNumeric[uint32](v) } if v, ok := m[rwkvRescaleEveryNLayersKey]; ok { ga.RWKVRescaleEveryNLayers = ValueNumeric[uint32](v) } if v, ok := m[rwkvTimeMixExtraDimensionKey]; ok { ga.RWKVTimeMixExtraDimension = ValueNumeric[uint32](v) } if v, ok := m[rwkvTimeDecayExtraDimensionKey]; ok { ga.RWKVTimeDecayExtraDimension = ValueNumeric[uint32](v) } if v, ok := m[rwkvTokenShiftCountKey]; ok { ga.RWKVTokenShiftCount = ValueNumeric[uint32](v) } else if ga.AttentionRecurrent { ga.RWKVTokenShiftCount = 2 } if v, ok := m[vocabularyLengthKey]; ok { ga.VocabularyLength = ValueNumeric[uint64](v) } else if v, ok := m[tokenizerGGMLTokensKey]; ok { ga.VocabularyLength = v.ValueArray().Len } return ga } ================================================ FILE: file_architecture_test.go ================================================ package gguf_parser import ( "context" "os" "testing" "github.com/davecgh/go-spew/spew" ) func TestGGUFFile_Architecture(t *testing.T) { ctx := context.Background() f, err := ParseGGUFFileFromHuggingFace( ctx, "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", "Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf", SkipLargeMetadata()) if err != nil { t.Fatal(err) return } t.Log("\n", spew.Sdump(f.Architecture()), "\n") } func BenchmarkGGUFFile_Architecture(b *testing.B) { mp, ok := os.LookupEnv("TEST_MODEL_PATH") if !ok { b.Skip("TEST_MODEL_PATH is not set") return } f, err := ParseGGUFFile(mp, SkipLargeMetadata(), UseMMap()) if err != nil { b.Fatal(err) return } b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { _ = f.Architecture() } } ================================================ FILE: file_estimate__llamacpp.go ================================================ package gguf_parser import ( "math" "regexp" "slices" "strings" "github.com/gpustack/gguf-parser-go/util/anyx" "github.com/gpustack/gguf-parser-go/util/ptr" "github.com/gpustack/gguf-parser-go/util/slicex" ) // Types for LLaMACpp estimation. type ( // LLaMACppRunEstimate represents the estimated result of loading the GGUF file in llama.cpp. LLaMACppRunEstimate struct { // Type describes what type this GGUF file is. Type string `json:"type"` // Architecture describes what architecture this GGUF file implements. // // All lowercase ASCII. Architecture string `json:"architecture"` // ClipProjectorType is the type of the projector used in the clip model. // // Only used when Architecture is "clip". ClipProjectorType string `json:"clipProjectorType,omitempty"` // AdapterType is the type of the adapter. // // Only used when Architecture is "adapter". AdapterType string `json:"adapterType,omitempty"` // FlashAttention is the flag to indicate whether enable the flash attention, // true for enable. FlashAttention bool `json:"flashAttention"` // ContextSize is the size of the context. ContextSize uint64 `json:"contextSize"` // OffloadLayers is the number of offloaded layers. OffloadLayers uint64 `json:"offloadLayers"` // FullOffloaded is the flag to indicate whether the layers are fully offloaded, // false for partial offloaded or zero offloaded. FullOffloaded bool `json:"fullOffloaded"` // NoMMap is the flag to indicate whether support the mmap, // true for support. NoMMap bool `json:"noMMap"` // EmbeddingOnly is the flag to indicate whether the model is used for embedding only, // true for embedding only. EmbeddingOnly bool `json:"embeddingOnly"` // Reranking is the flag to indicate whether the model is used for reranking, // true for reranking. // // Only available when EmbeddingOnly is true. Reranking bool `json:"reranking"` // Distributable is the flag to indicate whether the model is distributable, // true for distributable. Distributable bool `json:"distributable"` // LogicalBatchSize is the logical batch size. LogicalBatchSize int32 `json:"logicalBatchSize"` // PhysicalBatchSize is the physical batch size. PhysicalBatchSize int32 `json:"physicalBatchSize"` // Devices represents the usage for running the GGUF file, // the first device is the CPU, and the rest are GPUs. Devices []LLaMACppRunDeviceUsage `json:"devices"` // Drafter is the estimated result of drafter. Drafter *LLaMACppRunEstimate `json:"drafter,omitempty"` // Projector is the estimated result of multimodal projector. Projector *LLaMACppRunEstimate `json:"projector,omitempty"` // Adapters is the estimated result of adapters. Adapters []LLaMACppRunEstimate `json:"adapters,omitempty"` // MaximumTokensPerSecond represents the maximum tokens per second for running the GGUF file. MaximumTokensPerSecond *GGUFTokensPerSecondScalar `json:"maximumTokensPerSecond,omitempty"` } // LLaMACppRunDeviceUsage represents the usage for running the GGUF file in llama.cpp. LLaMACppRunDeviceUsage struct { // HandleLayers is the number of layers that the device can handle. HandleLayers uint64 `json:"handleLayers"` // HandleSWALayers is the number of layers that the device can handle in sliding window attention (SWA), // the non SWA layers is `HandleLayers - HandleSWALayers`. HandleSWALayers uint64 `json:"handleSWALayers"` // HandleLastLayer is the index of the last layer the device can handle, // -1 means the device does not handle the last layer. HandleLastLayer int `json:"handleLastLayer"` // HandleOutputLayer is the flag to indicate whether the device can handle the output layer, // true for handle. HandleOutputLayer bool `json:"handleOutputLayer"` // Remote is the flag to indicate whether the device is remote, // true for remote. Remote bool `json:"remote"` // Position is the relative position of the device, // starts from 0. // // If Remote is true, Position is the position of the remote devices, // Otherwise, Position is the position of the device in the local devices. Position int `json:"position"` // Endpoint is the endpoint of the remote device, empty for local devices. Endpoint string `json:"endpoint,omitempty"` // Footprint is the memory footprint for bootstrapping. Footprint GGUFBytesScalar `json:"footprint"` // Parameter is the running parameters that the device processes. Parameter LLaMACppParameterUsage `json:"parameter"` // Weight is the memory usage of weights that the device loads. Weight LLaMACppWeightMemoryUsage `json:"weight"` // KVCache is the memory usage of kv that the device caches. KVCache LLaMACppKVCacheMemoryUsage `json:"kvCache"` // Computation is the memory usage of computation that the device processes. Computation LLaMACppComputationMemoryUsage `json:"computation"` } // LLaMACppParameterUsage represents the parameter usage for running the GGUF file in llama.cpp. LLaMACppParameterUsage struct { // KVCache is the parameter usage for caching previous KV. KVCache GGUFParametersScalar `json:"kvCache"` // Input is the parameter usage for input tensors. Input GGUFParametersScalar `json:"input"` // Compute is the parameter usage for compute tensors. Compute GGUFParametersScalar `json:"compute"` // ComputeOverridden is the parameter usage for overridden compute tensors. ComputeOverridden GGUFParametersScalar `json:"computeOverridden"` // Output is the parameter usage for output tensors. Output GGUFParametersScalar `json:"output"` } // LLaMACppWeightMemoryUsage represents the memory usage of loading weights in llama.cpp. LLaMACppWeightMemoryUsage struct { // Input is the memory usage for loading input tensors. Input GGUFBytesScalar `json:"input"` // Compute is the memory usage for loading compute tensors. Compute GGUFBytesScalar `json:"compute"` // ComputeOverridden is the memory usage for loading overridden compute tensors. ComputeOverridden GGUFBytesScalar `json:"computeOverridden"` // Output is the memory usage for loading output tensors. Output GGUFBytesScalar `json:"output"` } // LLaMACppKVCacheMemoryUsage represents the memory usage of caching previous KV in llama.cpp. LLaMACppKVCacheMemoryUsage struct { // Key is the memory usage for caching previous keys. Key GGUFBytesScalar `json:"key"` // Value is the memory usage for caching previous values. Value GGUFBytesScalar `json:"value"` } // LLaMACppComputationMemoryUsage represents the memory usage of computation in llama.cpp. LLaMACppComputationMemoryUsage struct { // Footprint is the memory footprint for computation. Footprint GGUFBytesScalar `json:"footprint"` // Input is the memory usage for input. Input GGUFBytesScalar `json:"input"` // Compute is the memory usage for computation. Compute GGUFBytesScalar `json:"graph"` // Output is the memory usage for output. Output GGUFBytesScalar `json:"output"` } ) // EstimateLLaMACppRun estimates the usages of the GGUF file in llama.cpp. func (gf *GGUFFile) EstimateLLaMACppRun(opts ...GGUFRunEstimateOption) (e LLaMACppRunEstimate) { // Options var o _GGUFRunEstimateOptions for _, opt := range opts { opt(&o) } switch { case o.TensorSplitFraction == nil: o.TensorSplitFraction = []float64{1} o.MainGPUIndex = 0 case o.MainGPUIndex < 0 || o.MainGPUIndex >= len(o.TensorSplitFraction): panic("main device index must be range of 0 to the length of tensor split fraction") } if len(o.DeviceMetrics) > 0 { for i, j := 0, len(o.DeviceMetrics)-1; i < len(o.TensorSplitFraction)-j; i++ { o.DeviceMetrics = append(o.DeviceMetrics, o.DeviceMetrics[j]) } o.DeviceMetrics = o.DeviceMetrics[:len(o.TensorSplitFraction)+1] } if o.LMCCacheKeyType == nil { o.LMCCacheKeyType = ptr.To(GGMLTypeF16) } if o.LMCCacheValueType == nil { o.LMCCacheValueType = ptr.To(GGMLTypeF16) } if o.LMCOffloadKVCache == nil { o.LMCOffloadKVCache = ptr.To(true) } if o.LMCLogicalBatchSize == nil { o.LMCLogicalBatchSize = ptr.To(int32(2048)) } else { // See https://github.com/ggerganov/llama.cpp/blob/0bf16de07b0692e7df26b9a633e232bbd66e0360/src/llama.cpp#L16519-L16525. o.LMCLogicalBatchSize = ptr.To(max(32, *o.LMCLogicalBatchSize)) } if o.LMCPhysicalBatchSize == nil { o.LMCPhysicalBatchSize = ptr.To(int32(512)) } if *o.LMCPhysicalBatchSize > *o.LMCLogicalBatchSize { panic("physical batch size must be less than or equal to logical batch size") } if o.LMCSplitMode >= _LLAMACppSplitModeMax { panic("split mode must be less than max") } // Devices. e.Devices = make([]LLaMACppRunDeviceUsage, len(o.TensorSplitFraction)+1) for i := range e.Devices { e.Devices[i].HandleLastLayer = -1 } for j := range e.Devices[1:] { e.Devices[j+1].Remote = j < len(o.RPCServers) if e.Devices[j+1].Remote { e.Devices[j+1].Position = j e.Devices[j+1].Endpoint = o.RPCServers[j] } else { e.Devices[j+1].Position = j - len(o.RPCServers) } } // Metadata. a := gf.Architecture() e.Type = a.Type e.Architecture = a.Architecture e.ClipProjectorType = a.ClipProjectorType e.AdapterType = a.AdapterType switch a.Type { case "model": t := gf.Tokenizer() gf.estimateLLaMACppRunInModel(&o, &a, &t, &e) case "projector": // For projector model, // see https://github.com/ggerganov/llama.cpp/blob/148ec970b62c3c5ae0a8bfdaad2fc237aaae350d/examples/llava/clip.cpp#L994-L1008. if ptr.Deref(o.LMCOffloadLayers, math.MaxUint64) != 0 { // Full offload. o.LMCOffloadLayers = ptr.To[uint64](math.MaxUint64) } else { // Zero offload. o.LMCOffloadLayers = ptr.To[uint64](0) } gf.estimateLLaMACppRunInProjector(&o, &a, &e) case "adapter": gf.estimateLLaMACppRunInAdapter(&o, &a, &e) case "imatrix": gf.estimateLLaMACppRunInIMatrix(&o, &a, &e) } return e } // estimateLLaMACppRunInModel estimates the usages of the GGUF file for model, // including the usages of footprint, weight, KV cache, and computation. func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GGUFArchitecture, t *GGUFTokenizer, e *LLaMACppRunEstimate) { ls := gf.Layers() ioLs, tfLs, _ := ls.Cut([]string{ "position_*", "token_*", "cls.*", "output.*", "output_*", "rope_factors_*", }) ipLs, opLs, _ := ioLs.Cut([]string{ "position_*", "token_*", }) if a.BlockCount == 0 { a.BlockCount = uint64(len(tfLs)) } // Using sliding window attention. usingSWA := a.AttentionSlidingWindowPattern != 1 && !o.LMCFullSizeSWACache // Full offload: nLoadLayers == 0 && isOffloadOutputLayer // Zero offload: nOffloadLayers == 0 // Partial offload: !Full offload && !Zero offload var ( nOffloadLayers uint64 nActualOffloadLayers uint64 nLoadLayers = a.BlockCount idxOutputDevice int fullOffload, zeroOffload bool nSWALoadLayers, nSWAOffloadLayers uint64 ) { var isOffloadOutputLayer bool switch v := o.LMCOffloadLayers; { case v == nil: o.LMCOffloadLayers = ptr.To(a.BlockCount) nOffloadLayers = a.BlockCount isOffloadOutputLayer = true case *v != 0: nOffloadLayers = *v if nOffloadLayers > a.BlockCount { isOffloadOutputLayer = true nOffloadLayers = a.BlockCount } } nActualOffloadLayers = nOffloadLayers if isOffloadOutputLayer { nActualOffloadLayers += 1 } nLoadLayers -= nOffloadLayers fullOffload = nLoadLayers == 0 && isOffloadOutputLayer zeroOffload = nOffloadLayers == 0 e.FullOffloaded = fullOffload e.OffloadLayers = nOffloadLayers for i, j, offloadStart := uint64(0), 0, a.BlockCount-nOffloadLayers; i < a.BlockCount; i++ { switch { case i < nLoadLayers: e.Devices[0].HandleLayers += 1 e.Devices[0].HandleLastLayer = int(i) if usingSWA && (a.AttentionSlidingWindowPattern == 0 || i%uint64(a.AttentionSlidingWindowPattern) != 0) { e.Devices[0].HandleSWALayers += 1 nSWALoadLayers += 1 } case i >= offloadStart: x := float64(i-offloadStart) / float64(nActualOffloadLayers) j = slicex.UpperBound(o.TensorSplitFraction, x) e.Devices[j+1].HandleLayers += 1 e.Devices[j+1].HandleLastLayer = int(i) if usingSWA && (a.AttentionSlidingWindowPattern == 0 || i%uint64(a.AttentionSlidingWindowPattern) != 0) { e.Devices[j+1].HandleSWALayers += 1 nSWAOffloadLayers += 1 } if fullOffload && i == a.BlockCount-1 { idxOutputDevice = j + 1 } } } e.Devices[idxOutputDevice].HandleOutputLayer = true } // Flash attention. { // Grok is not compatible with flash attention, // see https://github.com/ggerganov/llama.cpp/blob/19d3c8293b1f61acbe2dab1d49a17950fd788a4a/src/llama.cpp#L9566-L9569. if a.Architecture == "grok" { o.FlashAttention = false } // Fallback to FP16 if the value type is quantized when disabling flash attention, // see https://github.com/ggerganov/llama.cpp/blob/19d3c8293b1f61acbe2dab1d49a17950fd788a4a/src/llama.cpp#L9576-L9579. if o.LMCCacheValueType.IsQuantized() && !o.FlashAttention { o.LMCCacheValueType = ptr.To(GGMLTypeF16) } e.FlashAttention = o.FlashAttention } // Embedding. if !a.AttentionCausal { ropeFrequencyBase := ptr.Deref(o.LMCRoPEFrequencyBase, a.RoPEFrequencyBase) ropeFrequencyScale := ptr.Deref(o.LMCRoPEFrequencyScale, a.RoPEFrequencyScale) ropeScalingType := ptr.Deref(o.LMCRoPEScalingType, a.RoPEScalingType) ropeScalingOriginalContextSize := ptr.Deref(o.LMCRoPEScalingOriginalContextSize, int32(a.RoPEScalingOriginalContextLength)) isRoPECustomized := ropeFrequencyBase != a.RoPEFrequencyBase || ropeFrequencyScale != a.RoPEFrequencyScale || ropeScalingType != a.RoPEScalingType || (ropeScalingType == "yarn" && ropeScalingOriginalContextSize != int32(a.RoPEScalingOriginalContextLength)) e.EmbeddingOnly = true o.LMCContextSize = ptr.To(ptr.Deref(o.LMCContextSize, int32(a.MaximumContextLength))) // Set context size/physical batch size/logical batch size to the training context size. if !isRoPECustomized { o.LMCContextSize = ptr.To(min(int32(a.MaximumContextLength), *o.LMCContextSize)) } o.LMCLogicalBatchSize = o.LMCContextSize o.LMCPhysicalBatchSize = o.LMCLogicalBatchSize // Reranking. if _, found := gf.TensorInfos.Index([]string{"cls.bias", "cls.weight"}); found > 0 { e.Reranking = true } if !e.Reranking && a.PoolingType == 4 { // 0: None, 1: Mean, 2: Cls, 3: Last, 4: Rank e.Reranking = true } } // Distributable, // fix by https://github.com/ggerganov/llama.cpp/pull/11047. e.Distributable = true // Batch size. e.LogicalBatchSize = *o.LMCLogicalBatchSize e.PhysicalBatchSize = *o.LMCPhysicalBatchSize // Padding alignment. paddingAlign := uint64(32) if o.FlashAttention { paddingAlign = 256 } // Init hyperparameters, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L6957-L7000. var ( nContext uint64 nTokens uint64 nBatch uint64 nOutputs uint64 nSeq uint64 nKV uint64 ) { nContext = a.MaximumContextLength if o.LMCContextSize != nil { nContext = uint64(*o.LMCContextSize) } if o.LMCInMaxContextSize { nContext = min(nContext, a.MaximumContextLength) } // Padding context size, // see https://github.com/ggerganov/llama.cpp/blob/278d0e18469aacf505be18ce790a63c7cc31be26/src/llama.cpp#L19001-L19002. nContext = GGMLPadding(nContext, paddingAlign) // Correct token size, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L12221-L12224. nTokens = min(nContext, uint64(*o.LMCPhysicalBatchSize)) nBatch = nTokens nOutputs = nTokens nSeq = uint64(ptr.Deref(o.ParallelSize, 1)) nKV = nContext e.ContextSize = nContext } // Footprint. { // Bootstrap. e.Devices[0].Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */ // Tokens, // https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L6380-L6384. fp := t.TokensLength * (4 /* token type */ + 4 /* token score*/) if t.Model == "gpt2" { fp += t.MergesLength * (48 /* key type */ + 56 /* value type */) } fp += t.TokensLength * (32 /* id to token vector */ + (24 + 32) /* token to id map*/) e.Devices[0].Footprint += GGUFBytesScalar(fp) // Output buffer, // see https://github.com/ggerganov/llama.cpp/blob/7672adeec7a79ea271058c63106c142ba84f951a/llama.cpp#L11940-L12003. ob := a.EmbeddingLength * nOutputs * 4 /* float32 size */ if a.AttentionCausal { ob += a.VocabularyLength * nOutputs * 4 /* float32 size */ } if fullOffload { e.Devices[idxOutputDevice].Footprint += GGUFBytesScalar(ob) } else { e.Devices[0].Footprint += GGUFBytesScalar(ob) } } // Weight & Parameter. { filter := func(idx int) GGUFTensorInfoFilter { if len(o.OverriddenTensors) == 0 { return nil } return func(name string) bool { for _, ot := range o.OverriddenTensors { bt, bi := ot.ParseBufferType() switch { case bt == GGUFRunOverriddenTensorBufferTypeUnknown: continue case bt == GGUFRunOverriddenTensorBufferTypeCPU && idx == 0: continue case bt == GGUFRunOverriddenTensorBufferTypeGPU && (e.Devices[idx].Remote || anyx.Number[int](bi)+1 != idx): continue case bt == GGUFRunOverriddenTensorBufferTypeRPC && (!e.Devices[idx].Remote || e.Devices[idx].Endpoint != bi): continue } if ot.PatternRegex.MatchString(name) { return false } } return true } } // If overridden tensors are provided, // we need to search the tensors of the overridden pattern, // and place them in the correct device. if len(o.OverriddenTensors) != 0 { for _, ot := range o.OverriddenTensors { bt, bi := ot.ParseBufferType() if bt == GGUFRunOverriddenTensorBufferTypeUnknown { continue } var sls GGUFTensorInfos = ls.Search(ot.PatternRegex) if len(sls) == 0 { continue } switch bt { case GGUFRunOverriddenTensorBufferTypeCPU: e.Devices[0].Weight.ComputeOverridden += GGUFBytesScalar(sls.Bytes()) e.Devices[0].Parameter.ComputeOverridden += GGUFParametersScalar(sls.Elements()) case GGUFRunOverriddenTensorBufferTypeGPU: idx := anyx.Number[int](bi) + 1 e.Devices[idx].Weight.ComputeOverridden += GGUFBytesScalar(sls.Bytes()) e.Devices[idx].Parameter.ComputeOverridden += GGUFParametersScalar(sls.Elements()) default: for i, d := range e.Devices[1:] { if d.Endpoint == bi { e.Devices[i+1].Weight.ComputeOverridden += GGUFBytesScalar(sls.Bytes()) e.Devices[i+1].Parameter.ComputeOverridden += GGUFParametersScalar(sls.Elements()) break } } } } } // Compute. for i, j, offloadStart := 0, 0, len(tfLs)-int(nOffloadLayers); i < len(tfLs); i++ { idx := 0 if i >= offloadStart { x := float64(i-offloadStart) / float64(nActualOffloadLayers) j = slicex.UpperBound(o.TensorSplitFraction, x) idx = j + 1 } f := filter(idx) e.Devices[idx].Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes(f)) e.Devices[idx].Parameter.Compute += GGUFParametersScalar(tfLs[i].Elements(f)) } // IO, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L4930-L5002. e.Devices[0].Weight.Input = GGUFBytesScalar(ipLs.Bytes()) e.Devices[0].Parameter.Input = GGUFParametersScalar(ipLs.Elements()) var ( wg GGUFBytesScalar ps GGUFParametersScalar ) if _, ok := opLs.Get("output.weight"); ok { wg = GGUFBytesScalar(opLs.Bytes()) ps = GGUFParametersScalar(opLs.Elements()) } else { wg = GGUFBytesScalar(opLs.Bytes()) + e.Devices[0].Weight.Input /* duplicate the input layer */ ps = GGUFParametersScalar(opLs.Elements() + ipLs.Elements()) } e.Devices[0].Weight.Output = wg if fullOffload { e.Devices[idxOutputDevice].Weight.Output = wg e.Devices[idxOutputDevice].Parameter.Output = ps } else { e.Devices[0].Parameter.Output = ps } } // KV cache. if a.AttentionCausal { switch { // Recurrent, // see https://github.com/ggml-org/llama.cpp/blob/704bb7a71c01dc07c1478b85f6322bf5dfde1eaf/src/llama-hparams.cpp#L68-L88. case a.AttentionRecurrent: var r, s uint64 if a.RWKVHeadSize > 0 { r = uint64(a.RWKVTokenShiftCount) * a.EmbeddingLength s = uint64(a.RWKVHeadSize) * a.EmbeddingLength } else { r = uint64((a.SSMConvolutionKernel - 1) * (a.SSMInnerSize + 2*a.SSMGroupCount*a.SSMStateSize)) s = uint64(a.SSMStateSize * a.SSMInnerSize) } rps, sps := r*nSeq, s*nSeq rrs, srs := GGMLTypeF32.RowSizeOf([]uint64{rps}), GGMLTypeF32.RowSizeOf([]uint64{sps}) e.Devices[0].KVCache.Key += GGUFBytesScalar(rrs * nLoadLayers) e.Devices[0].KVCache.Value += GGUFBytesScalar(srs * nLoadLayers) e.Devices[0].Parameter.KVCache += GGUFParametersScalar((rrs + srs) * nLoadLayers) if !*o.LMCOffloadKVCache { e.Devices[0].KVCache.Key += GGUFBytesScalar(rrs * nOffloadLayers) e.Devices[0].KVCache.Value += GGUFBytesScalar(srs * nOffloadLayers) e.Devices[0].Parameter.KVCache += GGUFParametersScalar((rrs + srs) * nOffloadLayers) } else if !zeroOffload { for i, d := range e.Devices[1:] { e.Devices[i+1].KVCache.Key += GGUFBytesScalar(rrs * d.HandleLayers) e.Devices[i+1].KVCache.Value += GGUFBytesScalar(srs * d.HandleLayers) e.Devices[i+1].Parameter.KVCache += GGUFParametersScalar((rrs + srs) * d.HandleLayers) } } if !a.AttentionHybrid { break } fallthrough // Causal, // see https://github.com/ggml-org/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L2479-L2501. default: akl, avl := uint64(a.AttentionKeyLength), uint64(a.AttentionValueLength) if a.AttentionKeyLengthMLA > 0 && a.AttentionValueLengthMLA > 0 { akl, avl = uint64(a.AttentionKeyLengthMLA), uint64(a.AttentionValueLengthMLA) } kGQA := akl * a.AttentionHeadCountKV vGQA := avl * a.AttentionHeadCountKV kps, vps := kGQA*nKV, vGQA*nKV krs, vrs := o.LMCCacheKeyType.RowSizeOf([]uint64{kps}), o.LMCCacheValueType.RowSizeOf([]uint64{vps}) if !usingSWA { e.Devices[0].KVCache.Key += GGUFBytesScalar(krs * nLoadLayers) e.Devices[0].KVCache.Value += GGUFBytesScalar(vrs * nLoadLayers) e.Devices[0].Parameter.KVCache += GGUFParametersScalar((kps + vps) * nLoadLayers) if !*o.LMCOffloadKVCache { e.Devices[0].KVCache.Key += GGUFBytesScalar(krs * nOffloadLayers) e.Devices[0].KVCache.Value += GGUFBytesScalar(vrs * nOffloadLayers) e.Devices[0].Parameter.KVCache += GGUFParametersScalar((kps + vps) * nOffloadLayers) } else if !zeroOffload { for i, d := range e.Devices[1:] { e.Devices[i+1].KVCache.Key += GGUFBytesScalar(krs * d.HandleLayers) e.Devices[i+1].KVCache.Value += GGUFBytesScalar(vrs * d.HandleLayers) e.Devices[i+1].Parameter.KVCache += GGUFParametersScalar((kps + vps) * d.HandleLayers) } } } else { // Sliding window attention size, // see https://github.com/ggml-org/llama.cpp/blob/3079e9ac8e04ef6eddeb0c164d72edb6b6fd2df5/src/llama-kv-cache.cpp#L1640-L1642. swas := min(nKV, GGMLPadding(a.AttentionSlidingWindow*nSeq+uint64(*o.LMCLogicalBatchSize), paddingAlign)) swaKps, swaVps := kGQA*swas, vGQA*swas swaKrs, swaVrs := o.LMCCacheKeyType.RowSizeOf([]uint64{swaKps}), o.LMCCacheValueType.RowSizeOf([]uint64{swaVps}) nNonSWALoadLayers, nNonSWAOffloadLayers := nLoadLayers-nSWALoadLayers, nOffloadLayers-nSWAOffloadLayers e.Devices[0].KVCache.Key += GGUFBytesScalar(swaKrs*nSWALoadLayers + krs*nNonSWALoadLayers) e.Devices[0].KVCache.Value += GGUFBytesScalar(swaVrs*nSWALoadLayers + vrs*nNonSWALoadLayers) e.Devices[0].Parameter.KVCache += GGUFParametersScalar((swaKps+swaVps)*nSWALoadLayers + (kps+vps)*nNonSWALoadLayers) if !*o.LMCOffloadKVCache { e.Devices[0].KVCache.Key += GGUFBytesScalar(swaKrs*nSWAOffloadLayers + krs*nNonSWAOffloadLayers) e.Devices[0].KVCache.Value += GGUFBytesScalar(swaVrs*nSWAOffloadLayers + vrs*nNonSWAOffloadLayers) e.Devices[0].Parameter.KVCache += GGUFParametersScalar((swaKps+swaVps)*nSWAOffloadLayers + (kps+vps)*nNonSWAOffloadLayers) } else if !zeroOffload { for i, d := range e.Devices[1:] { e.Devices[i+1].KVCache.Key += GGUFBytesScalar(swaKrs*d.HandleSWALayers + krs*(d.HandleLayers-d.HandleSWALayers)) e.Devices[i+1].KVCache.Value += GGUFBytesScalar(swaVrs*d.HandleSWALayers + vrs*(d.HandleLayers-d.HandleSWALayers)) e.Devices[i+1].Parameter.KVCache += GGUFParametersScalar((swaKps+swaVps)*d.HandleSWALayers + (kps+vps)*(d.HandleLayers-d.HandleSWALayers)) } } } } } // Computation. { // See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/src/llama-context.cpp#L1241-L1243. maxNodes := max(1024, uint64(8*len(gf.TensorInfos))) // Bootstrap, compute metadata. cm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false) e.Devices[0].Computation.Footprint = GGUFBytesScalar(cm) // Scheduler overhead, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149. e.Devices[0].Computation.Footprint += GGUFBytesScalar(4 * 1024 * 1024) // GGML context, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036. gc := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.BlockCount*3) e.Devices[0].Computation.Footprint += GGUFBytesScalar(gc) // Tensor usage, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149. // // First, get the usage of input layer, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L2279-L2290. var ( inpTokens = GGMLTypeI32.RowSizeOf([]uint64{nBatch}) // I32 [n_batch] inpEmbd = GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, nBatch}) // F32 [n_embd, n_batch] inpPos = GGMLTypeI32.RowSizeOf([]uint64{nBatch}) // I32 [n_batch] inpOutIds = GGMLTypeI32.RowSizeOf([]uint64{nOutputs}) // I32 [n_outputs], inpKQMask = GGMLTypeF32.RowSizeOf([]uint64{nKV, nBatch}) // F32 [n_kv, n_batch] inpSMask = GGMLTypeF32.RowSizeOf([]uint64{1, nSeq}) // F32 [1, n_seq] inpSSeq = GGMLTypeI32.RowSizeOf([]uint64{nSeq, nBatch}) // I32 [n_seq, n_batch] ) if a.AttentionRecurrent { e.Devices[0].Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + 2*inpSMask + inpSSeq + inpOutIds) } else { e.Devices[0].Computation.Input = GGUFBytesScalar(inpTokens + inpEmbd + inpPos + inpKQMask + inpOutIds) } { var v GGUFBytesScalar if a.AttentionRecurrent { v = GGUFBytesScalar(inpEmbd + inpSMask + inpSSeq) } else { v = GGUFBytesScalar(inpEmbd + inpPos + inpKQMask) } if len(o.RPCServers) == 0 && len(o.TensorSplitFraction) > 1 { if a.ExpertCount > 0 { v *= 2 } else { v *= 4 } } for i := range e.Devices[1:] { e.Devices[i+1].Computation.Input += v } } // Since the steps between transformer layers are serial, // the allocated memory can be reused for the next layer. // So, we only consider the usage of the largest layer, // which is the last layer by default. if a.AttentionRecurrent && !a.AttentionHybrid { if a.RWKVHeadSize > 0 { attnInc := uint64(0) for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|attn_norm_2)\.weight`)) { rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch}) attnInc += rs } ffnInc := uint64(0) for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.time_mix_(lerp_x|receptance|decay_w2|key|value|gate|w2|output)\.weight`)) { // nolint: lll switch { case strings.HasSuffix(l.Name, ".time_mix_w2.weight"): rs := GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, 1, nTokens, l.Dimensions[l.NDimensions-1]}) ffnInc += rs case strings.HasSuffix(l.Name, ".time_mix_output.weight"): rs := GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, nBatch + uint64(a.RWKVHeadSize)*nSeq}) ffnInc += rs default: rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nBatch}) ffnInc += rs } } cp := GGUFBytesScalar(attnInc + ffnInc) for i := range e.Devices[1:] { e.Devices[i+1].Computation.Compute = cp } } else { r := uint64((a.SSMConvolutionKernel - 1) * (a.SSMInnerSize + 2*a.SSMGroupCount*a.SSMStateSize)) convInc := GGMLTypeF32.RowSizeOf([]uint64{r, nSeq}) // F32 [n_embd_key_gqa, nSeq] reshape for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|ssm_in|ssm_conv1d)\.weight`)) { if !strings.HasSuffix(l.Name, ".ssm_conv1d.weight") { rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) convInc += rs continue } // https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L10379. rs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.SSMInnerSize)*nTokens + uint64(a.SSMConvolutionKernel)*uint64(a.SSMInnerSize)*nSeq}) convInc += rs } ssmInc := uint64(0) for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.ssm_(dt\.weight|a)`)) { if !strings.HasSuffix(l.Name, ".ssm_a") { rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) ssmInc += rs continue } // https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L10413. rs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.SSMInnerSize)*nTokens + uint64(a.SSMStateSize)*uint64(a.SSMInnerSize)*nSeq}) ssmInc += rs } cp := GGUFBytesScalar(convInc + ssmInc) for i := range e.Devices[1:] { e.Devices[i+1].Computation.Compute = cp } } } else { loadAttnInc, offloadAttnInc := uint64(0), uint64(0) { rs := o.LMCCacheKeyType.RowSizeOf([]uint64{uint64(a.AttentionKeyLength), nKV, a.AttentionHeadCountKV}) loadAttnInc = rs // k-? rs = o.LMCCacheValueType.RowSizeOf([]uint64{uint64(a.AttentionValueLength), nKV, a.AttentionHeadCountKV}) loadAttnInc += rs // v-? } if o.FlashAttention { // https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L7387. offloadAttnInc = GGMLTypeF16.RowSizeOf([]uint64{nKV, nTokens}) for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.attn_(norm|q|qkv|q_b)\.weight`)) { if strings.HasSuffix(l.Name, ".attn_norm.weight") { rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) offloadAttnInc += rs continue } rs := l.Bytes() offloadAttnInc += rs } // https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L6986-L6992. rs := o.LMCCacheKeyType.RowSizeOf([]uint64{uint64(a.AttentionKeyLength), nKV, a.AttentionHeadCountKV}) offloadAttnInc += rs // https://github.com/ggerganov/llama.cpp/blob/172c8256840ffd882ab9992ecedbb587d9b21f15/llama.cpp#L7000-L7007. rs = o.LMCCacheValueType.RowSizeOf([]uint64{uint64(a.AttentionValueLength), nKV, a.AttentionHeadCountKV}) offloadAttnInc += rs } else { offloadAttnInc = uint64(0) for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.attn_(norm|q|qkv|q_b)\.weight`)) { var rs uint64 switch { default: // norm. rs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) offloadAttnInc += rs case strings.HasSuffix(l.Name, ".attn_q.weight"): rs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[0], nTokens}) offloadAttnInc += rs * 2 // Qcur. rs = GGMLTypeF32.RowSizeOf([]uint64{nKV, nTokens, a.AttentionHeadCount}) offloadAttnInc += rs // kq. if !zeroOffload && !fullOffload { offloadAttnInc += loadAttnInc } case strings.HasSuffix(l.Name, ".attn_qkv.weight"): rs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[0], nTokens}) offloadAttnInc += rs * 2 // Qcur. rs = GGMLTypeF32.RowSizeOf([]uint64{nKV, nTokens, a.AttentionHeadCount}) offloadAttnInc += rs // kq. rs = GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, a.EmbeddingLength * 3}) offloadAttnInc += rs // wqkv. if !zeroOffload && !fullOffload { offloadAttnInc += loadAttnInc } case strings.HasSuffix(l.Name, ".attn_q_b.weight"): rs = GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) offloadAttnInc += rs * 2 // q-? rs = GGMLTypeF32.RowSizeOf([]uint64{nKV, nTokens, a.AttentionHeadCount}) offloadAttnInc += rs // kq. } } } ffnInc := uint64(0) for _, l := range tfLs[len(tfLs)-1].Search(regexp.MustCompile(`.*\.\d+\.(attn_norm|ffn_norm|ffn_gate|ffn_up)\.weight`)) { rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) ffnInc += rs } if a.ExpertCount > 0 || a.ExpertUsedCount > 0 { rs := GGMLTypeF32.RowSizeOf([]uint64{uint64(a.ExpertCount), a.EmbeddingLength}) ffnInc += rs // ffn_gate_input rs = GGMLTypeF32.RowSizeOf([]uint64{uint64(a.ExpertCount), nTokens}) ffnInc += rs // ffn_moe_logits rs = GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, uint64(a.ExpertUsedCount), nTokens}) ffnInc += rs // ffn_moe_down } if !zeroOffload { e.Devices[0].Computation.Compute = GGUFBytesScalar(loadAttnInc + ffnInc) } else { e.Devices[0].Computation.Compute = GGUFBytesScalar(loadAttnInc) } { cp := GGUFBytesScalar(max(offloadAttnInc, ffnInc)) for i := range e.Devices[1:] { e.Devices[i+1].Computation.Compute = cp } if nLoadLayers > 1 { for i := range e.Devices[1:] { if e.Devices[i+1].Remote { continue } e.Devices[i+1].Computation.Compute += GGUFBytesScalar(loadAttnInc) break } } } } // Finally, get the usage of output layer. if a.AttentionCausal { var outInc uint64 if a.AttentionRecurrent { outInc += inpSMask + inpSSeq } if l, ok := opLs.Get("output_norm.weight"); ok { rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) outInc += rs } if l, ok := opLs.Get("output.weight"); ok { rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) outInc += rs } else if l, ok := ipLs.Get("token_embd.weight"); ok { rs := GGMLTypeF32.RowSizeOf([]uint64{l.Dimensions[l.NDimensions-1], nTokens}) outInc += rs } e.Devices[idxOutputDevice].Computation.Output += GGUFBytesScalar(outInc) } } // Drafter. e.Drafter = o.LMCDrafter // Projector. e.Projector = o.LMCProjector // Adapters. e.Adapters = o.LMCAdapters // Maximum tokens per second. if ds, dmss := e.Devices, o.DeviceMetrics; len(dmss) != 0 { ltss := make([]float64, len(dmss)) bs := anyx.Number[float64](*o.LMCLogicalBatchSize) / float64(nBatch) for i, dm := range dmss { fl, upbw, dwbw := float64(max(dm.FLOPS, 1)), float64(max(dm.UpBandwidth, 1)), float64(max(dm.DownBandwidth, 1)) cmpops := float64(ds[i].Parameter.Compute+ds[i].Parameter.ComputeOverridden)*2 /* FMA */ *bs + float64(ds[i].Parameter.Input) + float64(ds[i].Parameter.Output) // nolint: lll cmps := float64(ds[i].Weight.Sum()) cmplat := max(cmpops/fl, cmps/upbw) kvcops := float64(ds[i].Parameter.KVCache) * 2 /* FMA */ * bs kvcs := float64(ds[i].KVCache.Sum()) * bs kvclat := max(kvcops/fl, kvcs/upbw) ffs := float64(GGMLTypeF32.RowSizeOf([]uint64{a.EmbeddingLength, nBatch})) ffslat := ffs / dwbw lays := float64(ds[i].HandleLayers) if ds[i].HandleOutputLayer { lays += 1 } ltss[i] = (cmplat + kvclat + ffslat) * lays / float64(a.BlockCount+2) } lt := float64(0) ltmax := slices.Max(ltss) for i := range ltss { lt += ltss[i] / ltmax * ltss[i] } e.MaximumTokensPerSecond = ptr.To(GGUFTokensPerSecondScalar(1 / lt)) } } // estimateLLaMACppRunInProjector estimates the usages of the GGUF file for projector. func (gf *GGUFFile) estimateLLaMACppRunInProjector(o *_GGUFRunEstimateOptions, a *GGUFArchitecture, e *LLaMACppRunEstimate) { ls := gf.Layers() ioLs, tfLs, _ := ls.Cut([]string{ "mm.*", // Vision specific IO layers. "v.patch_embd.*", "v.class_embd", "v.position_embd.*", "v.pre_ln.*", "v.post_ln.*", "model.*", "resampler.*", // Audio specific IO layers. "a.position_embd.*", "a.conv1d.*", "a.post_ln.*", }) ipLs, opLs, _ := ioLs.Cut([]string{ // Vision specific Input layers. "v.patch_embd.*", "v.class_embd", "v.position_embd.*", "v.pre_ln.*", "model.*", // Audio specific Input layers. "a.position_embd.*", "a.conv1d.*", }) // Block count. if a.ClipHasVisionEncoder && a.ClipVisionBlockCount == 0 { if len(tfLs) == 1 { if ntfLs, ok := tfLs[0].(*GGUFNamedTensorInfos); ok && slices.Contains([]string{"v"}, ntfLs.Name) { a.ClipVisionBlockCount = uint64(len(ntfLs.GGUFLayerTensorInfos)) } } if a.ClipVisionBlockCount == 0 { a.ClipVisionBlockCount = uint64(len(tfLs)) } } if a.ClipHasAudioEncoder && a.ClipAudioBlockCount == 0 { if len(tfLs) == 1 { if ntfLs, ok := tfLs[0].(*GGUFNamedTensorInfos); ok && slices.Contains([]string{"a"}, ntfLs.Name) { a.ClipAudioBlockCount = uint64(len(ntfLs.GGUFLayerTensorInfos)) } } if a.ClipAudioBlockCount == 0 { a.ClipAudioBlockCount = uint64(len(tfLs)) } } // Offload layers. if *o.LMCOffloadLayers == math.MaxUint64 { e.FullOffloaded = true e.OffloadLayers = a.ClipVisionBlockCount + a.ClipAudioBlockCount o.LMCOffloadLayers = ptr.To(e.OffloadLayers) } else { e.FullOffloaded = false e.OffloadLayers = 0 } // Footprint. { // Bootstrap. e.Devices[0].Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */ } idx := 0 // Default to the main host's RAM. if e.FullOffloaded { for i := 1; i < len(e.Devices); i++ { if !e.Devices[i].Remote { idx = i break } } } // Weight & Parameter. { // Compute. e.Devices[idx].HandleLayers = *o.LMCOffloadLayers e.Devices[idx].HandleLastLayer = int(e.Devices[idx].HandleLayers - 1) e.Devices[idx].Weight.Compute = GGUFBytesScalar(tfLs.Bytes()) e.Devices[idx].Parameter.Compute = GGUFParametersScalar(tfLs.Elements()) // IO. e.Devices[idx].Weight.Input = GGUFBytesScalar(ipLs.Bytes()) e.Devices[idx].Parameter.Input = GGUFParametersScalar(ipLs.Elements()) e.Devices[idx].Weight.Output = GGUFBytesScalar(opLs.Bytes()) e.Devices[idx].Parameter.Output = GGUFParametersScalar(opLs.Elements()) } if a.ClipHasVisionEncoder { // Init hyperparameters, // see https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/clip.cpp#L599-L636. var ( heightMaxSize uint64 // y widthMaxSize uint64 // x // See https://github.com/ggml-org/llama.cpp/blob/6385b843a8dc8e15b8362196039720c58dd79fa2/tools/mtmd/clip.cpp#L3462. nPatches uint64 patchesMaxSize uint64 // See https://github.com/ggml-org/llama.cpp/blob/6385b843a8dc8e15b8362196039720c58dd79fa2/tools/mtmd/clip.cpp#L4016. projectionDim uint64 // NB(thxCode): do not sure if there is the correct name. ) // See https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/llava.cpp#L397-L411, // https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/clip.cpp#L2323-L2345, // https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/clip.cpp#L2767-L2794. heightMaxSize = uint64(a.ClipVisionImageSize) widthMaxSize = heightMaxSize if a.ClipHasQwen2VLMerger || a.ClipProjectorType == "qwen2vl_merger" || a.ClipProjectorType == "qwen2.5vl_merger" || a.ClipProjectorType == "qwen2.5o" || a.ClipProjectorType == "pixtral" { // See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/tools/mtmd/clip.cpp#L2217. heightMaxSize = uint64(ptr.Deref(o.LMCVisualMaxImageSize, 1024)) widthMaxSize = heightMaxSize } nPatchSize := uint64(a.ClipVisionPatchSize) nPatchesHeight := heightMaxSize / nPatchSize nPatchesWidth := widthMaxSize / nPatchSize nPatches = nPatchesHeight * nPatchesWidth patchesMaxSize = 1 switch { case a.ClipHasLLaVAProjector || a.ClipProjectorType == "mlp" || a.ClipProjectorType == "mlp_norm" || a.ClipProjectorType == "ldp" || a.ClipProjectorType == "ldpv2": // LLaVA 1.6 uses up to 6 patches if a.ClipVisionMMPatchMergeType != "flat" { patchesMaxSize = 6 } case a.ClipHasMiniCPMVProjector || a.ClipProjectorType == "resampler": // MiniCPM-V uses up to 10 patches patchesMaxSize = 10 case a.ClipProjectorType == "adapter": // Granite vision uses up to 10 patches + base patch patchesMaxSize = 11 } if o.LMCMaxProjectedCache != nil { patchesMaxSize += uint64(*o.LMCMaxProjectedCache) } switch a.ClipProjectorType { case "ldp": nPatches /= 4 if ti, ok := gf.TensorInfos.Get("mm.model.mb_block.1.block.2.1.bias"); ok { projectionDim = ti.Dimensions[0] } case "ldpv2": nPatches /= 4 if ti, ok := gf.TensorInfos.Get("mm.model.peg.0.bias"); ok { projectionDim = ti.Dimensions[0] } case "mlp": if ti, ok := gf.TensorInfos.Get("mm.2.bias"); ok { projectionDim = ti.Dimensions[0] } case "mlp_norm": if ti, ok := gf.TensorInfos.Get("mm.3.bias"); ok { projectionDim = ti.Dimensions[0] } case "resampler": if ti, ok := gf.TensorInfos.Get("resampler.query"); ok { nPatches = ti.Dimensions[1] projectionDim = ti.Dimensions[0] } case "adapter": nPatches /= 4 nPatches += 2 if ti, ok := gf.TensorInfos.Get("adapter.linear.dense_4h_to_h.weight"); ok { projectionDim = ti.Dimensions[1] } case "qwen2vl_merger", "qwen2.5vl_merger", "qwen2.5o": nSizePatch := uint64(a.ClipVisionPatchSize * 2) heightPatchSize := heightMaxSize / nSizePatch if heightMaxSize%nSizePatch > 0 { heightPatchSize++ } widthPatchSize := widthMaxSize / nSizePatch if widthMaxSize%nSizePatch > 0 { widthPatchSize++ } nPatches = heightPatchSize * widthPatchSize if ti, ok := gf.TensorInfos.Get("mm.2.bias"); ok { projectionDim = ti.Dimensions[0] } case "gemma3": nPerSide := uint64(a.ClipVisionImageSize) / uint64(a.ClipVisionPatchSize) nPerSide2DPool := nPerSide / uint64(a.ClipVisionProjectorScaleFactor) nPatches = nPerSide2DPool * nPerSide2DPool if ti, ok := gf.TensorInfos.Get("mm.input_projection.weight"); ok { projectionDim = ti.Dimensions[0] } case "idefics3", "llama4": nPatches /= uint64(a.ClipVisionProjectorScaleFactor * a.ClipVisionProjectorScaleFactor) if ti, ok := gf.TensorInfos.Get("mm.model.fc.weight"); ok { projectionDim = ti.Dimensions[1] } case "pixtral": heightPatchSize := heightMaxSize / uint64(a.ClipVisionPatchSize) if a.ClipVisionSpatialMergeSize > 0 { heightPatchSize /= uint64(a.ClipVisionSpatialMergeSize) } widthPatchSize := widthMaxSize / uint64(a.ClipVisionPatchSize) if a.ClipVisionSpatialMergeSize > 0 { widthPatchSize /= uint64(a.ClipVisionSpatialMergeSize) } nPatches = heightPatchSize*widthPatchSize + heightPatchSize - 1 /* [IMG_BREAK] per row */ if ti, ok := gf.TensorInfos.Get("mm.2.bias"); ok { projectionDim = ti.Dimensions[0] } case "internvl": nPatches /= uint64(a.ClipVisionProjectorScaleFactor * a.ClipVisionProjectorScaleFactor) if ti, ok := gf.TensorInfos.Get("mm.model.mlp.3.weight"); ok { projectionDim = ti.Dimensions[1] } } // Footprint { // Image Embed, // see https://github.com/ggerganov/llama.cpp/blob/0827b2c1da299805288abbd556d869318f2b121e/examples/llava/llava.cpp#L401-L407. e.Devices[0].Footprint += GGUFBytesScalar(patchesMaxSize * nPatches * projectionDim * 4 /* float32 size */) } // Computation. { // See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/tools/mtmd/clip.cpp#L374. var maxNodes uint64 = 8192 // Bootstrap, compute metadata. cm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false) e.Devices[0].Computation.Footprint += GGUFBytesScalar(cm) // Scheduler overhead, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149. e.Devices[0].Computation.Footprint += GGUFBytesScalar(4 * 1024 * 1024) // GGML context, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036. gc := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.ClipVisionBlockCount*3) e.Devices[0].Computation.Footprint += GGUFBytesScalar(gc) // Tensor usage. var ( hasClassEmbd bool nPositions uint64 nBatch uint64 nEmbd uint64 nHead uint64 ) { _, hasClassEmbd = ipLs.Get("v.class_embd") nPositions = nPatches if hasClassEmbd { nPositions += 1 } if a.ClipHasQwen2VLMerger || a.ClipProjectorType == "qwen2vl_merger" || a.ClipProjectorType == "qwen2.5vl_merger" || a.ClipProjectorType == "qwen2.5o" { nPositions *= 4 } nBatch = 1 nEmbd = a.ClipVisionEmbeddingLength nHead = a.ClipVisionAttentionHeadCount } // First, get the usage of input layer. { var ( inpRaw = GGMLTypeF32.RowSizeOf([]uint64{widthMaxSize, heightMaxSize, 3, nBatch}) // F32 [img_width, img_height, 3, n_batch] inpRawCnt = GGMLTypeF32.RowSizeOf([]uint64{nPatches, nEmbd, nBatch}) // I32 [n_patches, n_embd, n_batch] inpEmbd = GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions, nBatch}) // F32 [n_embd, n_positions, n_batch] inpPosEmbd = GGMLTypeF32.RowSizeOf([]uint64{projectionDim, nPatches, nBatch}) // F32 [mmproj, n_patches, n_batch] inpPos = GGMLTypeI32.RowSizeOf([]uint64{nPositions}) // I32 [n_positions] inpPatches = GGMLTypeI32.RowSizeOf([]uint64{nPatches}) // I32 [n_patches] ) e.Devices[idx].Computation.Input += GGUFBytesScalar(inpRaw + inpRawCnt + inpPos + inpPatches) if a.ClipHasMiniCPMVProjector || a.ClipProjectorType == "resampler" { e.Devices[idx].Computation.Input += GGUFBytesScalar(inpPosEmbd) } if hasClassEmbd { e.Devices[idx].Computation.Input += GGUFBytesScalar(inpEmbd) } if a.ClipVisionWindowAttentionPattern > 0 { // Qwen2.5 VL inpWindowIndex := GGMLTypeI32.RowSizeOf([]uint64{nPatches}) // I32 [n_patches] inpWindowMask := GGMLTypeI32.RowSizeOf([]uint64{nPositions, nPositions}) // I32 [n_positions, n_positions] e.Devices[idx].Computation.Input += GGUFBytesScalar(inpWindowIndex + inpWindowMask) } } // Since the steps between transformer layers are serial, // the allocated memory can be reused for the next layer. // So, we only consider the usage of a certain layer. { compNorm := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) * 2 compVcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) compKcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) compKQcur := GGMLTypeF32.RowSizeOf([]uint64{nPositions, nPositions, nHead}) e.Devices[idx].Computation.Compute += GGUFBytesScalar(compNorm + compVcur + compKcur + compKQcur) } } } if a.ClipHasAudioEncoder { // See https://github.com/ggml-org/llama.cpp/blob/6385b843a8dc8e15b8362196039720c58dd79fa2/tools/mtmd/mtmd-audio.cpp#L311. var projectionDim uint64 // NB(thxCode): do not sure if there is the correct name. { if ti, ok := gf.TensorInfos.Get("a.position_embd.weight"); ok { projectionDim = ti.Dimensions[1] } } // Computation. { // See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/tools/mtmd/clip.cpp#L374. var maxNodes uint64 = 8192 // Bootstrap, compute metadata. cm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false) e.Devices[0].Computation.Footprint += GGUFBytesScalar(cm) // Scheduler overhead, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L16149. e.Devices[0].Computation.Footprint += GGUFBytesScalar(4 * 1024 * 1024) // GGML context, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L5015-L5036. gc := 2 /* buffer count */ * GGMLTensorOverhead() * (uint64(len(gf.TensorInfos)) + 1 + a.ClipAudioBlockCount*3) e.Devices[0].Computation.Footprint += GGUFBytesScalar(gc) // Tensor usage. var ( nPositions uint64 nBatch uint64 nEmbd uint64 nHead uint64 ) { nPositions = projectionDim nBatch = 1 nEmbd = a.ClipAudioEmbeddingLength nHead = a.ClipAudioAttentionHeadCount } // First, get the usage of input layer. { inpEmbd := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions, nBatch}) // F32 [n_embed, n_positions, n_batch] e.Devices[idx].Computation.Input += GGUFBytesScalar(inpEmbd) } // Since the steps between transformer layers are serial, // the allocated memory can be reused for the next layer. // So, we only consider the usage of a certain layer. { compNorm := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) compVcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) compKcur := GGMLTypeF32.RowSizeOf([]uint64{nEmbd, nPositions}) compKQcur := GGMLTypeF32.RowSizeOf([]uint64{nPositions, nPositions, nHead}) e.Devices[idx].Computation.Compute += GGUFBytesScalar(compNorm + compVcur + compKcur + compKQcur) } } } } // estimateLLaMACppRunInAdapter estimates the usages of the GGUF file for adapter. func (gf *GGUFFile) estimateLLaMACppRunInAdapter(o *_GGUFRunEstimateOptions, a *GGUFArchitecture, e *LLaMACppRunEstimate) { ls := gf.Layers() ioLs, tfLs, _ := ls.Cut([]string{ "position_*", "token_*", "cls.*", "output.*", "output_*", }) ipLs, opLs, _ := ioLs.Cut([]string{ "position_*", "token_*", }) if a.BlockCount == 0 { a.BlockCount = uint64(len(tfLs)) } // Full offload: nLoadLayers == 0 && isOffloadOutputLayer // Zero offload: nOffloadLayers == 0 // Partial offload: !Full offload && !Zero offload var ( nOffloadLayers uint64 nActualOffloadLayers uint64 nLoadLayers = a.BlockCount idxOutputDevice int fullOffload bool ) { var isOffloadOutputLayer bool switch v := o.LMCOffloadLayers; { case v == nil: o.LMCOffloadLayers = ptr.To(a.BlockCount) nOffloadLayers = a.BlockCount isOffloadOutputLayer = true case *v != 0: nOffloadLayers = *v if nOffloadLayers > a.BlockCount { isOffloadOutputLayer = true nOffloadLayers = a.BlockCount } } nActualOffloadLayers = nOffloadLayers if isOffloadOutputLayer { nActualOffloadLayers += 1 } nLoadLayers -= nOffloadLayers fullOffload = nLoadLayers == 0 && isOffloadOutputLayer e.FullOffloaded = fullOffload e.OffloadLayers = nOffloadLayers for i, j, offloadStart := 0, 0, len(tfLs)-int(nOffloadLayers); i < len(tfLs); i++ { switch { case i < int(nLoadLayers): e.Devices[0].HandleLayers += 1 e.Devices[0].HandleLastLayer = i case i >= offloadStart: x := float64(i-offloadStart) / float64(nActualOffloadLayers) j = slicex.UpperBound(o.TensorSplitFraction, x) e.Devices[j+1].HandleLayers += 1 e.Devices[j+1].HandleLastLayer = i if fullOffload && i == len(tfLs)-1 { idxOutputDevice = j + 1 } } } e.Devices[idxOutputDevice].HandleOutputLayer = true } // Distributable. e.Distributable = false // Footprint. { // Bootstrap. e.Devices[0].Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */ } // Weight & Parameter. { // Compute. for i, j, offloadStart := 0, 0, len(tfLs)-int(nOffloadLayers); i < len(tfLs); i++ { idx := 0 if i >= offloadStart { x := float64(i-offloadStart) / float64(nActualOffloadLayers) j = slicex.UpperBound(o.TensorSplitFraction, x) idx = j + 1 } e.Devices[idx].Weight.Compute += GGUFBytesScalar(tfLs[i].Bytes()) e.Devices[idx].Parameter.Compute += GGUFParametersScalar(tfLs[i].Elements()) } // IO, // see https://github.com/ggerganov/llama.cpp/blob/d6ef0e77dd25f54fb5856af47e3926cf6f36c281/llama.cpp#L4930-L5002. e.Devices[0].Weight.Input = GGUFBytesScalar(ipLs.Bytes()) e.Devices[0].Parameter.Input = GGUFParametersScalar(ipLs.Elements()) var ( wg GGUFBytesScalar ps GGUFParametersScalar ) if _, ok := opLs.Get("output.weight"); ok { wg = GGUFBytesScalar(opLs.Bytes()) ps = GGUFParametersScalar(opLs.Elements()) } else { wg = GGUFBytesScalar(opLs.Bytes()) + e.Devices[0].Weight.Input /* duplicate the input layer */ ps = GGUFParametersScalar(opLs.Elements() + ipLs.Elements()) } e.Devices[0].Weight.Output = wg if fullOffload { e.Devices[idxOutputDevice].Weight.Output = wg e.Devices[idxOutputDevice].Parameter.Output = ps } else { e.Devices[0].Parameter.Output = ps } } } // estimateLLaMACppRunInIMatrix estimates the usages of the GGUF file for imatrix. func (gf *GGUFFile) estimateLLaMACppRunInIMatrix(_ *_GGUFRunEstimateOptions, a *GGUFArchitecture, e *LLaMACppRunEstimate) { ls := gf.Layers() if a.BlockCount == 0 { a.BlockCount = uint64(len(ls)) } // Distributable. e.Distributable = false // Footprint. { // Bootstrap. e.Devices[0].Footprint = GGUFBytesScalar(5*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */ } // Weight & Parameter. { var ( wg GGUFBytesScalar ps GGUFParametersScalar ) wg = GGUFBytesScalar(ls.Bytes()) ps = GGUFParametersScalar(ls.Elements()) e.Devices[0].Weight.Compute = wg e.Devices[0].Parameter.Compute = ps } } // Types for LLaMACpp estimated summary. type ( // LLaMACppRunEstimateSummary represents the summary of the usage for loading the GGUF file in llama.cpp. LLaMACppRunEstimateSummary struct { /* Basic */ // Items Items []LLaMACppRunEstimateSummaryItem `json:"items"` /* Appendix */ // Type describes what type this GGUF file is. Type string `json:"type"` // Architecture describes what architecture this GGUF file implements. // // All lowercase ASCII. Architecture string `json:"architecture"` // ClipProjectorType is the type of the projector used in the clip model. // // Only used when Architecture is "clip". ClipProjectorType string `json:"clipProjectorType,omitempty"` // AdapterType is the type of the adapter. // // Only used when Architecture is "adapter". AdapterType string `json:"adapterType,omitempty"` // ContextSize is the size of the context. ContextSize uint64 `json:"contextSize"` // FlashAttention is the flag to indicate whether enable the flash attention, // true for enable. FlashAttention bool `json:"flashAttention"` // NoMMap is the flag to indicate whether the file must be loaded without mmap, // true for total loaded. NoMMap bool `json:"noMMap"` // EmbeddingOnly is the flag to indicate whether the model is used for embedding only, // true for embedding only. EmbeddingOnly bool `json:"embeddingOnly"` // Reranking is the flag to indicate whether the model is used for reranking, // true for reranking. // // Only available when EmbeddingOnly is true. Reranking bool `json:"reranking"` // Distributable is the flag to indicate whether the model is distributable, // true for distributable. Distributable bool `json:"distributable"` // LogicalBatchSize is the logical batch size. LogicalBatchSize int32 `json:"logicalBatchSize"` // PhysicalBatchSize is the physical batch size. PhysicalBatchSize int32 `json:"physicalBatchSize"` } // LLaMACppRunEstimateSummaryItem represents one summary item for loading the GGUF file in llama.cpp. LLaMACppRunEstimateSummaryItem struct { // OffloadLayers is the number of offloaded layers. OffloadLayers uint64 `json:"offloadLayers"` // FullOffloaded is the flag to indicate whether the layers are fully offloaded, // false for partial offloaded or zero offloaded. FullOffloaded bool `json:"fullOffloaded"` // MaximumTokensPerSecond is the maximum tokens per second for running the GGUF file. MaximumTokensPerSecond *GGUFTokensPerSecondScalar `json:"maximumTokensPerSecond,omitempty"` // RAM is the memory usage for loading the GGUF file in RAM. RAM LLaMACppRunEstimateMemory `json:"ram"` // VRAMs is the memory usage for loading the GGUF file in VRAM per device. VRAMs []LLaMACppRunEstimateMemory `json:"vrams"` } // LLaMACppRunEstimateMemory represents the memory usage for loading the GGUF file in llama.cpp. LLaMACppRunEstimateMemory struct { // HandleLayers is the number of layers that the device can handle. HandleLayers uint64 `json:"handleLayers"` // HandleLastLayer is the index of the last layer the device can handle. HandleLastLayer int `json:"handleLastLayer"` // HandleOutputLayer is the flag to indicate whether the device can handle the output layer, // true for handle. HandleOutputLayer bool `json:"handleOutputLayer"` // Remote is the flag to indicate whether the device is remote, // true for remote. Remote bool `json:"remote"` // Position is the relative position of the device, // starts from 0. // // If Remote is true, Position is the position of the remote devices, // Otherwise, Position is the position of the device in the local devices. Position int `json:"position"` // UMA represents the usage of Unified Memory Architecture. UMA GGUFBytesScalar `json:"uma"` // NonUMA represents the usage of Non-Unified Memory Architecture. NonUMA GGUFBytesScalar `json:"nonuma"` } ) // SummarizeItem returns the corresponding LLaMACppRunEstimateSummaryItem with the given options. func (e LLaMACppRunEstimate) SummarizeItem(mmap bool, nonUMARamFootprint, nonUMAVramFootprint uint64) (emi LLaMACppRunEstimateSummaryItem) { emi.OffloadLayers, emi.FullOffloaded = e.OffloadLayers, e.FullOffloaded if emi.FullOffloaded { emi.OffloadLayers++ // The output layer is offloaded. } emi.MaximumTokensPerSecond = e.MaximumTokensPerSecond // RAM. { fp := e.Devices[0].Footprint wg := e.Devices[0].Weight.Sum() kv := e.Devices[0].KVCache.Sum() cp := e.Devices[0].Computation.Sum() emi.RAM.HandleLayers = e.Devices[0].HandleLayers emi.RAM.HandleLastLayer = e.Devices[0].HandleLastLayer emi.RAM.HandleOutputLayer = e.Devices[0].HandleOutputLayer // UMA. emi.RAM.UMA = fp + wg + kv + cp if !e.NoMMap && (mmap || e.FullOffloaded) { emi.RAM.UMA -= wg if !mmap { emi.RAM.UMA += e.Devices[0].Weight.Output emi.RAM.UMA += e.Devices[0].Weight.ComputeOverridden } } // NonUMA. emi.RAM.NonUMA = GGUFBytesScalar(nonUMARamFootprint) + emi.RAM.UMA } // VRAMs. emi.VRAMs = make([]LLaMACppRunEstimateMemory, len(e.Devices)-1) { for i, d := range e.Devices[1:] { fp := d.Footprint wg := d.Weight.Sum() kv := d.KVCache.Sum() cp := d.Computation.Sum() emi.VRAMs[i].HandleLayers = d.HandleLayers emi.VRAMs[i].HandleLastLayer = d.HandleLastLayer emi.VRAMs[i].HandleOutputLayer = d.HandleOutputLayer emi.VRAMs[i].Remote = d.Remote emi.VRAMs[i].Position = d.Position // UMA. emi.VRAMs[i].UMA = fp + wg + kv + /* cp */ 0 if !e.NoMMap && mmap { emi.VRAMs[i].UMA -= wg if d.Remote || d.Position > 0 && d.HandleLastLayer >= 0 || e.Type == "projector" { emi.VRAMs[i].UMA += wg } } // NonUMA. emi.VRAMs[i].NonUMA = GGUFBytesScalar(nonUMAVramFootprint) + fp + wg + kv + cp if !d.Remote && d.Position > 0 && d.HandleLastLayer < 0 { emi.VRAMs[i].NonUMA -= wg + cp } } } // Add drafter's usage. if e.Drafter != nil { demi := e.Drafter.SummarizeItem(mmap, 0, 0) emi.RAM.UMA += demi.RAM.UMA emi.RAM.NonUMA += demi.RAM.NonUMA for i, v := range demi.VRAMs { emi.VRAMs[i].UMA += v.UMA emi.VRAMs[i].NonUMA += v.NonUMA } } // Add projector's usage. if e.Projector != nil { pemi := e.Projector.SummarizeItem(mmap, 0, 0) emi.RAM.UMA += pemi.RAM.UMA emi.RAM.NonUMA += pemi.RAM.NonUMA for i, v := range pemi.VRAMs { emi.VRAMs[i].UMA += v.UMA emi.VRAMs[i].NonUMA += v.NonUMA } } // Add adapters' usage. for i := range e.Adapters { aemi := e.Adapters[i].SummarizeItem(false, 0, 0) emi.RAM.UMA += aemi.RAM.UMA emi.RAM.NonUMA += aemi.RAM.NonUMA for j, v := range aemi.VRAMs { emi.VRAMs[j].UMA += v.UMA emi.VRAMs[j].NonUMA += v.NonUMA } } return emi } // Summarize returns the corresponding LLaMACppRunEstimateSummary with the given options. func (e LLaMACppRunEstimate) Summarize(mmap bool, nonUMARamFootprint, nonUMAVramFootprint uint64) (es LLaMACppRunEstimateSummary) { // Items. es.Items = []LLaMACppRunEstimateSummaryItem{ e.SummarizeItem(mmap, nonUMARamFootprint, nonUMAVramFootprint), } // Just copy from the original estimate. es.Type = e.Type es.Architecture = e.Architecture es.ClipProjectorType = e.ClipProjectorType es.AdapterType = e.AdapterType es.ContextSize = e.ContextSize es.FlashAttention = e.FlashAttention es.NoMMap = e.NoMMap es.EmbeddingOnly = e.EmbeddingOnly es.Reranking = e.Reranking es.LogicalBatchSize = e.LogicalBatchSize es.PhysicalBatchSize = e.PhysicalBatchSize es.Distributable = e.Distributable return es } func (u LLaMACppWeightMemoryUsage) Sum() GGUFBytesScalar { return u.Input + u.Compute + u.ComputeOverridden + u.Output } func (u LLaMACppKVCacheMemoryUsage) Sum() GGUFBytesScalar { return u.Key + u.Value } func (u LLaMACppComputationMemoryUsage) Sum() GGUFBytesScalar { return u.Footprint + u.Input + max(u.Compute, u.Output) } // ClipAligning returns the aligned value of x to the nearest multiple of n, // see https://github.com/ggml-org/llama.cpp/blob/cdf94a18023c92f41808ec874ba577d914674717/tools/mtmd/clip-impl.h#L114-L115. func ClipAligning(x, n uint64) uint64 { return ((x + n - 1) / n) * n } ================================================ FILE: file_estimate__llamacpp_test.go ================================================ package gguf_parser import ( "context" "testing" "github.com/davecgh/go-spew/spew" ) func TestGGUFFile_EstimateLLaMACppRun(t *testing.T) { ctx := context.Background() cases := []struct { name string given *GGUFFile }{ { name: "mixtral 7B", given: func() *GGUFFile { f, err := ParseGGUFFileFromHuggingFace( ctx, "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", "Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf", SkipLargeMetadata()) if err != nil { t.Fatal(err) } return f }(), }, { name: "mixtral 8x7B", given: func() *GGUFFile { f, err := ParseGGUFFileFromHuggingFace( ctx, "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO-GGUF", "Nous-Hermes-2-Mixtral-8x7B-DPO.Q5_K_M.gguf", SkipLargeMetadata()) if err != nil { t.Fatal(err) } return f }(), }, { name: "wizardlm 8x22B", given: func() *GGUFFile { f, err := ParseGGUFFileFromHuggingFace( ctx, "MaziyarPanahi/WizardLM-2-8x22B-GGUF", "WizardLM-2-8x22B.IQ1_M.gguf", SkipLargeMetadata()) if err != nil { t.Fatal(err) } return f }(), }, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { f := tc.given t.Log("\n", spew.Sdump(f.EstimateLLaMACppRun()), "\n") }) } } func TestGGUFFile_EstimateLLaMACppRun_ContextSize(t *testing.T) { ctx := context.Background() f, err := ParseGGUFFileFromHuggingFace( ctx, "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", "Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf", SkipLargeMetadata()) if err != nil { t.Fatal(err) return } cases := []struct { name string opts []GGUFRunEstimateOption }{ {"1024(fp16)", []GGUFRunEstimateOption{WithLLaMACppContextSize(1024)}}, {"1024(fp32)", []GGUFRunEstimateOption{WithLLaMACppContextSize(1024), WithLLaMACppCacheKeyType(GGMLTypeF32), WithLLaMACppCacheValueType(GGMLTypeF32)}}, {"4096(fp16)", []GGUFRunEstimateOption{WithLLaMACppContextSize(4096)}}, {"4096(fp32)", []GGUFRunEstimateOption{WithLLaMACppContextSize(4096), WithLLaMACppCacheKeyType(GGMLTypeF32), WithLLaMACppCacheValueType(GGMLTypeF32)}}, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { t.Log("\n", spew.Sdump(f.EstimateLLaMACppRun(tc.opts...)), "\n") }) } } func TestGGUFFile_EstimateLLaMACppRun_OffloadLayers(t *testing.T) { ctx := context.Background() f, err := ParseGGUFFileFromHuggingFace( ctx, "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", "Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf", SkipLargeMetadata()) if err != nil { t.Fatal(err) return } cases := []struct { name string opts []GGUFRunEstimateOption }{ {"offload 0 layer", []GGUFRunEstimateOption{WithLLaMACppOffloadLayers(0)}}, {"offload 1 layer", []GGUFRunEstimateOption{WithLLaMACppOffloadLayers(1)}}, {"offload 10 layers", []GGUFRunEstimateOption{WithLLaMACppOffloadLayers(10)}}, {"offload all layers", []GGUFRunEstimateOption{}}, {"offload 33 layers", []GGUFRunEstimateOption{WithLLaMACppOffloadLayers(33)}}, // exceeds the number of layers } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { t.Log("\n", spew.Sdump(f.EstimateLLaMACppRun(tc.opts...)), "\n") }) } } ================================================ FILE: file_estimate__stablediffusioncpp.go ================================================ package gguf_parser import ( "math" "strings" "golang.org/x/exp/maps" "github.com/gpustack/gguf-parser-go/util/ptr" "github.com/gpustack/gguf-parser-go/util/stringx" ) // Types for StableDiffusionCpp estimation. type ( // StableDiffusionCppRunEstimate represents the estimated result of loading the GGUF file in stable-diffusion.cpp. StableDiffusionCppRunEstimate struct { // Type describes what type this GGUF file is. Type string `json:"type"` // Architecture describes what architecture this GGUF file implements. // // All lowercase ASCII. Architecture string `json:"architecture"` // FlashAttention is the flag to indicate whether enable the flash attention, // true for enable. FlashAttention bool `json:"flashAttention"` // FullOffloaded is the flag to indicate whether the layers are fully offloaded, // false for partial offloaded or zero offloaded. FullOffloaded bool `json:"fullOffloaded"` // NoMMap is the flag to indicate whether support the mmap, // true for support. NoMMap bool `json:"noMMap"` // ImageOnly is the flag to indicate whether the model is used for generating image, // true for generating image only. ImageOnly bool `json:"imageOnly"` // Distributable is the flag to indicate whether the model is distributable, // true for distributable. Distributable bool `json:"distributable"` // Devices represents the usage for running the GGUF file, // the first device is the CPU, and the rest are GPUs. Devices []StableDiffusionCppRunDeviceUsage `json:"devices"` // Autoencoder is the estimated result of the autoencoder. Autoencoder *StableDiffusionCppRunEstimate `json:"autoencoder,omitempty"` // Conditioners is the estimated result of the conditioners. Conditioners []StableDiffusionCppRunEstimate `json:"conditioners,omitempty"` // Upscaler is the estimated result of the upscaler. Upscaler *StableDiffusionCppRunEstimate `json:"upscaler,omitempty"` // ControlNet is the estimated result of the control net. ControlNet *StableDiffusionCppRunEstimate `json:"controlNet,omitempty"` } // StableDiffusionCppRunDeviceUsage represents the usage for running the GGUF file in llama.cpp. StableDiffusionCppRunDeviceUsage struct { // Remote is the flag to indicate whether the device is remote, // true for remote. Remote bool `json:"remote"` // Position is the relative position of the device, // starts from 0. // // If Remote is true, Position is the position of the remote devices, // Otherwise, Position is the position of the device in the local devices. Position int `json:"position"` // Footprint is the memory footprint for bootstrapping. Footprint GGUFBytesScalar `json:"footprint"` // Parameter is the running parameters that the device processes. Parameter GGUFParametersScalar `json:"parameter"` // Weight is the memory usage of weights that the device loads. Weight GGUFBytesScalar `json:"weight"` // Computation is the memory usage of computation that the device processes. Computation GGUFBytesScalar `json:"computation"` } ) // EstimateStableDiffusionCppRun estimates the usages of the GGUF file in stable-diffusion.cpp. func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption) (e StableDiffusionCppRunEstimate) { // Options var o _GGUFRunEstimateOptions for _, opt := range opts { opt(&o) } switch { case o.TensorSplitFraction == nil: o.TensorSplitFraction = []float64{1} o.MainGPUIndex = 0 case o.MainGPUIndex < 0 || o.MainGPUIndex >= len(o.TensorSplitFraction): panic("main device index must be range of 0 to the length of tensor split fraction") } if len(o.DeviceMetrics) > 0 { for i, j := 0, len(o.DeviceMetrics)-1; i < len(o.TensorSplitFraction)-j; i++ { o.DeviceMetrics = append(o.DeviceMetrics, o.DeviceMetrics[j]) } o.DeviceMetrics = o.DeviceMetrics[:len(o.TensorSplitFraction)+1] } if o.SDCOffloadLayers == nil { o.SDCOffloadLayers = ptr.To[uint64](math.MaxUint64) } if o.SDCBatchCount == nil { o.SDCBatchCount = ptr.To[int32](1) } if o.SDCHeight == nil { o.SDCHeight = ptr.To[uint32](1024) } if o.SDCWidth == nil { o.SDCWidth = ptr.To[uint32](1024) } if o.SDCOffloadConditioner == nil { o.SDCOffloadConditioner = ptr.To(true) } if o.SDCOffloadAutoencoder == nil { o.SDCOffloadAutoencoder = ptr.To(true) } if o.SDCAutoencoderTiling == nil { o.SDCAutoencoderTiling = ptr.To(false) } if o.SDCFreeComputeMemoryImmediately == nil { o.SDCFreeComputeMemoryImmediately = ptr.To(false) } // Devices. initDevices := func(e *StableDiffusionCppRunEstimate) { for j := range e.Devices[1:] { e.Devices[j+1].Remote = j < len(o.RPCServers) if e.Devices[j+1].Remote { e.Devices[j+1].Position = j } else { e.Devices[j+1].Position = j - len(o.RPCServers) } } } e.Devices = make([]StableDiffusionCppRunDeviceUsage, len(o.TensorSplitFraction)+1) initDevices(&e) // Metadata. a := gf.Architecture() e.Type = a.Type e.Architecture = normalizeArchitecture(a.DiffusionArchitecture) // Flash attention. if o.FlashAttention && !strings.HasPrefix(a.DiffusionArchitecture, "Stable Diffusion 3") { // NB(thxCode): Stable Diffusion 3 doesn't support flash attention yet, // see https://github.com/leejet/stable-diffusion.cpp/pull/386. e.FlashAttention = true } // Distributable. e.Distributable = true // Offload. e.FullOffloaded = *o.SDCOffloadLayers > 0 // NoMMap. e.NoMMap = true // TODO: Implement this. // ImageOnly. e.ImageOnly = true // TODO: Implement this. // Autoencoder. if a.DiffusionAutoencoder != nil { ae := &StableDiffusionCppRunEstimate{ Type: "model", Architecture: e.Architecture + "_vae", FlashAttention: e.FlashAttention, Distributable: e.Distributable, FullOffloaded: e.FullOffloaded && *o.SDCOffloadAutoencoder, NoMMap: e.NoMMap, Devices: make([]StableDiffusionCppRunDeviceUsage, len(e.Devices)), } initDevices(ae) e.Autoencoder = ae } // Conditioners. if len(a.DiffusionConditioners) != 0 { e.Conditioners = make([]StableDiffusionCppRunEstimate, 0, len(a.DiffusionConditioners)) for i := range a.DiffusionConditioners { cd := StableDiffusionCppRunEstimate{ Type: "model", Architecture: normalizeArchitecture(a.DiffusionConditioners[i].Architecture), FlashAttention: e.FlashAttention, Distributable: e.Distributable, FullOffloaded: e.FullOffloaded && *o.SDCOffloadConditioner, NoMMap: e.NoMMap, Devices: make([]StableDiffusionCppRunDeviceUsage, len(e.Devices)), } initDevices(&cd) e.Conditioners = append(e.Conditioners, cd) } } // Footprint { // Bootstrap. e.Devices[0].Footprint = GGUFBytesScalar(10*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */ } var cdLs, aeLs, dmLs GGUFLayerTensorInfos { ls := gf.Layers() cdLs, aeLs, _ = ls.Cut([]string{ "cond_stage_model.*", }) aeLs, dmLs, _ = aeLs.Cut([]string{ "first_stage_model.*", }) } var cdDevIdx, aeDevIdx, dmDevIdx int { if *o.SDCOffloadConditioner && *o.SDCOffloadLayers > 0 { cdDevIdx = 1 } if *o.SDCOffloadAutoencoder && *o.SDCOffloadLayers > 0 { aeDevIdx = 1 if len(e.Devices) > 3 { aeDevIdx = 2 } } if *o.SDCOffloadLayers > 0 { dmDevIdx = 1 switch { case len(e.Devices) > 3: dmDevIdx = 3 case len(e.Devices) > 2: dmDevIdx = 2 } } } // Weight & Parameter. { // Conditioners. for i := range cdLs { e.Conditioners[i].Devices[cdDevIdx].Weight = GGUFBytesScalar(cdLs[i].Bytes()) e.Conditioners[i].Devices[cdDevIdx].Parameter = GGUFParametersScalar(cdLs[i].Elements()) } // Autoencoder. if len(aeLs) != 0 { e.Autoencoder.Devices[aeDevIdx].Weight = GGUFBytesScalar(aeLs.Bytes()) e.Autoencoder.Devices[aeDevIdx].Parameter = GGUFParametersScalar(aeLs.Elements()) } // Model. e.Devices[dmDevIdx].Weight = GGUFBytesScalar(dmLs.Bytes()) e.Devices[dmDevIdx].Parameter = GGUFParametersScalar(dmLs.Elements()) } // Computation. { // See https://github.com/leejet/stable-diffusion.cpp/blob/10c6501bd05a697e014f1bee3a84e5664290c489/ggml_extend.hpp#L1058C9-L1058C23. var maxNodes uint64 = 32768 // Bootstrap, compute metadata. cm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false) e.Devices[0].Computation = GGUFBytesScalar(cm) // Work context, // see https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/stable-diffusion.cpp#L1467-L1481, // https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/stable-diffusion.cpp#L1572-L1586, // https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/stable-diffusion.cpp#L1675-L1679. // { zChannels := uint64(4) if a.DiffusionTransformer { zChannels = 16 } // See https://github.com/thxCode/stable-diffusion.cpp/blob/1ae97f8a8ca3615bdaf9c1fd32c13562e2471833/stable-diffusion.cpp#L2682-L2691. usage := uint64(128 * 1024 * 1024) /* 128MiB, LLaMA Box */ usage += uint64(*o.SDCWidth) * uint64(*o.SDCHeight) * 3 /* output channels */ * 4 /* sizeof(float) */ * zChannels e.Devices[0].Computation += GGUFBytesScalar(usage * uint64(ptr.Deref(o.ParallelSize, 1)) /* max batch */) } // Encode usage, // see https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/conditioner.hpp#L388-L391, // https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/conditioner.hpp#L758-L766, // https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/conditioner.hpp#L1083-L1085. { var tes [][]uint64 switch { case strings.HasPrefix(a.DiffusionArchitecture, "FLUX"): // FLUX.1 tes = [][]uint64{ {768, 77}, {4096, 256}, } case strings.HasPrefix(a.DiffusionArchitecture, "Stable Diffusion 3"): // SD 3.x tes = [][]uint64{ {768, 77}, {1280, 77}, {4096, 77}, } case strings.HasPrefix(a.DiffusionArchitecture, "Stable Diffusion XL"): // SD XL/XL Refiner if strings.HasSuffix(a.DiffusionArchitecture, "Refiner") { tes = [][]uint64{ {1280, 77}, } } else { tes = [][]uint64{ {768, 77}, {1280, 77}, } } default: // SD 1.x/2.x tes = [][]uint64{ {768, 77}, } } for i := range cdLs { usage := GGMLTypeF32.RowSizeOf(tes[i]) * 2 /* include conditioner */ e.Conditioners[i].Devices[cdDevIdx].Computation += GGUFBytesScalar(usage) } // TODO VAE Encode } // Diffusing usage. if !*o.SDCFreeComputeMemoryImmediately { var usage uint64 switch { case strings.HasPrefix(a.DiffusionArchitecture, "FLUX"): // FLUX.1 usage = GuessFLUXDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention) case strings.HasPrefix(a.DiffusionArchitecture, "Stable Diffusion 3"): // SD 3.x const ( sd3MediumKey = "model.diffusion_model.joint_blocks.23.x_block.attn.proj.weight" // SD 3 Medium sd35MediumKey = "model.diffusion_model.joint_blocks.23.x_block.attn.ln_k.weight" // SD 3.5 Medium sd35LargeKey = "model.diffusion_model.joint_blocks.37.x_block.attn.ln_k.weight" // SD 3.5 Large ) m, _ := dmLs.Index([]string{sd3MediumKey, sd35MediumKey, sd35LargeKey}) switch { case m[sd35LargeKey].Name != "": usage = GuessSD35LargeDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention) case m[sd35MediumKey].Name != "": usage = GuessSD35MediumDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention) default: usage = GuessSD3MediumDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention) } case strings.HasPrefix(a.DiffusionArchitecture, "Stable Diffusion XL"): // SD XL/XL Refiner const ( sdXlKey = "model.diffusion_model.output_blocks.5.1.transformer_blocks.1.attn1.to_v.weight" // SD XL sdXlRefinerKey = "model.diffusion_model.output_blocks.8.1.transformer_blocks.1.attn1.to_v.weight" // SD XL Refiner ) m, _ := dmLs.Index([]string{sdXlKey, sdXlRefinerKey}) if m[sdXlRefinerKey].Name != "" { usage = GuessSDXLRefinerDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention) } else { usage = GuessSDXLDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention) } case strings.HasPrefix(a.DiffusionArchitecture, "Stable Diffusion 2"): // SD 2.x usage = GuessSD2DiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention) default: // SD 1.x usage = GuessSD1DiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention) } e.Devices[dmDevIdx].Computation += GGUFBytesScalar(usage) } // Decode usage. if len(aeLs) != 0 && !*o.SDCFreeComputeMemoryImmediately { // Bootstrap. e.Autoencoder.Devices[aeDevIdx].Footprint += GGUFBytesScalar(100 * 1024 * 1024) /*100 MiB.*/ var convDim uint64 { m, _ := aeLs.Index([]string{ "first_stage_model.decoder.conv_in.weight", "decoder.conv_in.weight", }) tis := maps.Values(m) if len(tis) != 0 && tis[0].NDimensions > 3 { convDim = max(tis[0].Dimensions[0], tis[0].Dimensions[3]) } } var usage uint64 if !*o.SDCAutoencoderTiling { usage = uint64(*o.SDCWidth) * uint64(*o.SDCHeight) * (3 /* output channels */ *4 /* sizeof(float) */ + 1) * convDim } else { usage = 512 * 512 * (3 /* output channels */ *4 /* sizeof(float) */ + 1) * convDim } e.Autoencoder.Devices[aeDevIdx].Computation += GGUFBytesScalar(usage) } } return e } // Types for StableDiffusionCpp estimated summary. type ( // StableDiffusionCppRunEstimateSummary represents the estimated summary of loading the GGUF file in stable-diffusion.cpp. StableDiffusionCppRunEstimateSummary struct { /* Basic */ // Items Items []StableDiffusionCppRunEstimateSummaryItem `json:"items"` /* Appendix */ // Type describes what type this GGUF file is. Type string `json:"type"` // Architecture describes what architecture this GGUF file implements. // // All lowercase ASCII. Architecture string `json:"architecture"` // FlashAttention is the flag to indicate whether enable the flash attention, // true for enable. FlashAttention bool `json:"flashAttention"` // NoMMap is the flag to indicate whether the file must be loaded without mmap, // true for total loaded. NoMMap bool `json:"noMMap"` // ImageOnly is the flag to indicate whether the model is used for generating image, // true for embedding only. ImageOnly bool `json:"imageOnly"` // Distributable is the flag to indicate whether the model is distributable, // true for distributable. Distributable bool `json:"distributable"` } // StableDiffusionCppRunEstimateSummaryItem represents the estimated summary item of loading the GGUF file in stable-diffusion.cpp. StableDiffusionCppRunEstimateSummaryItem struct { // FullOffloaded is the flag to indicate whether the layers are fully offloaded, // false for partial offloaded or zero offloaded. FullOffloaded bool `json:"fullOffloaded"` // RAM is the memory usage for loading the GGUF file in RAM. RAM StableDiffusionCppRunEstimateMemory `json:"ram"` // VRAMs is the memory usage for loading the GGUF file in VRAM per device. VRAMs []StableDiffusionCppRunEstimateMemory `json:"vrams"` } // StableDiffusionCppRunEstimateMemory represents the memory usage for loading the GGUF file in llama.cpp. StableDiffusionCppRunEstimateMemory struct { // Remote is the flag to indicate whether the device is remote, // true for remote. Remote bool `json:"remote"` // Position is the relative position of the device, // starts from 0. // // If Remote is true, Position is the position of the remote devices, // Otherwise, Position is the position of the device in the local devices. Position int `json:"position"` // UMA represents the usage of Unified Memory Architecture. UMA GGUFBytesScalar `json:"uma"` // NonUMA represents the usage of Non-Unified Memory Architecture. NonUMA GGUFBytesScalar `json:"nonuma"` } ) // SummarizeItem returns the corresponding LLaMACppRunEstimateSummaryItem with the given options. func (e StableDiffusionCppRunEstimate) SummarizeItem( mmap bool, nonUMARamFootprint, nonUMAVramFootprint uint64, ) (emi StableDiffusionCppRunEstimateSummaryItem) { emi.FullOffloaded = e.FullOffloaded // RAM. { fp := e.Devices[0].Footprint wg := e.Devices[0].Weight cp := e.Devices[0].Computation // UMA. emi.RAM.UMA = fp + wg + cp // NonUMA. emi.RAM.NonUMA = GGUFBytesScalar(nonUMARamFootprint) + emi.RAM.UMA } // VRAMs. emi.VRAMs = make([]StableDiffusionCppRunEstimateMemory, len(e.Devices)-1) { for i, d := range e.Devices[1:] { fp := d.Footprint wg := d.Weight cp := d.Computation emi.VRAMs[i].Remote = d.Remote emi.VRAMs[i].Position = d.Position // UMA. emi.VRAMs[i].UMA = fp + wg + /* cp */ 0 if d.Remote { emi.VRAMs[i].UMA += cp } // NonUMA. emi.VRAMs[i].NonUMA = GGUFBytesScalar(nonUMAVramFootprint) + fp + wg + cp } } // Add antoencoder's usage. if e.Autoencoder != nil { aemi := e.Autoencoder.SummarizeItem(mmap, 0, 0) emi.RAM.UMA += aemi.RAM.UMA emi.RAM.NonUMA += aemi.RAM.NonUMA for i, v := range aemi.VRAMs { emi.VRAMs[i].UMA += v.UMA emi.VRAMs[i].NonUMA += v.NonUMA } } // Add conditioners' usage. for i := range e.Conditioners { cemi := e.Conditioners[i].SummarizeItem(mmap, 0, 0) emi.RAM.UMA += cemi.RAM.UMA emi.RAM.NonUMA += cemi.RAM.NonUMA for i, v := range cemi.VRAMs { emi.VRAMs[i].UMA += v.UMA emi.VRAMs[i].NonUMA += v.NonUMA } } // Add upscaler's usage. if e.Upscaler != nil { uemi := e.Upscaler.SummarizeItem(mmap, 0, 0) emi.RAM.UMA += uemi.RAM.UMA emi.RAM.NonUMA += uemi.RAM.NonUMA // NB(thxCode): all VRAMs should offload to the first device at present. var vramUMA, vramNonUMA GGUFBytesScalar for _, v := range uemi.VRAMs { vramUMA += v.UMA vramNonUMA += v.NonUMA } if e.Upscaler.FullOffloaded { emi.VRAMs[0].UMA += vramUMA emi.VRAMs[0].NonUMA += vramNonUMA } else { emi.RAM.UMA += vramUMA emi.RAM.NonUMA += vramNonUMA } } // Add control net's usage. if e.ControlNet != nil { cnemi := e.ControlNet.SummarizeItem(mmap, 0, 0) emi.RAM.UMA += cnemi.RAM.UMA emi.RAM.NonUMA += cnemi.RAM.NonUMA // NB(thxCode): all VRAMs should offload to the first device at present. var vramUMA, vramNonUMA GGUFBytesScalar for _, v := range cnemi.VRAMs { vramUMA += v.UMA vramNonUMA += v.NonUMA } if e.ControlNet.FullOffloaded { emi.VRAMs[0].UMA += vramUMA emi.VRAMs[0].NonUMA += vramNonUMA } else { emi.RAM.UMA += vramUMA emi.RAM.NonUMA += vramNonUMA } } return emi } // Summarize returns the corresponding StableDiffusionCppRunEstimate with the given options. func (e StableDiffusionCppRunEstimate) Summarize( mmap bool, nonUMARamFootprint, nonUMAVramFootprint uint64, ) (es StableDiffusionCppRunEstimateSummary) { // Items. es.Items = []StableDiffusionCppRunEstimateSummaryItem{ e.SummarizeItem(mmap, nonUMARamFootprint, nonUMAVramFootprint), } // Just copy from the original estimate. es.Type = e.Type es.Architecture = e.Architecture es.FlashAttention = e.FlashAttention es.NoMMap = e.NoMMap es.ImageOnly = e.ImageOnly es.Distributable = e.Distributable return es } func normalizeArchitecture(arch string) string { return stringx.ReplaceAllFunc(arch, func(r rune) rune { switch r { case ' ', '.', '-', '/', ':': return '_' // Replace with underscore. } if r >= 'A' && r <= 'Z' { r += 'a' - 'A' // Lowercase. } return r }) } ================================================ FILE: file_estimate__stablediffusioncpp_test.go ================================================ package gguf_parser import ( "context" "testing" "github.com/davecgh/go-spew/spew" ) func TestGGUFFile_EstimateStableDiffusionRun(t *testing.T) { ctx := context.Background() cases := []struct { name string given *GGUFFile }{ { name: "sd 1.5", given: func() *GGUFFile { f, err := ParseGGUFFileFromHuggingFace( ctx, "gpustack/stable-diffusion-v1-5-GGUF", "stable-diffusion-v1-5-FP16.gguf", SkipLargeMetadata()) if err != nil { t.Fatal(err) } return f }(), }, { name: "sd 2.1", given: func() *GGUFFile { f, err := ParseGGUFFileFromHuggingFace( ctx, "gpustack/stable-diffusion-v2-1-GGUF", "stable-diffusion-v2-1-Q8_0.gguf", SkipLargeMetadata()) if err != nil { t.Fatal(err) } return f }(), }, { name: "sd xl", given: func() *GGUFFile { f, err := ParseGGUFFileFromHuggingFace( ctx, "gpustack/stable-diffusion-xl-base-1.0-GGUF", "stable-diffusion-xl-base-1.0-FP16.gguf", SkipLargeMetadata()) if err != nil { t.Fatal(err) } return f }(), }, { name: "sd 3.5 large", given: func() *GGUFFile { f, err := ParseGGUFFileFromHuggingFace( ctx, "gpustack/stable-diffusion-v3-5-large-GGUF", "stable-diffusion-v3-5-large-Q4_0.gguf", SkipLargeMetadata()) if err != nil { t.Fatal(err) } return f }(), }, { name: "flux .1 dev", given: func() *GGUFFile { f, err := ParseGGUFFileFromHuggingFace( ctx, "gpustack/FLUX.1-dev-GGUF", "FLUX.1-dev-Q4_0.gguf", SkipLargeMetadata()) if err != nil { t.Fatal(err) } return f }(), }, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { f := tc.given t.Log("\n", spew.Sdump(f.EstimateStableDiffusionCppRun()), "\n") }) } } ================================================ FILE: file_estimate_option.go ================================================ package gguf_parser import ( "regexp" "slices" "strconv" "github.com/gpustack/gguf-parser-go/util/ptr" ) type ( _GGUFRunEstimateOptions struct { // Common ParallelSize *int32 FlashAttention bool MainGPUIndex int RPCServers []string TensorSplitFraction []float64 OverriddenTensors []*GGUFRunOverriddenTensor DeviceMetrics []GGUFRunDeviceMetric // LLaMACpp (LMC) specific LMCContextSize *int32 LMCRoPEFrequencyBase *float32 LMCRoPEFrequencyScale *float32 LMCRoPEScalingType *string LMCRoPEScalingOriginalContextSize *int32 LMCInMaxContextSize bool LMCLogicalBatchSize *int32 LMCPhysicalBatchSize *int32 LMCVisualMaxImageSize *uint32 LMCMaxProjectedCache *uint32 LMCCacheKeyType *GGMLType LMCCacheValueType *GGMLType LMCOffloadKVCache *bool LMCOffloadLayers *uint64 LMCSplitMode LLaMACppSplitMode LMCFullSizeSWACache bool LMCProjector *LLaMACppRunEstimate LMCDrafter *LLaMACppRunEstimate LMCAdapters []LLaMACppRunEstimate // StableDiffusionCpp (SDC) specific SDCOffloadLayers *uint64 SDCBatchCount *int32 SDCHeight *uint32 SDCWidth *uint32 SDCOffloadConditioner *bool SDCOffloadAutoencoder *bool SDCAutoencoderTiling *bool SDCFreeComputeMemoryImmediately *bool SDCUpscaler *StableDiffusionCppRunEstimate SDCControlNet *StableDiffusionCppRunEstimate } // GGUFRunOverriddenTensor holds the overridden tensor information for the estimate. // // When BufferType is CPU, // it indicates that the tensor should be loaded into the CPU memory, // even if it belongs to a GPU offload layer. GGUFRunOverriddenTensor struct { // PatternRegex is the regex pattern to match the tensor name. PatternRegex *regexp.Regexp // BufferType is the buffer type to override, // it can be "CPU", "CUDA0", "Metal" and others. BufferType string // _BufferType record parsed buffer type, used internally. _BufferType GGUFRunOverriddenTensorBufferType // _Index record parsed device index, used internally. _Index string } // GGUFRunDeviceMetric holds the device metric for the estimate. // // When the device represents a CPU, // FLOPS refers to the floating-point operations per second of that CPU, // while UpBandwidth indicates the bandwidth of the RAM (since SRAM is typically small and cannot hold all weights, // the RAM here refers to the bandwidth of DRAM, // unless the device's SRAM can accommodate the corresponding model weights). // // When the device represents a GPU, // FLOPS refers to the floating-point operations per second of that GPU, // while UpBandwidth indicates the bandwidth of the VRAM. // // When the device represents a specific node, // FLOPS depends on whether a CPU or GPU is being used, // while UpBandwidth refers to the network bandwidth between nodes. GGUFRunDeviceMetric struct { // FLOPS is the floating-point operations per second of the device. FLOPS FLOPSScalar // UpBandwidth is the bandwidth of the device to transmit data to calculate, // unit is Bps (bytes per second). UpBandwidth BytesPerSecondScalar // DownBandwidth is the bandwidth of the device to transmit calculated result to next layer, // unit is Bps (bytes per second). DownBandwidth BytesPerSecondScalar } // GGUFRunEstimateOption is the options for the estimate. GGUFRunEstimateOption func(*_GGUFRunEstimateOptions) ) // GGUFRunOverriddenTensorBufferType is the type of the overridden tensor buffer. type GGUFRunOverriddenTensorBufferType uint32 const ( _ GGUFRunOverriddenTensorBufferType = iota GGUFRunOverriddenTensorBufferTypeCPU GGUFRunOverriddenTensorBufferTypeGPU GGUFRunOverriddenTensorBufferTypeRPC GGUFRunOverriddenTensorBufferTypeUnknown ) var ( _GGUFRunOverriddenTensorBufferTypeCPURegex = regexp.MustCompile(`^(CPU|AMX)`) _GGUFRunOverriddenTensorBufferTypeUMAGPURegex = regexp.MustCompile(`^(Metal|OpenCL)`) _GGUFRunOverriddenTensorBufferTypeNonUMAGPURegex = regexp.MustCompile(`^(CUDA|CANN|ROCm|MUSA|SYCL|Vulkan|Kompute)(\d+)?`) _GGUFRunOverriddenTensorBufferTypeRPCRegex = regexp.MustCompile(`^RPC\[(.*)\]`) ) // ParseBufferType returns the device index of the overridden tensor. // // The device index is used to determine which device the tensor belongs to, // it is according to the buffer type description. func (odt *GGUFRunOverriddenTensor) ParseBufferType() (GGUFRunOverriddenTensorBufferType, string) { if odt == nil { return GGUFRunOverriddenTensorBufferTypeUnknown, "" } if odt._BufferType == 0 { odt._BufferType = GGUFRunOverriddenTensorBufferTypeUnknown if ms := _GGUFRunOverriddenTensorBufferTypeCPURegex.FindStringSubmatch(odt.BufferType); len(ms) > 1 { odt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeCPU, "0" } if ms := _GGUFRunOverriddenTensorBufferTypeUMAGPURegex.FindStringSubmatch(odt.BufferType); len(ms) > 1 { odt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeGPU, "1" } if ms := _GGUFRunOverriddenTensorBufferTypeRPCRegex.FindStringSubmatch(odt.BufferType); len(ms) > 1 { odt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeRPC, ms[1] } if ms := _GGUFRunOverriddenTensorBufferTypeNonUMAGPURegex.FindStringSubmatch(odt.BufferType); len(ms) > 2 { if idx, err := strconv.ParseInt(ms[2], 10, 64); err == nil && idx >= 0 { odt._BufferType, odt._Index = GGUFRunOverriddenTensorBufferTypeGPU, ms[2] } } } return odt._BufferType, odt._Index } // WithParallelSize sets the (decoding sequences) parallel size for the estimate. func WithParallelSize(size int32) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { if size <= 0 { return } o.ParallelSize = &size } } // WithFlashAttention sets the flash attention flag. func WithFlashAttention() GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { o.FlashAttention = true } } // WithMainGPUIndex sets the main device for the estimate. // // When split mode is LLaMACppSplitModeNone, the main device is the only device. // When split mode is LLaMACppSplitModeRow, the main device handles the intermediate results and KV. // // WithMainGPUIndex needs to combine with WithTensorSplitFraction. func WithMainGPUIndex(di int) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { o.MainGPUIndex = di } } // WithRPCServers sets the RPC servers for the estimate. func WithRPCServers(srvs []string) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { if len(srvs) == 0 { return } o.RPCServers = srvs } } // WithTensorSplitFraction sets the tensor split cumulative fractions for the estimate. // // WithTensorSplitFraction accepts a variadic number of fractions, // all fraction values must be in the range of [0, 1], // and the last fraction must be 1. // // For example, WithTensorSplitFraction(0.2, 0.4, 0.6, 0.8, 1) will split the tensor into five parts with 20% each. func WithTensorSplitFraction(fractions []float64) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { if len(fractions) == 0 { return } for _, f := range fractions { if f < 0 || f > 1 { return } } if fractions[len(fractions)-1] != 1 { return } o.TensorSplitFraction = fractions } } // WithOverriddenTensors sets the overridden tensors for the estimate. func WithOverriddenTensors(tensors []GGUFRunOverriddenTensor) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { if len(tensors) == 0 { return } for _, t := range tensors { if t.PatternRegex == nil || t.BufferType == "" { return } } o.OverriddenTensors = make([]*GGUFRunOverriddenTensor, len(tensors)) for i := range tensors { o.OverriddenTensors[i] = &tensors[i] } } } // WithDeviceMetrics sets the device metrics for the estimate. func WithDeviceMetrics(metrics []GGUFRunDeviceMetric) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { if len(metrics) == 0 { return } o.DeviceMetrics = metrics } } // WithLLaMACppContextSize sets the context size for the estimate. func WithLLaMACppContextSize(size int32) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { if size <= 0 { return } o.LMCContextSize = &size } } // WithLLaMACppRoPE sets the RoPE parameters for the estimate. func WithLLaMACppRoPE( frequencyBase float64, frequencyScale float64, scalingType string, scalingOriginalContextSize int32, ) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { if frequencyBase > 0 { o.LMCRoPEFrequencyBase = ptr.Float32(float32(frequencyBase)) } if frequencyScale > 0 { o.LMCRoPEFrequencyScale = ptr.Float32(float32(frequencyScale)) } if slices.Contains([]string{"none", "linear", "yarn"}, scalingType) { o.LMCRoPEScalingType = &scalingType } if scalingOriginalContextSize > 0 { o.LMCRoPEScalingOriginalContextSize = ptr.To(scalingOriginalContextSize) } } } // WithinLLaMACppMaxContextSize limits the context size to the maximum, // if the context size is over the maximum. func WithinLLaMACppMaxContextSize() GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { o.LMCInMaxContextSize = true } } // WithLLaMACppLogicalBatchSize sets the logical batch size for the estimate. func WithLLaMACppLogicalBatchSize(size int32) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { if size <= 0 { return } o.LMCLogicalBatchSize = &size } } // WithLLaMACppPhysicalBatchSize sets the physical batch size for the estimate. func WithLLaMACppPhysicalBatchSize(size int32) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { if size <= 0 { return } o.LMCPhysicalBatchSize = &size } } // _GGUFEstimateCacheTypeAllowList is the allow list of cache key and value types. var _GGUFEstimateCacheTypeAllowList = []GGMLType{ GGMLTypeF32, GGMLTypeF16, GGMLTypeBF16, GGMLTypeQ8_0, GGMLTypeQ4_0, GGMLTypeQ4_1, GGMLTypeIQ4_NL, GGMLTypeQ5_0, GGMLTypeQ5_1, } // WithLLaMACppCacheKeyType sets the cache key type for the estimate. func WithLLaMACppCacheKeyType(t GGMLType) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { if slices.Contains(_GGUFEstimateCacheTypeAllowList, t) { o.LMCCacheKeyType = &t } } } // WithLLaMACppCacheValueType sets the cache value type for the estimate. func WithLLaMACppCacheValueType(t GGMLType) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { if slices.Contains(_GGUFEstimateCacheTypeAllowList, t) { o.LMCCacheValueType = &t } } } // WithoutLLaMACppOffloadKVCache disables offloading the KV cache. func WithoutLLaMACppOffloadKVCache() GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { o.LMCOffloadKVCache = ptr.To(false) } } // WithLLaMACppOffloadLayers sets the number of layers to offload. func WithLLaMACppOffloadLayers(layers uint64) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { o.LMCOffloadLayers = &layers } } // LLaMACppSplitMode is the split mode for LLaMACpp. type LLaMACppSplitMode uint const ( LLaMACppSplitModeLayer LLaMACppSplitMode = iota LLaMACppSplitModeRow LLaMACppSplitModeNone _LLAMACppSplitModeMax ) // WithLLaMACppSplitMode sets the split mode for the estimate. func WithLLaMACppSplitMode(mode LLaMACppSplitMode) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { if mode < _LLAMACppSplitModeMax { o.LMCSplitMode = mode } } } // WithLLaMACppFullSizeSWACache enables full size sliding window attention cache. func WithLLaMACppFullSizeSWACache() GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { o.LMCFullSizeSWACache = true } } // WithLLaMACppVisualMaxImageSize sets the visual maximum image size input for the estimate. func WithLLaMACppVisualMaxImageSize(size uint32) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { if size == 0 { return } o.LMCVisualMaxImageSize = &size } } // WithLLaMACppMaxProjectedCache sets the maximum projected embedding cache for the estimate. func WithLLaMACppMaxProjectedCache(cacheSize uint32) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { if cacheSize == 0 { return } o.LMCMaxProjectedCache = ptr.To(cacheSize) } } // WithLLaMACppDrafter sets the drafter estimate usage. func WithLLaMACppDrafter(dft *LLaMACppRunEstimate) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { o.LMCDrafter = dft } } // WithLLaMACppProjector sets the multimodal projector estimate usage. func WithLLaMACppProjector(prj *LLaMACppRunEstimate) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { o.LMCProjector = prj } } // WithLLaMACppAdapters sets the adapters estimate usage. func WithLLaMACppAdapters(adp []LLaMACppRunEstimate) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { if len(adp) == 0 { return } o.LMCAdapters = adp } } // WithStableDiffusionCppOffloadLayers sets the number of layers to offload. func WithStableDiffusionCppOffloadLayers(layers uint64) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { o.SDCOffloadLayers = &layers } } // WithStableDiffusionCppBatchCount sets the batch count for the estimate. func WithStableDiffusionCppBatchCount(count int32) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { if count == 0 { return } o.SDCBatchCount = ptr.To(count) } } // WithStableDiffusionCppHeight sets the image height for the estimate. func WithStableDiffusionCppHeight(height uint32) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { if height == 0 { return } o.SDCHeight = ptr.To(height) } } // WithStableDiffusionCppWidth sets the image width for the estimate. func WithStableDiffusionCppWidth(width uint32) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { if width == 0 { return } o.SDCWidth = ptr.To(width) } } // WithoutStableDiffusionCppOffloadConditioner disables offloading the conditioner(text encoder). func WithoutStableDiffusionCppOffloadConditioner() GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { o.SDCOffloadConditioner = ptr.To(false) } } // WithoutStableDiffusionCppOffloadAutoencoder disables offloading the autoencoder. func WithoutStableDiffusionCppOffloadAutoencoder() GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { o.SDCOffloadAutoencoder = ptr.To(false) } } // WithStableDiffusionCppAutoencoderTiling enables tiling for the autoencoder. func WithStableDiffusionCppAutoencoderTiling() GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { o.SDCAutoencoderTiling = ptr.To(true) } } // WithStableDiffusionCppFreeComputeMemoryImmediately enables freeing compute memory immediately. func WithStableDiffusionCppFreeComputeMemoryImmediately() GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { o.SDCFreeComputeMemoryImmediately = ptr.To(true) } } // WithStableDiffusionCppUpscaler sets the upscaler estimate usage. func WithStableDiffusionCppUpscaler(ups *StableDiffusionCppRunEstimate) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { o.SDCUpscaler = ups } } // WithStableDiffusionCppControlNet sets the control net estimate usage. func WithStableDiffusionCppControlNet(cn *StableDiffusionCppRunEstimate) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { o.SDCControlNet = cn } } ================================================ FILE: file_from_distro.go ================================================ package gguf_parser import ( "context" "errors" "fmt" "net/http" "path/filepath" "time" "github.com/gpustack/gguf-parser-go/util/httpx" ) var ( ErrOllamaInvalidModel = errors.New("ollama invalid model") ErrOllamaBaseLayerNotFound = errors.New("ollama base layer not found") ) // ParseGGUFFileFromOllama parses a GGUF file from Ollama model's base layer, // and returns a GGUFFile, or an error if any. func ParseGGUFFileFromOllama(ctx context.Context, model string, opts ...GGUFReadOption) (*GGUFFile, error) { return ParseGGUFFileFromOllamaModel(ctx, ParseOllamaModel(model), opts...) } // ParseGGUFFileFromOllamaModel is similar to ParseGGUFFileFromOllama, // but inputs an OllamaModel instead of a string. // // The given OllamaModel will be completed(fetching MediaType, Config and Layers) after calling this function. func ParseGGUFFileFromOllamaModel(ctx context.Context, model *OllamaModel, opts ...GGUFReadOption) (gf *GGUFFile, err error) { if model == nil { return nil, ErrOllamaInvalidModel } opts = append(opts[:len(opts):len(opts)], SkipRangeDownloadDetection()) var o _GGUFReadOptions for _, opt := range opts { opt(&o) } // Cache. { if o.CachePath != "" { o.CachePath = filepath.Join(o.CachePath, "distro", "ollama") } c := GGUFFileCache(o.CachePath) // Get from cache. if gf, err = c.Get(model.String(), o.CacheExpiration); err == nil { return gf, nil } // Put to cache. defer func() { if err == nil { _ = c.Put(model.String(), gf) } }() } var cli *http.Client cli = httpx.Client( httpx.ClientOptions(). WithUserAgent(OllamaUserAgent()). If(o.Debug, func(x *httpx.ClientOption) *httpx.ClientOption { return x.WithDebug() }). WithTimeout(0). WithRetryBackoff(1*time.Second, 5*time.Second, 10). WithRetryIf(func(resp *http.Response, err error) bool { return httpx.DefaultRetry(resp, err) || OllamaRegistryAuthorizeRetry(resp, cli) }). WithTransport( httpx.TransportOptions(). WithoutKeepalive(). TimeoutForDial(10*time.Second). TimeoutForTLSHandshake(5*time.Second). If(o.SkipProxy, func(x *httpx.TransportOption) *httpx.TransportOption { return x.WithoutProxy() }). If(o.ProxyURL != nil, func(x *httpx.TransportOption) *httpx.TransportOption { return x.WithProxy(http.ProxyURL(o.ProxyURL)) }). If(o.SkipTLSVerification, func(x *httpx.TransportOption) *httpx.TransportOption { return x.WithoutInsecureVerify() }). If(o.SkipDNSCache, func(x *httpx.TransportOption) *httpx.TransportOption { return x.WithoutDNSCache() }))) var ml OllamaModelLayer { err := model.Complete(ctx, cli) if err != nil { return nil, fmt.Errorf("complete ollama model: %w", err) } var ok bool ml, ok = model.GetLayer("application/vnd.ollama.image.model") if !ok { return nil, ErrOllamaBaseLayerNotFound } } return parseGGUFFileFromRemote(ctx, cli, ml.BlobURL().String(), o) } ================================================ FILE: file_from_remote.go ================================================ package gguf_parser import ( "context" "fmt" "io" "net/http" "path/filepath" "strings" "time" "github.com/gpustack/gguf-parser-go/util/httpx" "github.com/gpustack/gguf-parser-go/util/osx" ) // ParseGGUFFileFromHuggingFace parses a GGUF file from Hugging Face(https://huggingface.co/), // and returns a GGUFFile, or an error if any. func ParseGGUFFileFromHuggingFace(ctx context.Context, repo, file string, opts ...GGUFReadOption) (*GGUFFile, error) { ep := osx.Getenv("HF_ENDPOINT", "https://huggingface.co") return ParseGGUFFileRemote(ctx, fmt.Sprintf("%s/%s/resolve/main/%s", ep, repo, file), opts...) } // ParseGGUFFileFromModelScope parses a GGUF file from Model Scope(https://modelscope.cn/), // and returns a GGUFFile, or an error if any. func ParseGGUFFileFromModelScope(ctx context.Context, repo, file string, opts ...GGUFReadOption) (*GGUFFile, error) { ep := osx.Getenv("MS_ENDPOINT", "https://modelscope.cn") opts = append(opts[:len(opts):len(opts)], SkipRangeDownloadDetection()) return ParseGGUFFileRemote(ctx, fmt.Sprintf("%s/models/%s/resolve/master/%s", ep, repo, file), opts...) } // ParseGGUFFileRemote parses a GGUF file from a remote BlobURL, // and returns a GGUFFile, or an error if any. func ParseGGUFFileRemote(ctx context.Context, url string, opts ...GGUFReadOption) (gf *GGUFFile, err error) { var o _GGUFReadOptions for _, opt := range opts { opt(&o) } // Cache. { if o.CachePath != "" { o.CachePath = filepath.Join(o.CachePath, "remote") if o.SkipLargeMetadata { o.CachePath = filepath.Join(o.CachePath, "brief") } } c := GGUFFileCache(o.CachePath) // Get from cache. if gf, err = c.Get(url, o.CacheExpiration); err == nil { return gf, nil } // Put to cache. defer func() { if err == nil { _ = c.Put(url, gf) } }() } cli := httpx.Client( httpx.ClientOptions(). WithUserAgent("gguf-parser-go"). If(o.Debug, func(x *httpx.ClientOption) *httpx.ClientOption { return x.WithDebug() }, ). If(o.BearerAuthToken != "", func(x *httpx.ClientOption) *httpx.ClientOption { return x.WithBearerAuth(o.BearerAuthToken) }, ). If(len(o.Headers) > 0, func(x *httpx.ClientOption) *httpx.ClientOption { return x.WithHeaders(o.Headers) }, ). WithTimeout(0). WithTransport( httpx.TransportOptions(). WithoutKeepalive(). TimeoutForDial(5*time.Second). TimeoutForTLSHandshake(5*time.Second). TimeoutForResponseHeader(5*time.Second). If(o.SkipProxy, func(x *httpx.TransportOption) *httpx.TransportOption { return x.WithoutProxy() }, ). If(o.ProxyURL != nil, func(x *httpx.TransportOption) *httpx.TransportOption { return x.WithProxy(http.ProxyURL(o.ProxyURL)) }, ). If(o.SkipTLSVerification || !strings.HasPrefix(url, "https://"), func(x *httpx.TransportOption) *httpx.TransportOption { return x.WithoutInsecureVerify() }, ). If(o.SkipDNSCache, func(x *httpx.TransportOption) *httpx.TransportOption { return x.WithoutDNSCache() }, ), ), ) return parseGGUFFileFromRemote(ctx, cli, url, o) } func parseGGUFFileFromRemote(ctx context.Context, cli *http.Client, url string, o _GGUFReadOptions) (*GGUFFile, error) { var urls []string { rs := CompleteShardGGUFFilename(url) if rs != nil { urls = rs } else { urls = []string{url} } } fs := make([]_GGUFFileReadSeeker, 0, len(urls)) defer func() { for i := range fs { osx.Close(fs[i]) } }() for i := range urls { req, err := httpx.NewGetRequestWithContext(ctx, urls[i]) if err != nil { return nil, fmt.Errorf("new request: %w", err) } sf, err := httpx.OpenSeekerFile(cli, req, httpx.SeekerFileOptions(). WithBufferSize(o.BufferSize). If(o.SkipRangeDownloadDetection, func(x *httpx.SeekerFileOption) *httpx.SeekerFileOption { return x.WithoutRangeDownloadDetect() }, ), ) if err != nil { return nil, fmt.Errorf("open http file: %w", err) } fs = append(fs, _GGUFFileReadSeeker{ Closer: sf, ReadSeeker: io.NewSectionReader(sf, 0, sf.Len()), Size: sf.Len(), }) } return parseGGUFFile(fs, o) } ================================================ FILE: file_metadata.go ================================================ package gguf_parser import ( "regexp" "slices" "sort" "strings" "golang.org/x/exp/maps" ) // GGUFMetadata represents the model metadata of a GGUF file. type GGUFMetadata struct { /* Basic */ // Type describes what type this GGUF file is, // default is "model". Type string `json:"type"` // Architecture describes what architecture this GGUF file implements. // // All lowercase ASCII. Architecture string `json:"architecture"` // QuantizationVersion describes the version of the quantization format. // // Not required if the model is not quantized (i.e. no tensors are quantized). // If any tensors are quantized, this must be present. // This is separate to the quantization scheme of the tensors itself, // the quantization version may change without changing the scheme's name, // e.g. the quantization scheme is Q5_K, and the QuantizationVersion is 4. QuantizationVersion uint32 `json:"quantizationVersion,omitempty"` // Alignment describes the alignment of the GGUF file. // // This can vary to allow for different alignment schemes, but it must be a multiple of 8. // Some writers may not write the alignment. // // Default is 32. Alignment uint32 `json:"alignment"` // Name to the model. // // This should be a human-readable name that can be used to identify the GGUF file. // It should be unique within the community that the model is defined in. Name string `json:"name,omitempty"` // Author to the model. Author string `json:"author,omitempty"` // URL to the model's homepage. // // This can be a GitHub repo, a paper, etc. URL string `json:"url,omitempty"` // Description to the model. Description string `json:"description,omitempty"` // License to the model. // // This is expressed as a SPDX license expression, e.g. "MIT OR Apache-2.0". License string `json:"license,omitempty"` // FileType describes the type of the majority of the tensors in the GGUF file. FileType GGUFFileType `json:"fileType"` // FileTypeDescriptor describes the type of the GGUF file according to the FileType and trait layer. // // This supplies the FileType with more detail. FileTypeDescriptor string `json:"fileTypeDetail"` /* Appendix */ // LittleEndian is true if the GGUF file is little-endian, // and false for big-endian. LittleEndian bool `json:"littleEndian"` // FileSize is the size of the GGUF file in bytes. FileSize GGUFBytesScalar `json:"fileSize"` // Size is the model size. Size GGUFBytesScalar `json:"size"` // Parameters is the parameters of the GGUF file. Parameters GGUFParametersScalar `json:"parameters"` // BitsPerWeight is the bits per weight of the GGUF file. BitsPerWeight GGUFBitsPerWeightScalar `json:"bitsPerWeight"` } // GGUFFileType is a type of GGUF file, // see https://github.com/ggml-org/llama.cpp/blob/fd1234cb468935ea087d6929b2487926c3afff4b/ggml/include/ggml.h#L419-L445, // and https://github.com/huggingface/huggingface.js/blob/d67a464473ca07fee9811a129e5fac8cc7487098/packages/tasks/src/gguf.ts#L4-L52. type GGUFFileType uint32 // GGUFFileType constants. // // GGUFFileTypeMostlyQ4_2, GGUFFileTypeMostlyQ4_3 are deprecated. // GGUFFileTypeMostlyQ4_0_4_4, GGUFFileTypeMostlyQ4_0_4_8, GGUFFileTypeMostlyQ4_0_8_8 are deprecated. // // GGUFFileTypeMostlyQ4_1_SOME_F16 is a special case where the majority of the tensors are Q4_1, // but 'token_embd.weight' and 'output.weight' tensors are F16. const ( GGUFFileTypeMostlyF32 GGUFFileType = iota // MOSTLY_F32 GGUFFileTypeMostlyF16 // MOSTLY_F16 GGUFFileTypeMostlyQ4_0 // MOSTLY_Q4_0 GGUFFileTypeMostlyQ4_1 // MOSTLY_Q4_1 GGUFFileTypeMostlyQ4_1_SOME_F16 // MOSTLY_Q4_1_SOME_F16 GGUFFileTypeMostlyQ4_2 // MOSTLY_Q4_2 GGUFFileTypeMostlyQ4_3 // MOSTLY_Q4_3 GGUFFileTypeMostlyQ8_0 // MOSTLY_Q8_0 GGUFFileTypeMostlyQ5_0 // MOSTLY_Q5_0 GGUFFileTypeMostlyQ5_1 // MOSTLY_Q5_1 GGUFFileTypeMostlyQ2_K // MOSTLY_Q2_K GGUFFileTypeMostlyQ3_K_S // MOSTLY_Q3_K_S GGUFFileTypeMostlyQ3_K_M // MOSTLY_Q3_K_M GGUFFileTypeMostlyQ3_K_L // MOSTLY_Q3_K_L GGUFFileTypeMostlyQ4_K_S // MOSTLY_Q4_K_S GGUFFileTypeMostlyQ4_K_M // MOSTLY_Q4_K_M GGUFFileTypeMostlyQ5_K_S // MOSTLY_Q5_K_S GGUFFileTypeMostlyQ5_K_M // MOSTLY_Q5_K_M GGUFFileTypeMostlyQ6_K // MOSTLY_Q6_K GGUFFileTypeMostlyIQ2_XXS // MOSTLY_IQ2_XXS GGUFFileTypeMostlyIQ2_XS // MOSTLY_IQ2_XS GGUFFileTypeMostlyQ2_K_S // MOSTLY_Q2_K_S GGUFFileTypeMostlyIQ3_XS // MOSTLY_IQ3_XS GGUFFileTypeMostlyIQ3_XXS // MOSTLY_IQ3_XXS GGUFFileTypeMostlyIQ1_S // MOSTLY_IQ1_S GGUFFileTypeMostlyIQ4_NL // MOSTLY_IQ4_NL GGUFFileTypeMostlyIQ3_S // MOSTLY_IQ3_S GGUFFileTypeMostlyIQ3_M // MOSTLY_IQ3_M GGUFFileTypeMostlyIQ2_S // MOSTLY_IQ2_S GGUFFileTypeMostlyIQ2_M // MOSTLY_IQ2_M GGUFFileTypeMostlyIQ4_XS // MOSTLY_IQ4_XS GGUFFileTypeMostlyIQ1_M // MOSTLY_IQ1_M GGUFFileTypeMostlyBF16 // MOSTLY_BF16 GGUFFileTypeMostlyQ4_0_4_4 // MOSTLY_Q4_0_4_4 GGUFFileTypeMostlyQ4_0_4_8 // MOSTLY_Q4_0_4_8 GGUFFileTypeMostlyQ4_0_8_8 // MOSTLY_Q4_0_8_8 GGUFFileTypeMostlyTQ1_0 // MOSTLY_TQ1_0 GGUFFileTypeMostlyTQ2_0 // MOSTLY_TQ2_0 GGUFFileTypeMostlyMXFP4 // MOSTLY_MXFP4 _GGUFFileTypeCount // Unknown ) // _GGUFPotentialDiffusionArchitectures holds a list representing the potential diffusion architectures. // // Since we will unify all diffusion architectures to "diffusion" during processing, // we can use this list to match the value in explicit `general.architecture`. var _GGUFPotentialDiffusionArchitectures = []string{ "flux", "sd", "sd2.5", "sd3", "stable-diffusion", } // _GGUFPotentialDiffusionArchitectureTensorsRegexes holds a list of regexes to match the potential diffusion architecture tensors. // // This is used to detect if the GGUF file is a diffusion model, // when the `general.architecture` is not set to a known diffusion architecture. var _GGUFPotentialDiffusionArchitectureTensorsRegexes = []*regexp.Regexp{ regexp.MustCompile(`^model\.diffusion_model\..*`), regexp.MustCompile(`^double_blocks\..*`), regexp.MustCompile(`^joint_blocks\..*`), regexp.MustCompile(`^decoder\..*`), regexp.MustCompile(`^encoder\..*`), regexp.MustCompile(`^text_model\..*`), } // Metadata returns the metadata of the GGUF file. func (gf *GGUFFile) Metadata() (gm GGUFMetadata) { const ( typeKey = "general.type" architectureKey = "general.architecture" quantizationKey = "general.quantization_version" alignmentKey = "general.alignment" nameKey = "general.name" authorKey = "general.author" urlKey = "general.url" descriptionKey = "general.description" licenseKey = "general.license" controlVectorModelHintKey = "controlvector.model_hint" ) m, _ := gf.Header.MetadataKV.Index([]string{ typeKey, architectureKey, quantizationKey, alignmentKey, nameKey, authorKey, urlKey, descriptionKey, licenseKey, controlVectorModelHintKey, }) if v, ok := m[typeKey]; ok { gm.Type = v.ValueString() } else if _, ok = m[controlVectorModelHintKey]; ok { gm.Type = "adapter" } else { gm.Type = "model" } if v, ok := m[controlVectorModelHintKey]; ok { gm.Architecture = v.ValueString() } else if v, ok = m[architectureKey]; ok && !slices.Contains(_GGUFPotentialDiffusionArchitectures, v.ValueString()) { gm.Architecture = v.ValueString() if gm.Architecture == "clip" { gm.Type = "projector" } } else if gm.Type == "imatrix" { gm.Architecture = "imatrix" // Default to imatrix. } else { gm.Architecture = "llama" // Default to llama. for _, re := range _GGUFPotentialDiffusionArchitectureTensorsRegexes { if gf.TensorInfos.Match(re) { gm.Architecture = "diffusion" break } } } if v, ok := m[quantizationKey]; ok { gm.QuantizationVersion = ValueNumeric[uint32](v) } if v, ok := m[alignmentKey]; ok { gm.Alignment = ValueNumeric[uint32](v) } else { gm.Alignment = 32 } if v, ok := m[nameKey]; ok { gm.Name = v.ValueString() } if v, ok := m[authorKey]; ok { gm.Author = v.ValueString() } if v, ok := m[urlKey]; ok { gm.URL = v.ValueString() } if v, ok := m[descriptionKey]; ok { gm.Description = v.ValueString() } if v, ok := m[licenseKey]; ok { gm.License = v.ValueString() } gm.FileType, gm.FileTypeDescriptor = gf.extractFileType(gm.Architecture) gm.LittleEndian = gf.Header.Version < GGUFVersionV3 || gf.Header.Magic == GGUFMagicGGUFLe gm.FileSize = gf.Size gm.Size = gf.ModelSize gm.Parameters = gf.ModelParameters gm.BitsPerWeight = gf.ModelBitsPerWeight return gm } // GGMLType returns the GGMLType of the GGUFFileType, // which is inspired by // https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2730-L2763. func (t GGUFFileType) GGMLType() GGMLType { switch t { case GGUFFileTypeMostlyF32: return GGMLTypeF32 case GGUFFileTypeMostlyF16: return GGMLTypeF16 case GGUFFileTypeMostlyQ4_0: return GGMLTypeQ4_0 case GGUFFileTypeMostlyQ4_1: return GGMLTypeQ4_1 case GGUFFileTypeMostlyQ4_1_SOME_F16: return GGMLTypeQ4_1 case GGUFFileTypeMostlyQ4_2: return GGMLTypeQ4_2 case GGUFFileTypeMostlyQ4_3: return GGMLTypeQ4_3 case GGUFFileTypeMostlyQ8_0: return GGMLTypeQ8_0 case GGUFFileTypeMostlyQ5_0: return GGMLTypeQ5_0 case GGUFFileTypeMostlyQ5_1: return GGMLTypeQ5_1 case GGUFFileTypeMostlyQ2_K: return GGMLTypeQ2_K case GGUFFileTypeMostlyQ3_K_S: return GGMLTypeQ3_K case GGUFFileTypeMostlyQ3_K_M: return GGMLTypeQ4_K case GGUFFileTypeMostlyQ3_K_L: return GGMLTypeQ5_K case GGUFFileTypeMostlyQ4_K_S: return GGMLTypeQ6_K case GGUFFileTypeMostlyQ4_K_M: return GGMLTypeQ4_K case GGUFFileTypeMostlyQ5_K_S: return GGMLTypeQ5_K case GGUFFileTypeMostlyQ5_K_M: return GGMLTypeQ5_K case GGUFFileTypeMostlyQ6_K: return GGMLTypeQ6_K case GGUFFileTypeMostlyIQ2_XXS: return GGMLTypeIQ2_XXS case GGUFFileTypeMostlyIQ2_XS: return GGMLTypeIQ2_XS case GGUFFileTypeMostlyQ2_K_S: return GGMLTypeQ2_K case GGUFFileTypeMostlyIQ3_XS: return GGMLTypeIQ3_S case GGUFFileTypeMostlyIQ3_XXS: return GGMLTypeIQ3_XXS case GGUFFileTypeMostlyIQ1_S: return GGMLTypeIQ1_S case GGUFFileTypeMostlyIQ4_NL: return GGMLTypeIQ4_NL case GGUFFileTypeMostlyIQ3_S: return GGMLTypeIQ3_S case GGUFFileTypeMostlyIQ3_M: return GGMLTypeIQ3_S case GGUFFileTypeMostlyIQ2_S: return GGMLTypeIQ2_XS case GGUFFileTypeMostlyIQ2_M: return GGMLTypeIQ2_S case GGUFFileTypeMostlyIQ4_XS: return GGMLTypeIQ4_XS case GGUFFileTypeMostlyIQ1_M: return GGMLTypeIQ1_M case GGUFFileTypeMostlyBF16: return GGMLTypeBF16 case GGUFFileTypeMostlyQ4_0_4_4: return GGMLTypeQ4_0_4_4 case GGUFFileTypeMostlyQ4_0_4_8: return GGMLTypeQ4_0_4_8 case GGUFFileTypeMostlyQ4_0_8_8: return GGMLTypeQ4_0_8_8 case GGUFFileTypeMostlyTQ1_0: return GGMLTypeTQ1_0 case GGUFFileTypeMostlyTQ2_0: return GGMLTypeTQ2_0 case GGUFFileTypeMostlyMXFP4: return GGMLTypeMXFP4 default: } return _GGMLTypeCount } // extractFileType extracts the GGUF file type from the metadata, // it tries to return the descriptor of the file type. func (gf *GGUFFile) extractFileType(arch string) (fileType GGUFFileType, fileTypeDescriptor string) { fileType, fileTypeDescriptor = _GGUFFileTypeCount, "Unknown" const fileTypeKey = "general.file_type" m, _ := gf.Header.MetadataKV.Index([]string{ fileTypeKey, }) if v, ok := m[fileTypeKey]; ok { fileType = GGUFFileType(ValueNumeric[uint32](v)) } if fileType == _GGUFFileTypeCount { // Guess. if len(gf.TensorInfos) != 0 { cm := make(map[GGMLType]int) for i := range gf.TensorInfos { switch { case arch != "diffusion" && !strings.HasPrefix(gf.TensorInfos[i].Name, "token_embd") && !strings.HasPrefix(gf.TensorInfos[i].Name, "blk.") && !strings.Contains(gf.TensorInfos[i].Name, "_norm") && !strings.HasSuffix(gf.TensorInfos[i].Name, ".weight"): continue case arch == "diffusion" && !strings.HasSuffix(gf.TensorInfos[i].Name, ".weight"): continue } cm[gf.TensorInfos[i].Type]++ } fileType = GetFileType(cm) } } if fileType == _GGUFFileTypeCount { return fileType, fileTypeDescriptor } fileTypeDescriptor = strings.TrimPrefix(fileType.String(), "MOSTLY_") const tokenEmbedWeightTensorName = "token_embd.weight" switch fileType { case GGUFFileTypeMostlyQ4_0: tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName}) if v, ok := tis[tokenEmbedWeightTensorName]; ok { if v.Type == GGMLTypeQ8_0 || v.Type == GGMLTypeQ5_0 || v.Type == GGMLTypeQ5_1 { fileTypeDescriptor = "Q4_0_L" } } case GGUFFileTypeMostlyQ4_1: tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName}) if v, ok := tis[tokenEmbedWeightTensorName]; ok { if v.Type == GGMLTypeQ8_0 || v.Type == GGMLTypeQ5_0 || v.Type == GGMLTypeQ5_1 { fileTypeDescriptor = "Q4_1_L" } } case GGUFFileTypeMostlyQ5_0: tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName}) if v, ok := tis[tokenEmbedWeightTensorName]; ok { if v.Type == GGMLTypeQ8_0 { fileTypeDescriptor = "Q5_0_L" } } case GGUFFileTypeMostlyQ5_1: tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName}) if v, ok := tis[tokenEmbedWeightTensorName]; ok { if v.Type == GGMLTypeQ8_0 { fileTypeDescriptor = "Q5_1_L" } } case GGUFFileTypeMostlyQ2_K: tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName}) if v, ok := tis[tokenEmbedWeightTensorName]; ok { if v.Type == GGMLTypeQ8_0 || v.Type == GGMLTypeQ4_K { fileTypeDescriptor = "Q2_K_L" } } case GGUFFileTypeMostlyQ3_K_M: tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName}) if v, ok := tis[tokenEmbedWeightTensorName]; ok { if v.Type == GGMLTypeQ8_0 { fileTypeDescriptor = "Q3_K_L" } } case GGUFFileTypeMostlyQ4_K_M: tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName}) if v, ok := tis[tokenEmbedWeightTensorName]; ok { if v.Type == GGMLTypeQ8_0 { fileTypeDescriptor = "Q4_K_L" } } case GGUFFileTypeMostlyQ5_K_M: tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName}) if v, ok := tis[tokenEmbedWeightTensorName]; ok { if v.Type == GGMLTypeQ8_0 { fileTypeDescriptor = "Q5_K_L" } } case GGUFFileTypeMostlyQ6_K: tis, _ := gf.TensorInfos.Index([]string{tokenEmbedWeightTensorName}) if v, ok := tis[tokenEmbedWeightTensorName]; ok { if v.Type == GGMLTypeQ8_0 { fileTypeDescriptor = "Q6_K_L" } } } return fileType, fileTypeDescriptor } // GetFileType returns the GGUFFileType represented the mostly GGMLType of the given tensors counter. // // The input `cm` is a map of GGMLType to the count of tensors of that type. func GetFileType(cm map[GGMLType]int) GGUFFileType { if len(cm) == 0 { return _GGUFFileTypeCount } // Sort. ts := maps.Keys(cm) sort.Slice(ts, func(i, j int) bool { return cm[ts[i]] > cm[ts[j]] }) // Guess. if ts[0] == GGMLTypeF32 { if len(ts) == 1 { return GGUFFileTypeMostlyF32 } ts[0] = ts[1] } switch ts[0] { case GGMLTypeF16: return GGUFFileTypeMostlyF16 case GGMLTypeQ4_0: return GGUFFileTypeMostlyQ4_0 case GGMLTypeQ4_1: return GGUFFileTypeMostlyQ4_1 case GGMLTypeQ4_2: return GGUFFileTypeMostlyQ4_2 case GGMLTypeQ4_3: return GGUFFileTypeMostlyQ4_3 case GGMLTypeQ5_0: return GGUFFileTypeMostlyQ5_0 case GGMLTypeQ5_1: return GGUFFileTypeMostlyQ5_1 case GGMLTypeQ8_0: return GGUFFileTypeMostlyQ8_0 case GGMLTypeQ2_K: if ts[len(ts)-1] == GGMLTypeQ5_K { return GGUFFileTypeMostlyQ2_K_S } return GGUFFileTypeMostlyQ2_K case GGMLTypeQ3_K: if cm[GGMLTypeQ8_0] > 0 || (cm[GGMLTypeQ5_K] > 1 && cm[GGMLTypeQ4_K] == 0) { return GGUFFileTypeMostlyQ3_K_L } if cm[GGMLTypeQ4_K] > 1 { return GGUFFileTypeMostlyQ3_K_M } return GGUFFileTypeMostlyQ3_K_S case GGMLTypeQ4_K: if cm[GGMLTypeQ6_K] > 1 { return GGUFFileTypeMostlyQ4_K_M } if cm[GGMLTypeQ3_K] > 1 { return GGUFFileTypeMostlyQ3_K_M } return GGUFFileTypeMostlyQ4_K_S case GGMLTypeQ5_K: if cm[GGMLTypeQ6_K] > 1 { return GGUFFileTypeMostlyQ5_K_M } return GGUFFileTypeMostlyQ5_K_S case GGMLTypeQ6_K: return GGUFFileTypeMostlyQ6_K case GGMLTypeIQ2_XXS: return GGUFFileTypeMostlyIQ2_XXS case GGMLTypeIQ2_XS: if cm[GGMLTypeIQ4_XS] > 1 { return GGUFFileTypeMostlyIQ2_S } return GGUFFileTypeMostlyIQ2_XS case GGMLTypeIQ2_S: return GGUFFileTypeMostlyIQ2_M case GGMLTypeIQ3_XXS: return GGUFFileTypeMostlyIQ3_XXS case GGMLTypeIQ3_S: if cm[GGMLTypeIQ3_XXS] > 1 { return GGUFFileTypeMostlyIQ3_XS } return GGUFFileTypeMostlyIQ3_S case GGMLTypeIQ1_S: return GGUFFileTypeMostlyIQ1_S case GGMLTypeIQ4_NL: return GGUFFileTypeMostlyIQ4_NL case GGMLTypeIQ4_XS: return GGUFFileTypeMostlyIQ4_XS case GGMLTypeIQ1_M: return GGUFFileTypeMostlyIQ1_M case GGMLTypeBF16: return GGUFFileTypeMostlyBF16 case GGMLTypeQ4_0_4_4: return GGUFFileTypeMostlyQ4_0_4_4 case GGMLTypeQ4_0_4_8: return GGUFFileTypeMostlyQ4_0_4_8 case GGMLTypeQ4_0_8_8: return GGUFFileTypeMostlyQ4_0_8_8 case GGMLTypeTQ1_0: return GGUFFileTypeMostlyTQ1_0 case GGMLTypeTQ2_0: return GGUFFileTypeMostlyTQ2_0 case GGMLTypeMXFP4: return GGUFFileTypeMostlyMXFP4 default: } return _GGUFFileTypeCount } ================================================ FILE: file_metadata_test.go ================================================ package gguf_parser import ( "context" "fmt" "os" "strings" "testing" "github.com/davecgh/go-spew/spew" "github.com/stretchr/testify/assert" ) func TestGGUFFile_Metadata(t *testing.T) { ctx := context.Background() f, err := ParseGGUFFileFromHuggingFace( ctx, "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", "Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf", SkipLargeMetadata()) if err != nil { t.Fatal(err) return } t.Log("\n", spew.Sdump(f.Metadata()), "\n") } func BenchmarkGGUFFile_Metadata(b *testing.B) { mp, ok := os.LookupEnv("TEST_MODEL_PATH") if !ok { b.Skip("TEST_MODEL_PATH is not set") return } f, err := ParseGGUFFile(mp, UseMMap(), SkipLargeMetadata()) if err != nil { b.Fatal(err) return } b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { _ = f.Metadata() } } func TestGGUFFile_extractFileType(t *testing.T) { ctx := context.Background() repo := "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF" cases := []string{ "Q2_K", "Q3_K_L", "Q3_K_M", "Q3_K_S", "Q4_0", "Q4_K_M", "Q4_K_S", "Q5_0", "Q5_K_M", "Q5_K_S", "Q6_K", "Q8_0", } for _, tc := range cases { t.Run(repo+"/"+tc, func(t *testing.T) { gf, err := ParseGGUFFileFromHuggingFace( ctx, repo, fmt.Sprintf("Hermes-2-Pro-Mistral-7B.%s.gguf", tc)) if err != nil { t.Fatal(err) return } md := gf.Metadata() ft, ftd := gf.extractFileType(md.Architecture) assert.Equal(t, md.FileType.String(), ft.String(), tc+" file type should be equal") assert.Equal(t, tc, ftd, tc+" file type descriptor should be equal") }) } // Ignore unsupported cases for https://huggingface.co/Mungert/Qwen2.5-VL-3B-Instruct-GGUF/commit/42f8e463b233df7575f1e1e9a83cb5936db56d2a. repo = "Mungert/Qwen2.5-VL-3B-Instruct-GGUF" cases = []string{ "IQ2_M", "IQ2_S", "IQ2_XS", "IQ2_XXS", "IQ3_M", "IQ3_S", "IQ3_XS", "IQ3_XXS", "IQ4_NL", "IQ4_XS", // "Q2_K_L", "Q2_K_S", // "Q3_K_L", "Q3_K_M", "Q3_K_S", "Q4_0", // "Q4_0_L", "Q4_1", // "Q4_1_L", // "Q4_K_L", "Q4_K_M", "Q4_K_S", "Q5_0", // "Q5_0_L", // "Q5_K_L", "Q5_K_M", "Q5_K_S", // "Q6_K_L", // "Q6_K_M", == "Q6_K" "Q8_0", } for _, tc := range cases { t.Run(repo+"/"+tc, func(t *testing.T) { gf, err := ParseGGUFFileFromHuggingFace( ctx, repo, fmt.Sprintf("Qwen2.5-VL-3B-Instruct-%s.gguf", strings.ToLower(tc))) if err != nil { t.Fatal(err) return } md := gf.Metadata() ft, ftd := gf.extractFileType(md.Architecture) assert.Equal(t, md.FileType.String(), ft.String(), tc+" file type should be equal") assert.Equal(t, tc, ftd, tc+" file type descriptor should be equal") }) } repo = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF" cases = []string{ "BF16", "Q2_K", "Q2_K_L", "Q3_K_M", "Q4_K_M", "Q5_K_M", "Q6_K", "Q8_0", } for _, tc := range cases { t.Run(repo+"/"+tc, func(t *testing.T) { gf, err := ParseGGUFFileFromHuggingFace( ctx, repo, fmt.Sprintf("DeepSeek-R1-Distill-Qwen-1.5B-%s.gguf", tc)) if err != nil { t.Fatal(err) return } md := gf.Metadata() ft, ftd := gf.extractFileType(md.Architecture) assert.Equal(t, md.FileType.String(), ft.String(), tc+" file type should be equal") assert.Equal(t, tc, ftd, tc+" file type descriptor should be equal") }) } repo = "unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF" cases = []string{ "IQ1_M", "IQ1_S", "IQ2_M", "IQ2_XXS", "IQ3_XXS", "IQ4_XS", // "Q2_K_XL" == "Q2_K_L" // "Q3_K_XL" == "Q3_K_M" // "Q4_K_XL" == "Q4_K_M" } for _, tc := range cases { t.Run(repo+"/"+tc, func(t *testing.T) { gf, err := ParseGGUFFileFromHuggingFace( ctx, repo, fmt.Sprintf("DeepSeek-R1-Distill-Qwen-1.5B-UD-%s.gguf", tc)) if err != nil { t.Fatal(err) return } md := gf.Metadata() ft, ftd := gf.extractFileType(md.Architecture) assert.Equal(t, md.FileType.String(), ft.String(), tc+" file type should be equal") assert.Equal(t, tc, ftd, tc+" file type descriptor should be equal") }) } } ================================================ FILE: file_option.go ================================================ package gguf_parser import ( "net/url" "path/filepath" "runtime" "strings" "time" "github.com/gpustack/gguf-parser-go/util/osx" ) type ( _GGUFReadOptions struct { Debug bool SkipLargeMetadata bool // Local. MMap bool // Remote. BearerAuthToken string Headers map[string]string ProxyURL *url.URL SkipProxy bool SkipTLSVerification bool SkipDNSCache bool BufferSize int SkipRangeDownloadDetection bool CachePath string CacheExpiration time.Duration } // GGUFReadOption is the option for reading the file. GGUFReadOption func(o *_GGUFReadOptions) ) // UseDebug uses debug mode to read the file. func UseDebug() GGUFReadOption { return func(o *_GGUFReadOptions) { o.Debug = true } } // SkipLargeMetadata skips reading large GGUFMetadataKV items, // which are not necessary for most cases. func SkipLargeMetadata() GGUFReadOption { return func(o *_GGUFReadOptions) { o.SkipLargeMetadata = true } } // UseMMap uses mmap to read the local file. func UseMMap() GGUFReadOption { return func(o *_GGUFReadOptions) { o.MMap = true } } // UseBearerAuth uses the given token as a bearer auth when reading from remote. func UseBearerAuth(token string) GGUFReadOption { return func(o *_GGUFReadOptions) { o.BearerAuthToken = token } } // UseHeaders uses the given headers when reading from remote. func UseHeaders(headers map[string]string) GGUFReadOption { return func(o *_GGUFReadOptions) { o.Headers = headers } } // UseProxy uses the given url as a proxy when reading from remote. func UseProxy(url *url.URL) GGUFReadOption { return func(o *_GGUFReadOptions) { o.ProxyURL = url } } // SkipProxy skips the proxy when reading from remote. func SkipProxy() GGUFReadOption { return func(o *_GGUFReadOptions) { o.SkipProxy = true } } // SkipTLSVerification skips the TLS verification when reading from remote. func SkipTLSVerification() GGUFReadOption { return func(o *_GGUFReadOptions) { o.SkipTLSVerification = true } } // SkipDNSCache skips the DNS cache when reading from remote. func SkipDNSCache() GGUFReadOption { return func(o *_GGUFReadOptions) { o.SkipDNSCache = true } } // UseBufferSize sets the buffer size when reading from remote. func UseBufferSize(size int) GGUFReadOption { const minSize = 32 * 1024 if size < minSize { size = minSize } return func(o *_GGUFReadOptions) { o.BufferSize = size } } // SkipRangeDownloadDetection skips the range download detection when reading from remote. func SkipRangeDownloadDetection() GGUFReadOption { return func(o *_GGUFReadOptions) { o.SkipRangeDownloadDetection = true } } // UseCache caches the remote reading result. func UseCache() GGUFReadOption { return func(o *_GGUFReadOptions) { o.CachePath = DefaultCachePath() o.CacheExpiration = 24 * time.Hour } } // SkipCache skips the cache when reading from remote. func SkipCache() GGUFReadOption { return func(o *_GGUFReadOptions) { o.CachePath = "" o.CacheExpiration = 0 } } // DefaultCachePath returns the default cache path. func DefaultCachePath() string { cd := filepath.Join(osx.UserHomeDir(), ".cache") if runtime.GOOS == "windows" { cd = osx.Getenv("APPDATA", cd) } return filepath.Join(cd, "gguf-parser") } // UseCachePath uses the given path to cache the remote reading result. func UseCachePath(path string) GGUFReadOption { path = strings.TrimSpace(filepath.Clean(osx.InlineTilde(path))) return func(o *_GGUFReadOptions) { if path == "" { return } o.CachePath = path } } // UseCacheExpiration uses the given expiration to cache the remote reading result. // // Disable cache expiration by setting it to 0. func UseCacheExpiration(expiration time.Duration) GGUFReadOption { if expiration < 0 { expiration = 0 } return func(o *_GGUFReadOptions) { o.CacheExpiration = expiration } } ================================================ FILE: file_test.go ================================================ package gguf_parser import ( "bytes" "context" "encoding/binary" "os" "testing" "time" "github.com/davecgh/go-spew/spew" ) func TestParseGGUFFile(t *testing.T) { mp, ok := os.LookupEnv("TEST_MODEL_PATH") if !ok { t.Skip("TEST_MODEL_PATH is not set") return } // Slow read. { f, err := ParseGGUFFile(mp) if err != nil { t.Fatal(err) return } s := spew.ConfigState{ Indent: " ", MaxDepth: 5, // Avoid console overflow. } t.Log("\n", s.Sdump(f), "\n") } // Fast read. { f, err := ParseGGUFFile(mp, SkipLargeMetadata(), UseMMap()) if err != nil { t.Fatal(err) return } t.Log("\n", spew.Sdump(f), "\n") } } func BenchmarkParseGGUFFileMMap(b *testing.B) { mp, ok := os.LookupEnv("TEST_MODEL_PATH") if !ok { b.Skip("TEST_MODEL_PATH is not set") return } b.ReportAllocs() b.ResetTimer() b.Run("Normal", func(b *testing.B) { for i := 0; i < b.N; i++ { _, err := ParseGGUFFile(mp) if err != nil { b.Fatal(err) return } } }) b.ResetTimer() b.Run("UseMMap", func(b *testing.B) { for i := 0; i < b.N; i++ { _, err := ParseGGUFFile(mp, UseMMap()) if err != nil { b.Fatal(err) return } } }) } func BenchmarkParseGGUFFileSkipLargeMetadata(b *testing.B) { mp, ok := os.LookupEnv("TEST_MODEL_PATH") if !ok { b.Skip("TEST_MODEL_PATH is not set") return } b.ReportAllocs() b.ResetTimer() b.Run("Normal", func(b *testing.B) { for i := 0; i < b.N; i++ { _, err := ParseGGUFFile(mp, UseMMap()) if err != nil { b.Fatal(err) return } } }) b.ResetTimer() b.Run("SkipLargeMetadata", func(b *testing.B) { for i := 0; i < b.N; i++ { _, err := ParseGGUFFile(mp, SkipLargeMetadata(), UseMMap()) if err != nil { b.Fatal(err) return } } }) } func TestParseGGUFFileRemote(t *testing.T) { const u = "https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF" + "/resolve/main/Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf" ctx := context.Background() // Slow read. { f, err := ParseGGUFFileRemote(ctx, u, UseDebug()) if err != nil { t.Fatal(err) return } s := spew.ConfigState{ Indent: " ", MaxDepth: 5, // Avoid console overflow. } t.Log("\n", s.Sdump(f), "\n") } // Fast read. { f, err := ParseGGUFFileRemote(ctx, u, UseDebug(), SkipLargeMetadata()) if err != nil { t.Fatal(err) return } t.Log("\n", spew.Sdump(f), "\n") } } func BenchmarkParseGGUFFileRemoteWithBufferSize(b *testing.B) { const u = "https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF" + "/resolve/main/Hermes-2-Pro-Llama-3-Instruct-Merged-DPO-Q4_K_M.gguf" ctx := context.Background() b.ReportAllocs() b.ResetTimer() b.Run("256KibBuffer", func(b *testing.B) { for i := 0; i < b.N; i++ { _, err := ParseGGUFFileRemote(ctx, u, SkipLargeMetadata(), UseBufferSize(256*1024)) if err != nil { b.Fatal(err) return } } }) b.ResetTimer() b.Run("1MibBuffer", func(b *testing.B) { for i := 0; i < b.N; i++ { _, err := ParseGGUFFileRemote(ctx, u, SkipLargeMetadata(), UseBufferSize(1024*1024)) if err != nil { b.Fatal(err) return } } }) b.ResetTimer() b.Run("4MibBuffer", func(b *testing.B) { for i := 0; i < b.N; i++ { _, err := ParseGGUFFileRemote(ctx, u, SkipLargeMetadata(), UseBufferSize(4*1024*1024)) if err != nil { b.Fatal(err) return } } }) } func TestParseGGUFFileFromHuggingFace(t *testing.T) { ctx := context.Background() cases := [][2]string{ { "TheBloke/Llama-2-13B-chat-GGUF", "llama-2-13b-chat.Q8_0.gguf", }, { "lmstudio-community/Yi-1.5-9B-Chat-GGUF", "Yi-1.5-9B-Chat-Q5_K_M.gguf", }, { "bartowski/gemma-2-9b-it-GGUF", "gemma-2-9b-it-Q3_K_M.gguf", }, } for _, tc := range cases { t.Run(tc[0]+"/"+tc[1], func(t *testing.T) { f, err := ParseGGUFFileFromHuggingFace(ctx, tc[0], tc[1], SkipLargeMetadata()) if err != nil { t.Fatal(err) return } t.Log("\n", spew.Sdump(f), "\n") }) } } func TestParseGGUFFileFromModelScope(t *testing.T) { ctx := context.Background() cases := [][2]string{ { "qwen/Qwen1.5-0.5B-Chat-GGUF", "qwen1_5-0_5b-chat-q5_k_m.gguf", }, { "HIT-SCIR/huozi3-gguf", "huozi3-q2_k.gguf", }, { "shaowenchen/chinese-alpaca-2-13b-16k-gguf", "chinese-alpaca-2-13b-16k.Q5_K.gguf", }, } for _, tc := range cases { t.Run(tc[0]+"/"+tc[1], func(t *testing.T) { f, err := ParseGGUFFileFromModelScope(ctx, tc[0], tc[1], SkipLargeMetadata()) if err != nil { t.Fatal(err) return } t.Log("\n", spew.Sdump(f), "\n") }) } } func TestParseGGUFFileFromOllama(t *testing.T) { ctx := context.Background() cases := []string{ "gemma2", "llama3.1", "qwen2:72b-instruct-q3_K_M", } for _, tc := range cases { t.Run(tc, func(t *testing.T) { start := time.Now() f, err := ParseGGUFFileFromOllama(ctx, tc, SkipLargeMetadata()) if err != nil { t.Fatal(err) return } t.Logf("cost: %v\n", time.Since(start)) t.Log("\n", spew.Sdump(f), "\n") }) } } // FuzzParseGGUFFile writes the fuzz input to a temp file and calls ParseGGUFFile. // Any panic during parsing will be reported by the fuzzing harness. func FuzzParseGGUFFile(f *testing.F) { buf := new(bytes.Buffer) bo := binary.LittleEndian for _, v := range []GGUFMagic{GGUFMagicGGML, GGUFMagicGGMF, GGUFMagicGGJT, GGUFMagicGGUFLe, GGUFMagicGGUFBe} { _ = binary.Write(buf, bo, uint32(v)) f.Add(buf.Bytes()) buf.Reset() } f.Fuzz(func(t *testing.T, data []byte) { tmp, err := os.CreateTemp("", "gguf_fuzz_*.gguf") if err != nil { t.Fatalf("create tmp: %v", err) } defer os.Remove(tmp.Name()) if _, err := tmp.Write(data); err != nil { t.Fatalf("write tmp: %v", err) } if err := tmp.Close(); err != nil { t.Fatalf("close tmp: %v", err) } // Call the public ParseGGUFFile which exercises parseGGUFFile. _, _ = ParseGGUFFile(tmp.Name()) }) } func TestParseGGUFFileWithFuzzInput(t *testing.T) { // Use the fuzz-generated data // data := []byte("GGUF\x00\x00\x00\x030000000000000000") data := []byte("FUGG\x00\x00\x00\x00GG>?\x00\x00\x00\x000000") // Create temp file tmpFile, err := os.CreateTemp("", "fuzz_test_gguf_*.gguf") if err != nil { t.Fatal(err) } defer os.Remove(tmpFile.Name()) _, err = tmpFile.Write(data) if err != nil { t.Fatal(err) } tmpFile.Close() // Parse should return error (since it's invalid or triggers the check) _, err = ParseGGUFFile(tmpFile.Name()) if err == nil { t.Error("expected error for fuzz-generated data") } else { t.Logf("got expected error: %v", err) } } ================================================ FILE: file_tokenizer.go ================================================ package gguf_parser // GGUFTokenizer represents the tokenizer metadata of a GGUF file. type GGUFTokenizer struct { /* Basic */ // Model is the model of the tokenizer. Model string `json:"model"` // TokensLength is the size of tokens. TokensLength uint64 `json:"tokensLength"` // MergeLength is the size of merges. MergesLength uint64 `json:"mergesLength"` // AddedTokensLength is the size of added tokens after training. AddedTokensLength uint64 `json:"addedTokenLength"` // BOSTokenID is the ID of the beginning of sentence token. // // Use -1 if the token is not found. BOSTokenID int64 `json:"bosTokenID"` // EOSTokenID is the ID of the end of sentence token. // // Use -1 if the token is not found. EOSTokenID int64 `json:"eosTokenID"` // EOTTokenID is the ID of the end of text token. // // Use -1 if the token is not found. EOTTokenID int64 `json:"eotTokenID"` // EOMTokenID is the ID of the end of message token. // // Use -1 if the token is not found. EOMTokenID int64 `json:"eomTokenID"` // UnknownTokenID is the ID of the unknown token. // // Use -1 if the token is not found. UnknownTokenID int64 `json:"unknownTokenID"` // SeparatorTokenID is the ID of the separator token. // // Use -1 if the token is not found. SeparatorTokenID int64 `json:"separatorTokenID"` // PaddingTokenID is the ID of the padding token. // // Use -1 if the token is not found. PaddingTokenID int64 `json:"paddingTokenID"` /* Appendix */ // TokenSize is the size of tokens in bytes. TokensSize int64 `json:"tokensSize"` // MergesSize is the size of merges in bytes. MergesSize int64 `json:"mergesSize"` } // Tokenizer returns the tokenizer metadata of a GGUF file. func (gf *GGUFFile) Tokenizer() (gt GGUFTokenizer) { const ( modelKey = "tokenizer.ggml.model" tokensKey = "tokenizer.ggml.tokens" mergesKey = "tokenizer.ggml.merges" addedTokensKey = "tokenizer.ggml.added_tokens" bosTokenIDKey = "tokenizer.ggml.bos_token_id" eosTokenIDKey = "tokenizer.ggml.eos_token_id" eotTokenIDKey = "tokenizer.ggml.eot_token_id" eomTokenIDKey = "tokenizer.ggml.eom_token_id" unknownTokenIDKey = "tokenizer.ggml.unknown_token_id" separatorTokenIDKey = "tokenizer.ggml.separator_token_id" paddingTokenIDKey = "tokenizer.ggml.padding_token_id" ) m, _ := gf.Header.MetadataKV.Index([]string{ modelKey, tokensKey, mergesKey, addedTokensKey, bosTokenIDKey, eosTokenIDKey, eotTokenIDKey, eomTokenIDKey, unknownTokenIDKey, separatorTokenIDKey, paddingTokenIDKey, }) gt.BOSTokenID = -1 gt.EOSTokenID = -1 gt.EOTTokenID = -1 gt.EOMTokenID = -1 gt.UnknownTokenID = -1 gt.SeparatorTokenID = -1 gt.PaddingTokenID = -1 if v, ok := m[modelKey]; ok { gt.Model = v.ValueString() } if v, ok := m[tokensKey]; ok { arr := v.ValueArray() gt.TokensLength = arr.Len gt.TokensSize = arr.Size } if v, ok := m[mergesKey]; ok { arr := v.ValueArray() gt.MergesLength = arr.Len gt.MergesSize = arr.Size } if v, ok := m[addedTokensKey]; ok { gt.AddedTokensLength = v.ValueArray().Len } if v, ok := m[bosTokenIDKey]; ok { gt.BOSTokenID = ValueNumeric[int64](v) } if v, ok := m[eosTokenIDKey]; ok { gt.EOSTokenID = ValueNumeric[int64](v) } if v, ok := m[eotTokenIDKey]; ok { gt.EOTTokenID = ValueNumeric[int64](v) } if v, ok := m[eomTokenIDKey]; ok { gt.EOMTokenID = ValueNumeric[int64](v) } if v, ok := m[unknownTokenIDKey]; ok { gt.UnknownTokenID = ValueNumeric[int64](v) } if v, ok := m[separatorTokenIDKey]; ok { gt.SeparatorTokenID = ValueNumeric[int64](v) } if v, ok := m[paddingTokenIDKey]; ok { gt.PaddingTokenID = ValueNumeric[int64](v) } return gt } ================================================ FILE: file_tokenizer_test.go ================================================ package gguf_parser import ( "context" "os" "testing" "github.com/davecgh/go-spew/spew" ) func TestGGUFFile_Tokenizer(t *testing.T) { ctx := context.Background() f, err := ParseGGUFFileFromHuggingFace( ctx, "NousResearch/Hermes-2-Pro-Mistral-7B-GGUF", "Hermes-2-Pro-Mistral-7B.Q5_K_M.gguf", SkipLargeMetadata()) if err != nil { t.Fatal(err) return } t.Log("\n", spew.Sdump(f.Tokenizer()), "\n") } func BenchmarkGGUFFile_Tokenizer(b *testing.B) { mp, ok := os.LookupEnv("TEST_MODEL_PATH") if !ok { b.Skip("TEST_MODEL_PATH is not set") return } f, err := ParseGGUFFile(mp, SkipLargeMetadata(), UseMMap()) if err != nil { b.Fatal(err) return } b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { _ = f.Tokenizer() } } ================================================ FILE: filename.go ================================================ package gguf_parser import ( "fmt" "regexp" "strconv" "strings" "github.com/gpustack/gguf-parser-go/util/funcx" "github.com/gpustack/gguf-parser-go/util/ptr" ) // GGUFFilename represents a GGUF filename, // see https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention. type GGUFFilename struct { BaseName string `json:"baseName"` SizeLabel string `json:"sizeLabel"` FineTune string `json:"fineTune"` Version string `json:"version"` Encoding string `json:"encoding"` Type string `json:"type"` Shard *int `json:"shard,omitempty"` ShardTotal *int `json:"shardTotal,omitempty"` } var GGUFFilenameRegex = regexp.MustCompile(`^(?P[A-Za-z\s][A-Za-z0-9._\s]*(?:(?:-(?:(?:[A-Za-z\s][A-Za-z0-9._\s]*)|(?:[0-9._\s]*)))*))-(?:(?P(?:\d+x)?(?:\d+\.)?\d+[A-Za-z](?:-[A-Za-z]+(\d+\.)?\d+[A-Za-z]+)?)(?:-(?P[A-Za-z][A-Za-z0-9\s_-]+[A-Za-z](?i:[^BFKIQ])))?)?(?:-(?P[vV]\d+(?:\.\d+)*))?(?i:-(?P(BF16|F32|F16|([KI]?Q[0-9][A-Z0-9_]*))))?(?:-(?PLoRA|vocab))?(?:-(?P\d{5})-of-(?P\d{5}))?\.gguf$`) // nolint:lll // ParseGGUFFilename parses the given GGUF filename string, // and returns the GGUFFilename, or nil if the filename is invalid. func ParseGGUFFilename(name string) *GGUFFilename { n := name if !strings.HasSuffix(n, ".gguf") { n += ".gguf" } m := make(map[string]string) { r := GGUFFilenameRegex.FindStringSubmatch(n) for i, ne := range GGUFFilenameRegex.SubexpNames() { if i != 0 && i <= len(r) { m[ne] = r[i] } } } if m["BaseName"] == "" { return nil } var gn GGUFFilename gn.BaseName = strings.ReplaceAll(m["BaseName"], "-", " ") gn.SizeLabel = m["SizeLabel"] gn.FineTune = m["FineTune"] gn.Version = m["Version"] gn.Encoding = m["Encoding"] gn.Type = m["Type"] if v := m["Shard"]; v != "" { gn.Shard = ptr.To(parseInt(v)) } if v := m["ShardTotal"]; v != "" { gn.ShardTotal = ptr.To(parseInt(v)) } return &gn } func (gn GGUFFilename) String() string { if gn.BaseName == "" { return "" } var sb strings.Builder sb.WriteString(strings.ReplaceAll(gn.BaseName, " ", "-")) if gn.SizeLabel != "" { sb.WriteString("-") sb.WriteString(gn.SizeLabel) } if gn.FineTune != "" { sb.WriteString("-") sb.WriteString(gn.FineTune) } if gn.Version != "" { sb.WriteString("-") sb.WriteString(gn.Version) } if gn.Encoding != "" { sb.WriteString("-") sb.WriteString(gn.Encoding) } if gn.Type != "" { sb.WriteString("-") sb.WriteString(gn.Type) } if m, n := ptr.Deref(gn.Shard, 0), ptr.Deref(gn.ShardTotal, 0); m > 0 && n > 0 { sb.WriteString("-") sb.WriteString(fmt.Sprintf("%05d", m)) sb.WriteString("-of-") sb.WriteString(fmt.Sprintf("%05d", n)) } sb.WriteString(".gguf") return sb.String() } // IsShard returns true if the GGUF filename is a shard. func (gn GGUFFilename) IsShard() bool { return ptr.Deref(gn.Shard, 0) > 0 && ptr.Deref(gn.ShardTotal, 0) > 0 } var ShardGGUFFilenameRegex = regexp.MustCompile(`^(?P.*)-(?:(?P\d{5})-of-(?P\d{5}))\.gguf$`) // IsShardGGUFFilename returns true if the given filename is a shard GGUF filename. func IsShardGGUFFilename(name string) bool { n := name if !strings.HasSuffix(n, ".gguf") { n += ".gguf" } m := make(map[string]string) { r := ShardGGUFFilenameRegex.FindStringSubmatch(n) for i, ne := range ShardGGUFFilenameRegex.SubexpNames() { if i != 0 && i <= len(r) { m[ne] = r[i] } } } var shard, shardTotal int if v := m["Shard"]; v != "" { shard = parseInt(v) } if v := m["ShardTotal"]; v != "" { shardTotal = parseInt(v) } return shard > 0 && shardTotal > 0 } // CompleteShardGGUFFilename returns the list of shard GGUF filenames that are related to the given shard GGUF filename. // // Only available if the given filename is a shard GGUF filename. func CompleteShardGGUFFilename(name string) []string { n := name if !strings.HasSuffix(n, ".gguf") { n += ".gguf" } m := make(map[string]string) { r := ShardGGUFFilenameRegex.FindStringSubmatch(n) for i, ne := range ShardGGUFFilenameRegex.SubexpNames() { if i != 0 && i <= len(r) { m[ne] = r[i] } } } var shard, shardTotal int if v := m["Shard"]; v != "" { shard = parseInt(v) } if v := m["ShardTotal"]; v != "" { shardTotal = parseInt(v) } if shard <= 0 || shardTotal <= 0 { return nil } names := make([]string, 0, shardTotal) for i := 1; i <= shardTotal; i++ { names = append(names, fmt.Sprintf("%s-%05d-of-%05d.gguf", m["Prefix"], i, shardTotal)) } return names } func parseInt(v string) int { return int(funcx.MustNoError(strconv.ParseInt(v, 10, 64))) } ================================================ FILE: filename_test.go ================================================ package gguf_parser import ( "testing" "github.com/stretchr/testify/assert" "github.com/gpustack/gguf-parser-go/util/ptr" ) func TestParseGGUFFilename(t *testing.T) { cases := []struct { given string expected *GGUFFilename }{ { given: "Mixtral-8x7B-V0.1-KQ2.gguf", expected: &GGUFFilename{ BaseName: "Mixtral", SizeLabel: "8x7B", Version: "V0.1", Encoding: "KQ2", }, }, { given: "Grok-100B-v1.0-Q4_0-00003-of-00009.gguf", expected: &GGUFFilename{ BaseName: "Grok", SizeLabel: "100B", Version: "v1.0", Encoding: "Q4_0", Shard: ptr.To(3), ShardTotal: ptr.To(9), }, }, { given: "Hermes-2-Pro-Llama-3-8B-F16.gguf", expected: &GGUFFilename{ BaseName: "Hermes 2 Pro Llama 3", SizeLabel: "8B", Encoding: "F16", }, }, { given: "Phi-3-mini-3.8B-ContextLength4k-instruct-v1.0.gguf", expected: &GGUFFilename{ BaseName: "Phi 3 mini", SizeLabel: "3.8B-ContextLength4k", FineTune: "instruct", Version: "v1.0", }, }, { given: "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00001-of-00018.gguf", expected: &GGUFFilename{ BaseName: "Meta Llama 3.1", SizeLabel: "405B", FineTune: "Instruct-XelotX", Encoding: "BF16", Shard: ptr.To(1), ShardTotal: ptr.To(18), }, }, { given: "qwen2-72b-instruct-q6_k-00001-of-00002.gguf", expected: &GGUFFilename{ BaseName: "qwen2", SizeLabel: "72b", FineTune: "instruct", Encoding: "q6_k", Shard: ptr.To(1), ShardTotal: ptr.To(2), }, }, { given: "Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00001-of-00009.gguf", expected: nil, }, { given: "not-a-known-arrangement.gguf", expected: nil, }, } for _, tc := range cases { t.Run(tc.given, func(t *testing.T) { actual := ParseGGUFFilename(tc.given) assert.Equal(t, tc.expected, actual) }) } } func TestGGUFFilenameString(t *testing.T) { cases := []struct { given GGUFFilename expected string }{ { given: GGUFFilename{ BaseName: "Mixtral", SizeLabel: "8x7B", Version: "v0.1", Encoding: "KQ2", }, expected: "Mixtral-8x7B-v0.1-KQ2.gguf", }, { given: GGUFFilename{ BaseName: "Grok", SizeLabel: "100B", Version: "v1.0", Encoding: "Q4_0", Shard: ptr.To(3), ShardTotal: ptr.To(9), }, expected: "Grok-100B-v1.0-Q4_0-00003-of-00009.gguf", }, { given: GGUFFilename{ BaseName: "Hermes 2 Pro Llama 3", SizeLabel: "8B", Encoding: "F16", }, expected: "Hermes-2-Pro-Llama-3-8B-F16.gguf", }, { given: GGUFFilename{ BaseName: "Phi 3 mini", SizeLabel: "3.8B-ContextLength4k", FineTune: "instruct", Version: "v1.0", }, expected: "Phi-3-mini-3.8B-ContextLength4k-instruct-v1.0.gguf", }, { given: GGUFFilename{}, expected: "", }, } for _, tc := range cases { t.Run(tc.expected, func(t *testing.T) { actual := tc.given.String() assert.Equal(t, tc.expected, actual) }) } } func TestIsShardGGUFFilename(t *testing.T) { cases := []struct { given string expected bool }{ { given: "qwen2-72b-instruct-q6_k-00001-of-00002.gguf", expected: true, }, { given: "Grok-100B-v1.0-Q4_0-00003-of-00009.gguf", expected: true, }, { given: "Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00001-of-00009.gguf", expected: true, }, { given: "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00001-of-00018.gguf", expected: true, }, { given: "not-a-known-arrangement.gguf", expected: false, }, } for _, tc := range cases { t.Run(tc.given, func(t *testing.T) { actual := IsShardGGUFFilename(tc.given) assert.Equal(t, tc.expected, actual) }) } } func TestCompleteShardGGUFFilename(t *testing.T) { cases := []struct { given string expected []string }{ { given: "qwen2-72b-instruct-q6_k-00001-of-00002.gguf", expected: []string{ "qwen2-72b-instruct-q6_k-00001-of-00002.gguf", "qwen2-72b-instruct-q6_k-00002-of-00002.gguf", }, }, { given: "Grok-100B-v1.0-Q4_0-00003-of-00009.gguf", expected: []string{ "Grok-100B-v1.0-Q4_0-00001-of-00009.gguf", "Grok-100B-v1.0-Q4_0-00002-of-00009.gguf", "Grok-100B-v1.0-Q4_0-00003-of-00009.gguf", "Grok-100B-v1.0-Q4_0-00004-of-00009.gguf", "Grok-100B-v1.0-Q4_0-00005-of-00009.gguf", "Grok-100B-v1.0-Q4_0-00006-of-00009.gguf", "Grok-100B-v1.0-Q4_0-00007-of-00009.gguf", "Grok-100B-v1.0-Q4_0-00008-of-00009.gguf", "Grok-100B-v1.0-Q4_0-00009-of-00009.gguf", }, }, { given: "Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00001-of-00009.gguf", expected: []string{ "Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00001-of-00009.gguf", "Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00002-of-00009.gguf", "Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00003-of-00009.gguf", "Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00004-of-00009.gguf", "Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00005-of-00009.gguf", "Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00006-of-00009.gguf", "Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00007-of-00009.gguf", "Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00008-of-00009.gguf", "Meta-Llama-3.1-405B-Instruct.Q2_K.gguf-00009-of-00009.gguf", }, }, { given: "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00001-of-00018.gguf", expected: []string{ "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00001-of-00018.gguf", "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00002-of-00018.gguf", "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00003-of-00018.gguf", "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00004-of-00018.gguf", "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00005-of-00018.gguf", "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00006-of-00018.gguf", "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00007-of-00018.gguf", "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00008-of-00018.gguf", "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00009-of-00018.gguf", "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00010-of-00018.gguf", "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00011-of-00018.gguf", "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00012-of-00018.gguf", "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00013-of-00018.gguf", "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00014-of-00018.gguf", "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00015-of-00018.gguf", "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00016-of-00018.gguf", "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00017-of-00018.gguf", "Meta-Llama-3.1-405B-Instruct-XelotX-BF16-00018-of-00018.gguf", }, }, { given: "not-a-known-arrangement.gguf", expected: nil, }, } for _, tc := range cases { t.Run(tc.given, func(t *testing.T) { actual := CompleteShardGGUFFilename(tc.given) assert.Equal(t, tc.expected, actual) }) } } ================================================ FILE: gen.go ================================================ //go:generate go generate -tags stringer gen.stringer.go //go:generate go generate -tags regression gen.regression.go package gguf_parser ================================================ FILE: gen.regression.go ================================================ //go:build regression //go:generate go run -tags regression gen.regression.go package main import ( "fmt" "strconv" "math" "os" "text/template" "bytes" "go/format" "gonum.org/v1/gonum/mat" "golang.org/x/exp/maps" "sort" ) type LinearRegression struct { Intercept float64 Slope float64 } func (lr *LinearRegression) Fit(xs, ys []float64) { if len(xs) != len(ys) { panic("length of xs and ys must be the same") } var sX, sY, sXY, sXX float64 for i := 0; i < len(xs); i++ { sX += xs[i] sY += ys[i] sXY += xs[i] * ys[i] sXX += xs[i] * xs[i] } n := float64(len(xs)) d := n*sXX - sX*sX if d == 0 { d = 1 } lr.Slope = (n*sXY - sX*sY) / d lr.Intercept = (sY*sXX - sX*sXY) / d } func (lr *LinearRegression) Predict(x float64) (y float64) { return lr.Intercept + lr.Slope*x } type PolynomialRegression struct { Degree int Coefficients []float64 } func (pr *PolynomialRegression) Fit(xs, ys []float64) { samples := len(xs) feats := pr.Degree + 1 feat := mat.NewDense(samples, feats, nil) { for i := 0; i < samples; i++ { for j := 0; j < feats; j++ { feat.Set(i, j, math.Pow(xs[i], float64(j))) } } var qr mat.QR qr.Factorize(feat) } yVec := mat.NewVecDense(samples, ys) var coef mat.VecDense if err := coef.SolveVec(feat, yVec); err != nil { panic("failed to solve") } pr.Coefficients = coef.RawVector().Data } func (pr *PolynomialRegression) Predict(x float64) (y float64) { y = 0 for i := 0; i < pr.Degree+1; i++ { y += pr.Coefficients[i] * math.Pow(x, float64(i)) } return } func DiffusionModelMemoryUsageRegression(output string) { type Regression struct { Name string LinearRegression *LinearRegression PolynomialRegression *PolynomialRegression } const tmplStr = ` package gguf_parser import "math" {{ range . -}} // {{ .Name }} returns the memory usage in bytes for the given width and height, // which is calculated by linear regression or polynomial regression. func {{ .Name }}(width, height uint32, flashAttention bool) uint64 { coefficients := []float64{ {{ range $i, $c := .PolynomialRegression.Coefficients }}{{ if eq $i 0 }}{{ printf "%.4f" $c }}{{ else }}{{ printf "%.10f" $c }}{{ end }}, {{ end }} } degree := {{ .PolynomialRegression.Degree }} x := float64(width * height) {{ if .LinearRegression -}} if flashAttention { coefficients = []float64{ {{ printf "%.5f" .LinearRegression.Intercept }}, {{ printf "%.10f" .LinearRegression.Slope }} } degree = 1 } {{- end }} y := float64(0) for i := 0; i <= degree; i++ { y += coefficients[i] * math.Pow(x, float64(i)) } return uint64(y) } {{ end }} ` ts := []struct { n string x2y map[float64]float64 c map[float64]float64 fax2y map[float64]float64 fac map[float64]float64 }{ { n: "GuessSD1DiffusionModelMemoryUsage", // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 49.57 MB(VRAM) // 256*256 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 559.90 MB(VRAM) // 512*512 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 8360.93 MB(VRAM) // 1024*1024 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 18681.62 MB(VRAM) // 1024*1536 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 25377.96 MB(VRAM) // 1024*1792 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 41842.65 MB(VRAM) // 1536*1536 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 77333.77 MB(VRAM) // 1792*1792 x2y: map[float64]float64{ 256 * 256: 49.57, 512 * 512: 559.90, 1024 * 1024: 8360.93, 1024 * 1536: 18681.62, 1024 * 1792: 25377.96, 1536 * 1536: 41842.65, 1792 * 1792: 77333.77, }, // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 56879.17 MB(VRAM) // 1536*1792 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 100924.37 MB(VRAM) // 1792*2048 c: map[float64]float64{ 1536 * 1792: 56879.17, 1792 * 2048: 100924.37, }, }, { n: "GuessSD2DiffusionModelMemoryUsage", // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 37.65 MB(VRAM) // 256*256 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 367.98 MB(VRAM) // 512*512 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 830.86 MB(VRAM) // 1024*1024 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 11769.69 MB(VRAM) // 1024*1536 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 15970.04 MB(VRAM) // 1024*1792 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 26290.73 MB(VRAM) // 1536*1536 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 48521.84 MB(VRAM) // 1792*1792 x2y: map[float64]float64{ 256 * 256: 37.65, 512 * 512: 367.98, 1024 * 1024: 830.86, 1024 * 1536: 11769.69, 1024 * 1792: 15970.04, 1536 * 1536: 26290.73, 1792 * 1792: 48521.84, }, // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 35711.24 MB(VRAM) // 1536*1792 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 63292.44 MB(VRAM) // 1792*2048 c: map[float64]float64{ 1536 * 1792: 35711.24, 1792 * 2048: 63292.44, }, // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 34.52 MB(VRAM) // 256*256 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 130.48 MB(VRAM) // 512*512 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 519.01 MB(VRAM) // 1024*1024 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 774.69 MB(VRAM) // 1024*1536 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 902.54 MB(VRAM) // 1024*1792 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1158.23 MB(VRAM) // 1536*1536 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1573.72 MB(VRAM) // 1792*1792 fax2y: map[float64]float64{ 256 * 256: 34.52, 512 * 512: 130.48, 1024 * 1024: 519.01, 1024 * 1536: 774.69, 1024 * 1792: 902.54, 1536 * 1536: 1158.23, 1792 * 1792: 1573.72, }, // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1349.99 MB(VRAM) // 1536*1792 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1797.44 MB(VRAM) // 1792*2048 fac: map[float64]float64{ 1536 * 1792: 1349.99, 1792 * 2048: 1797.44, }, }, { n: "GuessSDXLDiffusionModelMemoryUsage", // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 60.76 MB(VRAM) // 256*256 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 132.05 MB(VRAM) // 512*512 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 830.86 MB(VRAM) // 1024*1024 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1701.55 MB(VRAM) // 1024*1536 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 2256.90 MB(VRAM) // 1024*1792 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 3607.58 MB(VRAM) // 1536*1536 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 6484.95 MB(VRAM) // 1792*1792 x2y: map[float64]float64{ 256 * 256: 60.76, 512 * 512: 132.05, 1024 * 1024: 830.86, 1024 * 1536: 1701.55, 1024 * 1792: 2256.90, 1536 * 1536: 3607.58, 1792 * 1792: 6484.95, }, // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 4830.60 MB(VRAM) // 1536*1792 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 8384.30 MB(VRAM) // 1792*2048 c: map[float64]float64{ 1536 * 1792: 4830.60, 1792 * 2048: 8384.30, }, // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 60.13 MB(VRAM) // 256*256 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 132.05 MB(VRAM) // 512*512 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 440.86 MB(VRAM) // 1024*1024 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 726.55 MB(VRAM) // 1024*1536 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 874.40 MB(VRAM) // 1024*1792 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1110.08 MB(VRAM) // 1536*1536 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1554.33 MB(VRAM) // 1792*1792 fax2y: map[float64]float64{ 256 * 256: 60.13, 512 * 512: 132.05, 1024 * 1024: 440.86, 1024 * 1536: 726.55, 1024 * 1792: 874.40, 1536 * 1536: 1110.08, 1792 * 1792: 1554.33, }, // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1339.35 MB(VRAM) // 1536*1792 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1769.30 MB(VRAM) // 1792*2048 fac: map[float64]float64{ 1536 * 1792: 1339.35, 1792 * 2048: 1769.30, }, }, { // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 44.57 MB(VRAM) // 256*256 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 154.40 MB(VRAM) // 512*512 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 968.43 MB(VRAM) // 1024*1024 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 2013.12 MB(VRAM) // 1024*1536 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 2679.46 MB(VRAM) // 1024*1792 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 4300.15 MB(VRAM) // 1536*1536 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 7752.77 MB(VRAM) // 1792*1792 n: "GuessSDXLRefinerDiffusionModelMemoryUsage", x2y: map[float64]float64{ 256 * 256: 44.57, 512 * 512: 154.40, 1024 * 1024: 968.43, 1024 * 1536: 2013.12, 1024 * 1792: 2679.46, 1536 * 1536: 4300.15, 1792 * 1792: 7752.77, }, // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 5767.67 MB(VRAM) // 1536*1792 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 10031.87 MB(VRAM) // 1792*2048 c: map[float64]float64{ 1536 * 1792: 5767.67, 1792 * 2048: 10031.87, }, // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 44.57 MB(VRAM) // 256*256 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 154.40 MB(VRAM) // 512*512 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 596.43 MB(VRAM) // 1024*1024 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 915.12 MB(VRAM) // 1024*1536 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1062.46 MB(VRAM) // 1024*1792 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1357.15 MB(VRAM) // 1536*1536 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1836.02 MB(VRAM) // 1792*1792 fax2y: map[float64]float64{ 256 * 256: 44.57, 512 * 512: 154.40, 1024 * 1024: 596.43, 1024 * 1536: 915.12, 1024 * 1792: 1062.46, 1536 * 1536: 1357.15, 1792 * 1792: 1836.02, }, // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1578.17 MB(VRAM) // 1536*1792 // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 2014.02 MB(VRAM) // 1792*2048 fac: map[float64]float64{ 1536 * 1792: 1578.17, 1792 * 2048: 2014.02, }, }, { n: "GuessSD3MediumDiffusionModelMemoryUsage", // [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 37.09 MB(VRAM) // 256*256 // [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 169.64 MB(VRAM) // 512*512 // [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 1786.11 MB(VRAM) // 1024*1024 // [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 3824.36 MB(VRAM) // 1024*1536 // [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 5131.48 MB(VRAM) // 1024*1792 // [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 8319.03 MB(VRAM) // 1536*1536 // [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 15141.18 MB(VRAM) // 1792*1792 x2y: map[float64]float64{ 256 * 256: 37.09, 512 * 512: 169.64, 1024 * 1024: 1786.11, 1024 * 1536: 3824.36, 1024 * 1792: 5131.48, 1536 * 1536: 8319.03, 1792 * 1792: 15141.18, }, // [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 11215.71 MB(VRAM) // 1536*1792 // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 19654.65 MB(VRAM) // 1792*2048 c: map[float64]float64{ 1536 * 1792: 11215.71, 1792 * 2048: 19654.65, }, }, { n: "GuessSD35MediumDiffusionModelMemoryUsage", // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 41.48 MB(VRAM) // 256*256 // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 181.64 MB(VRAM) // 512*512 // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 1834.11 MB(VRAM) // 1024*1024 // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 3896.36 MB(VRAM) // 1024*1536 // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 5215.48 MB(VRAM) // 1024*1792 // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 8427.03 MB(VRAM) // 1536*1536 // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 15288.18 MiB(VRAM) // 1792*1792 x2y: map[float64]float64{ 256 * 256: 41.48, 512 * 512: 181.64, 1024 * 1024: 1834.11, 1024 * 1536: 3896.36, 1024 * 1792: 5215.48, 1536 * 1536: 8427.03, 1792 * 1792: 15288.18, }, // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 11341.71 MB(VRAM) // 1536*1792 // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 19822.65 MB(VRAM) // 1792*2048 c: map[float64]float64{ 1536 * 1792: 11341.71, 1792 * 2048: 19822.65, }, }, { n: "GuessSD35LargeDiffusionModelMemoryUsage", // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 57.27 MB(VRAM) // 256*256 // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 276.54 MB(VRAM) // 512*512 // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 2865.44 MB(VRAM) // 1024*1024 // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 6109.95 MB(VRAM) // 1024*1536 // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 8188.92 MB(VRAM) // 1024*1792 // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 13258.86 MB(VRAM) // 1536*1536 // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 24091.01 MiB(VRAM) // 1792*1792 x2y: map[float64]float64{ 256 * 256: 57.27, 512 * 512: 276.54, 1024 * 1024: 2865.44, 1024 * 1536: 6109.95, 1024 * 1792: 8188.92, 1536 * 1536: 13258.86, 1792 * 1792: 24091.01, }, // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 17859.31 MB(VRAM) // 1536*1792 // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 31253.70 MB(VRAM) // 1792*2048 c: map[float64]float64{ 1536 * 1792: 17859.31, 1792 * 2048: 31253.70, }, }, { n: "GuessFLUXDiffusionModelMemoryUsage", // [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 103.35 MB(VRAM) // 256*256 // [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 398.05 MB(VRAM) // 512*512 // [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 2576.18 MB(VRAM) // 1024*1024 // [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 4978.31 MB(VRAM) // 1024*1536 // [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 6467.37 MB(VRAM) // 1024*1792 // [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 10021.49 MB(VRAM) // 1536*1536 // [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 17434.95 MB(VRAM) // 1792*1792 x2y: map[float64]float64{ 256 * 256: 103.35, 512 * 512: 398.05, 1024 * 1024: 2576.18, 1024 * 1536: 4978.31, 1024 * 1792: 6467.37, 1536 * 1536: 10021.49, 1792 * 1792: 17434.95, }, // [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 13191.09 MB(VRAM) // 1536*1792 // [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 22266.81 MB(VRAM) // 1792*2048 c: map[float64]float64{ 1536 * 1792: 13191.09, 1792 * 2048: 22266.81, }, }, } rs := make([]Regression, len(ts)) for i, t := range ts { rs[i].Name = t.n } fmt.Println("Polynomial Regression For None Flash Attention") for i, t := range ts { pr := PolynomialRegression{ Degree: 2, } xs := maps.Keys(t.x2y) sort.Float64s(xs) ys := make([]float64, len(xs)) for j, x := range xs { ys[j] = t.x2y[x] * 1024 * 1024 // MB to B } pr.Fit(xs, ys) for x, y := range t.c { y_ := pr.Predict(x) / 1024 / 1024 // B to MB d := (y_ - y) / y * 100 s := "+" if d < 0 { s = "" } c := "" if d > 10 { c = "?" } fmt.Printf("%50s: y': %10.2f | y: %10.2f | d: %10s%% %s\n", t.n, y_, y, s+strconv.FormatFloat(d, 'f', 6, 64), c) } rs[i].PolynomialRegression = &pr } fmt.Println("Linear Regression For Flash Attention") for i, t := range ts { if len(t.fax2y) == 0 { continue } lr := LinearRegression{} xs := maps.Keys(t.fax2y) sort.Float64s(xs) ys := make([]float64, len(xs)) for j, x := range xs { ys[j] = t.fax2y[x] * 1024 * 1024 // MB to B } lr.Fit(xs, ys) for x, y := range t.fac { y_ := lr.Predict(x) / 1024 / 1024 // B to MB d := (y_ - y) / y * 100 s := "+" if d < 0 { s = "" } c := "" if d > 10 { c = "?" } fmt.Printf("%50s: y': %10.2f | y: %10.2f | d: %10s%% %s\n", t.n, y_, y, s+strconv.FormatFloat(d, 'f', 6, 64), c) } rs[i].LinearRegression = &lr } var code []byte { var ( buff bytes.Buffer err error ) tmpl := template.Must(template.New("tmpl").Parse(tmplStr)) if err = tmpl.Execute(&buff, rs); err != nil { panic(fmt.Errorf("failed to execute template: %w", err)) } code, err = format.Source(buff.Bytes()) if err != nil { panic(fmt.Errorf("failed to format source: %w", err)) } } if err := os.WriteFile(output, code, 0644); err != nil { panic(fmt.Errorf("failed to write file: %w", err)) } } func main() { DiffusionModelMemoryUsageRegression("zz_generated.diffusion_model_memory_usage.regression.go") } ================================================ FILE: gen.stringer.go ================================================ //go:build stringer //go:generate go run golang.org/x/tools/cmd/stringer -linecomment -type GGUFMagic -output zz_generated.ggufmagic.stringer.go -trimprefix GGUFMagic //go:generate go run golang.org/x/tools/cmd/stringer -linecomment -type GGUFVersion -output zz_generated.ggufversion.stringer.go -trimprefix GGUFVersion //go:generate go run golang.org/x/tools/cmd/stringer -linecomment -type GGUFMetadataValueType -output zz_generated.ggufmetadatavaluetype.stringer.go -trimprefix GGUFMetadataValueType //go:generate go run golang.org/x/tools/cmd/stringer -linecomment -type GGUFFileType -output zz_generated.gguffiletype.stringer.go -trimprefix GGUFFileType //go:generate go run golang.org/x/tools/cmd/stringer -linecomment -type GGMLType -output zz_generated.ggmltype.stringer.go -trimprefix GGMLType package gguf_parser import _ "golang.org/x/tools/cmd/stringer" ================================================ FILE: ggml.go ================================================ package gguf_parser import ( "errors" "fmt" "slices" ) // Types for GGMLType. type ( // GGMLType is a type of GGML tensor, // see https://github.com/ggml-org/llama.cpp/blob/fd1234cb468935ea087d6929b2487926c3afff4b/ggml/include/ggml.h#L368-L410. GGMLType uint32 // GGMLTypeTrait holds the trait of a GGMLType, // see https://github.com/ggml-org/llama.cpp/blob/fd1234cb468935ea087d6929b2487926c3afff4b/ggml/src/ggml.c#L586-L876. GGMLTypeTrait struct { BlockSize uint64 // Original is int, in order to reduce conversion, here we use uint64. TypeSize uint64 // Original is uint32, in order to reduce conversion, here we use uint64. Quantized bool } ) // GGMLType constants. // // GGMLTypeQ4_2, GGMLTypeQ4_3 are deprecated. // GGMLTypeQ4_0_4_4, GGMLTypeQ4_0_4_8, GGMLTypeQ4_0_8_8 are deprecated. // GGMLTypeIQ4_NL_4_4, GGMLTypeIQ4_NL_4_8, GGMLTypeIQ4_NL_8_8 are deprecated. const ( GGMLTypeF32 GGMLType = iota GGMLTypeF16 GGMLTypeQ4_0 GGMLTypeQ4_1 GGMLTypeQ4_2 GGMLTypeQ4_3 GGMLTypeQ5_0 GGMLTypeQ5_1 GGMLTypeQ8_0 GGMLTypeQ8_1 GGMLTypeQ2_K GGMLTypeQ3_K GGMLTypeQ4_K GGMLTypeQ5_K GGMLTypeQ6_K GGMLTypeQ8_K GGMLTypeIQ2_XXS GGMLTypeIQ2_XS GGMLTypeIQ3_XXS GGMLTypeIQ1_S GGMLTypeIQ4_NL GGMLTypeIQ3_S GGMLTypeIQ2_S GGMLTypeIQ4_XS GGMLTypeI8 GGMLTypeI16 GGMLTypeI32 GGMLTypeI64 GGMLTypeF64 GGMLTypeIQ1_M GGMLTypeBF16 GGMLTypeQ4_0_4_4 GGMLTypeQ4_0_4_8 GGMLTypeQ4_0_8_8 GGMLTypeTQ1_0 GGMLTypeTQ2_0 GGMLTypeIQ4_NL_4_4 GGMLTypeIQ4_NL_4_8 GGMLTypeIQ4_NL_8_8 GGMLTypeMXFP4 _GGMLTypeCount // Unknown ) // _GGMLTypeTraits is a table of GGMLTypeTrait for GGMLType. var _GGMLTypeTraits = map[GGMLType]GGMLTypeTrait{ GGMLTypeF32: {BlockSize: 1, TypeSize: 4}, GGMLTypeF16: {BlockSize: 1, TypeSize: 2}, GGMLTypeQ4_0: {BlockSize: 32, TypeSize: 18, Quantized: true}, GGMLTypeQ4_1: {BlockSize: 32, TypeSize: 20, Quantized: true}, GGMLTypeQ4_2: {BlockSize: 0, TypeSize: 0}, // Deprecated GGMLTypeQ4_3: {BlockSize: 0, TypeSize: 0}, // Deprecated GGMLTypeQ5_0: {BlockSize: 32, TypeSize: 22, Quantized: true}, GGMLTypeQ5_1: {BlockSize: 32, TypeSize: 24, Quantized: true}, GGMLTypeQ8_0: {BlockSize: 32, TypeSize: 34, Quantized: true}, GGMLTypeQ8_1: {BlockSize: 32, TypeSize: 36, Quantized: true}, GGMLTypeQ2_K: {BlockSize: 256, TypeSize: 84, Quantized: true}, GGMLTypeQ3_K: {BlockSize: 256, TypeSize: 110, Quantized: true}, GGMLTypeQ4_K: {BlockSize: 256, TypeSize: 144, Quantized: true}, GGMLTypeQ5_K: {BlockSize: 256, TypeSize: 176, Quantized: true}, GGMLTypeQ6_K: {BlockSize: 256, TypeSize: 210, Quantized: true}, GGMLTypeQ8_K: {BlockSize: 256, TypeSize: 292, Quantized: true}, GGMLTypeIQ2_XXS: {BlockSize: 256, TypeSize: 66, Quantized: true}, GGMLTypeIQ2_XS: {BlockSize: 256, TypeSize: 74, Quantized: true}, GGMLTypeIQ3_XXS: {BlockSize: 256, TypeSize: 98, Quantized: true}, GGMLTypeIQ1_S: {BlockSize: 256, TypeSize: 50, Quantized: true}, GGMLTypeIQ4_NL: {BlockSize: 32, TypeSize: 18, Quantized: true}, GGMLTypeIQ3_S: {BlockSize: 256, TypeSize: 110, Quantized: true}, GGMLTypeIQ2_S: {BlockSize: 256, TypeSize: 82, Quantized: true}, GGMLTypeIQ4_XS: {BlockSize: 256, TypeSize: 136, Quantized: true}, GGMLTypeI8: {BlockSize: 1, TypeSize: 1}, GGMLTypeI16: {BlockSize: 1, TypeSize: 2}, GGMLTypeI32: {BlockSize: 1, TypeSize: 4}, GGMLTypeI64: {BlockSize: 1, TypeSize: 8}, GGMLTypeF64: {BlockSize: 1, TypeSize: 8}, GGMLTypeIQ1_M: {BlockSize: 256, TypeSize: 56, Quantized: true}, GGMLTypeBF16: {BlockSize: 1, TypeSize: 2}, GGMLTypeQ4_0_4_4: {BlockSize: 32, TypeSize: 18, Quantized: true}, GGMLTypeQ4_0_4_8: {BlockSize: 32, TypeSize: 18, Quantized: true}, GGMLTypeQ4_0_8_8: {BlockSize: 32, TypeSize: 18, Quantized: true}, GGMLTypeTQ1_0: {BlockSize: 256, TypeSize: 54, Quantized: true}, GGMLTypeTQ2_0: {BlockSize: 256, TypeSize: 66, Quantized: true}, GGMLTypeIQ4_NL_4_4: {BlockSize: 32, TypeSize: 18, Quantized: true}, GGMLTypeIQ4_NL_4_8: {BlockSize: 32, TypeSize: 18, Quantized: true}, GGMLTypeIQ4_NL_8_8: {BlockSize: 32, TypeSize: 18, Quantized: true}, GGMLTypeMXFP4: {BlockSize: 32, TypeSize: 17, Quantized: true}, } // Trait returns the GGMLTypeTrait of the GGMLType. func (t GGMLType) Trait() (GGMLTypeTrait, bool) { tt, ok := _GGMLTypeTraits[t] return tt, ok } // IsQuantized returns whether the GGMLType is quantized. func (t GGMLType) IsQuantized() bool { tt, ok := t.Trait() if !ok { return false } return tt.Quantized } // RowSizeOf returns the size of the given dimensions according to the GGMLType's GGMLTypeTrait, // which is inspired by // https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L3142-L3145. // // The index of the given dimensions means the number of dimension, // i.e. 0 is the first dimension, 1 is the second dimension, and so on. // // The value of the item is the number of elements in the corresponding dimension. func (t GGMLType) RowSizeOf(dimensions []uint64) uint64 { if len(dimensions) == 0 { panic(errors.New("no dimensions")) } tt, ok := t.Trait() if !ok { panic(fmt.Errorf("invalid type: %v", t)) } // https://github.com/ggerganov/ggml/blob/a10a8b880c059b3b29356eb9a9f8df72f03cdb6a/src/ggml.c#L2640-L2643 ds := tt.TypeSize * dimensions[0] / tt.BlockSize // Row size for i := 1; i < len(dimensions); i++ { ds *= dimensions[i] } return ds } // GGMLMemoryPadding returns the padded size of the given size according to GGML memory padding, // see https://github.com/ggerganov/ggml/blob/0cbb7c0/include/ggml/ggml.h#L238-L243. func GGMLMemoryPadding(size uint64) uint64 { const align = 16 return GGMLPadding(size, align) } // GGMLPadding returns the padded size of the given size according to given align, // see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L255. func GGMLPadding(size, align uint64) uint64 { return (size + align - 1) &^ (align - 1) } // GGML tensor constants. const ( // GGMLTensorSize is the size of GGML tensor in bytes, // see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L606. GGMLTensorSize = 368 // GGMLObjectSize is the size of GGML object in bytes, // see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/include/ggml/ggml.h#L563. GGMLObjectSize = 32 ) // GGMLTensorOverhead is the overhead of GGML tensor in bytes, // see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L2765-L2767. func GGMLTensorOverhead() uint64 { return GGMLObjectSize + GGMLTensorSize } // GGML computation graph constants. const ( // GGMLComputationGraphSize is the size of GGML computation graph in bytes. GGMLComputationGraphSize = 80 // GGMLComputationBitsetSize is the size of GGML computation bitset in bytes, // see https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-impl.h#L165. GGMLComputationBitsetSize = 4 ) // GGMLComputationGraphOverhead is the overhead of GGML graph in bytes, // see https://github.com/ggml-org/ggml/blob/5592ffda9c417c3c12232c828247c23d17004c88/src/ggml.c#L5941-L5956. func GGMLComputationGraphOverhead(nodes uint64, grads bool) uint64 { const ps = 8 // c++ pointer size hs := GGMLHashSize(nodes * 2) var g uint64 = GGMLComputationGraphSize // graph g += GGMLPadding(nodes*ps, ps) // nodes g += GGMLPadding(nodes*ps, ps) // leafs g += GGMLPadding(nodes*ps, ps) // parents g += GGMLPadding(hs*ps, ps) // hash keys if grads { g += GGMLPadding(hs*ps, ps) // grads g += GGMLPadding(hs*ps, ps) // grad_accs } g += GGMLPadding(GGMLBitsetSize(hs)*GGMLComputationBitsetSize, GGMLComputationBitsetSize) // bitset return GGMLObjectSize + GGMLMemoryPadding(g) } // GGMLHashSize returns the size of the hash table for the given base, // see https://github.com/ggerganov/ggml/blob/0cbb7c0e053f5419cfbebb46fbf4d4ed60182cf5/src/ggml.c#L17698-L17722. func GGMLHashSize(base uint64) uint64 { primes := []uint64{ 2, 3, 5, 11, 17, 37, 67, 131, 257, 521, 1031, 2053, 4099, 8209, 16411, 32771, 65537, 131101, 262147, 524309, 1048583, 2097169, 4194319, 8388617, 16777259, 33554467, 67108879, 134217757, 268435459, 536870923, 1073741827, 2147483659, } i, ok := slices.BinarySearchFunc(primes, base, func(e, t uint64) int { if t >= e { return 0 } return -1 }) if !ok { return base | 1 } return primes[i] } // GGMLBitsetSize returns the size of the bitset for the given number of bits, // see https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/ggml/src/ggml-impl.h#L166-L171. func GGMLBitsetSize(n uint64) uint64 { return (n + (GGMLComputationBitsetSize*8 - 1)) >> 5 } ================================================ FILE: go.mod ================================================ module github.com/gpustack/gguf-parser-go go 1.22.0 toolchain go1.22.9 require ( github.com/davecgh/go-spew v1.1.1 github.com/henvic/httpretty v0.1.4 github.com/json-iterator/go v1.1.12 github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d github.com/stretchr/testify v1.9.0 golang.org/x/crypto v0.29.0 golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f golang.org/x/sync v0.9.0 golang.org/x/sys v0.27.0 golang.org/x/tools v0.27.0 gonum.org/v1/gonum v0.15.1 ) require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.2 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect golang.org/x/mod v0.22.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) ================================================ FILE: go.sum ================================================ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/henvic/httpretty v0.1.4 h1:Jo7uwIRWVFxkqOnErcoYfH90o3ddQyVrSANeS4cxYmU= github.com/henvic/httpretty v0.1.4/go.mod h1:Dn60sQTZfbt2dYsdUSNsCljyF4AfdqnuJFDLJA1I4AM= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d h1:3VwvTjiRPA7cqtgOWddEL+JrcijMlXUmj99c/6YyZoY= github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d/go.mod h1:tAG61zBM1DYRaGIPloumExGvScf08oHuo0kFoOqdbT0= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= golang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ= golang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg= golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f h1:XdNn9LlyWAhLVp6P/i8QYBW+hlyhrhei9uErw2B5GJo= golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f/go.mod h1:D5SMRVC3C2/4+F/DB1wZsLRnSNimn2Sp/NPsCrsv8ak= golang.org/x/mod v0.22.0 h1:D4nJWe9zXqHOmWqj4VMOJhvzj7bEZg4wEYa759z1pH4= golang.org/x/mod v0.22.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY= golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ= golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.26.0 h1:WEQa6V3Gja/BhNxg540hBip/kkaYtRg3cxg4oXSw4AU= golang.org/x/term v0.26.0/go.mod h1:Si5m1o57C5nBNQo5z1iq+XDijt21BDBDp2bK0QI8e3E= golang.org/x/tools v0.27.0 h1:qEKojBykQkQ4EynWy4S8Weg69NumxKdn40Fce3uc/8o= golang.org/x/tools v0.27.0/go.mod h1:sUi0ZgbwW9ZPAq26Ekut+weQPR5eIM6GQLQ1Yjm1H0Q= gonum.org/v1/gonum v0.15.1 h1:FNy7N6OUZVUaWG9pTiD+jlhdQ3lMP+/LcTpJ6+a8sQ0= gonum.org/v1/gonum v0.15.1/go.mod h1:eZTZuRFrzu5pcyjN5wJhcIhnUdNijYxX1T2IcrOGY0o= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= ================================================ FILE: ollama_model.go ================================================ package gguf_parser import ( "context" "fmt" "net/http" "net/url" "regexp" "strings" "golang.org/x/sync/errgroup" "github.com/gpustack/gguf-parser-go/util/httpx" "github.com/gpustack/gguf-parser-go/util/json" "github.com/gpustack/gguf-parser-go/util/stringx" ) // Inspired by https://github.com/ollama/ollama/blob/380e06e5bea06ae8ded37f47c37bd5d604194d3e/types/model/name.go, // and https://github.com/ollama/ollama/blob/380e06e5bea06ae8ded37f47c37bd5d604194d3e/server/modelpath.go. const ( OllamaDefaultScheme = "https" OllamaDefaultRegistry = "registry.ollama.ai" OllamaDefaultNamespace = "library" OllamaDefaultTag = "latest" ) type ( // OllamaModel represents an Ollama model, // its manifest(including MediaType, Config and Layers) can be completed further by calling the Complete method. OllamaModel struct { Schema string `json:"schema"` Registry string `json:"registry"` Namespace string `json:"namespace"` Repository string `json:"repository"` Tag string `json:"tag"` SchemaVersion uint32 `json:"schemaVersion"` MediaType string `json:"mediaType"` Config OllamaModelLayer `json:"config"` Layers []OllamaModelLayer `json:"layers"` // Client is the http client used to complete the OllamaModel's network operations. // // When this field is nil, // it will be set to the client used by OllamaModel.Complete. // // When this field is offered, // the network operations will be done with this client. Client *http.Client `json:"-"` } // OllamaModelLayer represents an Ollama model layer, // its digest can be used to download the artifact. OllamaModelLayer struct { MediaType string `json:"mediaType"` Size uint64 `json:"size"` Digest string `json:"digest"` // Root points to the root OllamaModel, // which is never serialized or deserialized. // // When called OllamaModel.Complete, // this field will be set to the OllamaModel itself. // If not, this field will be nil, // and must be set manually to the root OllamaModel before calling the method of OllamaModelLayer. Root *OllamaModel `json:"-"` } ) // ParseOllamaModel parses the given Ollama model string, // and returns the OllamaModel, or nil if the model is invalid. func ParseOllamaModel(model string, opts ...OllamaModelOption) *OllamaModel { if model == "" { return nil } var o _OllamaModelOptions for _, opt := range opts { opt(&o) } om := OllamaModel{ Schema: OllamaDefaultScheme, Registry: OllamaDefaultRegistry, Namespace: OllamaDefaultNamespace, Tag: OllamaDefaultTag, } { if o.DefaultScheme != "" { om.Schema = o.DefaultScheme } if o.DefaultRegistry != "" { om.Registry = o.DefaultRegistry } if o.DefaultNamespace != "" { om.Namespace = o.DefaultNamespace } if o.DefaultTag != "" { om.Tag = o.DefaultTag } } m := model // Drop digest. m, _, _ = stringx.CutFromRight(m, "@") // Get tag. m, s, ok := stringx.CutFromRight(m, ":") if ok && s != "" { om.Tag = s } // Get repository. m, s, ok = stringx.CutFromRight(m, "/") if ok && s != "" { om.Repository = s } else if m != "" { om.Repository = m m = "" } // Get namespace. m, s, ok = stringx.CutFromRight(m, "/") if ok && s != "" { om.Namespace = s } else if m != "" { om.Namespace = m m = "" } // Get registry. m, s, ok = stringx.CutFromLeft(m, "://") if ok && s != "" { om.Schema = m om.Registry = s } else if m != "" { om.Registry = m } if om.Repository == "" { return nil } return &om } func (om *OllamaModel) String() string { var b strings.Builder if om.Registry != "" { b.WriteString(om.Registry) b.WriteByte('/') } if om.Namespace != "" { b.WriteString(om.Namespace) b.WriteByte('/') } b.WriteString(om.Repository) if om.Tag != "" { b.WriteByte(':') b.WriteString(om.Tag) } return b.String() } // GetLayer returns the OllamaModelLayer with the given media type, // and true if found, and false otherwise. func (om *OllamaModel) GetLayer(mediaType string) (OllamaModelLayer, bool) { for i := range om.Layers { if om.Layers[i].MediaType == mediaType { return om.Layers[i], true } } return OllamaModelLayer{}, false } // SearchLayers returns a list of OllamaModelLayer with the media type that matches the given regex. func (om *OllamaModel) SearchLayers(mediaTypeRegex *regexp.Regexp) []OllamaModelLayer { var ls []OllamaModelLayer for i := range om.Layers { if mediaTypeRegex.MatchString(om.Layers[i].MediaType) { ls = append(ls, om.Layers[i]) } } return ls } // WebPageURL returns the Ollama web page URL of the OllamaModel. func (om *OllamaModel) WebPageURL() *url.URL { u := &url.URL{ Scheme: om.Schema, Host: om.Registry, } return u.JoinPath(om.Namespace, om.Repository+":"+om.Tag) } // Complete completes the OllamaModel with the given context and http client. func (om *OllamaModel) Complete(ctx context.Context, cli *http.Client) error { if om.Client == nil { om.Client = cli } u := &url.URL{ Scheme: om.Schema, Host: om.Registry, } u = u.JoinPath("v2", om.Namespace, om.Repository, "manifests", om.Tag) req, err := httpx.NewGetRequestWithContext(ctx, u.String()) if err != nil { return fmt.Errorf("new request: %w", err) } req.Header.Set("Accept", "application/vnd.docker.distribution.manifest.v2+json") err = httpx.Do(om.Client, req, func(resp *http.Response) error { if resp.StatusCode != http.StatusOK { return fmt.Errorf("status code %d", resp.StatusCode) } return json.NewDecoder(resp.Body).Decode(om) }) if err != nil { return fmt.Errorf("do request %s: %w", u, err) } // Connect. om.Config.Root = om for i := range om.Layers { om.Layers[i].Root = om } return nil } // Params returns the parameters of the OllamaModel. func (om *OllamaModel) Params(ctx context.Context, cli *http.Client) (map[string]any, error) { if cli == nil { cli = om.Client } if cli == nil { return nil, fmt.Errorf("no client") } mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.params$`)) if len(mls) == 0 { return nil, nil } rs := make([]map[string]any, len(mls)) eg, ctx := errgroup.WithContext(ctx) for i := range mls { x := i eg.Go(func() error { bs, err := mls[x].FetchBlob(ctx, cli) if err == nil { p := make(map[string]any) if err = json.Unmarshal(bs, &p); err == nil { rs[x] = p } } return err }) } if err := eg.Wait(); err != nil { return nil, fmt.Errorf("fetch blob: %w", err) } r := make(map[string]any) for i := range rs { for k, v := range rs[i] { r[k] = v } } return r, nil } // Template returns the template of the OllamaModel. func (om *OllamaModel) Template(ctx context.Context, cli *http.Client) (string, error) { if cli == nil { cli = om.Client } if cli == nil { return "", fmt.Errorf("no client") } mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.(prompt|template)$`)) if len(mls) == 0 { return "", nil } ml := mls[len(mls)-1] bs, err := ml.FetchBlob(ctx, cli) if err != nil { return "", fmt.Errorf("fetch blob: %w", err) } return stringx.FromBytes(&bs), nil } // System returns the system message of the OllamaModel. func (om *OllamaModel) System(ctx context.Context, cli *http.Client) (string, error) { if cli == nil { cli = om.Client } if cli == nil { return "", fmt.Errorf("no client") } mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.system$`)) if len(mls) == 0 { return "", nil } ml := mls[len(mls)-1] bs, err := ml.FetchBlob(ctx, cli) if err != nil { return "", fmt.Errorf("fetch blob: %w", err) } return stringx.FromBytes(&bs), nil } // License returns the license of the OllamaModel. func (om *OllamaModel) License(ctx context.Context, cli *http.Client) ([]string, error) { if cli == nil { cli = om.Client } if cli == nil { return nil, fmt.Errorf("no client") } mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.license$`)) if len(mls) == 0 { return nil, nil } rs := make([]string, len(mls)) eg, ctx := errgroup.WithContext(ctx) for i := range mls { x := i eg.Go(func() error { bs, err := mls[x].FetchBlob(ctx, cli) if err == nil { rs[x] = stringx.FromBytes(&bs) } return err }) } if err := eg.Wait(); err != nil { return nil, fmt.Errorf("fetch blob: %w", err) } return rs, nil } // Messages returns the messages of the OllamaModel. func (om *OllamaModel) Messages(ctx context.Context, cli *http.Client) ([]json.RawMessage, error) { if cli == nil { cli = om.Client } if cli == nil { return nil, fmt.Errorf("no client") } mls := om.SearchLayers(regexp.MustCompile(`^application/vnd\.ollama\.image\.messages$`)) if len(mls) == 0 { return nil, nil } rs := make([]json.RawMessage, len(mls)) eg, ctx := errgroup.WithContext(ctx) for i := range mls { x := i eg.Go(func() error { bs, err := mls[x].FetchBlob(ctx, cli) if err == nil { rs[x] = bs } return err }) } if err := eg.Wait(); err != nil { return nil, fmt.Errorf("fetch blob: %w", err) } return rs, nil } // BlobURL returns the blob URL of the OllamaModelLayer. func (ol *OllamaModelLayer) BlobURL() *url.URL { if ol.Root == nil { return nil } u := &url.URL{ Scheme: ol.Root.Schema, Host: ol.Root.Registry, } return u.JoinPath("v2", ol.Root.Namespace, ol.Root.Repository, "blobs", ol.Digest) } // FetchBlob fetches the blob of the OllamaModelLayer with the given context and http client, // and returns the response body as bytes. func (ol *OllamaModelLayer) FetchBlob(ctx context.Context, cli *http.Client) ([]byte, error) { var b []byte err := ol.FetchBlobFunc(ctx, cli, func(resp *http.Response) error { b = httpx.BodyBytes(resp) return nil }) return b, err } // FetchBlobFunc fetches the blob of the OllamaModelLayer with the given context and http client, // and processes the response with the given function. func (ol *OllamaModelLayer) FetchBlobFunc(ctx context.Context, cli *http.Client, process func(*http.Response) error) error { if cli == nil { cli = ol.Root.Client } if cli == nil { return fmt.Errorf("no client") } u := ol.BlobURL() if u == nil { return fmt.Errorf("no blob URL") } req, err := httpx.NewGetRequestWithContext(ctx, u.String()) if err != nil { return fmt.Errorf("new request: %w", err) } err = httpx.Do(cli, req, process) if err != nil { return fmt.Errorf("do request %s: %w", u, err) } return nil } ================================================ FILE: ollama_model_option.go ================================================ package gguf_parser import ( "net/url" "strings" ) type ( _OllamaModelOptions struct { DefaultScheme string DefaultRegistry string DefaultNamespace string DefaultTag string } OllamaModelOption func(*_OllamaModelOptions) ) // SetOllamaModelBaseURL parses the given base URL, // and sets default schema/registry for OllamaModel. func SetOllamaModelBaseURL(baseURL string) OllamaModelOption { baseURL = strings.TrimSpace(baseURL) return func(o *_OllamaModelOptions) { if baseURL == "" { return } if !strings.Contains(baseURL, "://") { baseURL = "https://" + baseURL } u, err := url.Parse(baseURL) if err != nil { return } o.DefaultScheme = u.Scheme o.DefaultRegistry = u.Host } } // SetOllamaModelDefaultScheme sets the default scheme for OllamaModel. func SetOllamaModelDefaultScheme(scheme string) OllamaModelOption { return func(o *_OllamaModelOptions) { if scheme == "" { return } o.DefaultScheme = scheme } } // SetOllamaModelDefaultRegistry sets the default registry for OllamaModel. func SetOllamaModelDefaultRegistry(registry string) OllamaModelOption { return func(o *_OllamaModelOptions) { if registry == "" { return } o.DefaultRegistry = registry } } // SetOllamaModelDefaultNamespace sets the default namespace for OllamaModel. func SetOllamaModelDefaultNamespace(namespace string) OllamaModelOption { return func(o *_OllamaModelOptions) { if namespace == "" { return } o.DefaultNamespace = namespace } } // SetOllamaModelDefaultTag sets the default tag for OllamaModel. func SetOllamaModelDefaultTag(tag string) OllamaModelOption { return func(o *_OllamaModelOptions) { if tag == "" { return } o.DefaultTag = tag } } ================================================ FILE: ollama_model_test.go ================================================ package gguf_parser import ( "testing" "github.com/stretchr/testify/assert" ) func TestParseOllamaModel(t *testing.T) { cases := []struct { given string expected *OllamaModel }{ { given: "gemma2", expected: &OllamaModel{ Schema: OllamaDefaultScheme, Registry: OllamaDefaultRegistry, Namespace: OllamaDefaultNamespace, Repository: "gemma2", Tag: OllamaDefaultTag, }, }, { given: "gemma2:awesome", expected: &OllamaModel{ Schema: OllamaDefaultScheme, Registry: OllamaDefaultRegistry, Namespace: OllamaDefaultNamespace, Repository: "gemma2", Tag: "awesome", }, }, { given: "gemma2:awesome@sha256:1234567890abcdef", expected: &OllamaModel{ Schema: OllamaDefaultScheme, Registry: OllamaDefaultRegistry, Namespace: OllamaDefaultNamespace, Repository: "gemma2", Tag: "awesome", }, }, { given: "awesome/gemma2:latest@sha256:1234567890abcdef", expected: &OllamaModel{ Schema: OllamaDefaultScheme, Registry: OllamaDefaultRegistry, Namespace: "awesome", Repository: "gemma2", Tag: "latest", }, }, { given: "mysite.com/library/gemma2:latest@sha256:1234567890abcdef", expected: &OllamaModel{ Schema: OllamaDefaultScheme, Registry: "mysite.com", Namespace: "library", Repository: "gemma2", Tag: "latest", }, }, { given: "http://mysite.com/library/gemma2:latest@sha256:1234567890abcdef", expected: &OllamaModel{ Schema: "http", Registry: "mysite.com", Namespace: "library", Repository: "gemma2", Tag: "latest", }, }, } for _, tc := range cases { t.Run(tc.given, func(t *testing.T) { actual := ParseOllamaModel(tc.given) assert.Equal(t, tc.expected, actual) }) } } ================================================ FILE: ollama_registry_authenticate.go ================================================ package gguf_parser import ( "bytes" "context" "crypto/ed25519" "crypto/rand" "encoding/base64" "encoding/json" "encoding/pem" "errors" "fmt" "net/http" "net/url" "os" "path/filepath" "runtime" "strconv" "strings" "time" "golang.org/x/crypto/ssh" "github.com/gpustack/gguf-parser-go/util/funcx" "github.com/gpustack/gguf-parser-go/util/httpx" "github.com/gpustack/gguf-parser-go/util/osx" "github.com/gpustack/gguf-parser-go/util/stringx" ) const ( httpHeaderWWWAuthenticate = "WWW-Authenticate" httpHeaderAuthorization = "Authorization" ) // OllamaUserAgent returns the user agent string for Ollama, // since llama3.1, the user agent is required to be set, // otherwise the request will be rejected by 412. func OllamaUserAgent() string { return fmt.Sprintf("ollama/9.9.9 (%s %s) Go/%s", runtime.GOARCH, runtime.GOOS, runtime.Version()) } // OllamaRegistryAuthorizeRetry returns true if the request should be retried with authorization. // // OllamaRegistryAuthorizeRetry leverages OllamaRegistryAuthorize to obtain an authorization token, // and configures the request with the token. func OllamaRegistryAuthorizeRetry(resp *http.Response, cli *http.Client) bool { if resp == nil || cli == nil { return false } if resp.StatusCode != http.StatusUnauthorized && resp.Request == nil { // Not unauthorized, return. return false } req := resp.Request if req.Header.Get(httpHeaderAuthorization) != "" { // Already authorized, return. return false } const tokenPrefix = "Bearer " authnToken := strings.TrimPrefix(resp.Header.Get(httpHeaderWWWAuthenticate), tokenPrefix) if authnToken == "" { // No authentication token, return. return false } authzToken := funcx.MustNoError(OllamaRegistryAuthorize(req.Context(), cli, authnToken)) req.Header.Set(httpHeaderAuthorization, tokenPrefix+authzToken) return true } // OllamaRegistryAuthorize authorizes the request with the given authentication token, // and returns the authorization token. func OllamaRegistryAuthorize(ctx context.Context, cli *http.Client, authnToken string) (string, error) { priKey, err := OllamaSingKeyLoad() if err != nil { return "", fmt.Errorf("load sign key: %w", err) } var authzUrl string { ss := strings.Split(authnToken, ",") if len(ss) < 3 { return "", errors.New("invalid authn token") } var realm, service, scope string for _, s := range ss { sp := strings.SplitN(s, "=", 2) if len(sp) < 2 { continue } sp[1] = strings.TrimFunc(sp[1], func(r rune) bool { return r == '"' || r == '\'' }) switch sp[0] { case "realm": realm = sp[1] case "service": service = sp[1] case "scope": scope = sp[1] } } u, err := url.Parse(realm) if err != nil { return "", fmt.Errorf("parse realm: %w", err) } qs := u.Query() qs.Add("service", service) for _, s := range strings.Split(scope, " ") { qs.Add("scope", s) } qs.Add("ts", strconv.FormatInt(time.Now().Unix(), 10)) qs.Add("nonce", stringx.RandomBase64(16)) u.RawQuery = qs.Encode() authzUrl = u.String() } var authnData string { pubKey := ssh.MarshalAuthorizedKey(priKey.PublicKey()) pubKeyp := bytes.Split(pubKey, []byte(" ")) if len(pubKeyp) < 2 { return "", errors.New("malformed public key") } nc := base64.StdEncoding.EncodeToString([]byte(stringx.SumBytesBySHA256(nil))) py := []byte(fmt.Sprintf("%s,%s,%s", http.MethodGet, authzUrl, nc)) sd, err := priKey.Sign(rand.Reader, py) if err != nil { return "", fmt.Errorf("signing data: %w", err) } authnData = fmt.Sprintf("%s:%s", bytes.TrimSpace(pubKeyp[1]), base64.StdEncoding.EncodeToString(sd.Blob)) } req, err := httpx.NewGetRequestWithContext(ctx, authzUrl) if err != nil { return "", fmt.Errorf("new request: %w", err) } req.Header.Add(httpHeaderAuthorization, authnData) var authzToken string err = httpx.Do(cli, req, func(resp *http.Response) error { if resp.StatusCode != http.StatusOK { return fmt.Errorf("status code %d", resp.StatusCode) } var tok struct { Token string `json:"token"` } if err = json.NewDecoder(resp.Body).Decode(&tok); err != nil { return err } if tok.Token == "" { return errors.New("empty token") } authzToken = tok.Token return nil }) if err != nil { return "", fmt.Errorf("do request %s: %w", authzUrl, err) } return authzToken, nil } // OllamaSingKeyLoad loads the signing key for Ollama, // and generates a new key if not exists. func OllamaSingKeyLoad() (ssh.Signer, error) { hd := filepath.Join(osx.UserHomeDir(), ".ollama") priKeyPath := filepath.Join(hd, "id_ed25519") if !osx.ExistsFile(priKeyPath) { // Generate key if not exists. pubKey, priKey, err := ed25519.GenerateKey(rand.Reader) if err != nil { return nil, fmt.Errorf("generate key: %w", err) } priKeyPem, err := ssh.MarshalPrivateKey(priKey, "") if err != nil { return nil, fmt.Errorf("marshal private key: %w", err) } priKeyBs := pem.EncodeToMemory(priKeyPem) sshPubKey, err := ssh.NewPublicKey(pubKey) if err != nil { return nil, fmt.Errorf("new public key: %w", err) } pubKeyBs := ssh.MarshalAuthorizedKey(sshPubKey) if err = osx.WriteFile(priKeyPath, priKeyBs, 0o600); err != nil { return nil, fmt.Errorf("write private key: %w", err) } if err = osx.WriteFile(priKeyPath+".pub", pubKeyBs, 0o644); err != nil { _ = os.Remove(priKeyPath) return nil, fmt.Errorf("write public key: %w", err) } } priKeyBs, err := os.ReadFile(priKeyPath) if err != nil { return nil, fmt.Errorf("read private key: %w", err) } priKey, err := ssh.ParsePrivateKey(priKeyBs) if err != nil { return nil, fmt.Errorf("parse private key: %w", err) } return priKey, nil } ================================================ FILE: scalar.go ================================================ package gguf_parser import ( "errors" "strconv" "strings" ) const ( _Ki = 1 << ((iota + 1) * 10) _Mi _Gi _Ti _Pi ) const ( _K = 1e3 _M = 1e6 _G = 1e9 _T = 1e12 _P = 1e15 ) const ( _Thousand = 1e3 _Million = 1e6 _Billion = 1e9 _Trillion = 1e12 _Quadrillion = 1e15 ) type ( // SizeScalar is the scalar for size. SizeScalar uint64 // FLOPSScalar is the scalar for FLOPS. FLOPSScalar uint64 // BytesPerSecondScalar is the scalar for bytes per second (Bps). BytesPerSecondScalar uint64 ) var ( // _GeneralBaseUnitMatrix is the base unit matrix for bytes. _GeneralBaseUnitMatrix = []struct { Base float64 Unit string }{ {_Pi, "Pi"}, {_P, "P"}, {_Ti, "Ti"}, {_T, "T"}, {_Gi, "Gi"}, {_G, "G"}, {_Mi, "Mi"}, {_M, "M"}, {_Ki, "Ki"}, {_K, "K"}, } // _SizeBaseUnitMatrix is the base unit matrix for size. _SizeBaseUnitMatrix = []struct { Base float64 Unit string }{ {_Pi, "P"}, {_Ti, "T"}, {_Gi, "G"}, {_Mi, "M"}, {_Ki, "K"}, } // _NumberBaseUnitMatrix is the base unit matrix for numbers. _NumberBaseUnitMatrix = []struct { Base float64 Unit string }{ {_Quadrillion, "Q"}, {_Trillion, "T"}, {_Billion, "B"}, {_Million, "M"}, {_Thousand, "K"}, } ) // ParseSizeScalar parses the SizeScalar from the string. func ParseSizeScalar(s string) (_ SizeScalar, err error) { if s == "" { return 0, errors.New("invalid SizeScalar") } b := float64(1) for i := range _SizeBaseUnitMatrix { if strings.HasSuffix(s, _SizeBaseUnitMatrix[i].Unit) { b = _SizeBaseUnitMatrix[i].Base s = strings.TrimSuffix(s, _SizeBaseUnitMatrix[i].Unit) break } } f, err := strconv.ParseFloat(strings.TrimSpace(s), 64) if err != nil { return 0, err } return SizeScalar(f * b), nil } func (s SizeScalar) String() string { if s == 0 { return "0" } b, u := float64(1), "" for i := range _SizeBaseUnitMatrix { if float64(s) >= _SizeBaseUnitMatrix[i].Base { b = _SizeBaseUnitMatrix[i].Base u = _SizeBaseUnitMatrix[i].Unit break } } f := strconv.FormatFloat(float64(s)/b, 'f', 2, 64) return strings.TrimSuffix(f, ".00") + " " + u } // ParseFLOPSScalar parses the FLOPSScalar from the string. func ParseFLOPSScalar(s string) (_ FLOPSScalar, err error) { if s == "" { return 0, errors.New("invalid FLOPSScalar") } s = strings.TrimSuffix(s, "FLOPS") b := float64(1) for i := range _GeneralBaseUnitMatrix { if strings.HasSuffix(s, _GeneralBaseUnitMatrix[i].Unit) { b = _GeneralBaseUnitMatrix[i].Base s = strings.TrimSuffix(s, _GeneralBaseUnitMatrix[i].Unit) break } } f, err := strconv.ParseFloat(strings.TrimSpace(s), 64) if err != nil { return 0, err } return FLOPSScalar(f * b), nil } func (s FLOPSScalar) String() string { if s == 0 { return "0 FLOPS" } b, u := float64(1), "" for i := range _GeneralBaseUnitMatrix { if float64(s) >= _GeneralBaseUnitMatrix[i].Base { b = _GeneralBaseUnitMatrix[i].Base u = _GeneralBaseUnitMatrix[i].Unit break } } f := strconv.FormatFloat(float64(s)/b, 'f', 2, 64) return strings.TrimSuffix(f, ".00") + " " + u + "FLOPS" } // ParseBytesPerSecondScalar parses the BytesPerSecondScalar from the string. func ParseBytesPerSecondScalar(s string) (_ BytesPerSecondScalar, err error) { if s == "" { return 0, errors.New("invalid BytesPerSecondScalar") } b := float64(1) o := float64(1) switch { case strings.HasSuffix(s, "Bps") || strings.HasSuffix(s, "B/s"): s = strings.TrimSuffix(strings.TrimSuffix(s, "Bps"), "B/s") case strings.HasSuffix(s, "bps") || strings.HasSuffix(s, "b/s"): s = strings.TrimSuffix(strings.TrimSuffix(s, "bps"), "b/s") o = 8 } for i := range _GeneralBaseUnitMatrix { if strings.HasSuffix(s, _GeneralBaseUnitMatrix[i].Unit) { b = _GeneralBaseUnitMatrix[i].Base s = strings.TrimSuffix(s, _GeneralBaseUnitMatrix[i].Unit) break } } f, err := strconv.ParseFloat(strings.TrimSpace(s), 64) if err != nil { return 0, err } return BytesPerSecondScalar(f * b / o), nil } func (s BytesPerSecondScalar) String() string { if s == 0 { return "0 Bps" } b, u := float64(1), "" for i := range _GeneralBaseUnitMatrix { if float64(s) >= _GeneralBaseUnitMatrix[i].Base { b = _GeneralBaseUnitMatrix[i].Base u = _GeneralBaseUnitMatrix[i].Unit break } } f := strconv.FormatFloat(float64(s)/b, 'f', 2, 64) return strings.TrimSuffix(f, ".00") + " " + u + "Bps" } type ( // GGUFBytesScalar is the scalar for bytes. GGUFBytesScalar uint64 // GGUFParametersScalar is the scalar for parameters. GGUFParametersScalar uint64 // GGUFBitsPerWeightScalar is the scalar for bits per weight. GGUFBitsPerWeightScalar float64 // GGUFTokensPerSecondScalar is the scalar for tokens per second. GGUFTokensPerSecondScalar float64 ) // ParseGGUFBytesScalar parses the GGUFBytesScalar from the string. func ParseGGUFBytesScalar(s string) (_ GGUFBytesScalar, err error) { if s == "" { return 0, errors.New("invalid GGUFBytesScalar") } s = strings.TrimSuffix(s, "B") b := float64(1) for i := range _GeneralBaseUnitMatrix { if strings.HasSuffix(s, _GeneralBaseUnitMatrix[i].Unit) { b = _GeneralBaseUnitMatrix[i].Base s = strings.TrimSuffix(s, _GeneralBaseUnitMatrix[i].Unit) break } } f, err := strconv.ParseFloat(strings.TrimSpace(s), 64) if err != nil { return 0, err } return GGUFBytesScalar(f * b), nil } // GGUFBytesScalarStringInMiBytes is the flag to show the GGUFBytesScalar string in MiB. var GGUFBytesScalarStringInMiBytes bool func (s GGUFBytesScalar) String() string { if s == 0 { return "0 B" } b, u := float64(1), "" if GGUFBytesScalarStringInMiBytes { b = _Mi u = "Mi" } else { for i := range _GeneralBaseUnitMatrix { if float64(s) >= _GeneralBaseUnitMatrix[i].Base { b = _GeneralBaseUnitMatrix[i].Base u = _GeneralBaseUnitMatrix[i].Unit break } } } f := strconv.FormatFloat(float64(s)/b, 'f', 2, 64) return strings.TrimSuffix(f, ".00") + " " + u + "B" } func (s GGUFParametersScalar) String() string { if s == 0 { return "0" } b, u := float64(1), "" for i := range _NumberBaseUnitMatrix { if float64(s) >= _NumberBaseUnitMatrix[i].Base { b = _NumberBaseUnitMatrix[i].Base u = _NumberBaseUnitMatrix[i].Unit break } } f := strconv.FormatFloat(float64(s)/b, 'f', 2, 64) return strings.TrimSuffix(f, ".00") + " " + u } func (s GGUFBitsPerWeightScalar) String() string { if s <= 0 { return "0 bpw" } return strconv.FormatFloat(float64(s), 'f', 2, 64) + " bpw" } func (s GGUFTokensPerSecondScalar) String() string { if s <= 0 { return "0 tps" } return strconv.FormatFloat(float64(s), 'f', 2, 64) + " tps" } ================================================ FILE: scalar_test.go ================================================ package gguf_parser import ( "testing" "github.com/stretchr/testify/assert" ) func TestParseSizeScalar(t *testing.T) { testCases := []struct { given string expected SizeScalar }{ {"1", 1}, {"1K", 1 * _Ki}, {"1M", 1 * _Mi}, {"1G", 1 * _Gi}, {"1T", 1 * _Ti}, {"1P", 1 * _Pi}, } for _, tc := range testCases { t.Run(tc.given, func(t *testing.T) { actual, err := ParseSizeScalar(tc.given) if !assert.NoError(t, err) { return } assert.Equal(t, tc.expected, actual) }) } } func TestParseFLOPSScalar(t *testing.T) { testCases := []struct { given string expected FLOPSScalar }{ {"1FLOPS", 1}, {"1KFLOPS", 1 * _K}, {"1MFLOPS", 1 * _M}, {"1GFLOPS", 1 * _G}, {"1TFLOPS", 1 * _T}, {"1PFLOPS", 1 * _P}, } for _, tc := range testCases { t.Run(tc.given, func(t *testing.T) { actual, err := ParseFLOPSScalar(tc.given) if !assert.NoError(t, err) { return } assert.Equal(t, tc.expected, actual) }) } } func TestParseBytesPerSecondScalar(t *testing.T) { testCases := []struct { given string expected BytesPerSecondScalar }{ {"1B/s", 1}, {"1KB/s", 1 * _K}, {"1MB/s", 1 * _M}, {"1GB/s", 1 * _G}, {"1TB/s", 1 * _T}, {"1PB/s", 1 * _P}, {"1KiBps", 1 * _Ki}, {"1MiBps", 1 * _Mi}, {"1GiBps", 1 * _Gi}, {"1TiBps", 1 * _Ti}, {"1PiBps", 1 * _Pi}, {"8b/s", 1}, {"1Kbps", 1 * _K >> 3}, {"1Mbps", 1 * _M >> 3}, {"1Gbps", 1 * _G >> 3}, {"1Tbps", 1 * _T >> 3}, {"1Pbps", 1 * _P >> 3}, {"1Kibps", 1 * _Ki >> 3}, {"1Mibps", 1 * _Mi >> 3}, {"1Gibps", 1 * _Gi >> 3}, {"1Tibps", 1 * _Ti >> 3}, {"1Pibps", 1 * _Pi >> 3}, } for _, tc := range testCases { t.Run(tc.given, func(t *testing.T) { actual, err := ParseBytesPerSecondScalar(tc.given) if !assert.NoError(t, err) { return } assert.Equal(t, tc.expected, actual) }) } } func TestParseGGUFBytesScalar(t *testing.T) { testCases := []struct { given string expected GGUFBytesScalar }{ {"1B", 1}, {"1KB", 1 * _K}, {"1MB", 1 * _M}, {"1GB", 1 * _G}, {"1TB", 1 * _T}, {"1PB", 1 * _P}, {"1KiB", 1 * _Ki}, {"1MiB", 1 * _Mi}, {"1GiB", 1 * _Gi}, {"1TiB", 1 * _Ti}, {"1PiB", 1 * _Pi}, } for _, tc := range testCases { t.Run(tc.given, func(t *testing.T) { actual, err := ParseGGUFBytesScalar(tc.given) if !assert.NoError(t, err) { return } assert.Equal(t, tc.expected, actual) }) } } ================================================ FILE: util/anyx/any.go ================================================ package anyx import ( "encoding/json" "fmt" "strconv" "golang.org/x/exp/constraints" ) // Number converts any type to the specified number type. func Number[T constraints.Integer | constraints.Float](v any) T { switch vv := v.(type) { case int: return T(vv) case int8: return T(vv) case int16: return T(vv) case int32: return T(vv) case int64: return T(vv) case uint: return T(vv) case uint8: return T(vv) case uint16: return T(vv) case uint32: return T(vv) case uint64: return T(vv) case float32: return T(vv) case float64: return T(vv) case bool: if vv { return T(1) } return T(0) case string: x, err := strconv.ParseInt(vv, 10, 64) if err != nil { y, err := strconv.ParseFloat(vv, 64) if err != nil { return T(0) } else { return T(y) } } return T(x) case json.Number: x, err := vv.Int64() if err != nil { y, err := vv.Float64() if err != nil { return T(0) } else { return T(y) } } return T(x) default: return T(0) } } // Bool converts any type to a bool. func Bool(v any) bool { switch vv := v.(type) { case bool: return vv case int, int8, int16, int32, int64, uint, uint8, uint16, uint32, uint64, uintptr: return vv != 0 case float32, float64: return vv != 0 case string: return vv != "0" case fmt.Stringer: return vv.String() != "0" default: return false } } // String converts any type to a string. func String(v any) string { switch vv := v.(type) { case string: return vv case []byte: return string(vv) case int: return strconv.FormatInt(int64(vv), 10) case int8: return strconv.FormatInt(int64(vv), 10) case int16: return strconv.FormatInt(int64(vv), 10) case int32: return strconv.FormatInt(int64(vv), 10) case int64: return strconv.FormatInt(vv, 10) case uint: return strconv.FormatUint(uint64(vv), 10) case uint8: return strconv.FormatUint(uint64(vv), 10) case uint16: return strconv.FormatUint(uint64(vv), 10) case uint32: return strconv.FormatUint(uint64(vv), 10) case uint64: return strconv.FormatUint(vv, 10) case float32: return strconv.FormatFloat(float64(vv), 'f', -1, 32) case float64: return strconv.FormatFloat(vv, 'f', -1, 64) case bool: return strconv.FormatBool(vv) case fmt.Stringer: return vv.String() case json.RawMessage: return string(vv) default: return fmt.Sprintf("%v", v) } } ================================================ FILE: util/bytex/pool.go ================================================ package bytex import ( "bytes" "sync" ) const defaultSize = 32 * 1024 type ( Bytes = []byte BytesBuffer = *bytes.Buffer ) var gp = sync.Pool{ New: func() any { buf := make(Bytes, defaultSize) return &buf }, } // GetBytes gets a bytes buffer from the pool, // which can specify with a size, // default is 32k. func GetBytes(size ...uint64) Bytes { buf := *(gp.Get().(*Bytes)) s := defaultSize if len(size) != 0 { s = int(size[0]) if s == 0 { s = defaultSize } } if cap(buf) >= s { return buf[:s] } gp.Put(&buf) ns := s if ns < defaultSize { ns = defaultSize } buf = make(Bytes, ns) return buf[:s] } // WithBytes relies on GetBytes to get a buffer, // calls the function with the buffer, // finally, puts it back to the pool after the function returns. func WithBytes(fn func(Bytes) error, size ...uint64) error { if fn == nil { return nil } buf := GetBytes(size...) defer Put(buf) return fn(buf) } // GetBuffer is similar to GetBytes, // but it returns the bytes buffer wrapped by bytes.Buffer. func GetBuffer(size ...uint64) BytesBuffer { return bytes.NewBuffer(GetBytes(size...)[:0]) } // WithBuffer relies on GetBuffer to get a buffer, // calls the function with the buffer, // finally, puts it back to the pool after the function returns. func WithBuffer(fn func(BytesBuffer) error, size ...uint64) error { if fn == nil { return nil } buf := GetBuffer(size...) defer Put(buf) return fn(buf) } // Put puts the buffer(either Bytes or BytesBuffer) back to the pool. func Put[T Bytes | BytesBuffer](buf T) { switch v := any(buf).(type) { case Bytes: gp.Put(&v) case BytesBuffer: bs := v.Bytes() gp.Put(&bs) v.Reset() } } ================================================ FILE: util/funcx/error.go ================================================ package funcx // NoError ignores the given error, // it is usually a nice helper for chain function calling. func NoError[T any](t T, _ error) T { return t } // NoError2 ignores the given error, // it is usually a nice helper for chain function calling. func NoError2[T, U any](t T, u U, _ error) (T, U) { return t, u } // NoError3 ignores the given error, // it is usually a nice helper for chain function calling. func NoError3[T, U, V any](t T, u U, v V, _ error) (T, U, V) { return t, u, v } // NoError4 ignores the given error, // it is usually a nice helper for chain function calling. func NoError4[T, U, V, W any](t T, u U, v V, w W, _ error) (T, U, V, W) { return t, u, v, w } // MustNoError is similar to NoError, // but it panics if the given error is not nil, // it is usually a nice helper for chain function calling. func MustNoError[T any](t T, e error) T { if e != nil { panic(e) } return t } // MustNoError2 is similar to NoError2, // but it panics if the given error is not nil, // it is usually a nice helper for chain function calling. func MustNoError2[T, U any](t T, u U, e error) (T, U) { if e != nil { panic(e) } return t, u } // MustNoError3 is similar to NoError3, // but it panics if the given error is not nil, // it is usually a nice helper for chain function calling. func MustNoError3[T, U, V any](t T, u U, v V, e error) (T, U, V) { if e != nil { panic(e) } return t, u, v } // MustNoError4 is similar to NoError4, // but it panics if the given error is not nil, // it is usually a nice helper for chain function calling. func MustNoError4[T, U, V, W any](t T, u U, v V, w W, e error) (T, U, V, W) { if e != nil { panic(e) } return t, u, v, w } ================================================ FILE: util/httpx/client.go ================================================ package httpx import ( "context" "fmt" "io" "net/http" "time" "github.com/henvic/httpretty" "github.com/gpustack/gguf-parser-go/util/bytex" ) // DefaultClient is similar to the default http.Client used by the package. // // It is used for requests pooling. var DefaultClient = &http.Client{ Transport: DefaultTransport, } // DefaultInsecureClient is the default http.Client used by the package, // with TLS insecure skip verify. // // It is used for requests pooling. var DefaultInsecureClient = &http.Client{ Transport: DefaultInsecureTransport, } // Client returns a new http.Client with the given options, // the result http.Client is used for fast-consuming requests. // // If you want a requests pool management, use DefaultClient instead. func Client(opts ...*ClientOption) *http.Client { var o *ClientOption if len(opts) > 0 { o = opts[0] } else { o = ClientOptions() } root := DefaultTransport if o.transport != nil { root = o.transport } if o.debug { pretty := &httpretty.Logger{ Time: true, TLS: true, RequestHeader: true, RequestBody: true, MaxRequestBody: 1024, ResponseHeader: true, ResponseBody: true, MaxResponseBody: 1024, Formatters: []httpretty.Formatter{&JSONFormatter{}}, } root = pretty.RoundTripper(root) } rtc := RoundTripperChain{ Next: root, } for i := range o.roundTrippers { rtc = RoundTripperChain{ Do: o.roundTrippers[i], Next: rtc, } } var rt http.RoundTripper = rtc if o.retryIf != nil { rt = RoundTripperFunc(func(req *http.Request) (*http.Response, error) { for i := 0; ; i++ { resp, err := rtc.RoundTrip(req) if !o.retryIf(resp, err) { return resp, err } w, ok := o.retryBackoff(i+1, resp) if !ok { return resp, err } wt := time.NewTimer(w) select { case <-req.Context().Done(): wt.Stop() return resp, req.Context().Err() case <-wt.C: } } }) } return &http.Client{ Transport: rt, Timeout: o.timeout, } } // NewGetRequestWithContext returns a new http.MethodGet request, // which is saving your life from http.NewRequestWithContext. func NewGetRequestWithContext(ctx context.Context, uri string) (*http.Request, error) { return http.NewRequestWithContext(ctx, http.MethodGet, uri, nil) } // NewGetRequest returns a new http.MethodGet request, // which is saving your life from http.NewRequest. func NewGetRequest(uri string) (*http.Request, error) { return http.NewRequest(http.MethodGet, uri, nil) } // NewHeadRequestWithContext returns a new http.MethodHead request, // which is saving your life from http.NewRequestWithContext. func NewHeadRequestWithContext(ctx context.Context, uri string) (*http.Request, error) { return http.NewRequestWithContext(ctx, http.MethodHead, uri, nil) } // NewHeadRequest returns a new http.MethodHead request, // which is saving your life from http.NewRequest. func NewHeadRequest(uri string) (*http.Request, error) { return http.NewRequest(http.MethodHead, uri, nil) } // NewPostRequestWithContext returns a new http.MethodPost request with the given context, // which is saving your life from http.NewRequestWithContext. func NewPostRequestWithContext(ctx context.Context, uri string, body io.Reader) (*http.Request, error) { return http.NewRequestWithContext(ctx, http.MethodPost, uri, body) } // NewPostRequest returns a new http.MethodPost request, // which is saving your life from http.NewRequest. func NewPostRequest(uri string, body io.Reader) (*http.Request, error) { return http.NewRequest(http.MethodPost, uri, body) } // NewPutRequestWithContext returns a new http.MethodPut request with the given context, // which is saving your life from http.NewRequestWithContext. func NewPutRequestWithContext(ctx context.Context, uri string, body io.Reader) (*http.Request, error) { return http.NewRequestWithContext(ctx, http.MethodPut, uri, body) } // NewPutRequest returns a new http.MethodPut request, // which is saving your life from http.NewRequest. func NewPutRequest(uri string, body io.Reader) (*http.Request, error) { return http.NewRequest(http.MethodPut, uri, body) } // NewPatchRequestWithContext returns a new http.MethodPatch request with the given context, // which is saving your life from http.NewRequestWithContext. func NewPatchRequestWithContext(ctx context.Context, uri string, body io.Reader) (*http.Request, error) { return http.NewRequestWithContext(ctx, http.MethodPatch, uri, body) } // NewPatchRequest returns a new http.MethodPatch request, // which is saving your life from http.NewRequest. func NewPatchRequest(uri string, body io.Reader) (*http.Request, error) { return http.NewRequest(http.MethodPatch, uri, body) } // NewDeleteRequestWithContext returns a new http.MethodDelete request with the given context, // which is saving your life from http.NewRequestWithContext. func NewDeleteRequestWithContext(ctx context.Context, uri string) (*http.Request, error) { return http.NewRequestWithContext(ctx, http.MethodDelete, uri, nil) } // NewDeleteRequest returns a new http.MethodDelete request, // which is saving your life from http.NewRequest. func NewDeleteRequest(uri string) (*http.Request, error) { return http.NewRequest(http.MethodDelete, uri, nil) } // NewConnectRequestWithContext returns a new http.MethodConnect request with the given context, // which is saving your life from http.NewRequestWithContext. func NewConnectRequestWithContext(ctx context.Context, uri string) (*http.Request, error) { return http.NewRequestWithContext(ctx, http.MethodConnect, uri, nil) } // NewConnectRequest returns a new http.MethodConnect request, // which is saving your life from http.NewRequest. func NewConnectRequest(uri string) (*http.Request, error) { return http.NewRequest(http.MethodConnect, uri, nil) } // NewOptionsRequestWithContext returns a new http.MethodOptions request with the given context, // which is saving your life from http.NewRequestWithContext. func NewOptionsRequestWithContext(ctx context.Context, uri string) (*http.Request, error) { return http.NewRequestWithContext(ctx, http.MethodOptions, uri, nil) } // NewOptionsRequest returns a new http.MethodOptions request, // which is saving your life from http.NewRequest. func NewOptionsRequest(uri string) (*http.Request, error) { return http.NewRequest(http.MethodOptions, uri, nil) } // NewTraceRequestWithContext returns a new http.MethodTrace request with the given context, // which is saving your life from http.NewRequestWithContext. func NewTraceRequestWithContext(ctx context.Context, uri string) (*http.Request, error) { return http.NewRequestWithContext(ctx, http.MethodTrace, uri, nil) } // NewTraceRequest returns a new http.MethodTrace request, // which is saving your life from http.NewRequest. func NewTraceRequest(uri string) (*http.Request, error) { return http.NewRequest(http.MethodTrace, uri, nil) } // Error is similar to http.Error, // but it can get the error message by the given code. func Error(rw http.ResponseWriter, code int) { http.Error(rw, http.StatusText(code), code) } // Close closes the http response body without error. func Close(resp *http.Response) { if resp != nil && resp.Body != nil { _ = resp.Body.Close() } } // BodyBytes returns the body of the http response as a byte slice. func BodyBytes(resp *http.Response) []byte { buf := bytex.GetBytes() defer bytex.Put(buf) w := bytex.GetBuffer() _, _ = io.CopyBuffer(w, resp.Body, buf) return w.Bytes() } // BodyString returns the body of the http response as a string. func BodyString(resp *http.Response) string { return string(BodyBytes(resp)) } // Do is a helper function to execute the given http request with the given http client, // and execute the given function with the http response. // // It is useful to avoid forgetting to close the http response body. // // Do will return the error if failed to execute the http request or the given function. func Do(cli *http.Client, req *http.Request, respFunc func(*http.Response) error) error { resp, err := cli.Do(req) if err != nil { return fmt.Errorf("do request: %w", err) } defer Close(resp) if respFunc == nil { return nil } return respFunc(resp) } ================================================ FILE: util/httpx/client_helper.go ================================================ package httpx import ( "bytes" "errors" "io" "net/http" "regexp" "github.com/henvic/httpretty" "github.com/gpustack/gguf-parser-go/util/json" ) var _ httpretty.Formatter = (*JSONFormatter)(nil) // JSONFormatter is copied from httpretty.JSONFormatter, // but use our own json package. type JSONFormatter struct{} var jsonTypeRE = regexp.MustCompile(`[/+]json($|;)`) // Match JSON media type. func (j *JSONFormatter) Match(mediatype string) bool { return jsonTypeRE.MatchString(mediatype) } // Format JSON content. func (j *JSONFormatter) Format(w io.Writer, src []byte) error { if !json.Valid(src) { // We want to get the error of json.checkValid, not unmarshal it. // The happy path has been optimized, maybe prematurely. if err := json.Unmarshal(src, &json.RawMessage{}); err != nil { return err } } // Avoiding allocation as we use *bytes.Buffer to store the formatted body before printing dst, ok := w.(*bytes.Buffer) if !ok { // Mitigating panic to avoid upsetting anyone who uses this directly return errors.New("underlying writer for JSONFormatter must be *bytes.Buffer") } return json.Indent(dst, src, "", " ") } type RoundTripperChain struct { Do func(req *http.Request) error Next http.RoundTripper } func (c RoundTripperChain) RoundTrip(req *http.Request) (*http.Response, error) { if c.Do != nil { if err := c.Do(req); err != nil { return nil, err } } if c.Next != nil { return c.Next.RoundTrip(req) } return nil, nil } type RoundTripperFunc func(*http.Request) (*http.Response, error) func (fn RoundTripperFunc) RoundTrip(req *http.Request) (*http.Response, error) { return fn(req) } ================================================ FILE: util/httpx/client_options.go ================================================ package httpx import ( "math" "net/http" "strconv" "strings" "time" ) type ClientOption struct { *TransportOption timeout time.Duration debug bool retryIf RetryFunc retryBackoff func(attemptNum int, resp *http.Response) (wait time.Duration, ok bool) roundTrippers []func(req *http.Request) error } func ClientOptions() *ClientOption { return &ClientOption{ TransportOption: TransportOptions().WithoutKeepalive(), timeout: 30 * time.Second, retryIf: DefaultRetry, retryBackoff: createRetryBackoff(100*time.Millisecond, 5*time.Second, 5), } } // WithTransport sets the TransportOption. func (o *ClientOption) WithTransport(opt *TransportOption) *ClientOption { if o == nil || opt == nil { return o } o.TransportOption = opt return o } // WithTimeout sets the request timeout. // // This timeout controls the sum of [network dial], [tls handshake], [request], [response header reading] and [response body reading]. // // Use 0 to disable timeout. func (o *ClientOption) WithTimeout(timeout time.Duration) *ClientOption { if o == nil || timeout < 0 { return o } o.timeout = timeout return o } // WithDebug sets the debug mode. func (o *ClientOption) WithDebug() *ClientOption { if o == nil { return o } o.debug = true return o } type RetryFunc func(resp *http.Response, err error) (retry bool) // WithRetryIf specifies the if-condition of retry operation for request, // or stops retrying if setting with `nil`. func (o *ClientOption) WithRetryIf(retryIf RetryFunc) *ClientOption { if o == nil { return o } o.retryIf = retryIf return o } // WithRetryBackoff specifies the retry-backoff mechanism for request. func (o *ClientOption) WithRetryBackoff(waitMin, waitMax time.Duration, attemptMax int) *ClientOption { if o == nil || waitMin < 0 || waitMax < 0 || waitMax < waitMin || attemptMax <= 0 { return o } o.retryBackoff = createRetryBackoff(waitMin, waitMax, attemptMax) return o } // WithUserAgent sets the user agent. func (o *ClientOption) WithUserAgent(ua string) *ClientOption { return o.WithRoundTripper(func(req *http.Request) error { req.Header.Set("User-Agent", ua) return nil }) } // WithBearerAuth sets the bearer token. func (o *ClientOption) WithBearerAuth(token string) *ClientOption { return o.WithRoundTripper(func(req *http.Request) error { req.Header.Set("Authorization", "Bearer "+token) return nil }) } // WithBasicAuth sets the basic authentication. func (o *ClientOption) WithBasicAuth(username, password string) *ClientOption { return o.WithRoundTripper(func(req *http.Request) error { req.SetBasicAuth(username, password) return nil }) } // WithHeader sets the header. func (o *ClientOption) WithHeader(key, value string) *ClientOption { return o.WithRoundTripper(func(req *http.Request) error { req.Header.Set(key, value) return nil }) } // WithHeaders sets the headers. func (o *ClientOption) WithHeaders(headers map[string]string) *ClientOption { return o.WithRoundTripper(func(req *http.Request) error { for k, v := range headers { req.Header.Set(k, v) } return nil }) } // WithRoundTripper sets the round tripper. func (o *ClientOption) WithRoundTripper(rt func(req *http.Request) error) *ClientOption { if o == nil || rt == nil { return o } o.roundTrippers = append(o.roundTrippers, rt) return o } // If is a conditional option, // which receives a boolean condition to trigger the given function or not. func (o *ClientOption) If(condition bool, then func(*ClientOption) *ClientOption) *ClientOption { if condition { return then(o) } return o } // DefaultRetry is the default retry condition, // inspired by https://github.com/hashicorp/go-retryablehttp/blob/40b0cad1633fd521cee5884724fcf03d039aaf3f/client.go#L68-L86. func DefaultRetry(resp *http.Response, respErr error) bool { if respErr != nil { switch errMsg := respErr.Error(); { case strings.Contains(errMsg, `redirects`): return false case strings.Contains(errMsg, `unsupported protocol scheme`): return false case strings.Contains(errMsg, `certificate is not trusted`): return false case strings.Contains(errMsg, `invalid header`): return false case strings.Contains(errMsg, `failed to verify certificate`): return false } // Retry if receiving connection closed. return true } // Retry if receiving rate-limited of server. if resp.StatusCode == http.StatusTooManyRequests { return true } // Retry if receiving unexpected responses. if resp.StatusCode == 0 || (resp.StatusCode >= 500 && resp.StatusCode != http.StatusNotImplemented) { return true } return false } // createRetryBackoff creates a backoff function for retry operation. func createRetryBackoff(waitMin, waitMax time.Duration, attemptMax int) func(int, *http.Response) (time.Duration, bool) { return func(attemptNum int, resp *http.Response) (wait time.Duration, ok bool) { if attemptNum > attemptMax { return 0, false } if resp != nil && (resp.StatusCode == http.StatusTooManyRequests || resp.StatusCode == http.StatusServiceUnavailable) { if retryAfter := resp.Header.Get("Retry-After"); retryAfter != "" { if seconds, err := strconv.Atoi(retryAfter); err == nil { return time.Duration(seconds) * time.Second, true } } } wait = time.Duration(math.Pow(2, float64(attemptNum)) * float64(waitMin)) return min(wait, waitMax), true } } ================================================ FILE: util/httpx/file.go ================================================ package httpx import ( "errors" "fmt" "io" "net/http" "strings" "syscall" "github.com/smallnest/ringbuffer" "github.com/gpustack/gguf-parser-go/util/bytex" ) type SeekerFile struct { cli *http.Client req *http.Request b *ringbuffer.RingBuffer c int64 l int64 } // OpenSeekerFile tries the GET http.Request as a SeekerFile, // and returns a SeekerFile, or an error if any. func OpenSeekerFile(cli *http.Client, req *http.Request, opts ...*SeekerFileOption) (*SeekerFile, error) { if cli == nil { return nil, errors.New("client is nil") } if req == nil { return nil, errors.New("request is nil") } if req.Method != http.MethodGet { return nil, errors.New("request method is not GET") } var o *SeekerFileOption if len(opts) > 0 { o = opts[0] } else { o = SeekerFileOptions() } if o.bufSize <= 0 { o.bufSize = 4 * 1024 * 1024 // 4mb } var l int64 { if !o.skipRangeDownloadDetect { req := req.Clone(req.Context()) req.Method = http.MethodHead err := Do(cli, req, func(resp *http.Response) error { if resp.StatusCode != http.StatusOK { return fmt.Errorf("stat: status code %d", resp.StatusCode) } if !strings.EqualFold(resp.Header.Get("Accept-Ranges"), "bytes") { return fmt.Errorf("stat: not support range download") } l = resp.ContentLength return nil }) if err != nil { return nil, fmt.Errorf("stat: do head request: %w", err) } } else { req := req.Clone(req.Context()) err := Do(cli, req, func(resp *http.Response) error { if resp.StatusCode != http.StatusOK { return fmt.Errorf("stat: status code %d", resp.StatusCode) } l = resp.ContentLength return nil }) if err != nil { return nil, fmt.Errorf("stat: do get request: %w", err) } } switch sz := int64(o.size); { case sz > l: return nil, fmt.Errorf("size %d is greater than limit %d", o.size, l) case sz <= 0: default: l = sz } } b := ringbuffer.New(o.bufSize).WithCancel(req.Context()) return &SeekerFile{cli: cli, req: req, b: b, c: 1<<63 - 1, l: l}, nil } func (f *SeekerFile) Close() error { if f.b != nil { f.b.CloseWriter() } return nil } func (f *SeekerFile) Len() int64 { return f.l } func (f *SeekerFile) ReadAt(p []byte, off int64) (int, error) { if off < 0 { return 0, syscall.EINVAL } if off > f.Len() { return 0, io.EOF } // Sync and move to new offset, if backward or empty buffer. if f.c > off || f.b.IsEmpty() { if err := f.sync(off, true); err != nil { return 0, err } } var ( remain = int64(f.b.Length()) capacity = int64(f.b.Capacity()) need = int64(len(p)) ) switch { case f.c+remain >= off+need: // Skip and move to new offset, if enough to forward. if err := f.skip(off - f.c); err != nil { return 0, err } return f.Read(p) case f.c+capacity >= off+need: // Sync and move to new offset, if enough to forward after synced. if err := f.sync(f.c+remain, false); err != nil { return 0, err } if err := f.skip(off - f.c); err != nil { return 0, err } return f.Read(p) default: } // Otherwise, read directly. f.b.Reset() f.c = off // Request remain needing. lim := off + int64(len(p)) - 1 if lim > f.Len() { lim = f.Len() } req := f.req.Clone(f.req.Context()) req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", off, lim)) resp, err := f.cli.Do(req) if err != nil { return 0, err } defer Close(resp) if resp.StatusCode != http.StatusPartialContent && resp.StatusCode != http.StatusOK { return 0, errors.New(resp.Status) } n, err := resp.Body.Read(p) f.c += int64(n) return n, err } func (f *SeekerFile) Read(p []byte) (int, error) { n, err := f.b.Read(p) f.c += int64(n) return n, err } func (f *SeekerFile) sync(off int64, reset bool) error { lim := off + int64(f.b.Free()) - 1 if lim > f.Len() { lim = f.Len() } req := f.req.Clone(f.req.Context()) req.Header.Set("Range", fmt.Sprintf("bytes=%d-%d", off, lim)) resp, err := f.cli.Do(req) if err != nil { return err } defer Close(resp) if resp.StatusCode != http.StatusPartialContent && resp.StatusCode != http.StatusOK { return errors.New(resp.Status) } buf := bytex.GetBytes() defer bytex.Put(buf) if reset { f.b.Reset() f.c = off } _, err = io.CopyBuffer(_WriterOnly{w: f.b}, resp.Body, buf) if err != nil { return err } return nil } func (f *SeekerFile) skip(dif int64) error { if dif <= 0 { return nil } buf := bytex.GetBytes(uint64(dif)) defer bytex.Put(buf) n, err := f.b.Read(buf) f.c += int64(n) if err != nil { return err } return nil } // _WriterOnly is a wrapper to expose the io.Writer method only, // which to avoid calling the io.ReaderFrom method. type _WriterOnly struct { w io.Writer } func (w _WriterOnly) Write(p []byte) (int, error) { return w.w.Write(p) } ================================================ FILE: util/httpx/file_options.go ================================================ package httpx type SeekerFileOption struct { bufSize int size int skipRangeDownloadDetect bool } func SeekerFileOptions() *SeekerFileOption { return &SeekerFileOption{ bufSize: 4 * 1024 * 1024, // 4mb } } // WithBufferSize sets the size of the buffer to read the file, // // Default is 4mb. func (o *SeekerFileOption) WithBufferSize(bufSize int) *SeekerFileOption { if o == nil || bufSize <= 0 { return o } o.bufSize = bufSize return o } // WithSize sets the size of the file to read, // // If the size is greater than the content size of the file, it will return an error. func (o *SeekerFileOption) WithSize(size int) *SeekerFileOption { if o == nil || size <= 0 { return o } o.size = size return o } // WithoutRangeDownloadDetect disables range download detection. // // Usually, OpenSeekerFile sends a "HEAD" HTTP request to destination to get the content size from the "Content-Length" header, // and confirms whether supports range download via the "Accept-Ranges" header. // However, some servers may not support the "HEAD" method, or the "Accept-Ranges" header is not set correctly. // // With this option, OpenSeekerFile sends "GET" HTTP request to get the content size as usual, // and does not confirm whether supports range download. But during the seeking read, // it still uses the "Range" header to read the file. func (o *SeekerFileOption) WithoutRangeDownloadDetect() *SeekerFileOption { if o == nil { return o } o.skipRangeDownloadDetect = true return o } // If is a conditional option, // which receives a boolean condition to trigger the given function or not. func (o *SeekerFileOption) If(condition bool, then func(*SeekerFileOption) *SeekerFileOption) *SeekerFileOption { if condition { return then(o) } return o } ================================================ FILE: util/httpx/proxy.go ================================================ package httpx import ( "net" "net/http" "net/url" "strings" "github.com/gpustack/gguf-parser-go/util/osx" ) var noProxies []*net.IPNet func init() { noProxyEnv := osx.Getenv("NO_PROXY", osx.Getenv("no_proxy")) noProxyRules := strings.Split(noProxyEnv, ",") for i := range noProxyRules { _, cidr, _ := net.ParseCIDR(noProxyRules[i]) if cidr != nil { noProxies = append(noProxies, cidr) } } } // ProxyFromEnvironment is similar to http.ProxyFromEnvironment, // but it also respects the NO_PROXY environment variable. func ProxyFromEnvironment(r *http.Request) (*url.URL, error) { if ip := net.ParseIP(r.URL.Hostname()); ip != nil { for i := range noProxies { if noProxies[i].Contains(ip) { return nil, nil } } } return http.ProxyFromEnvironment(r) } ================================================ FILE: util/httpx/resolver.go ================================================ package httpx import ( "context" "net" ) func DNSCacheDialContext(dialer *net.Dialer) func(context.Context, string, string) (net.Conn, error) { cs := map[string][]net.IP{} return func(ctx context.Context, nw, addr string) (conn net.Conn, err error) { h, p, err := net.SplitHostPort(addr) if err != nil { return nil, err } ips, ok := cs[h] if !ok { ips, err = net.DefaultResolver.LookupIP(ctx, "ip4", h) if len(ips) == 0 { ips, err = net.DefaultResolver.LookupIP(ctx, "ip", h) } if err != nil { return nil, err } cs[h] = ips } // Try to connect to each IP address in order. for _, ip := range ips { conn, err = dialer.DialContext(ctx, nw, net.JoinHostPort(ip.String(), p)) if err == nil { break } } return conn, err } } ================================================ FILE: util/httpx/transport.go ================================================ package httpx import ( "net/http" ) // DefaultTransport is similar to the default http.DefaultTransport used by the package. var DefaultTransport http.RoundTripper = Transport() // DefaultInsecureTransport is the default http.DefaultTransport used by the package, // with TLS insecure skip verify. var DefaultInsecureTransport http.RoundTripper = Transport(TransportOptions().WithoutInsecureVerify()) // Transport returns a new http.Transport with the given options, // the result http.Transport is used for constructing http.Client. func Transport(opts ...*TransportOption) *http.Transport { var o *TransportOption if len(opts) > 0 { o = opts[0] } else { o = TransportOptions() } return o.transport } ================================================ FILE: util/httpx/transport_options.go ================================================ package httpx import ( "crypto/tls" "net" "net/http" "net/url" "time" ) type TransportOption struct { dialer *net.Dialer transport *http.Transport } func TransportOptions() *TransportOption { dialer := &net.Dialer{ Timeout: 30 * time.Second, KeepAlive: 30 * time.Second, } transport := &http.Transport{ Proxy: ProxyFromEnvironment, TLSClientConfig: &tls.Config{ MinVersion: tls.VersionTLS12, }, DialContext: DNSCacheDialContext(dialer), ForceAttemptHTTP2: true, MaxIdleConns: 100, IdleConnTimeout: 90 * time.Second, TLSHandshakeTimeout: 10 * time.Second, ExpectContinueTimeout: 1 * time.Second, } return &TransportOption{ dialer: dialer, transport: transport, } } // WithProxy sets the proxy. func (o *TransportOption) WithProxy(proxy func(*http.Request) (*url.URL, error)) *TransportOption { if o == nil || o.transport == nil { return o } o.transport.Proxy = proxy return o } // WithoutProxy disables the proxy. func (o *TransportOption) WithoutProxy() *TransportOption { if o == nil || o.transport == nil { return o } o.transport.Proxy = nil return o } // WithKeepalive sets the keepalive. func (o *TransportOption) WithKeepalive(timeoutAndKeepalive ...time.Duration) *TransportOption { if o == nil || o.transport == nil || o.dialer == nil { return o } tak := [2]time.Duration{30 * time.Second, 30 * time.Second} if len(timeoutAndKeepalive) > 0 { tak[0] = timeoutAndKeepalive[0] if len(timeoutAndKeepalive) > 1 { tak[1] = timeoutAndKeepalive[1] } } o.dialer.Timeout, o.dialer.KeepAlive = tak[0], tak[1] o.transport.MaxIdleConns = 100 o.transport.IdleConnTimeout = 90 * time.Second return o } // WithoutKeepalive disables the keepalive. func (o *TransportOption) WithoutKeepalive() *TransportOption { if o == nil || o.transport == nil { return o } o.dialer.KeepAlive = -1 o.transport.MaxIdleConns = 0 o.transport.IdleConnTimeout = 0 return o } // WithInsecureVerify verifies the insecure connection. func (o *TransportOption) WithInsecureVerify() *TransportOption { if o == nil || o.transport == nil || o.transport.TLSClientConfig == nil { return o } o.transport.TLSClientConfig.InsecureSkipVerify = false return o } // WithoutInsecureVerify skips the insecure connection verify. func (o *TransportOption) WithoutInsecureVerify() *TransportOption { if o == nil || o.transport == nil || o.transport.TLSClientConfig == nil { return o } o.transport.TLSClientConfig.InsecureSkipVerify = true return o } // TimeoutForDial sets the timeout for network dial. // // This timeout controls the [network dial] only. // // Use 0 to disable timeout. func (o *TransportOption) TimeoutForDial(timeout time.Duration) *TransportOption { if o == nil || o.dialer == nil { return o } o.dialer.Timeout = timeout return o } // TimeoutForResponseHeader sets the timeout for response header. // // This timeout controls the [response header reading] only. // // Use 0 to disable timeout. func (o *TransportOption) TimeoutForResponseHeader(timeout time.Duration) *TransportOption { if o == nil || o.transport == nil { return o } o.transport.ResponseHeaderTimeout = timeout return o } // TimeoutForTLSHandshake sets the timeout for tls handshake. // // This timeout controls the [tls handshake] only. // // Use 0 to disable timeout. func (o *TransportOption) TimeoutForTLSHandshake(timeout time.Duration) *TransportOption { if o == nil || o.transport == nil { return o } o.transport.TLSHandshakeTimeout = timeout return o } // TimeoutForIdleConn sets the timeout for idle connection. // // This timeout controls the [idle connection lifetime] only. // // Use 0 to disable timeout. func (o *TransportOption) TimeoutForIdleConn(timeout time.Duration) *TransportOption { if o == nil || o.transport == nil { return o } o.transport.IdleConnTimeout = timeout return o } // WithTLSClientConfig sets the tls.Config. func (o *TransportOption) WithTLSClientConfig(config *tls.Config) *TransportOption { if o == nil || o.transport == nil { return o } o.transport.TLSClientConfig = config return o } // WithoutDNSCache disables the dns cache. func (o *TransportOption) WithoutDNSCache() *TransportOption { if o == nil || o.transport == nil || o.dialer == nil { return o } o.transport.DialContext = o.dialer.DialContext return o } // WithDialer sets the dialer. func (o *TransportOption) WithDialer(dialer *net.Dialer) *TransportOption { if o == nil || o.transport == nil || dialer == nil { return o } o.dialer = dialer o.transport.DialContext = DNSCacheDialContext(o.dialer) return o } // Customize sets the transport. func (o *TransportOption) Customize(fn func(*http.Transport)) *TransportOption { if o == nil || o.transport == nil { return o } o.dialer = nil fn(o.transport) return o } // If is a conditional option, // which receives a boolean condition to trigger the given function or not. func (o *TransportOption) If(condition bool, then func(*TransportOption) *TransportOption) *TransportOption { if condition { return then(o) } return o } ================================================ FILE: util/json/common.go ================================================ package json import ( stdjson "encoding/json" "fmt" ) type RawMessage = stdjson.RawMessage var ( MarshalIndent = stdjson.MarshalIndent Indent = stdjson.Indent NewEncoder = stdjson.NewEncoder Valid = stdjson.Valid ) // MustMarshal is similar to Marshal, // but panics if found error. func MustMarshal(v any) []byte { bs, err := Marshal(v) if err != nil { panic(fmt.Errorf("error marshaling json: %w", err)) } return bs } // MustUnmarshal is similar to Unmarshal, // but panics if found error. func MustUnmarshal(data []byte, v any) { err := Unmarshal(data, v) if err != nil { panic(fmt.Errorf("error unmarshaling json: %w", err)) } } // MustMarshalIndent is similar to MarshalIndent, // but panics if found error. func MustMarshalIndent(v any, prefix, indent string) []byte { bs, err := MarshalIndent(v, prefix, indent) if err != nil { panic(fmt.Errorf("error marshaling indent json: %w", err)) } return bs } // ShouldMarshal is similar to Marshal, // but never return error. func ShouldMarshal(v any) []byte { bs, _ := Marshal(v) return bs } // ShouldUnmarshal is similar to Unmarshal, // but never return error. func ShouldUnmarshal(data []byte, v any) { _ = Unmarshal(data, v) } // ShouldMarshalIndent is similar to MarshalIndent, // but never return error. func ShouldMarshalIndent(v any, prefix, indent string) []byte { bs, _ := MarshalIndent(v, prefix, indent) return bs } ================================================ FILE: util/json/jsoniter.go ================================================ //go:build !stdjson package json import ( stdjson "encoding/json" "strconv" "unsafe" jsoniter "github.com/json-iterator/go" ) var json = jsoniter.ConfigCompatibleWithStandardLibrary func init() { // borrowed from https://github.com/json-iterator/go/issues/145#issuecomment-323483602 decodeNumberAsInt64IfPossible := func(ptr unsafe.Pointer, iter *jsoniter.Iterator) { switch iter.WhatIsNext() { case jsoniter.NumberValue: var number stdjson.Number iter.ReadVal(&number) i, err := strconv.ParseInt(string(number), 10, 64) if err == nil { *(*any)(ptr) = i return } f, err := strconv.ParseFloat(string(number), 64) if err == nil { *(*any)(ptr) = f return } default: *(*any)(ptr) = iter.Read() } } jsoniter.RegisterTypeDecoderFunc("interface {}", decodeNumberAsInt64IfPossible) jsoniter.RegisterTypeDecoderFunc("any", decodeNumberAsInt64IfPossible) } var ( Marshal = json.Marshal Unmarshal = json.Unmarshal NewDecoder = json.NewDecoder ) ================================================ FILE: util/json/stdjson.go ================================================ //go:build stdjson package json import ( "encoding/json" ) var ( Marshal = json.Marshal Unmarshal = json.Unmarshal NewDecoder = json.NewDecoder ) ================================================ FILE: util/osx/env.go ================================================ package osx import ( "os" ) // ExistEnv checks if the environment variable named by the key exists. func ExistEnv(key string) bool { _, ok := os.LookupEnv(key) return ok } // Getenv retrieves the value of the environment variable named by the key. // It returns the default, which will be empty if the variable is not present. // To distinguish between an empty value and an unset value, use LookupEnv. func Getenv(key string, def ...string) string { e, ok := os.LookupEnv(key) if !ok && len(def) != 0 { return def[0] } return e } // ExpandEnv is similar to Getenv, // but replaces ${var} or $var in the result. func ExpandEnv(key string, def ...string) string { return os.ExpandEnv(Getenv(key, def...)) } ================================================ FILE: util/osx/file.go ================================================ package osx import ( "io" "os" "path/filepath" "strings" ) // InlineTilde replaces the leading ~ with the home directory. func InlineTilde(path string) string { if path == "" { return path } if strings.HasPrefix(path, "~"+string(filepath.Separator)) { hd, err := os.UserHomeDir() if err == nil { path = filepath.Join(hd, path[2:]) } } return path } // Open is similar to os.Open but supports ~ as the home directory. func Open(path string) (*os.File, error) { p := filepath.Clean(path) p = InlineTilde(p) return os.Open(p) } // Exists checks if the given path exists. func Exists(path string, checks ...func(os.FileInfo) bool) bool { p := filepath.Clean(path) p = InlineTilde(p) stat, err := os.Lstat(p) if err != nil { return false } for i := range checks { if checks[i] == nil { continue } if !checks[i](stat) { return false } } return true } // ExistsDir checks if the given path exists and is a directory. func ExistsDir(path string) bool { return Exists(path, func(stat os.FileInfo) bool { return stat.Mode().IsDir() }) } // ExistsLink checks if the given path exists and is a symbolic link. func ExistsLink(path string) bool { return Exists(path, func(stat os.FileInfo) bool { return stat.Mode()&os.ModeSymlink != 0 }) } // ExistsFile checks if the given path exists and is a regular file. func ExistsFile(path string) bool { return Exists(path, func(stat os.FileInfo) bool { return stat.Mode().IsRegular() }) } // ExistsSocket checks if the given path exists and is a socket. func ExistsSocket(path string) bool { return Exists(path, func(stat os.FileInfo) bool { return stat.Mode()&os.ModeSocket != 0 }) } // ExistsDevice checks if the given path exists and is a device. func ExistsDevice(path string) bool { return Exists(path, func(stat os.FileInfo) bool { return stat.Mode()&os.ModeDevice != 0 }) } // Close closes the given io.Closer without error. func Close(c io.Closer) { if c == nil { return } _ = c.Close() } // WriteFile is similar to os.WriteFile but supports ~ as the home directory, // and also supports the parent directory creation. func WriteFile(name string, data []byte, perm os.FileMode) error { p := filepath.Clean(name) p = InlineTilde(p) if err := os.MkdirAll(filepath.Dir(p), 0o700); err != nil { return err } return os.WriteFile(p, data, perm) } // CreateFile is similar to os.Create but supports ~ as the home directory, // and also supports the parent directory creation. func CreateFile(name string, perm os.FileMode) (*os.File, error) { p := filepath.Clean(name) p = InlineTilde(p) if err := os.MkdirAll(filepath.Dir(p), 0o700); err != nil { return nil, err } return os.OpenFile(p, os.O_RDWR|os.O_CREATE|os.O_TRUNC, perm) } // OpenFile is similar to os.OpenFile but supports ~ as the home directory, // and also supports the parent directory creation. func OpenFile(name string, flag int, perm os.FileMode) (*os.File, error) { p := filepath.Clean(name) p = InlineTilde(p) if err := os.MkdirAll(filepath.Dir(p), 0o700); err != nil { return nil, err } return os.OpenFile(p, flag, perm) } ================================================ FILE: util/osx/file_mmap.go ================================================ // Copyright 2018 The Prometheus Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package osx import ( "errors" "fmt" "io" "os" "path/filepath" "runtime/debug" "syscall" ) type MmapFile struct { f *os.File b []byte } func OpenMmapFile(path string) (*MmapFile, error) { return OpenMmapFileWithSize(path, 0) } func OpenMmapFileWithSize(path string, size int) (*MmapFile, error) { p := filepath.Clean(path) p = InlineTilde(p) f, err := os.Open(p) if err != nil { return nil, fmt.Errorf("try lock file: %w", err) } if size <= 0 { info, err := f.Stat() if err != nil { Close(f) return nil, fmt.Errorf("stat: %w", err) } size = int(info.Size()) } b, err := mmap(f, size) if err != nil { Close(f) return nil, fmt.Errorf("mmap, size %d: %w", size, err) } return &MmapFile{f: f, b: b}, nil } func (f *MmapFile) Close() error { err0 := munmap(f.b) err1 := f.f.Close() if err0 != nil { return err0 } return err1 } func (f *MmapFile) Bytes() []byte { return f.b } func (f *MmapFile) Len() int64 { return int64(len(f.b)) } var ErrPageFault = errors.New("page fault occurred while reading from memory map") func (f *MmapFile) ReadAt(p []byte, off int64) (_ int, err error) { if off < 0 { return 0, syscall.EINVAL } if off > f.Len() { return 0, io.EOF } old := debug.SetPanicOnFault(true) defer func() { debug.SetPanicOnFault(old) if recover() != nil { err = ErrPageFault } }() n := copy(p, f.b[off:]) if n < len(p) { err = io.EOF } return n, err } ================================================ FILE: util/osx/file_mmap_js.go ================================================ // Copyright 2022 The Prometheus Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package osx import ( "errors" "os" ) func mmap(f *os.File, length int) ([]byte, error) { return nil, errors.New("unsupported") } func munmap(b []byte) (err error) { return errors.New("unsupported") } ================================================ FILE: util/osx/file_mmap_unix.go ================================================ // Copyright 2017 The Prometheus Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris package osx import ( "os" "golang.org/x/sys/unix" ) func mmap(f *os.File, length int) ([]byte, error) { return unix.Mmap(int(f.Fd()), 0, length, unix.PROT_READ, unix.MAP_SHARED) } func munmap(b []byte) (err error) { return unix.Munmap(b) } ================================================ FILE: util/osx/file_mmap_windows.go ================================================ package osx import ( "os" "syscall" "unsafe" ) func mmap(f *os.File, size int) ([]byte, error) { low, high := uint32(size), uint32(size>>32) h, errno := syscall.CreateFileMapping(syscall.Handle(f.Fd()), nil, syscall.PAGE_READONLY, high, low, nil) if h == 0 { return nil, os.NewSyscallError("CreateFileMapping", errno) } addr, errno := syscall.MapViewOfFile(h, syscall.FILE_MAP_READ, 0, 0, uintptr(size)) if addr == 0 { return nil, os.NewSyscallError("MapViewOfFile", errno) } if err := syscall.CloseHandle(h); err != nil { return nil, os.NewSyscallError("CloseHandle", err) } return (*[maxMapSize]byte)(unsafe.Pointer(addr))[:size], nil } func munmap(b []byte) error { if err := syscall.UnmapViewOfFile((uintptr)(unsafe.Pointer(&b[0]))); err != nil { return os.NewSyscallError("UnmapViewOfFile", err) } return nil } ================================================ FILE: util/osx/file_mmap_windows_386.go ================================================ // Copyright 2018 The Prometheus Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package osx const maxMapSize = 0x7FFFFFFF // 2GB ================================================ FILE: util/osx/file_mmap_windows_non386.go ================================================ // Copyright 2018 The Prometheus Authors // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. //go:build windows && !386 package osx const maxMapSize = 0xFFFFFFFFFFFF // 256TB ================================================ FILE: util/osx/homedir.go ================================================ package osx import ( "os" "path/filepath" "time" ) // UserHomeDir is similar to os.UserHomeDir, // but returns the temp dir if the home dir is not found. func UserHomeDir() string { hd, err := os.UserHomeDir() if err != nil { hd = filepath.Join(os.TempDir(), time.Now().Format(time.DateOnly)) } return hd } ================================================ FILE: util/ptr/pointer.go ================================================ package ptr import ( "time" "golang.org/x/exp/constraints" ) func Int(v int) *int { return Ref(v) } func IntDeref(v *int, def int) int { return Deref(v, def) } func Int8(v int8) *int8 { return Ref(v) } func Int8Deref(v *int8, def int8) int8 { return Deref(v, def) } func Int16(v int16) *int16 { return Ref(v) } func Int16Deref(v *int16, def int16) int16 { return Deref(v, def) } func Int32(v int32) *int32 { return Ref(v) } func Int32Deref(v *int32, def int32) int32 { return Deref(v, def) } func Int64(v int64) *int64 { return Ref(v) } func Int64Deref(v *int64, def int64) int64 { return Deref(v, def) } func Uint(v uint) *uint { return Ref(v) } func UintDeref(v *uint, def uint) uint { return Deref(v, def) } func Uint8(v uint8) *uint8 { return Ref(v) } func Uint8Deref(v *uint8, def uint8) uint8 { return Deref(v, def) } func Uint16(v uint16) *uint16 { return Ref(v) } func Uint16Deref(v *uint16, def uint16) uint16 { return Deref(v, def) } func Uint32(v uint32) *uint32 { return Ref(v) } func Uint32Deref(v *uint32, def uint32) uint32 { return Deref(v, def) } func Uint64(v uint64) *uint64 { return Ref(v) } func Uint64Deref(v *uint64, def uint64) uint64 { return Deref(v, def) } func Float32(v float32) *float32 { return Ref(v) } func Float32Deref(v *float32, def float32) float32 { return Deref(v, def) } func Float64(v float64) *float64 { return Ref(v) } func Float64Deref(v *float64, def float64) float64 { return Deref(v, def) } func String(v string) *string { return Ref(v) } func StringDeref(v *string, def string) string { return Deref(v, def) } func Bool(v bool) *bool { return Ref(v) } func BoolDeref(v *bool, def bool) bool { return Deref(v, def) } func Duration(v time.Duration) *time.Duration { return Ref(v) } func DurationDeref(v *time.Duration, def time.Duration) time.Duration { return Deref(v, def) } func Time(v time.Time) *time.Time { return Ref(v) } func TimeDeref(v *time.Time, def time.Time) time.Time { return Deref(v, def) } type Pointerable interface { constraints.Ordered | ~bool | time.Time } func Ref[T Pointerable](v T) *T { return &v } func To[T Pointerable](v T) *T { return Ref(v) } func Deref[T Pointerable](ptr *T, def T) T { if ptr != nil { return *ptr } return def } func Equal[T Pointerable](a, b *T) bool { if a != nil && b != nil { return *a == *b } return false } ================================================ FILE: util/signalx/handler.go ================================================ package signalx import ( "context" "os" "os/signal" ) var registered = make(chan struct{}) // Handler registers for signals and returns a context. func Handler() context.Context { close(registered) // Panics when called twice. sigChan := make(chan os.Signal, len(sigs)) ctx, cancel := context.WithCancel(context.Background()) // Register for signals. signal.Notify(sigChan, sigs...) // Process signals. go func() { var exited bool for range sigChan { if exited { os.Exit(1) } cancel() exited = true } }() return ctx } ================================================ FILE: util/signalx/handler_unix.go ================================================ //go:build !windows package signalx import ( "os" "syscall" ) var sigs = []os.Signal{syscall.SIGINT, syscall.SIGTERM} ================================================ FILE: util/signalx/handler_windows.go ================================================ package signalx import ( "os" "syscall" ) var sigs = []os.Signal{syscall.SIGINT} ================================================ FILE: util/slicex/search.go ================================================ package slicex import "golang.org/x/exp/constraints" // UpperBound returns an index of the first element that is greater than value. func UpperBound[T constraints.Integer | constraints.Float](s []T, e T) int { l, r := 0, len(s) for l < r { m := l + (r-l)/2 if s[m] <= e { l = m + 1 } else { r = m } } return l } ================================================ FILE: util/stringx/bytes.go ================================================ package stringx import "unsafe" // FromBytes converts a byte slice to a string. func FromBytes(b *[]byte) string { return unsafe.String(unsafe.SliceData(*b), len(*b)) } // ToBytes converts a string to a byte slice, // which is impossible to modify the item of slice. func ToBytes(s *string) (bs []byte) { return unsafe.Slice(unsafe.StringData(*s), len(*s)) } ================================================ FILE: util/stringx/random.go ================================================ package stringx // Borrowed from github.com/thanhpk/randstr. import ( "bytes" "crypto/rand" "encoding/binary" "encoding/hex" ) // list of default letters that can be used to make a random string when calling RandomString // function with no letters provided. var defLetters = []rune("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ") // RandomBytes generates n random bytes. func RandomBytes(n int) []byte { b := make([]byte, n) _, err := rand.Read(b) if err != nil { panic(err) } return b } // RandomHex generates a random hex string with length of n // e.g: 67aab2d956bd7cc621af22cfb169cba8. func RandomHex(n int) string { return hex.EncodeToString(RandomBytes(n)) } // RandomString generates a random string using only letters provided in the letters parameter // if user omit letters parameters, this function will use defLetters instead. func RandomString(n int, letters ...string) string { var ( letterRunes []rune bb bytes.Buffer ) if len(letters) == 0 { letterRunes = defLetters } else { letterRunes = []rune(letters[0]) } bb.Grow(n) l := uint32(len(letterRunes)) // On each loop, generate one random rune and append to output. for i := 0; i < n; i++ { bb.WriteRune(letterRunes[binary.BigEndian.Uint32(RandomBytes(4))%l]) } return bb.String() } // RandomBase64 generates a random base64 string with length of n, // safe for URL. func RandomBase64(n int) string { return RandomString(n, "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_") } ================================================ FILE: util/stringx/strings.go ================================================ package stringx import "strings" // CutFromLeft is the same as strings.Cut, // which starts from left to right, // slices s around the first instance of sep, // returning the text before and after sep. // The found result reports whether sep appears in s. // If sep does not appear in s, cut returns s, "", false. func CutFromLeft(s, sep string) (before, after string, found bool) { return strings.Cut(s, sep) } // CutFromRight takes the same arguments as CutFromLeft, // but starts from right to left, // slices s around the last instance of sep, // return the text before and after sep. // The found result reports whether sep appears in s. // If sep does not appear in s, cut returns s, "", false. func CutFromRight(s, sep string) (before, after string, found bool) { if i := strings.LastIndex(s, sep); i >= 0 { return s[:i], s[i+len(sep):], true } return s, "", false } // ReplaceAllFunc is similar to strings.ReplaceAll, // but it replaces each rune in s with the result of f(r). func ReplaceAllFunc(s string, f func(rune) rune) string { var b strings.Builder for _, r := range s { b.WriteRune(f(r)) } return b.String() } // HasSuffixes checks if s has any of the suffixes in prefixes. func HasSuffixes(s string, suffixes ...string) bool { for _, suffix := range suffixes { if strings.HasSuffix(s, suffix) { return true } } return false } ================================================ FILE: util/stringx/sum.go ================================================ package stringx import ( "crypto/sha256" "encoding/hex" "hash/fnv" ) // SumByFNV64a sums up the string(s) by FNV-64a hash algorithm. func SumByFNV64a(s string, ss ...string) string { h := fnv.New64a() _, _ = h.Write(ToBytes(&s)) for i := range ss { _, _ = h.Write(ToBytes(&ss[i])) } sum := h.Sum(nil) return hex.EncodeToString(sum) } // SumBytesByFNV64a sums up the byte slice(s) by FNV-64a hash algorithm. func SumBytesByFNV64a(bs []byte, bss ...[]byte) string { h := fnv.New64a() _, _ = h.Write(bs) for i := range bss { _, _ = h.Write(bss[i]) } sum := h.Sum(nil) return hex.EncodeToString(sum) } // SumBySHA256 sums up the string(s) by SHA256 hash algorithm. func SumBySHA256(s string, ss ...string) string { h := sha256.New() _, _ = h.Write(ToBytes(&s)) for i := range ss { _, _ = h.Write(ToBytes(&ss[i])) } sum := h.Sum(nil) return hex.EncodeToString(sum) } // SumBytesBySHA256 sums up the byte slice(s) by SHA256 hash algorithm. func SumBytesBySHA256(bs []byte, bss ...[]byte) string { h := sha256.New() _, _ = h.Write(bs) for i := range bss { _, _ = h.Write(bss[i]) } sum := h.Sum(nil) return hex.EncodeToString(sum) } // SumBySHA224 sums up the string(s) by SHA224 hash algorithm. func SumBySHA224(s string, ss ...string) string { h := sha256.New224() _, _ = h.Write(ToBytes(&s)) for i := range ss { _, _ = h.Write(ToBytes(&ss[i])) } sum := h.Sum(nil) return hex.EncodeToString(sum) } // SumBytesBySHA224 sums up the byte slice(s) by SHA224 hash algorithm. func SumBytesBySHA224(bs []byte, bss ...[]byte) string { h := sha256.New224() _, _ = h.Write(bs) for i := range bss { _, _ = h.Write(bss[i]) } sum := h.Sum(nil) return hex.EncodeToString(sum) } ================================================ FILE: zz_generated.diffusion_model_memory_usage.regression.go ================================================ package gguf_parser import "math" // GuessSD1DiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, // which is calculated by linear regression or polynomial regression. func GuessSD1DiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 { coefficients := []float64{7876368.5672, 161.4230198633, 0.0078124893} degree := 2 x := float64(width * height) y := float64(0) for i := 0; i <= degree; i++ { y += coefficients[i] * math.Pow(x, float64(i)) } return uint64(y) } // GuessSD2DiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, // which is calculated by linear regression or polynomial regression. func GuessSD2DiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 { coefficients := []float64{-355043979.0562, -1193.3271458642, 0.0054023818} degree := 2 x := float64(width * height) if flashAttention { coefficients = []float64{3780681.28078, 513.2102510935} degree = 1 } y := float64(0) for i := 0; i <= degree; i++ { y += coefficients[i] * math.Pow(x, float64(i)) } return uint64(y) } // GuessSDXLDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, // which is calculated by linear regression or polynomial regression. func GuessSDXLDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 { coefficients := []float64{55541290.3893, 138.3196116655, 0.0006109455} degree := 2 x := float64(width * height) if flashAttention { coefficients = []float64{-5958802.78052, 500.0687898915} degree = 1 } y := float64(0) for i := 0; i <= degree; i++ { y += coefficients[i] * math.Pow(x, float64(i)) } return uint64(y) } // GuessSDXLRefinerDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, // which is calculated by linear regression or polynomial regression. func GuessSDXLRefinerDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 { coefficients := []float64{49395992.3449, 155.2477810191, 0.0007351736} degree := 2 x := float64(width * height) if flashAttention { coefficients = []float64{7031343.31998, 599.4137437227} degree = 1 } y := float64(0) for i := 0; i <= degree; i++ { y += coefficients[i] * math.Pow(x, float64(i)) } return uint64(y) } // GuessSD3MediumDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, // which is calculated by linear regression or polynomial regression. func GuessSD3MediumDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 { coefficients := []float64{16529921.3700, 234.6656247718, 0.0014648995} degree := 2 x := float64(width * height) y := float64(0) for i := 0; i <= degree; i++ { y += coefficients[i] * math.Pow(x, float64(i)) } return uint64(y) } // GuessSD35MediumDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, // which is calculated by linear regression or polynomial regression. func GuessSD35MediumDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 { coefficients := []float64{17441103.4726, 281.6956819806, 0.0014651233} degree := 2 x := float64(width * height) y := float64(0) for i := 0; i <= degree; i++ { y += coefficients[i] * math.Pow(x, float64(i)) } return uint64(y) } // GuessSD35LargeDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, // which is calculated by linear regression or polynomial regression. func GuessSD35LargeDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 { coefficients := []float64{23204369.2029, 410.3731196298, 0.0023195947} degree := 2 x := float64(width * height) y := float64(0) for i := 0; i <= degree; i++ { y += coefficients[i] * math.Pow(x, float64(i)) } return uint64(y) } // GuessFLUXDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, // which is calculated by linear regression or polynomial regression. func GuessFLUXDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 { coefficients := []float64{46511668.6742, 997.7758807792, 0.0014573393} degree := 2 x := float64(width * height) y := float64(0) for i := 0; i <= degree; i++ { y += coefficients[i] * math.Pow(x, float64(i)) } return uint64(y) } ================================================ FILE: zz_generated.ggmltype.stringer.go ================================================ // Code generated by "stringer -linecomment -type GGMLType -output zz_generated.ggmltype.stringer.go -trimprefix GGMLType"; DO NOT EDIT. package gguf_parser import "strconv" func _() { // An "invalid array index" compiler error signifies that the constant values have changed. // Re-run the stringer command to generate them again. var x [1]struct{} _ = x[GGMLTypeF32-0] _ = x[GGMLTypeF16-1] _ = x[GGMLTypeQ4_0-2] _ = x[GGMLTypeQ4_1-3] _ = x[GGMLTypeQ4_2-4] _ = x[GGMLTypeQ4_3-5] _ = x[GGMLTypeQ5_0-6] _ = x[GGMLTypeQ5_1-7] _ = x[GGMLTypeQ8_0-8] _ = x[GGMLTypeQ8_1-9] _ = x[GGMLTypeQ2_K-10] _ = x[GGMLTypeQ3_K-11] _ = x[GGMLTypeQ4_K-12] _ = x[GGMLTypeQ5_K-13] _ = x[GGMLTypeQ6_K-14] _ = x[GGMLTypeQ8_K-15] _ = x[GGMLTypeIQ2_XXS-16] _ = x[GGMLTypeIQ2_XS-17] _ = x[GGMLTypeIQ3_XXS-18] _ = x[GGMLTypeIQ1_S-19] _ = x[GGMLTypeIQ4_NL-20] _ = x[GGMLTypeIQ3_S-21] _ = x[GGMLTypeIQ2_S-22] _ = x[GGMLTypeIQ4_XS-23] _ = x[GGMLTypeI8-24] _ = x[GGMLTypeI16-25] _ = x[GGMLTypeI32-26] _ = x[GGMLTypeI64-27] _ = x[GGMLTypeF64-28] _ = x[GGMLTypeIQ1_M-29] _ = x[GGMLTypeBF16-30] _ = x[GGMLTypeQ4_0_4_4-31] _ = x[GGMLTypeQ4_0_4_8-32] _ = x[GGMLTypeQ4_0_8_8-33] _ = x[GGMLTypeTQ1_0-34] _ = x[GGMLTypeTQ2_0-35] _ = x[GGMLTypeIQ4_NL_4_4-36] _ = x[GGMLTypeIQ4_NL_4_8-37] _ = x[GGMLTypeIQ4_NL_8_8-38] _ = x[GGMLTypeMXFP4-39] _ = x[_GGMLTypeCount-40] } const _GGMLType_name = "F32F16Q4_0Q4_1Q4_2Q4_3Q5_0Q5_1Q8_0Q8_1Q2_KQ3_KQ4_KQ5_KQ6_KQ8_KIQ2_XXSIQ2_XSIQ3_XXSIQ1_SIQ4_NLIQ3_SIQ2_SIQ4_XSI8I16I32I64F64IQ1_MBF16Q4_0_4_4Q4_0_4_8Q4_0_8_8TQ1_0TQ2_0IQ4_NL_4_4IQ4_NL_4_8IQ4_NL_8_8MXFP4Unknown" var _GGMLType_index = [...]uint8{0, 3, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, 69, 75, 82, 87, 93, 98, 103, 109, 111, 114, 117, 120, 123, 128, 132, 140, 148, 156, 161, 166, 176, 186, 196, 201, 208} func (i GGMLType) String() string { if i >= GGMLType(len(_GGMLType_index)-1) { return "GGMLType(" + strconv.FormatInt(int64(i), 10) + ")" } return _GGMLType_name[_GGMLType_index[i]:_GGMLType_index[i+1]] } ================================================ FILE: zz_generated.gguffiletype.stringer.go ================================================ // Code generated by "stringer -linecomment -type GGUFFileType -output zz_generated.gguffiletype.stringer.go -trimprefix GGUFFileType"; DO NOT EDIT. package gguf_parser import "strconv" func _() { // An "invalid array index" compiler error signifies that the constant values have changed. // Re-run the stringer command to generate them again. var x [1]struct{} _ = x[GGUFFileTypeMostlyF32-0] _ = x[GGUFFileTypeMostlyF16-1] _ = x[GGUFFileTypeMostlyQ4_0-2] _ = x[GGUFFileTypeMostlyQ4_1-3] _ = x[GGUFFileTypeMostlyQ4_1_SOME_F16-4] _ = x[GGUFFileTypeMostlyQ4_2-5] _ = x[GGUFFileTypeMostlyQ4_3-6] _ = x[GGUFFileTypeMostlyQ8_0-7] _ = x[GGUFFileTypeMostlyQ5_0-8] _ = x[GGUFFileTypeMostlyQ5_1-9] _ = x[GGUFFileTypeMostlyQ2_K-10] _ = x[GGUFFileTypeMostlyQ3_K_S-11] _ = x[GGUFFileTypeMostlyQ3_K_M-12] _ = x[GGUFFileTypeMostlyQ3_K_L-13] _ = x[GGUFFileTypeMostlyQ4_K_S-14] _ = x[GGUFFileTypeMostlyQ4_K_M-15] _ = x[GGUFFileTypeMostlyQ5_K_S-16] _ = x[GGUFFileTypeMostlyQ5_K_M-17] _ = x[GGUFFileTypeMostlyQ6_K-18] _ = x[GGUFFileTypeMostlyIQ2_XXS-19] _ = x[GGUFFileTypeMostlyIQ2_XS-20] _ = x[GGUFFileTypeMostlyQ2_K_S-21] _ = x[GGUFFileTypeMostlyIQ3_XS-22] _ = x[GGUFFileTypeMostlyIQ3_XXS-23] _ = x[GGUFFileTypeMostlyIQ1_S-24] _ = x[GGUFFileTypeMostlyIQ4_NL-25] _ = x[GGUFFileTypeMostlyIQ3_S-26] _ = x[GGUFFileTypeMostlyIQ3_M-27] _ = x[GGUFFileTypeMostlyIQ2_S-28] _ = x[GGUFFileTypeMostlyIQ2_M-29] _ = x[GGUFFileTypeMostlyIQ4_XS-30] _ = x[GGUFFileTypeMostlyIQ1_M-31] _ = x[GGUFFileTypeMostlyBF16-32] _ = x[GGUFFileTypeMostlyQ4_0_4_4-33] _ = x[GGUFFileTypeMostlyQ4_0_4_8-34] _ = x[GGUFFileTypeMostlyQ4_0_8_8-35] _ = x[GGUFFileTypeMostlyTQ1_0-36] _ = x[GGUFFileTypeMostlyTQ2_0-37] _ = x[GGUFFileTypeMostlyMXFP4-38] _ = x[_GGUFFileTypeCount-39] } const _GGUFFileType_name = "MOSTLY_F32MOSTLY_F16MOSTLY_Q4_0MOSTLY_Q4_1MOSTLY_Q4_1_SOME_F16MOSTLY_Q4_2MOSTLY_Q4_3MOSTLY_Q8_0MOSTLY_Q5_0MOSTLY_Q5_1MOSTLY_Q2_KMOSTLY_Q3_K_SMOSTLY_Q3_K_MMOSTLY_Q3_K_LMOSTLY_Q4_K_SMOSTLY_Q4_K_MMOSTLY_Q5_K_SMOSTLY_Q5_K_MMOSTLY_Q6_KMOSTLY_IQ2_XXSMOSTLY_IQ2_XSMOSTLY_Q2_K_SMOSTLY_IQ3_XSMOSTLY_IQ3_XXSMOSTLY_IQ1_SMOSTLY_IQ4_NLMOSTLY_IQ3_SMOSTLY_IQ3_MMOSTLY_IQ2_SMOSTLY_IQ2_MMOSTLY_IQ4_XSMOSTLY_IQ1_MMOSTLY_BF16MOSTLY_Q4_0_4_4MOSTLY_Q4_0_4_8MOSTLY_Q4_0_8_8MOSTLY_TQ1_0MOSTLY_TQ2_0MOSTLY_MXFP4Unknown" var _GGUFFileType_index = [...]uint16{0, 10, 20, 31, 42, 62, 73, 84, 95, 106, 117, 128, 141, 154, 167, 180, 193, 206, 219, 230, 244, 257, 270, 283, 297, 309, 322, 334, 346, 358, 370, 383, 395, 406, 421, 436, 451, 463, 475, 487, 494} func (i GGUFFileType) String() string { if i >= GGUFFileType(len(_GGUFFileType_index)-1) { return "GGUFFileType(" + strconv.FormatInt(int64(i), 10) + ")" } return _GGUFFileType_name[_GGUFFileType_index[i]:_GGUFFileType_index[i+1]] } ================================================ FILE: zz_generated.ggufmagic.stringer.go ================================================ // Code generated by "stringer -linecomment -type GGUFMagic -output zz_generated.ggufmagic.stringer.go -trimprefix GGUFMagic"; DO NOT EDIT. package gguf_parser import "strconv" func _() { // An "invalid array index" compiler error signifies that the constant values have changed. // Re-run the stringer command to generate them again. var x [1]struct{} _ = x[GGUFMagicGGML-1734831468] _ = x[GGUFMagicGGMF-1734831462] _ = x[GGUFMagicGGJT-1734830708] _ = x[GGUFMagicGGUFLe-1179993927] _ = x[GGUFMagicGGUFBe-1195857222] } const ( _GGUFMagic_name_0 = "GGUF" _GGUFMagic_name_1 = "GGUF" _GGUFMagic_name_2 = "GGJT" _GGUFMagic_name_3 = "GGMF" _GGUFMagic_name_4 = "GGML" ) func (i GGUFMagic) String() string { switch { case i == 1179993927: return _GGUFMagic_name_0 case i == 1195857222: return _GGUFMagic_name_1 case i == 1734830708: return _GGUFMagic_name_2 case i == 1734831462: return _GGUFMagic_name_3 case i == 1734831468: return _GGUFMagic_name_4 default: return "GGUFMagic(" + strconv.FormatInt(int64(i), 10) + ")" } } ================================================ FILE: zz_generated.ggufmetadatavaluetype.stringer.go ================================================ // Code generated by "stringer -linecomment -type GGUFMetadataValueType -output zz_generated.ggufmetadatavaluetype.stringer.go -trimprefix GGUFMetadataValueType"; DO NOT EDIT. package gguf_parser import "strconv" func _() { // An "invalid array index" compiler error signifies that the constant values have changed. // Re-run the stringer command to generate them again. var x [1]struct{} _ = x[GGUFMetadataValueTypeUint8-0] _ = x[GGUFMetadataValueTypeInt8-1] _ = x[GGUFMetadataValueTypeUint16-2] _ = x[GGUFMetadataValueTypeInt16-3] _ = x[GGUFMetadataValueTypeUint32-4] _ = x[GGUFMetadataValueTypeInt32-5] _ = x[GGUFMetadataValueTypeFloat32-6] _ = x[GGUFMetadataValueTypeBool-7] _ = x[GGUFMetadataValueTypeString-8] _ = x[GGUFMetadataValueTypeArray-9] _ = x[GGUFMetadataValueTypeUint64-10] _ = x[GGUFMetadataValueTypeInt64-11] _ = x[GGUFMetadataValueTypeFloat64-12] _ = x[_GGUFMetadataValueTypeCount-13] } const _GGUFMetadataValueType_name = "Uint8Int8Uint16Int16Uint32Int32Float32BoolStringArrayUint64Int64Float64Unknown" var _GGUFMetadataValueType_index = [...]uint8{0, 5, 9, 15, 20, 26, 31, 38, 42, 48, 53, 59, 64, 71, 78} func (i GGUFMetadataValueType) String() string { if i >= GGUFMetadataValueType(len(_GGUFMetadataValueType_index)-1) { return "GGUFMetadataValueType(" + strconv.FormatInt(int64(i), 10) + ")" } return _GGUFMetadataValueType_name[_GGUFMetadataValueType_index[i]:_GGUFMetadataValueType_index[i+1]] } ================================================ FILE: zz_generated.ggufversion.stringer.go ================================================ // Code generated by "stringer -linecomment -type GGUFVersion -output zz_generated.ggufversion.stringer.go -trimprefix GGUFVersion"; DO NOT EDIT. package gguf_parser import "strconv" func _() { // An "invalid array index" compiler error signifies that the constant values have changed. // Re-run the stringer command to generate them again. var x [1]struct{} _ = x[GGUFVersionV1-1] _ = x[GGUFVersionV2-2] _ = x[GGUFVersionV3-3] } const _GGUFVersion_name = "V1V2V3" var _GGUFVersion_index = [...]uint8{0, 2, 4, 6} func (i GGUFVersion) String() string { i -= 1 if i >= GGUFVersion(len(_GGUFVersion_index)-1) { return "GGUFVersion(" + strconv.FormatInt(int64(i+1), 10) + ")" } return _GGUFVersion_name[_GGUFVersion_index[i]:_GGUFVersion_index[i+1]] }