Showing preview only (622K chars total). Download the full file or copy to clipboard to get everything.
Repository: gpustack/gguf-parser-go
Branch: main
Commit: 35c4501b75f7
Files: 85
Total size: 593.2 KB
Directory structure:
gitextract_sklq_6lp/
├── .gitattributes
├── .github/
│ └── workflows/
│ ├── ci.yml
│ ├── cmd.yml
│ ├── prune.yml
│ └── sync.yml
├── .gitignore
├── .golangci.yaml
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── cache.go
├── cmd/
│ └── gguf-parser/
│ ├── README.md
│ ├── go.mod
│ ├── go.sum
│ └── main.go
├── file.go
├── file_architecture.go
├── file_architecture_test.go
├── file_estimate__llamacpp.go
├── file_estimate__llamacpp_test.go
├── file_estimate__stablediffusioncpp.go
├── file_estimate__stablediffusioncpp_test.go
├── file_estimate_option.go
├── file_from_distro.go
├── file_from_remote.go
├── file_metadata.go
├── file_metadata_test.go
├── file_option.go
├── file_test.go
├── file_tokenizer.go
├── file_tokenizer_test.go
├── filename.go
├── filename_test.go
├── gen.go
├── gen.regression.go
├── gen.stringer.go
├── ggml.go
├── go.mod
├── go.sum
├── ollama_model.go
├── ollama_model_option.go
├── ollama_model_test.go
├── ollama_registry_authenticate.go
├── scalar.go
├── scalar_test.go
├── util/
│ ├── anyx/
│ │ └── any.go
│ ├── bytex/
│ │ └── pool.go
│ ├── funcx/
│ │ └── error.go
│ ├── httpx/
│ │ ├── client.go
│ │ ├── client_helper.go
│ │ ├── client_options.go
│ │ ├── file.go
│ │ ├── file_options.go
│ │ ├── proxy.go
│ │ ├── resolver.go
│ │ ├── transport.go
│ │ └── transport_options.go
│ ├── json/
│ │ ├── common.go
│ │ ├── jsoniter.go
│ │ └── stdjson.go
│ ├── osx/
│ │ ├── env.go
│ │ ├── file.go
│ │ ├── file_mmap.go
│ │ ├── file_mmap_js.go
│ │ ├── file_mmap_unix.go
│ │ ├── file_mmap_windows.go
│ │ ├── file_mmap_windows_386.go
│ │ ├── file_mmap_windows_non386.go
│ │ └── homedir.go
│ ├── ptr/
│ │ └── pointer.go
│ ├── signalx/
│ │ ├── handler.go
│ │ ├── handler_unix.go
│ │ └── handler_windows.go
│ ├── slicex/
│ │ └── search.go
│ └── stringx/
│ ├── bytes.go
│ ├── random.go
│ ├── strings.go
│ └── sum.go
├── zz_generated.diffusion_model_memory_usage.regression.go
├── zz_generated.ggmltype.stringer.go
├── zz_generated.gguffiletype.stringer.go
├── zz_generated.ggufmagic.stringer.go
├── zz_generated.ggufmetadatavaluetype.stringer.go
└── zz_generated.ggufversion.stringer.go
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitattributes
================================================
* text=auto eol=lf
**/go.sum linguist-generated=true
**/zz_generated.*.go linguist-generated=true
================================================
FILE: .github/workflows/ci.yml
================================================
name: ci
permissions:
contents: read
pull-requests: read
actions: read
defaults:
run:
shell: bash
on:
push:
branches:
- 'main'
- 'branch-v*.*'
paths-ignore:
- "docs/**"
- "**.md"
- "**.mdx"
- "**.png"
- "**.jpg"
- ".github/workflows/cmd.yml"
- ".github/workflows/prune.yml"
- ".github/workflows/sync.yml"
pull_request:
branches:
- 'main'
paths-ignore:
- "docs/**"
- "**.md"
- "**.mdx"
- "**.png"
- "**.jpg"
- ".github/workflows/cmd.yml"
- ".github/workflows/prune.yml"
- ".github/workflows/sync.yml"
jobs:
ci:
timeout-minutes: 15
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 1
persist-credentials: false
- name: Setup Go
timeout-minutes: 15
uses: actions/setup-go@v5
with:
go-version: "1.22.9"
cache-dependency-path: |
**/go.sum
- name: Setup Toolbox
timeout-minutes: 5
uses: actions/cache@v4
with:
key: toolbox-${{ runner.os }}
path: |
${{ github.workspace }}/.sbin
- name: Make
run: make ci
env:
LINT_DIRTY: "true"
================================================
FILE: .github/workflows/cmd.yml
================================================
name: cmd
permissions:
contents: write
actions: read
id-token: write
defaults:
run:
shell: bash
on:
push:
branches:
- 'main'
- 'branch-v*.*'
paths-ignore:
- "docs/**"
- "**.md"
- "**.mdx"
- "**.png"
- "**.jpg"
- ".github/workflows/ci.yml"
- ".github/workflows/prune.yml"
- ".github/workflows/sync.yml"
tags:
- "v*.*.*"
jobs:
build:
timeout-minutes: 15
runs-on: ubuntu-22.04
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 1
persist-credentials: false
- name: Setup Go
timeout-minutes: 15
uses: actions/setup-go@v5
with:
go-version: "1.22.9"
cache-dependency-path: |
cmd/**/go.sum
- name: Make
run: make build
env:
VERSION: "${{ github.ref_name }}"
- name: Upload Artifact
uses: actions/upload-artifact@v4
with:
include-hidden-files: true
path: ${{ github.workspace }}/.dist/*
- name: Release
if: ${{ startsWith(github.ref, 'refs/tags/') }}
uses: softprops/action-gh-release@v2
with:
fail_on_unmatched_files: true
tag_name: "${{ github.ref_name }}"
prerelease: ${{ contains(github.ref, 'rc') }}
files: ${{ github.workspace }}/.dist/*
publish:
needs:
- build
permissions:
contents: write
actions: read
id-token: write
timeout-minutes: 15
runs-on: ubuntu-22.04
env:
PACKAGE_REGISTRY: "gpustack"
PACKAGE_IMAGE: "gguf-parser"
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 1
persist-credentials: false
- name: Setup QEMU
uses: docker/setup-qemu-action@v3
with:
image: tonistiigi/binfmt:qemu-v9.2.2
platforms: "arm64"
- name: Setup Buildx
uses: docker/setup-buildx-action@v3
- name: Login DockerHub
uses: docker/login-action@v3
with:
username: ${{ secrets.CI_DOCKERHUB_USERNAME }}
password: ${{ secrets.CI_DOCKERHUB_PASSWORD }}
- name: Download Artifact
uses: actions/download-artifact@v4
with:
path: ${{ github.workspace }}/.dist
merge-multiple: true
- name: Get Metadata
id: metadata
uses: docker/metadata-action@v5
with:
images: "${{ env.PACKAGE_REGISTRY }}/${{ env.PACKAGE_IMAGE }}"
- name: Package
uses: docker/build-push-action@v6
with:
push: true
file: ${{ github.workspace }}/Dockerfile
context: ${{ github.workspace }}
platforms: "linux/amd64,linux/arm64"
tags: ${{ steps.metadata.outputs.tags }}
labels: ${{ steps.metadata.outputs.labels }}
cache-from: |
type=registry,ref=${{ env.PACKAGE_REGISTRY }}/${{ env.PACKAGE_IMAGE }}:build-cache
cache-to: |
type=registry,mode=max,compression=gzip,ref=${{ env.PACKAGE_REGISTRY }}/${{ env.PACKAGE_IMAGE }}:build-cache,ignore-error=true
provenance: true
sbom: true
================================================
FILE: .github/workflows/prune.yml
================================================
name: prune
permissions:
contents: write
pull-requests: write
actions: write
issues: write
defaults:
run:
shell: bash
on:
workflow_dispatch:
inputs:
prune:
description: 'Prune all caches'
required: false
type: boolean
default: false
schedule:
- cron: "0 0 * * *" # every day at 00:00 UTC
jobs:
close-stale-issues-and-prs:
uses: gpustack/.github/.github/workflows/close-stale-issues-and-prs.yml@main
clean-stale-caches:
uses: gpustack/.github/.github/workflows/clean-stale-caches.yml@main
with:
# allow to prune all caches on demand
prune: ${{ github.event_name != 'schedule' && inputs.prune || false }}
================================================
FILE: .github/workflows/sync.yml
================================================
name: sync
permissions:
contents: read
pull-requests: read
actions: read
defaults:
run:
shell: bash
on:
workflow_dispatch:
inputs:
max_releases:
description: "Maximum number of latest releases to sync"
required: false
default: 1
type: number
specific_release_tag:
description: "Specific release tag to sync"
required: false
default: ""
type: string
dry_run:
description: "Skip the actual sync"
required: false
default: false
type: boolean
schedule:
- cron: "0 */12 * * *" # every 12 hours
jobs:
gitcode:
runs-on: ubuntu-22.04
timeout-minutes: 240
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
persist-credentials: false
- name: Sync
uses: gpustack/.github/.github/actions/mirror-release-gitcode@main
with:
gitcode-username: "${{ secrets.CI_GITCODE_USERNAME }}"
gitcode-password: "${{ secrets.CI_GITCODE_PASSWORD }}"
gitcode-token: "${{ secrets.CI_GITCODE_TOKEN }}"
max-releases: "${{ inputs.max_releases && inputs.max_releases || '1' }}"
specific-release-tag: "${{ inputs.specific_release_tag && inputs.specific_release_tag || '' }}"
code-only: true
dry-run: "${{ inputs.dry_run && inputs.dry_run || 'false' }}"
gitee:
runs-on: ubuntu-22.04
timeout-minutes: 120
steps:
- name: Checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
persist-credentials: false
- name: Sync
uses: gpustack/.github/.github/actions/mirror-release-gitee@main
with:
gitee-username: "${{ secrets.CI_GITEE_USERNAME }}"
gitee-token: "${{ secrets.CI_GITEE_TOKEN }}"
max-releases: "${{ inputs.max_releases && inputs.max_releases || '1' }}"
specific-release-tag: "${{ inputs.specific_release_tag && inputs.specific_release_tag || '' }}"
code-only: true
dry-run: "${{ inputs.dry_run && inputs.dry_run || 'false' }}"
tencent-cos:
runs-on: ubuntu-22.04
timeout-minutes: 120
steps:
- name: Sync
uses: gpustack/.github/.github/actions/mirror-release-tencent-cos@main
with:
tencent-secret-id: "${{ secrets.CI_TECENTCOS_SECRET_ID }}"
tencent-secret-key: "${{ secrets.CI_TECENTCOS_SECRET_KEY }}"
tencent-cos-region: "ap-guangzhou"
tencent-cos-bucket: "gpustack-1303613262"
max-releases: "${{ inputs.max_releases && inputs.max_releases || '1' }}"
specific-release-tag: "${{ inputs.specific_release_tag && inputs.specific_release_tag || '' }}"
dry-run: "${{ inputs.dry_run && inputs.dry_run || 'false' }}"
================================================
FILE: .gitignore
================================================
# Files
.DS_Store
*.lock
*.test
*.out
*.swp
*.swo
*.db
*.exe
*.exe~
*.dll
*.so
*.dylib
*.log
go.work
go.work.*
# Dirs
/.idea
/.vscode
/.kube
/.terraform
/.vagrant
/.bundle
/.cache
/.docker
/.entc
/.sbin
/.dist
/log
/certs
================================================
FILE: .golangci.yaml
================================================
version: "1"
run:
timeout: 10m
tests: true
modules-download-mode: readonly
go: "1.22"
# output configuration options
output:
print-issued-lines: true
print-linter-name: true
path-prefix: ""
sort-results: true
linters:
disable-all: true
enable:
- asciicheck
- bidichk
- decorder
- durationcheck
- errcheck
- errname
- errorlint
- copyloopvar
- godot
- goconst
- gocritic
- gosimple
- gosec
- govet
- gofumpt
- gofmt
- ineffassign
- importas
- lll
- makezero
- misspell
- nakedret
- nilerr
- prealloc
- predeclared
- revive
- staticcheck
- stylecheck
- typecheck
- unconvert
- unparam
- unused
- usestdlibvars
- whitespace
linters-settings:
decorder:
dec-order:
- const
- var
- func
disable-init-func-first-check: false
disable-dec-order-check: true
errorlint:
errorf: true
asserts: true
comparison: true
godot:
scope: all
exclude:
- "(?i)^ FIXME:"
- "(?i)^ TODO:"
- "(?i)^ SPDX\\-License\\-Identifier:"
- "(?i)^ +"
period: true
capital: false
goconst:
min-len: 3
min-occurrences: 10
gosimple:
checks: [ "all" ]
gosec:
severity: "low"
confidence: "low"
excludes:
- G101
- G107
- G112
- G115
- G404
gofumpt:
extra-rules: true
gofmt:
simplify: true
rewrite-rules:
- pattern: 'interface{}'
replacement: 'any'
- pattern: 'a[b:len(a)]'
replacement: 'a[b:]'
importas:
no-unaliased: true
lll:
line-length: 150
tab-width: 1
makezero:
always: false
misspell:
locale: US
nakedret:
max-func-lines: 60
revive:
rules:
- name: var-naming
disabled: true
arguments:
- [ "HTTP", "ID", "TLS", "TCP", "UDP", "API", "CA", "URL", "DNS" ]
staticcheck:
checks: [ "all", "-SA1019", "-SA2002", "-SA5008" ]
stylecheck:
checks: [ "all", "-ST1003" ]
unparam:
check-exported: false
unused:
field-writes-are-uses: true
post-statements-are-reads: true
exported-fields-are-used: true
parameters-are-used: true
local-variables-are-used: true
generated-is-used: true
usestdlibvars:
http-method: true
http-status-code: true
time-weekday: true
time-month: true
time-layout: true
crypto-hash: true
issues:
uniq-by-line: true
exclude-files:
- "doc.go"
- "zz_generated.*.go"
- "gen.*.go"
exclude-rules:
- path: _test\.go
linters:
- errcheck
- gosec
- makezero
- lll
================================================
FILE: Dockerfile
================================================
FROM scratch
ARG TARGETOS
ARG TARGETARCH
COPY --chmod=755 .dist/gguf-parser-${TARGETOS}-${TARGETARCH} /bin/gguf-parser
ENTRYPOINT ["/bin/gguf-parser"]
================================================
FILE: LICENSE
================================================
MIT License
Copyright (c) 2024 gguf-parser-go authors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: Makefile
================================================
.SILENT:
.DEFAULT_GOAL := ci
SHELL := /bin/bash
SRCDIR := $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST)))))
GOOS := $(shell go env GOOS)
GOARCH := $(shell go env GOARCH)
LINT_DIRTY ?= false
VERSION ?= $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null | tr '[:upper:]' '[:lower:]' || echo "unknown")
DEPS_UPDATE ?= false
deps:
@echo "+++ $@ +++"
cd $(SRCDIR) && go mod tidy && go mod download
cd $(SRCDIR)/cmd/gguf-parser && go mod tidy && go mod download
if [[ "$(DEPS_UPDATE)" == "true" ]]; then \
cd $(SRCDIR) && go get -u -v ./...; \
cd $(SRCDIR)/cmd/gguf-parser && go get -u -v ./...; \
fi
@echo "--- $@ ---"
generate:
@echo "+++ $@ +++"
cd $(SRCDIR) && go generate ./...
cd $(SRCDIR)/cmd/gguf-parser && go generate ./...
@echo "--- $@ ---"
lint:
@echo "+++ $@ +++"
[[ -d "$(SRCDIR)/.sbin" ]] || mkdir -p "$(SRCDIR)/.sbin"
[[ -f "$(SRCDIR)/.sbin/goimports-reviser" ]] || \
curl --retry 3 --retry-all-errors --retry-delay 3 -sSfL "https://github.com/incu6us/goimports-reviser/releases/download/v3.8.2/goimports-reviser_3.8.2_$(GOOS)_$(GOARCH).tar.gz" \
| tar -zxvf - --directory "$(SRCDIR)/.sbin" --no-same-owner --exclude ./LICENSE --exclude ./README.md && chmod +x "$(SRCDIR)/.sbin/goimports-reviser"
cd $(SRCDIR) && \
go list -f "{{.Dir}}" ./... | xargs -I {} find {} -maxdepth 1 -type f -name '*.go' ! -name 'gen.*' ! -name 'zz_generated.*' \
| xargs -I {} "$(SRCDIR)/.sbin/goimports-reviser" -use-cache -imports-order=std,general,company,project,blanked,dotted -output=file {} 1>/dev/null 2>&1
cd $(SRCDIR)/cmd/gguf-parser && \
go list -f "{{.Dir}}" ./... | xargs -I {} find {} -maxdepth 1 -type f -name '*.go' ! -name 'gen.*' ! -name 'zz_generated.*' \
| xargs -I {} "$(SRCDIR)/.sbin/goimports-reviser" -use-cache -imports-order=std,general,company,project,blanked,dotted -output=file {} 1>/dev/null 2>&1
[[ -f "$(SRCDIR)/.sbin/golangci-lint" ]] || \
curl --retry 3 --retry-all-errors --retry-delay 3 -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh \
| sh -s -- -b "$(SRCDIR)/.sbin" "v1.63.4"
cd $(SRCDIR) && \
"$(SRCDIR)/.sbin/golangci-lint" run --fix ./...
cd $(SRCDIR)/cmd/gguf-parser && \
"$(SRCDIR)/.sbin/golangci-lint" run --fix ./...
if [[ "$(LINT_DIRTY)" == "true" ]]; then \
if [[ -n $$(git status --porcelain) ]]; then \
echo "Code tree is dirty."; \
git diff --exit-code; \
fi; \
fi
@echo "--- $@ ---"
test:
@echo "+++ $@ +++"
go test -v -failfast -race -cover -timeout=30m $(SRCDIR)/...
@echo "--- $@ ---"
benchmark:
@echo "+++ $@ +++"
go test -v -failfast -run="^Benchmark[A-Z]+" -bench=. -benchmem -timeout=30m $(SRCDIR)/...
@echo "--- $@ ---"
gguf-parser:
[[ -d "$(SRCDIR)/.dist" ]] || mkdir -p "$(SRCDIR)/.dist"
cd "$(SRCDIR)/cmd/gguf-parser" && for os in darwin linux windows; do \
tags="netgo"; \
if [[ $$os == "windows" ]]; then \
suffix=".exe"; \
tags="netcgo"; \
else \
suffix=""; \
fi; \
for arch in amd64 arm64; do \
echo "Building gguf-parser for $$os-$$arch $(VERSION)"; \
GOOS="$$os" GOARCH="$$arch" CGO_ENABLED=1 go build \
-trimpath \
-ldflags="-w -s -X main.Version=$(VERSION)" \
-tags="urfave_cli_no_docs $$tags" \
-o $(SRCDIR)/.dist/gguf-parser-$$os-$$arch$$suffix; \
done; \
if [[ $$os == "darwin" ]]; then \
[[ -d "$(SRCDIR)/.sbin" ]] || mkdir -p "$(SRCDIR)/.sbin"; \
[[ -f "$(SRCDIR)/.sbin/lipo" ]] || \
GOBIN="$(SRCDIR)/.sbin" go install github.com/konoui/lipo@v0.9.2; \
"$(SRCDIR)/.sbin/lipo" -create -output $(SRCDIR)/.dist/gguf-parser-darwin-universal $(SRCDIR)/.dist/gguf-parser-darwin-amd64 $(SRCDIR)/.dist/gguf-parser-darwin-arm64; \
fi;\
if [[ $$os == "$(GOOS)" ]] && [[ $$arch == "$(GOARCH)" ]]; then \
cp -rf $(SRCDIR)/.dist/gguf-parser-$$os-$$arch$$suffix $(SRCDIR)/.dist/gguf-parser$$suffix; \
fi; \
done
build: gguf-parser
PACKAGE_PUBLISH ?= false
PACKAGE_REGISTRY ?= "gpustack"
PACKAGE_IMAGE ?= "gguf-parser"
package: build
@echo "+++ $@ +++"
if [[ -z $$(command -v docker) ]]; then \
echo "Docker is not installed."; \
exit 1; \
fi; \
platform="linux/amd64,linux/arm64"; \
image="$(PACKAGE_IMAGE):$(VERSION)"; \
if [[ -n "$(PACKAGE_REGISTRY)" ]]; then \
image="$(PACKAGE_REGISTRY)/$$image"; \
fi; \
if [[ "$(PACKAGE_PUBLISH)" == "true" ]]; then \
if [[ -z $$(docker buildx inspect --builder "gguf-parser") ]]; then \
docker run --rm --privileged tonistiigi/binfmt:qemu-v9.2.2 --install $$platform; \
docker buildx create --name "gguf-parser" --driver "docker-container" --buildkitd-flags "--allow-insecure-entitlement security.insecure --allow-insecure-entitlement network.host" --bootstrap; \
fi; \
docker buildx build --progress=plain --platform=$$platform --builder="gguf-parser" --output="type=image,name=$$image,push=true" "$(SRCDIR)"; \
else \
platform="linux/$(GOARCH)"; \
docker buildx build --progress=plain --platform=$$platform --output="type=docker,name=$$image" "$(SRCDIR)"; \
fi
@echo "--- $@ ---"
ci: deps generate lint test build
================================================
FILE: README.md
================================================
# GGUF Parser
> tl;dr, Review/Check [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) files and estimate the memory
> usage.
[](https://goreportcard.com/report/github.com/gpustack/gguf-parser-go)
[](https://github.com/gpustack/gguf-parser-go/actions)
[](https://github.com/gpustack/gguf-parser-go#license)
[](https://github.com/gpustack/gguf-parser-go/releases)
[](https://hub.docker.com/r/gpustack/gguf-parser)
[](https://github.com/gpustack/gguf-parser-go/releases/latest)
[GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) is a file format for storing models for inference
with GGML and executors based on GGML. GGUF is a binary format that is designed for fast loading and saving of models,
and for ease of reading. Models are traditionally developed using PyTorch or another framework, and then converted to
GGUF for use in GGML.
GGUF Parser helps in reviewing and estimating the usage and maximum tokens per second of a GGUF format model without
download it.
## Key Features
- **No File Required**: GGUF Parser uses chunking reading to parse the metadata of remote GGUF file, which means you
don't need to download the entire file and load it.
- **Accurate Prediction**: The evaluation results of GGUF Parser usually deviate from the actual usage by about 100MiB.
- **Quick Verification**: You can provide device metrics to calculate the maximum tokens per second (TPS) without
running the model.
- **Type Screening**: GGUF Parser can distinguish what the GGUF file used for, such as Embedding, Reranking, LoRA, etc.
- **Fast**: GGUF Parser is written in Go, which is fast and efficient.
## Agenda
- [Notes](#notes)
- [Installation](#installation)
- [Overview](#overview)
+ [Parse](#parse)
* [Local File](#parse-local-file)
* [Remote File](#parse-remote-file)
* [From HuggingFace](#parse-from-huggingface)
* [From ModelScope](#parse-from-modelscope)
* [From Ollama Library](#parse-from-ollama-library)
* [Others](#others)
* [Image Model](#parse-image-model)
* [None Model](#parse-none-model)
+ [Estimate](#estimate)
* [Across Multiple GPU devices](#across-multiple-gpu-devices)
* [Maximum Tokens Per Second](#maximum-tokens-per-second)
* [Full Layers Offload (default)](#full-layers-offload-default)
* [Zero Layers Offload](#zero-layers-offload)
* [Specific Layers Offload](#specific-layers-offload)
* [Specific Context Size](#specific-context-size)
* [Enable Flash Attention](#enable-flash-attention)
* [Disable MMap](#disable-mmap)
* [With Adapter](#with-adapter)
* [Get Proper Offload Layers](#get-proper-offload-layers)
## Notes
- **Since v0.20.0**, GGUF Parser supports leveraging `--override-tensor` to indicate how to place the model tensors.
- **Since v0.19.0**, GGUF Parser supports estimating Audio projector model file, like Ultravox series, Qwen2 Audio
series, etc.
- **Since v0.18.0**, GGUF Parser supports estimating SWA-supported(sliding window attention) model file, like LLaMA 4
series, Gemma2/3 series, etc.
- **Since v0.17.0**, GGUF Parser align the `QUANTIZATION`(
aka. [`general.file_type`](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#general-metadata))
to [HuggingFace processing](https://github.com/huggingface/huggingface.js/blob/2475d6d316135c0a4fceff6b3fe2aed0dde36ac1/packages/gguf/src/types.ts#L11-L48),
but there are still many model files whose naming does not fully follow `general.file_type`.
- **Since v0.16.0**, GGUF Parser supports estimating MLA-supported model file, like DeepSeek series.
- **Since v0.14.0 (BREAKING CHANGE)**, GGUF Parser parses `*.feed_forward_length` metadata as `[]uint64`,
which means the architecture `feedForwardLength` is a list of integers.
- **Since v0.13.0 (BREAKING CHANGE)**, GGUF Parser can parse files
for [StableDiffusion.Cpp](https://github.com/leejet/stable-diffusion.cpp) or StableDiffusion.Cpp like application.
+ [LLaMA Box](https://github.com/gpustack/llama-box) is able to offload different components of the all-in-one model
to different devices, e.g. with `-ts 1,1,1`, GGUF Parser return the usage of Text Encoder Models in 1st device,
VAE Model in 2nd device, and Diffusion Model in 3rd device.
- Experimentally, GGUF Parser can estimate the maximum tokens per second(`MAX TPS`) for a (V)LM model according to the
`--device-metric` options.
- GGUF Parser distinguishes the remote devices from `--tensor-split` via `--rpc`.
+ For one host multiple GPU devices, you can use `--tensor-split` to get the estimated memory usage of each GPU.
+ For multiple hosts multiple GPU devices, you can use `--tensor-split` and `--rpc` to get the estimated memory
usage of each GPU. Since v0.11.0, `--rpc` flag masks the devices specified by `--tensor-split` in front.
- Table result usage:
+ `DISTRIBUTABLE` indicates the GGUF file supports distribution inference or not, if the file doesn't support
distribution inference, you can not offload it
with [RPC servers](https://github.com/ggerganov/llama.cpp/tree/master/examples/rpc).
+ `RAM` indicates the system memory usage.
+ `VRAM *` indicates the local GPU memory usage.
+ `RPC * (V)RAM` indicates the remote memory usage. The kind of memory is determined by which backend the RPC server
uses, check the running logs for more details.
+ `UMA` indicates the memory usage of Apple macOS only. `NONUMA` adapts to other cases, including non-GPU devices.
+ `LAYERS`(`I`/`T`/`O`) indicates the count for input layers, transformer layers, and output layers. Input layers
are not offloaded at present.
## Installation
Install from [releases](https://github.com/gpustack/gguf-parser-go/releases).
## Overview
### Parse
#### Parse Local File
```shell
$ gguf-parser --path ~/.cache/lm-studio/models/unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF/DeepSeek-R1-Distill-Qwen-7B-Q4_K_M.gguf
+-----------------------------------------------------------------------------------------------------------+
| METADATA |
+-------+-------------------------+-------+--------------+---------------+----------+------------+----------+
| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW |
+-------+-------------------------+-------+--------------+---------------+----------+------------+----------+
| model | DeepSeek R1 Distill ... | qwen2 | Q4_K_M | true | 4.36 GiB | 7.62 B | 4.91 bpw |
+-------+-------------------------+-------+--------------+---------------+----------+------------+----------+
+-----------------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| 131072 | 3584 | true | 28 | 28 | 18944 | 0 | 152064 |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| TOKENIZER |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| gpt2 | 2.47 MiB | 152064 | N/A | 151646 | 151643 | N/A | N/A | N/A | N/A | 151654 |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+
| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 |
| | | | | | | | | | +--------------------+------------+------------+----------------+----------+-----------+
| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
| qwen2 | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 29 (28 + 1) | Yes | 1 + 0 + 0 | 677.44 MiB | 827.44 MiB | 28 + 1 | 7.30 GiB | 18.89 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
$ # Retrieve the model's metadata via split file,
$ # which needs all split files has been downloaded.
$ gguf-parser --path ~/.cache/lm-studio/models/Qwen/Qwen2.5-7B-Instruct-GGUF/qwen2.5-7b-instruct-q8_0-00001-of-00003.gguf
+-------------------------------------------------------------------------------------------------------+
| METADATA |
+-------+---------------------+-------+--------------+---------------+----------+------------+----------+
| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW |
+-------+---------------------+-------+--------------+---------------+----------+------------+----------+
| model | qwen2.5-7b-instruct | qwen2 | Q8_0 | true | 7.54 GiB | 7.62 B | 8.50 bpw |
+-------+---------------------+-------+--------------+---------------+----------+------------+----------+
+-----------------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| 131072 | 3584 | true | 28 | 28 | 18944 | 0 | 152064 |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| TOKENIZER |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| gpt2 | 2.47 MiB | 152064 | N/A | 151643 | 151645 | N/A | N/A | N/A | N/A | 151643 |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+
| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 |
| | | | | | | | | | +--------------------+------------+------------+----------------+----------+-----------+
| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
| qwen2 | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 29 (28 + 1) | Yes | 1 + 0 + 0 | 677.44 MiB | 827.44 MiB | 28 + 1 | 7.30 GiB | 21.82 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
```
#### Parse Remote File
```shell
$ gguf-parser --url="https://huggingface.co/bartowski/Qwen2.5-72B-Instruct-GGUF/resolve/main/Qwen2.5-72B-Instruct-Q4_K_M.gguf"
+---------------------------------------------------------------------------------------------------------+
| METADATA |
+-------+----------------------+-------+--------------+---------------+-----------+------------+----------+
| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW |
+-------+----------------------+-------+--------------+---------------+-----------+------------+----------+
| model | Qwen2.5 72B Instruct | qwen2 | Q4_K_M | true | 44.15 GiB | 72.71 B | 5.22 bpw |
+-------+----------------------+-------+--------------+---------------+-----------+------------+----------+
+-----------------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| 32768 | 8192 | true | 64 | 80 | 29568 | 0 | 152064 |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| TOKENIZER |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| gpt2 | 2.47 MiB | 152064 | N/A | 151643 | 151645 | N/A | N/A | N/A | N/A | 151643 |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+----------------------------------------+
| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 |
| | | | | | | | | | +--------------------+------------+------------+----------------+-----------+-----------+
| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+-----------+-----------+
| qwen2 | 32768 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 81 (80 + 1) | Yes | 1 + 0 + 0 | 426.57 MiB | 576.57 MiB | 80 + 1 | 10.31 GiB | 58.18 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+-----------+-----------+
$ # Retrieve the model's metadata via split file
$ gguf-parser --url="https://huggingface.co/unsloth/DeepSeek-R1-GGUF/resolve/main/DeepSeek-R1-UD-IQ1_S/DeepSeek-R1-UD-IQ1_S-00001-of-00003.gguf"
+----------------------------------------------------------------------------------------------------------+
| METADATA |
+-------+------------------+-----------+--------------+---------------+------------+------------+----------+
| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW |
+-------+------------------+-----------+--------------+---------------+------------+------------+----------+
| model | DeepSeek R1 BF16 | deepseek2 | IQ1_S | true | 130.60 GiB | 671.03 B | 1.67 bpw |
+-------+------------------+-----------+--------------+---------------+------------+------------+----------+
+-----------------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| 163840 | 7168 | true | N/A | 61 | 18432 | 256 | 129280 |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| TOKENIZER |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| gpt2 | 2.21 MiB | 129280 | N/A | 0 | 1 | N/A | N/A | N/A | N/A | 128815 |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+-----------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------------------------------+--------------------------------------+
| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 |
| | | | | | | | | | +--------------------+-----------+-----------+----------------+------------+--------+
| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+-----------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+-----------+-----------+----------------+------------+--------+
| deepseek2 | 163840 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 62 (61 + 1) | Yes | 1 + 0 + 0 | 13.03 GiB | 13.18 GiB | 61 + 1 | 762.76 GiB | 1 TB |
+-----------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+-----------+-----------+----------------+------------+--------+
```
#### Parse From HuggingFace
> [!NOTE]
>
> Allow using `HF_ENDPOINT` to override the default HuggingFace endpoint: `https://huggingface.co`.
```shell
$ gguf-parser --hf-repo="bartowski/Qwen2-VL-2B-Instruct-GGUF" --hf-file="Qwen2-VL-2B-Instruct-f16.gguf" --hf-mmproj-file="mmproj-Qwen2-VL-2B-Instruct-f32.gguf" --visual-max-image-size 1344
+-----------------------------------------------------------------------------------------------------------+
| METADATA |
+-------+----------------------+---------+--------------+---------------+----------+------------+-----------+
| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW |
+-------+----------------------+---------+--------------+---------------+----------+------------+-----------+
| model | Qwen2 VL 2B Instruct | qwen2vl | F16 | true | 2.88 GiB | 1.54 B | 16.00 bpw |
+-------+----------------------+---------+--------------+---------------+----------+------------+-----------+
+-----------------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| 32768 | 1536 | true | 12 | 28 | 8960 | 0 | 151936 |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| TOKENIZER |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| gpt2 | 2.47 MiB | 151936 | N/A | 151643 | 151645 | N/A | N/A | N/A | N/A | 151643 |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+---------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+
| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 |
| | | | | | | | | | +--------------------+------------+------------+----------------+----------+-----------+
| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+---------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
| qwen2vl | 32768 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 29 (28 + 1) | Yes | 1 + 0 + 0 | 236.87 MiB | 386.87 MiB | 28 + 1 | 3.65 GiB | 12.86 GiB |
+---------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
$ # Retrieve the model's metadata via split file
$ gguf-parser --hf-repo="bartowski/openbuddy-llama3.3-70b-v24.1-131k-GGUF" --hf-file="openbuddy-llama3.3-70b-v24.1-131k-Q4_0.gguf"
+------------------------------------------------------------------------------------------------------------+
| METADATA |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
| model | Openbuddy Llama3.3 7... | llama | Q4_0 | true | 37.35 GiB | 70.55 B | 4.55 bpw |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
+-----------------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| 131072 | 8192 | true | 64 | 80 | 28672 | 0 | 128256 |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| TOKENIZER |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| gpt2 | 2 MiB | 128256 | N/A | 128000 | 128048 | N/A | N/A | N/A | N/A | 128044 |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+-----------------------------------------+----------------------------------------+
| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 |
| | | | | | | | | | +--------------------+---------+----------+----------------+-----------+-----------+
| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+
| llama | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 81 (80 + 1) | Yes | 1 + 0 + 0 | 1.06 GB | 1.13 GiB | 80 + 1 | 40.26 GiB | 93.62 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+
```
#### Parse From ModelScope
> [!NOTE]
>
> Allow using `MS_ENDPOINT` to override the default ModelScope endpoint: `https://modelscope.cn`.
```shell
$ gguf-parser --ms-repo="unsloth/DeepSeek-R1-Distill-Qwen-7B-GGUF" --ms-file="DeepSeek-R1-Distill-Qwen-7B-F16.gguf"
+-------------------------------------------------------------------------------------------------------------+
| METADATA |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+-----------+
| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+-----------+
| model | DeepSeek R1 Distill ... | qwen2 | F16 | true | 14.19 GiB | 7.62 B | 16.00 bpw |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+-----------+
+-----------------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| 131072 | 3584 | true | 28 | 28 | 18944 | 0 | 152064 |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| TOKENIZER |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| gpt2 | 2.47 MiB | 152064 | N/A | 151646 | 151643 | N/A | N/A | N/A | N/A | 151654 |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+---------------------------------------+
| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 |
| | | | | | | | | | +--------------------+------------+------------+----------------+----------+-----------+
| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
| qwen2 | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 29 (28 + 1) | Yes | 1 + 0 + 0 | 677.44 MiB | 827.44 MiB | 28 + 1 | 7.30 GiB | 27.99 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+----------+-----------+
```
#### Parse From Ollama Library
> [!NOTE]
>
> Allow using `--ol-base-url` to override the default Ollama registry endpoint: `https://registry.ollama.ai`.
```shell
$ gguf-parser --ol-model="llama3.3"
+------------------------------------------------------------------------------------------------------------+
| METADATA |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
| model | Llama 3.1 70B Instru... | llama | Q4_K_M | true | 39.59 GiB | 70.55 B | 4.82 bpw |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
+-----------------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| 131072 | 8192 | true | 64 | 80 | 28672 | 0 | 128256 |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| TOKENIZER |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| gpt2 | 2 MiB | 128256 | N/A | 128000 | 128009 | N/A | N/A | N/A | N/A | N/A |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+-----------------------------------------+----------------------------------------+
| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 |
| | | | | | | | | | +--------------------+---------+----------+----------------+-----------+-----------+
| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+
| llama | 131072 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 81 (80 + 1) | Yes | 1 + 0 + 0 | 1.06 GB | 1.13 GiB | 80 + 1 | 40.26 GiB | 95.86 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+---------+----------+----------------+-----------+-----------+
$ # Ollama Model includes the preset params and other artifacts, like multimodal projectors or LoRA adapters,
$ # you can get the usage of Ollama running by using `--ol-usage` option.
+------------------------------------------------------------------------------------------------------------+
| METADATA |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
| model | Llama 3.1 70B Instru... | llama | Q4_K_M | true | 39.59 GiB | 70.55 B | 4.82 bpw |
+-------+-------------------------+-------+--------------+---------------+-----------+------------+----------+
+-----------------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| MAX CONTEXT LEN | EMBEDDING LEN | ATTENTION CAUSAL | ATTENTION HEAD CNT | LAYERS | FEED FORWARD LEN | EXPERT CNT | VOCABULARY LEN |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
| 131072 | 8192 | true | 64 | 80 | 28672 | 0 | 128256 |
+-----------------+---------------+------------------+--------------------+--------+------------------+------------+----------------+
+-------------------------------------------------------------------------------------------------------------------------------------------------------+
| TOKENIZER |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| MODEL | TOKENS SIZE | TOKENS LEN | ADDED TOKENS LEN | BOS TOKEN | EOS TOKEN | EOT TOKEN | EOM TOKEN | UNKNOWN TOKEN | SEPARATOR TOKEN | PADDING TOKEN |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
| gpt2 | 2 MiB | 128256 | N/A | 128000 | 128009 | N/A | N/A | N/A | N/A | N/A |
+-------+-------------+------------+------------------+-----------+-----------+-----------+-----------+---------------+-----------------+---------------+
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+----------------------------------------------+-----------------------------------------+
| ARCH | CONTEXT SIZE | BATCH SIZE (L / P) | FLASH ATTENTION | MMAP LOAD | EMBEDDING ONLY | RERANKING | DISTRIBUTABLE | OFFLOAD LAYERS | FULL OFFLOADED | RAM | VRAM 0 |
| | | | | | | | | | +--------------------+------------+------------+----------------+------------+-----------+
| | | | | | | | | | | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+------------+-----------+
| llama | 2048 | 2048 / 512 | Disabled | Enabled | No | Unsupported | Supported | 81 (80 + 1) | Yes | 1 + 0 + 0 | 255.27 MiB | 405.27 MiB | 80 + 1 | 906.50 MiB | 40.49 GiB |
+-------+--------------+--------------------+-----------------+-----------+----------------+-------------+---------------+----------------+----------------+--------------------+------------+------------+----------------+------------+-----------+
```
#### Others
##### Parse Image Model
```shell
$ # Parse FLUX.1-dev Model
$ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gguf"
+----------------------------------------------------------------------------------------------+
| METADATA |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
| model | N/A | diffusion | F16 | true | 31.79 GiB | 17 B | 16.06 bpw |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
+----------------------------------------------------------------------------------------------------------+
| ARCHITECTURE |
+----------------+---------------------------------------------------------------+-------------------------+
| DIFFUSION ARCH | CONDITIONERS | AUTOENCODER |
+----------------+---------------------------------------------------------------+-------------------------+
| FLUX.1 | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) |
+----------------+---------------------------------------------------------------+-------------------------+
+---------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+--------+-----------------+-------------+---------------+----------------+-------------------------+-----------------------+
| ARCH | FLASH ATTENTION | MMAP LOAD | DISTRIBUTABLE | FULL OFFLOADED | RAM | VRAM 0 |
| | | | | +------------+------------+-----------+-----------+
| | | | | | UMA | NONUMA | UMA | NONUMA |
+--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+
| flux_1 | Disabled | Unsupported | Supported | Yes | 343.89 MiB | 493.89 MiB | 31.89 GiB | 41.15 GiB |
+--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+
$ # Parse FLUX.1-dev Model without offload Conditioner and Autoencoder
$ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gguf" --clip-on-cpu --vae-on-cpu
+----------------------------------------------------------------------------------------------+
| METADATA |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
| model | N/A | diffusion | F16 | true | 31.79 GiB | 17 B | 16.06 bpw |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
+----------------------------------------------------------------------------------------------------------+
| ARCHITECTURE |
+----------------+---------------------------------------------------------------+-------------------------+
| DIFFUSION ARCH | CONDITIONERS | AUTOENCODER |
+----------------+---------------------------------------------------------------+-------------------------+
| FLUX.1 | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) |
+----------------+---------------------------------------------------------------+-------------------------+
+-------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+--------+-----------------+-------------+---------------+----------------+-----------------------+-----------------------+
| ARCH | FLASH ATTENTION | MMAP LOAD | DISTRIBUTABLE | FULL OFFLOADED | RAM | VRAM 0 |
| | | | | +-----------+-----------+-----------+-----------+
| | | | | | UMA | NONUMA | UMA | NONUMA |
+--------+-----------------+-------------+---------------+----------------+-----------+-----------+-----------+-----------+
| flux_1 | Disabled | Unsupported | Supported | Yes | 16.44 GiB | 16.59 GiB | 22.29 GiB | 25.05 GiB |
+--------+-----------------+-------------+---------------+----------------+-----------+-----------+-----------+-----------+
$ # Parse FLUX.1-dev Model with Autoencoder tiling
$ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gguf" --vae-tiling
+----------------------------------------------------------------------------------------------+
| METADATA |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
| model | N/A | diffusion | F16 | true | 31.79 GiB | 17 B | 16.06 bpw |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
+----------------------------------------------------------------------------------------------------------+
| ARCHITECTURE |
+----------------+---------------------------------------------------------------+-------------------------+
| DIFFUSION ARCH | CONDITIONERS | AUTOENCODER |
+----------------+---------------------------------------------------------------+-------------------------+
| FLUX.1 | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) |
+----------------+---------------------------------------------------------------+-------------------------+
+---------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+--------+-----------------+-------------+---------------+----------------+-------------------------+-----------------------+
| ARCH | FLASH ATTENTION | MMAP LOAD | DISTRIBUTABLE | FULL OFFLOADED | RAM | VRAM 0 |
| | | | | +------------+------------+-----------+-----------+
| | | | | | UMA | NONUMA | UMA | NONUMA |
+--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+
| flux_1 | Disabled | Unsupported | Supported | Yes | 343.89 MiB | 493.89 MiB | 31.89 GiB | 36.28 GiB |
+--------+-----------------+-------------+---------------+----------------+------------+------------+-----------+-----------+
$ # Parse FLUX.1-dev Model with multiple devices offloading
$ # Support by LLaMA Box v0.0.106+, https://github.com/gpustack/llama-box.
$ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gguf" --tensor-split="1,1,1"
+----------------------------------------------------------------------------------------------+
| METADATA |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
| model | N/A | diffusion | F16 | true | 31.79 GiB | 17 B | 16.06 bpw |
+-------+------+-----------+--------------+---------------+-----------+------------+-----------+
+----------------------------------------------------------------------------------------------------------+
| ARCHITECTURE |
+----------------+---------------------------------------------------------------+-------------------------+
| DIFFUSION ARCH | CONDITIONERS | AUTOENCODER |
+----------------+---------------------------------------------------------------+-------------------------+
| FLUX.1 | OpenAI CLIP ViT-L/14 (MOSTLY_F16), Google T5-xxl (MOSTLY_F16) | FLUX.1 VAE (MOSTLY_F16) |
+----------------+---------------------------------------------------------------+-------------------------+
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+--------+-----------------+-------------+---------------+----------------+-------------------------+---------------------+---------------------+-----------------------+
| ARCH | FLASH ATTENTION | MMAP LOAD | DISTRIBUTABLE | FULL OFFLOADED | RAM | VRAM 0 | VRAM 1 | VRAM 2 |
| | | | | +------------+------------+----------+----------+------------+--------+-----------+-----------+
| | | | | | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA | UMA | NONUMA |
+--------+-----------------+-------------+---------------+----------------+------------+------------+----------+----------+------------+--------+-----------+-----------+
| flux_1 | Disabled | Unsupported | Supported | Yes | 343.89 MiB | 493.89 MiB | 9.34 GiB | 9.60 GiB | 259.96 MiB | 7 GiB | 22.29 GiB | 25.05 GiB |
+--------+-----------------+-------------+---------------+----------------+------------+------------+----------+----------+------------+--------+-----------+-----------+
```
##### Parse None Model
```shell
$ # Parse Multi-Modal Projector
$ gguf-parser --hf-repo="unsloth/Qwen2.5-Omni-3B-GGUF" --hf-file="mmproj-F32.gguf"
+-------------------------------------------------------------------------------------------------------+
| METADATA |
+-----------+-----------------+------+--------------+---------------+----------+------------+-----------+
| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW |
+-----------+-----------------+------+--------------+---------------+----------+------------+-----------+
| projector | Qwen2.5-Omni-3B | clip | F32 | true | 4.86 GiB | 1.31 B | 31.93 bpw |
+-----------+-----------------+------+--------------+---------------+----------+------------+-----------+
+-------------------------------------------------------------------------------------------------------------------------+
| ARCHITECTURE |
+----------------+-------------------------------+-----------------+-------------------------------------+----------------+
| PROJECTOR TYPE | EMBEDDING LEN | LAYERS | FEED FORWARD LEN | ENCODER |
| +---------------+---------------+--------+--------+------------------+------------------+ |
| | VISION | AUDIO | VISION | AUDIO | VISION | AUDIO | |
+----------------+---------------+---------------+--------+--------+------------------+------------------+----------------+
| qwen2.5o | 1280 | 1280 | 32 | 32 | 1280 | 5120 | Vision & Audio |
+----------------+---------------+---------------+--------+--------+------------------+------------------+----------------+
$ # Parse LoRA Adapter
$ gguf-parser --hf-repo="ngxson/test_gguf_lora_adapter" --hf-file="lora-Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf"
+---------------------------------------------------------------------------------------------+
| METADATA |
+---------+------+-------+--------------+---------------+------------+------------+-----------+
| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW |
+---------+------+-------+--------------+---------------+------------+------------+-----------+
| adapter | N/A | llama | F16 | true | 168.08 MiB | 88.12 M | 16.00 bpw |
+---------+------+-------+--------------+---------------+------------+------------+-----------+
+---------------------------+
| ARCHITECTURE |
+--------------+------------+
| ADAPTER TYPE | LORA ALPHA |
+--------------+------------+
| lora | 32 |
+--------------+------------+
```
### Estimate
#### Across Multiple GPU Devices
Imaging you're preparing to run
the [hierholzer/Llama-3.1-70B-Instruct-GGUF](https://huggingface.co/hierholzer/Llama-3.1-70B-Instruct-GGUF) model file
across several hosts in your local network. Some of these hosts are equipped with GPU devices, while others do not have
any GPU capabilities.
```mermaid
flowchart TD
subgraph host4["Windows 11 (host4)"]
ram40(["11GiB RAM remaining"])
end
subgraph host3["Apple macOS (host3)"]
gpu10["Apple M1 Max (6GiB VRAM remaining)"]
end
subgraph host2["Windows 11 (host2)"]
gpu20["NVIDIA 4090 (12GiB VRAM remaining)"]
end
subgraph host1["Ubuntu (host1)"]
gpu30["NVIDIA 4080 0 (8GiB VRAM remaining)"]
gpu31["NVIDIA 4080 1 (10GiB VRAM remaining)"]
end
```
##### Single Host Multiple GPU Devices
Let's assume you plan to run the model on `host1` only.
```mermaid
flowchart TD
subgraph host1["Ubuntu (host1)"]
gpu30["NVIDIA 4080 0 (8GiB VRAM remaining)"]
gpu31["NVIDIA 4080 1 (10GiB VRAM remaining)"]
end
```
```shell
$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --ctx-size=1024 --tensor-split="8,10" --estimate --in-short
+------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+----------------------------------------------+--------------------------------------+----------------------------------------+
| RAM | VRAM 0 | VRAM 1 |
+--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+
| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+
| 1 + 0 + 0 | 249.27 MiB | 399.27 MiB | 36 + 0 | 144 MiB | 17.83 GiB | 44 + 1 | 22.27 GiB | 22.83 GiB |
+--------------------+------------+------------+----------------+---------+-----------+----------------+-----------+-----------+
```
Based on the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host1` has the following
resource consumption:
| Host | Available RAM | Request RAM | Available VRAM | Request VRAM | Result |
|-----------------------|---------------|-------------|----------------|--------------|------------|
| host1 | ENOUGH | 399.27 MiB | | | :thumbsup: |
| host1 (NVIDIA 4080 0) | | | 8 GiB | 17.83 GiB | |
| host1 (NVIDIA 4080 1) | | | 10 GiB | 22.83 GiB | |
It appears that running the model on `host1` alone is not feasible.
##### Multiple Hosts Multiple GPU Devices
Next, let's consider the scenario where you plan to run the model on `host4`, while offloading all layers to `host1`,
`host2`,
and `host3`.
```mermaid
flowchart TD
host4 -->|TCP| gpu10
host4 -->|TCP| gpu20
host4 -->|TCP| gpu30
host4 -->|TCP| gpu31
subgraph host4["Windows 11 (host4)"]
ram40(["11GiB RAM remaining"])
end
subgraph host3["Apple macOS (host3)"]
gpu10["Apple M1 Max (6GiB VRAM remaining)"]
end
subgraph host2["Windows 11 (host2)"]
gpu20["NVIDIA 4090 (12GiB VRAM remaining)"]
end
subgraph host1["Ubuntu (host1)"]
gpu30["NVIDIA 4080 0 (8GiB VRAM remaining)"]
gpu31["NVIDIA 4080 1 (10GiB VRAM remaining)"]
end
```
```shell
$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --ctx-size=1024 --tensor-split="8,10,12,6" --rpc="host1:50052,host1:50053,host2:50052,host3:50052" --estimate --in-short
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+
| RAM | RPC 0 (V)RAM | RPC 1 (V)RAM | RPC 2 (V)RAM | RPC 3 (V)RAM |
+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+
| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+
| 1 + 0 + 0 | 249.27 MiB | 399.27 MiB | 18 + 0 | 8.85 GiB | 9.28 GiB | 23 + 0 | 10.88 GiB | 11.32 GiB | 27 + 0 | 12.75 GiB | 13.19 GiB | 12 + 1 | 7.13 GiB | 7.64 GiB |
+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+
```
According to the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host4` results in the
following resource consumption:
| Host | Available RAM | Request RAM | Available VRAM | Request VRAM | Result |
|-----------------------|---------------|-------------|----------------|--------------|------------|
| host4 | 11 GiB | 399.27 MiB | | | :thumbsup: |
| host1 (NVIDIA 4080 0) | | | 8 GiB | 9.28 GiB | |
| host1 (NVIDIA 4080 1) | | | 10 GiB | 11.32 GiB | |
| host2 (NVIDIA 4090) | | | 12 GiB | 13.19 GiB | |
| host3 (Apple M1 Max) | ENOUGH | | 6 GiB | 7.13 GiB | |
It seems that the model cannot be served on `host4`, even with all layers offloaded to `host1`, `host2`, and `host3`.
We should consider a different approach: running the model on `host3` while offloading all layers to `host1`, `host2`,
and `host4`.
```mermaid
flowchart TD
host3 -->|TCP| ram40
host3 -->|TCP| gpu20
host3 -->|TCP| gpu30
host3 -->|TCP| gpu31
subgraph host4["Windows 11 (host4)"]
ram40(["11GiB RAM remaining"])
end
subgraph host3["Apple macOS (host3)"]
gpu10["Apple M1 Max (6GiB VRAM remaining)"]
end
subgraph host2["Windows 11 (host2)"]
gpu20["NVIDIA 4090 (12GiB VRAM remaining)"]
end
subgraph host1["Ubuntu (host1)"]
gpu30["NVIDIA 4080 0 (8GiB VRAM remaining)"]
gpu31["NVIDIA 4080 1 (10GiB VRAM remaining)"]
end
```
```shell
$ gguf-parser --hf-repo="hierholzer/Llama-3.1-70B-Instruct-GGUF" --hf-file="Llama-3.1-70B-Instruct-Q4_K_M.gguf" --ctx-size=1024 --tensor-split="11,12,8,10,6" --rpc="host4:50052,host2:50052,host1:50052,host1:50053" --estimate --in-short
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------------+----------------------------------------+
| RAM | RPC 0 (V)RAM | RPC 1 (V)RAM | RPC 2 (V)RAM | RPC 3 (V)RAM | VRAM 0 |
+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+------------+----------+
| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+------------+----------+
| 1 + 0 + 0 | 249.27 MiB | 399.27 MiB | 19 + 0 | 9.36 GiB | 9.79 GiB | 21 + 0 | 9.92 GiB | 10.35 GiB | 14 + 0 | 6.57 GiB | 7.01 GiB | 17 + 0 | 8.11 GiB | 8.54 GiB | 9 + 1 | 302.50 MiB | 6.16 GiB |
+--------------------+------------+------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+--------------+--------------+----------------+------------+----------+
```
According to the output provided, serving the `hierholzer/Llama-3.1-70B-Instruct-GGUF` model on `host3` results in the
following resource consumption:
| Host | Available RAM | Request RAM | Available VRAM | Request VRAM | Result |
|-----------------------|---------------|-------------|----------------|--------------|------------|
| host3 (Apple M1 Max) | ENOUGH | 249.27 MiB | | | :thumbsup: |
| host4 | 11 GiB | 9.79 GiB | | | :thumbsup: |
| host2 (NVIDIA 4090) | | | 12 GiB | 10.35 GiB | :thumbsup: |
| host1 (NVIDIA 4080 0) | | | 8 GiB | 7.01 GiB | :thumbsup: |
| host1 (NVIDIA 4080 1) | | | 10 GiB | 8.54 GiB | :thumbsup: |
| host3 (Apple M1 Max) | | | 6 GiB | 302.50 MiB | :thumbsup: |
Now, the model can be successfully served on `host3`, with all layers offloaded to `host1`, `host2`, and `host4`.
#### Maximum Tokens Per Second
The maximum TPS estimation for the GGUF Parser is determined by the model's parameter size, context size, model
offloaded layers, and devices on which the model runs. Among these factors, the device's specifications are particularly
important.
Inspired
by [LLM inference speed of light](https://zeux.io/2024/03/15/llm-inference-sol/), GGUF Parser use the **FLOPS** and
**bandwidth** of the device as evaluation metrics:
- When the device is a CPU, FLOPS refers to the performance of that CPU, while bandwidth corresponds to the DRAM
bandwidth.
- When the device is a (i)GPU, FLOPS indicates the performance of that (i)GPU, and bandwidth corresponds to the VRAM
bandwidth.
- When the device is a specific host, FLOPS depends on whether the CPU or (i)GPU of that host is being used, while
bandwidth corresponds to the bandwidth connecting the main node to that host. **After all, a chain is only as strong
as
its weakest link.** If the connection bandwidth between the
main node and the host is equal to or greater than the *RAM bandwidth, then the bandwidth should be taken as the *RAM
bandwidth value.
##### CPU FLOPS Calculation
The performance of a single CPU cache can be calculated using the following formula:
$$ CPU\ FLOPS = Number\ of \ Cores \times Core\ Frequency \times Floating\ Point\ Operations\ per\ Cycle $$
The Apple M1 Max CPU features a total of 10 cores, consisting of 8 performance cores and 2 efficiency cores. The
performance cores operate at a clock speed of 3.2 GHz, while the efficiency cores run at 2.2 GHz. All cores support
the [ARM NEON instruction set](https://en.wikipedia.org/wiki/ARM_architecture_family#Advanced_SIMD_(Neon)), which
enables 128-bit SIMD operations, allowing multiple floating-point numbers to be processed simultaneously within a
single CPU cycle. Specifically, using single-precision (32-bit) floating-point numbers, each cycle can handle 4
floating-point operations.
The peak floating-point performance for a single performance core is calculated as follows:
$$ Peak\ Performance = 3.2\ GHz \times 4\ FLOPS = 12.8\ GFLOPS $$
For a single efficiency core, the calculation is:
$$ Peak\ Performance = 2.2\ GHz \times 4\ FLOPS = 8.8\ GFLOPS $$
Thus, the overall peak floating-point performance of the entire CPU can be determined by combining the contributions
from both types of cores:
$$ Peak\ Performance = 8\ Cores \times 12.8\ GFLOPS + 2\ Cores \times 8.8\ GFLOPS = 120\ GFLOPS $$
> This results in an average performance of 12 GFLOPS per core. It is evident that the average performance achieved by
> utilizing both performance and efficiency cores is lower than that obtained by exclusively using performance cores.
##### Run LLaMA2-7B-Chat with Apple Silicon M-series
Taking [TheBloke/Llama-2-7B-Chat-GGUF](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF) as an
example and estimate the maximum tokens per second for Apple Silicon M-series using the GGUF Parser.
```shell
$ # Estimate full offloaded Q8_0 model
$ gguf-parser --hf-repo TheBloke/LLaMA-7b-GGUF --hf-file llama-7b.Q8_0.gguf --estimate --in-short \
-c 512 \
--device-metric "<CPU FLOPS>;<RAM BW>,<iGPU FLOPS>;<VRAM BW>"
$ # Estimate full offloaded Q4_0 model
$ gguf-parser --hf-repo TheBloke/LLaMA-7b-GGUF --hf-file llama-7b.Q4_0.gguf --estimate --in-short \
-c 512 \
--device-metric "<CPU FLOPS>;<RAM BW>,<iGPU FLOPS>;<VRAM BW>"
```
| Variant | CPU FLOPS (Performance Core) | iGPU FLOPS | (V)RAM Bandwidth | Q8_0 Max TPS | Q4_0 Max TPS |
|----------|------------------------------|------------------------|------------------|--------------|--------------|
| M1 | 51.2 GFLOPS (4 cores) | 2.6 TFLOPS (8 cores) | 68.3 GBps | 8.68 | 14.56 |
| M1 Pro | 102.4 GFLOPS (8 cores) | 5.2 TFLOPS (16 cores) | 204.8 GBps | 26.04 | 43.66 |
| M1 Max | 102.4 GFLOPS (8 cores) | 10.4 TFLOPS (32 cores) | 409.6 GBps | 52.08 | 87.31 |
| M1 Ultra | 204.8 GFLOPS (16 cores) | 21 TFLOPS (64 cores) | 819.2 GBps | 104.16 | 174.62 |
| M2 | 56 GFLOPS (4 cores) | 3.6 TFLOPS (10 cores) | 102.4 GBps | 13.02 | 21.83 |
| M2 Pro | 112 GFLOPS (8 cores) | 6.8 TFLOPS (19 cores) | 204.8 GBps | 26.04 | 43.66 |
| M2 Max | 112 GFLOPS (8 cores) | 13.6 TFLOPS (38 cores) | 409.6 GBps | 52.08 | 87.31 |
| M2 Ultra | 224 GFLOPS (16 cores) | 27.2 TFLOPS (76 cores) | 819.2 GBps | 104.16 | 174.62 |
| M3 | 64.96 GFLOPS (4 cores) | 4.1 TFLOPS (10 cores) | 102.4 GBps | 13.02 | 21.83 |
| M3 Pro | 97.44 GFLOPS (6 cores) | 7.4 TFLOPS (18 cores) | 153.6 GBps | 19.53 | 32.74 |
| M3 Max | 194.88 GFLOPS (12 cores) | 16.4 TFLOPS (40 cores) | 409.6 GBps | 52.08 | 87.31 |
| M4 | 70.56 GFLOPS (4 cores) | 4.1 TFLOPS | 120 GBps | 15.26 | 25.58 |
> References:
> - https://www.cpu-monkey.com/en/cpu_family-apple_m_series
> - https://nanoreview.net/
> - https://en.wikipedia.org/wiki/Apple_M1#Variants
> - https://en.wikipedia.org/wiki/Apple_M2#Variants
> - https://en.wikipedia.org/wiki/Apple_M3#Variants
> - https://en.wikipedia.org/wiki/Apple_M4#Variants
You can further verify the above results in [Performance of llama.cpp on Apple Silicon M-series
](https://github.com/ggerganov/llama.cpp/discussions/4167#user-content-fn-1-e9a4caf2848534167e450e18fc4ede7f).
##### Run LLaMA3.1-405B-Instruct with Apple Mac Studio devices combined with Thunderbolt
Example
by [leafspark/Meta-Llama-3.1-405B-Instruct-GGUF](https://huggingface.co/leafspark/Meta-Llama-3.1-405B-Instruct-GGUF)
and estimate the maximum tokens per second for three Apple Mac Studio devices combined with Thunderbolt.
| Device | CPU FLOPS (Performance Core) | iGPU FLOPS | (V)RAM Bandwidth | Thunderbolt Bandwidth | Role |
|-------------------------------|------------------------------|------------------------|------------------|-----------------------|------------|
| Apple Mac Studio (M2 Ultra) 0 | 224 GFLOPS (16 cores) | 27.2 TFLOPS (76 cores) | 819.2 GBps | 40 Gbps | Main |
| Apple Mac Studio (M2 Ultra) 1 | 224 GFLOPS (16 cores) | 27.2 TFLOPS (76 cores) | 819.2 GBps | 40 Gbps | RPC Server |
| Apple Mac Studio (M2 Ultra) 2 | 224 GFLOPS (16 cores) | 27.2 TFLOPS (76 cores) | 819.2 GBps | 40 Gbps | RPC Server |
Get the maximum tokens per second with the following command:
```shell
$ # Explain the command:
$ # --device-metric "224GFLOPS;819.2GBps" <-- Apple Mac Studio 0 CPU FLOPS and RAM Bandwidth
$ # --device-metric "27.2TFLOPS;819.2GBps;40Gbps" <-- Apple Mac Studio 1 (RPC 0) iGPU FLOPS, VRAM Bandwidth, and Thunderbolt Bandwidth
$ # --device-metric "27.2TFLOPS;819.2GBps;40Gbps" <-- Apple Mac Studio 2 (RPC 1) iGPU FLOPS, VRAM Bandwidth, and Thunderbolt Bandwidth
$ # --device-metric "27.2TFLOPS;819.2GBps" <-- Apple Mac Studio 0 iGPU FLOPS and VRAM Bandwidth
$ gguf-parser --hf-repo leafspark/Meta-Llama-3.1-405B-Instruct-GGUF --hf-file Llama-3.1-405B-Instruct.Q4_0.gguf/Llama-3.1-405B-Instruct.Q4_0-00001-of-00012.gguf --estimate --in-short \
--no-mmap \
-c 512 \
--rpc host1:port,host2:port \
--tensor-split "<Proportions>" \
--device-metric "224GFLOPS;819.2GBps" \
--device-metric "27.2TFLOPS;819.2GBps;40Gbps" \
--device-metric "27.2TFLOPS;819.2GBps;40Gbps" \
--device-metric "27.2TFLOPS;819.2GBps"
```
| Tensor Split | Apple Mac Studio 0 RAM | Apple Mac Studio 1 VRAM (RPC 0) | Apple Mac Studio 2 VRAM (RPC 1) | Apple Mac Studio 0 VRAM | Q4_0 Max TPS |
|--------------|------------------------|---------------------------------|----------------------------------|-------------------------|--------------|
| 1,1,1 | 1.99 GiB | 72.74 GiB | 71.04 GiB | 70.96 GiB | 10.71 |
| 2,1,1 | 1.99 GiB | 108.26 GiB | 54.13 GiB | 52.35 GiB | 11.96 |
| 3,1,1 | 1.99 GiB | 130.25 GiB | 42.29 GiB | 42.20 GiB | 9.10 |
| 4,1,1 | 1.99 GiB | 143.78 GiB | 35.52 GiB | 35.44 GiB | 7.60 |
##### Run Qwen2.5-72B-Instruct with NVIDIA RTX 4080 and remote RPC by Apple Mac Studio (M2)
Example by [Qwen/Qwen2.5-72B-Instruct-GGUF](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct-GGUF) and estimate the
maximum tokens per second for NVIDIA RTX 4080.
| Hardware | FLOPS | Bandwidth |
|---------------------------------------------|--------------|------------|
| Intel i5-14600k | 510.4 GFLOPS | |
| 2 x Corsair Vengeance RGB DDR5-6000 (32GiB) | | 96 GBps |
| 2 x NVIDIA GeForce RTX 4080 | 48.74 TFLOPS | 736.3 GBps |
| Apple Mac Studio (M2) | 27.2 TFLOPS | 819.2 GBps |
```shell
$ # Explain the command:
$ # --tensor-split 20369,12935,13325 <-- Available Memory in MiB for each device
$ # --device-metric "510.4GFLOPS;96GBps" <-- Intel i5-14600k CPU FLOPS and RAM Bandwidth
$ # --device-metric "27.2TFLOPS;819.2GBps;40Gbps" <-- Apple Mac Studio (M2) (RPC 0) iGPU FLOPS, VRAM Bandwidth, and Thunderbolt Bandwidth
$ # --device-metric "48.74TFLOPS;736.3GBps;64GBps" <-- NVIDIA GeForce RTX 0 4080 GPU FLOPS, VRAM Bandwidth, and PCIe 5.0 x16 Bandwidth
$ # --device-metric "48.74TFLOPS;736.3GBps;8GBps" <-- NVIDIA GeForce RTX 1 4080 GPU FLOPS, VRAM Bandwidth, and PCIe 4.0 x4 Bandwidth
$ gguf-parser --hf-repo Qwen/Qwen2.5-72B-Instruct-GGUF --hf-file qwen2.5-72b-instruct-q4_k_m-00001-of-00012.gguf --estimate --in-short \
--no-mmap \
-c 8192 \
--rpc host:port \
--tensor-split 20369,12935,13325 \
--device-metric "510.4GFLOPS;96GBps" \
--device-metric "27.2TFLOPS;819.2GBps;40Gbps" \
--device-metric "48.74TFLOPS;736.3GBps;64GBps" \
--device-metric "48.74TFLOPS;736.3GBps;8GBps"
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| ESTIMATE |
+-----------+------------------------------------------+----------------------------------------------+----------------------------------------+----------------------------------------+
| MAX TPS | RAM | RPC 0 (V)RAM | VRAM 0 | VRAM 1 |
| +--------------------+----------+----------+----------------+--------------+--------------+----------------+-----------+-----------+----------------+-----------+-----------+
| | LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+-----------+--------------------+----------+----------+----------------+--------------+--------------+----------------+-----------+-----------+----------------+-----------+-----------+
| 51.82 tps | 1 + 0 + 0 | 1.19 GiB | 1.34 GiB | 36 + 0 | 18.85 GiB | 20.17 GiB | 22 + 0 | 11.34 GiB | 12.66 GiB | 22 + 1 | 12.65 GiB | 13.97 GiB |
+-----------+--------------------+----------+----------+----------------+--------------+--------------+----------------+-----------+-----------+----------------+-----------+-----------+
```
#### Full Layers Offload (default)
```shell
$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --estimate --in-short
+-------------------------------------------------------------------------------------+
| ESTIMATE |
+------------------------------------------+------------------------------------------+
| RAM | VRAM 0 |
+--------------------+----------+----------+----------------+------------+------------+
| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+--------------------+----------+----------+----------------+------------+------------+
| 1 + 0 + 0 | 1.63 GiB | 1.78 GiB | 126 + 1 | 126.28 GiB | 246.86 GiB |
+--------------------+----------+----------+----------------+------------+------------+
```
#### Zero Layers Offload
```shell
$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --gpu-layers=0 --estimate --in-short
+------------------------------------------------------------------------------------+
| ESTIMATE |
+----------------------------------------------+-------------------------------------+
| RAM | VRAM 0 |
+--------------------+------------+------------+----------------+--------+-----------+
| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+--------------------+------------+------------+----------------+--------+-----------+
| 1 + 126 + 1 | 127.64 GiB | 127.79 GiB | 0 + 0 | 0 B | 33.62 GiB |
+--------------------+------------+------------+----------------+--------+-----------+
```
#### Specific Layers Offload
```shell
$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --gpu-layers=10 --estimate --in-short
+----------------------------------------------------------------------------------+
| ESTIMATE |
+----------------------------------------------+-----------------------------------+
| RAM | VRAM 0 |
+--------------------+------------+------------+----------------+--------+---------+
| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+--------------------+------------+------------+----------------+--------+---------+
| 1 + 126 + 1 | 127.64 GiB | 127.79 GiB | 0 + 0 | 0 B | 250 MiB |
+--------------------+------------+------------+----------------+--------+---------+
```
#### Specific Context Size
By default, the context size retrieved from the model's metadata.
Use `--ctx-size` to specify the context size.
```shell
$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --ctx-size=4096 --estimate --in-short
+--------------------------------------------------------------------------------------+
| ESTIMATE |
+----------------------------------------------+---------------------------------------+
| RAM | VRAM 0 |
+--------------------+------------+------------+----------------+----------+-----------+
| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+--------------------+------------+------------+----------------+----------+-----------+
| 1 + 0 + 0 | 404.53 MiB | 554.53 MiB | 126 + 1 | 3.94 GiB | 93.28 GiB |
+--------------------+------------+------------+----------------+----------+-----------+
```
#### Enable Flash Attention
By default, LLaMA.cpp disables the Flash Attention.
Enable Flash Attention will reduce the VRAM usage, but it also increases the GPU/CPU usage.
Use `--flash-attention` to enable the Flash Attention.
Please note that not all models support Flash Attention, if the model does not support, the "FLASH ATTENTION" shows "
Disabled" even if you enable it.
```shell
$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --flash-attention --estimate --in-short
+-------------------------------------------------------------------------------------+
| ESTIMATE |
+------------------------------------------+------------------------------------------+
| RAM | VRAM 0 |
+--------------------+----------+----------+----------------+------------+------------+
| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+--------------------+----------+----------+----------------+------------+------------+
| 1 + 0 + 0 | 1.63 GiB | 1.78 GiB | 126 + 1 | 126.28 GiB | 215.98 GiB |
+--------------------+----------+----------+----------------+------------+------------+
```
#### Disable MMap
By default, LLaMA.cpp loads the model via Memory-Mapped.
For Apple MacOS, Memory-Mapped is an efficient way to load the model, and results in a lower VRAM usage.
For other platforms, Memory-Mapped affects the first-time model loading speed only.
Use `--no-mmap` to disable loading the model via Memory-Mapped.
Please note that some models require loading the whole weight into memory, if the model does not support MMap, the "MMAP
LOAD" shows "Not Supported".
```shell
$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --no-mmap --estimate --in-short
+-------------------------------------------------------------------------------------+
| ESTIMATE |
+------------------------------------------+------------------------------------------+
| RAM | VRAM 0 |
+--------------------+----------+----------+----------------+------------+------------+
| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+--------------------+----------+----------+----------------+------------+------------+
| 1 + 0 + 0 | 2.97 GiB | 3.12 GiB | 126 + 1 | 214.24 GiB | 246.86 GiB |
+--------------------+----------+----------+----------------+------------+------------+
```
#### With Adapter
Use `--lora`/`--control-vector` to estimate the usage when loading a model with adapters.
```shell
$ gguf-parser --hf-repo="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" --hf-file="Meta-Llama-3-8B-Instruct.Q5_K_M.gguf" --estimate --in-short
+-------------------------------------------------------------------------------------+
| ESTIMATE |
+----------------------------------------------+--------------------------------------+
| RAM | VRAM 0 |
+--------------------+------------+------------+----------------+----------+----------+
| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+--------------------+------------+------------+----------------+----------+----------+
| 1 + 0 + 0 | 210.80 MiB | 360.80 MiB | 32 + 1 | 1.25 GiB | 7.04 GiB |
+--------------------+------------+------------+----------------+----------+----------+
$ # With a LoRA adapter.
$ gguf-parser --hf-repo="QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" --hf-file="Meta-Llama-3-8B-Instruct.Q5_K_M.gguf" --lora-url="https://huggingface.co/ngxson/test_gguf_lora_adapter/resolve/main/lora-Llama-3-Instruct-abliteration-LoRA-8B-f16.gguf" --estimate --in-short
+-------------------------------------------------------------------------------------+
| ESTIMATE |
+----------------------------------------------+--------------------------------------+
| RAM | VRAM 0 |
+--------------------+------------+------------+----------------+----------+----------+
| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+--------------------+------------+------------+----------------+----------+----------+
| 1 + 0 + 0 | 223.91 MiB | 373.91 MiB | 32 + 1 | 1.42 GiB | 7.20 GiB |
+--------------------+------------+------------+----------------+----------+----------+
```
#### Get Proper Offload Layers
Use `--gpu-layers-step` to get the proper offload layers number when the model is too large to fit into the GPUs memory.
```shell
$ gguf-parser --hf-repo="etemiz/Llama-3.1-405B-Inst-GGUF" --hf-file="llama-3.1-405b-IQ1_M-00019-of-00019.gguf" --gpu-layers-step=6 --estimate --in-short
+-----------------------------------------------------------------------------------------+
| ESTIMATE |
+----------------------------------------------+------------------------------------------+
| RAM | VRAM 0 |
+--------------------+------------+------------+----------------+------------+------------+
| LAYERS (I + T + O) | UMA | NONUMA | LAYERS (T + O) | UMA | NONUMA |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 126 + 1 | 127.64 GiB | 127.79 GiB | 0 + 0 | 0 B | 250 MiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 120 + 1 | 121.90 GiB | 122.05 GiB | 6 + 0 | 6 GiB | 44.68 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 114 + 1 | 115.90 GiB | 116.05 GiB | 12 + 0 | 12 GiB | 54.74 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 108 + 1 | 109.90 GiB | 110.05 GiB | 18 + 0 | 18 GiB | 64.80 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 102 + 1 | 103.90 GiB | 104.05 GiB | 24 + 0 | 24 GiB | 74.86 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 96 + 1 | 97.90 GiB | 98.05 GiB | 30 + 0 | 30 GiB | 84.93 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 90 + 1 | 91.90 GiB | 92.05 GiB | 36 + 0 | 36 GiB | 94.99 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 84 + 1 | 85.90 GiB | 86.05 GiB | 42 + 0 | 42 GiB | 105.05 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 78 + 1 | 79.90 GiB | 80.05 GiB | 48 + 0 | 48 GiB | 115.11 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 72 + 1 | 73.90 GiB | 74.05 GiB | 54 + 0 | 54 GiB | 125.17 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 66 + 1 | 67.90 GiB | 68.05 GiB | 60 + 0 | 60 GiB | 135.23 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 60 + 1 | 61.90 GiB | 62.05 GiB | 66 + 0 | 66 GiB | 145.29 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 54 + 1 | 55.90 GiB | 56.05 GiB | 72 + 0 | 72 GiB | 155.35 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 48 + 1 | 49.90 GiB | 50.05 GiB | 78 + 0 | 78 GiB | 165.42 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 42 + 1 | 43.90 GiB | 44.05 GiB | 84 + 0 | 84 GiB | 175.48 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 36 + 1 | 37.90 GiB | 38.05 GiB | 90 + 0 | 90 GiB | 185.54 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 30 + 1 | 31.90 GiB | 32.05 GiB | 96 + 0 | 96 GiB | 195.60 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 24 + 1 | 25.90 GiB | 26.05 GiB | 102 + 0 | 102 GiB | 205.66 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 18 + 1 | 19.90 GiB | 20.05 GiB | 108 + 0 | 108 GiB | 215.72 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 12 + 1 | 13.90 GiB | 14.05 GiB | 114 + 0 | 114 GiB | 226.05 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 6 + 1 | 7.90 GiB | 8.05 GiB | 120 + 0 | 120 GiB | 236.64 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 0 + 1 | 1.90 GiB | 2.05 GiB | 126 + 0 | 126 GiB | 246.24 GiB |
+--------------------+------------+------------+----------------+------------+------------+
| 1 + 0 + 0 | 1.63 GiB | 1.78 GiB | 126 + 1 | 126.28 GiB | 246.86 GiB |
+--------------------+------------+------------+----------------+------------+------------+
```
## License
MIT
================================================
FILE: cache.go
================================================
package gguf_parser
import (
"errors"
"fmt"
"os"
"path/filepath"
"time"
"github.com/gpustack/gguf-parser-go/util/json"
"github.com/gpustack/gguf-parser-go/util/osx"
"github.com/gpustack/gguf-parser-go/util/stringx"
)
var (
ErrGGUFFileCacheDisabled = errors.New("GGUF file cache disabled")
ErrGGUFFileCacheMissed = errors.New("GGUF file cache missed")
ErrGGUFFileCacheCorrupted = errors.New("GGUF file cache corrupted")
)
type GGUFFileCache string
func (c GGUFFileCache) getKeyPath(key string) string {
k := stringx.SumByFNV64a(key)
p := filepath.Join(string(c), k[:1], k)
return p
}
func (c GGUFFileCache) Get(key string, exp time.Duration) (*GGUFFile, error) {
if c == "" {
return nil, ErrGGUFFileCacheDisabled
}
if key == "" {
return nil, ErrGGUFFileCacheMissed
}
p := c.getKeyPath(key)
if !osx.Exists(p, func(stat os.FileInfo) bool {
if !stat.Mode().IsRegular() {
return false
}
return exp == 0 || time.Since(stat.ModTime()) < exp
}) {
return nil, ErrGGUFFileCacheMissed
}
var gf GGUFFile
{
bs, err := os.ReadFile(p)
if err != nil {
return nil, fmt.Errorf("GGUF file cache get: %w", err)
}
if err = json.Unmarshal(bs, &gf); err != nil {
return nil, fmt.Errorf("GGUF file cache get: %w", err)
}
}
if len(gf.TensorInfos) == 0 {
_ = os.Remove(p)
return nil, ErrGGUFFileCacheCorrupted
}
return &gf, nil
}
func (c GGUFFileCache) Put(key string, gf *GGUFFile) error {
if c == "" {
return ErrGGUFFileCacheDisabled
}
if key == "" || gf == nil {
return nil
}
bs, err := json.Marshal(gf)
if err != nil {
return fmt.Errorf("GGUF file cache put: %w", err)
}
p := c.getKeyPath(key)
if err = osx.WriteFile(p, bs, 0o600); err != nil {
return fmt.Errorf("GGUF file cache put: %w", err)
}
return nil
}
func (c GGUFFileCache) Delete(key string) error {
if c == "" {
return ErrGGUFFileCacheDisabled
}
if key == "" {
return ErrGGUFFileCacheMissed
}
p := c.getKeyPath(key)
if !osx.ExistsFile(p) {
return ErrGGUFFileCacheMissed
}
if err := os.Remove(p); err != nil {
return fmt.Errorf("GGUF file cache delete: %w", err)
}
return nil
}
================================================
FILE: cmd/gguf-parser/README.md
================================================
# GGUF Parser
Review/Check [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) files,
estimate the memory usage
for [llama.cpp](https://github.com/ggerganov/llama.cpp), [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)
and [llama-box](https://github.com/gpustack/llama-box).
See [GGUF Parser detail introduction](https://github.com/gpustack/gguf-parser-go) for more information.
## Usage
```shell
$ gguf-parser --help
NAME:
gguf-parser - Review/Check GGUF files and estimate the memory usage.
USAGE:
gguf-parser [GLOBAL OPTIONS]
VERSION:
...
GLOBAL OPTIONS:
--debug Enable debugging, verbosity. (default: false)
--help, -h Print the usage.
--version, -v Print the version.
Estimate
--device-metric value [ --device-metric value ] Specify the device metrics, which is used to estimate the throughput, in form of "FLOPS;Up Bandwidth[;Down Bandwidth]". The FLOPS unit, select from [PFLOPS, TFLOPS, GFLOPS, MFLOPS, KFLOPS]. The Up/Down Bandwidth unit, select from [PiBps, TiBps, GiBps, MiBps, KiBps, PBps, TBps, GBps, MBps, KBps, Pbps, Tbps, Gbps, Mbps, Kbps]. Up Bandwidth usually indicates the bandwidth to transmit the data to calculate, and Down Bandwidth indicates the bandwidth to transmit the calculated result to next layer. For example, "--device-metric 10TFLOPS;400GBps" means the device has 10 TFLOPS and 400 GBps Up/Down bandwidth, "--device-metric 10TFLOPS;400GBps;5000MBps" means the device has 5000MBps Down bandwidth. If the quantity specified by "--device-metric" is less than the number of estimation devices(determined by "--tensor-split" and "--rpc" to infer the device count), then replicate the last "--device-metric" to meet the required number of evaluation devices.
--flash-attention, --flash-attn, --fa, --diffusion-fa Specify enabling Flash Attention, which is used to estimate the usage. Flash Attention can reduce the usage of RAM/VRAM. (default: false)
--gpu-layers value, --ngl value, --n-gpu-layers value Specify how many layers of the main model to offload, which is used to estimate the usage, default is full offloaded. (default: -1)
--main-gpu value, --mg value Specify the GPU to use for the model (with "--split-mode=none") or for intermediate results and KV (with "--split-mode=row"), which is used to estimate the usage. Since gguf-parser cannot recognize the host GPU devices or RPC servers, "--main-gpu" only works when "--tensor-split" is set. (default: 0)
--no-flash-attention, --no-flash-attn Specify disabling Flash Attention. (default: false)
--override-tensor value, --ot value [ --override-tensor value, --ot value ] Override tensor buffer type, for example, use --override-tensor "[2-9][0-9]\.ffn_.*_exps\.=CPU" to keep experts of layers 20-99 in the CPU
--parallel-size value, --parallel value, --np value, --threads-http value Specify the number of parallel sequences to decode, which is used to estimate the usage. (default: 1)
--platform-footprint value Specify the platform footprint(RAM,VRAM) of running host in MiB, which is used to estimate the NonUMA usage, default is "150,250". Different platform always gets different RAM and VRAM footprints, for example, within CUDA, "cudaMemGetInfo" or "cudaSetDevice" would occupy some RAM and VRAM, see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo. (default: "150,250")
--rpc value Specify the RPC servers, which is used to estimate the usage, it is a comma-separated list of host:port. Woks with "--tensor-split".
--tensor-split value, --ts value Specify the fraction of the model to offload to each device, which is used to estimate the usage, it is a comma-separated list of integer. Since gguf-parser cannot recognize the host GPU devices or RPC servers, must explicitly set "--tensor-split" to indicate how many devices are used. To declare the devices belong to RPC servers, set "--rpc" please.
Estimate/LLaMACpp
--batch-size value, -b value Specify the logical batch size, which is used to estimate the usage. (default: 2048)
--cache-type-k value, --ctk value Specify the type of Key cache, which is used to estimate the usage, select from [f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1]. (default: "f16")
--cache-type-v value, --ctv value Specify the type of Value cache, which is used to estimate the usage, select from [f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1]. (default: "f16")
--ctx-size value, -c value Specify the size of prompt context, which is used to estimate the usage, default is equal to the model's maximum context size. (default: 0)
--gpu-layers-draft value, --ngld value, --n-gpu-layers-draft value Specify how many layers of the draft model to offload, which is used to estimate the usage, default is full offloaded. (default: -1)
--gpu-layers-step value Specify the step of layers to offload, works with "--gpu-layers". (default: 0)
--in-max-ctx-size Limit the context size to the maximum context size of the model, if the context size is larger than the maximum context size. (default: false)
--max-projected-cache value, --visual-max-image-cache value Specify how many projected embedding to be cached. (default: 0)
--mmap Specify enabling Memory-Mapped using, which is used to estimate the usage. Memory-Mapped can avoid loading the entire model weights into RAM. (default: false)
--no-kv-offload, --nkvo Specify disabling Key-Value offloading, which is used to estimate the usage. Disable Key-Value offloading can reduce the usage of VRAM. (default: false)
--no-mmap Specify disabling Memory-Mapped using, which is used to estimate the usage. Memory-Mapped can avoid loading the entire model weights into RAM. (default: false)
--rope-freq-base value RoPE base frequency, used by NTK-aware scaling. (default: 0)
--rope-freq-scale value RoPE frequency scaling factor, expands context by a factor of 1/N. (default: 0)
--rope-scale value RoPE context scaling factor, expands context by a factor of N. (default: 0)
--rope-scaling value RoPE frequency scaling method, defaults to linear unless specified by the model, select from [none, linear, yarn].
--split-mode value, --sm value Specify how to split the model across multiple devices, which is used to estimate the usage, select from [layer, row, none]. Since gguf-parser always estimates the usage of VRAM, "none" is meaningless here, keep for compatibility. (default: "layer")
--swa-full Specify using full-size SWA cache. (default: false)
--ubatch-size value, --ub value Specify the physical maximum batch size, which is used to estimate the usage. (default: 512)
--visual-max-image-size value Specify maximum image size when completion with vision model. (default: 0)
--yarn-orig-ctx value YaRN original context size of model, defaults to model training context size. (default: 0)
Estimate/StableDiffusionCpp
--image-autoencoder-tiling, --vae-tiling, --image-vae-tiling Specify to enable tiling for the vae model. (default: false)
--image-batch-count value, --batch-count value, --image-max-batch value Specify the batch(generation) count of the image. (default: 1)
--image-free-compute-memory-immediately Specify to free the compute memory immediately after the generation, which burst using VRAM. (default: false)
--image-height value, --height value, --image-max-height value Specify the (maximum) height of the image. (default: 1024)
--image-no-autoencoder-offload, --vae-on-cpu, --image-no-vae-model-offload Specify to offload the vae model to CPU. (default: false)
--image-no-autoencoder-tiling, --image-no-vae-tiling Specify to disable tiling for the vae model, it takes precedence over --image-autoencoder-tiling. (default: false)
--image-no-conditioner-offload, --clip-on-cpu, --image-no-text-encoder-model-offload Specify to offload the text encoder model to CPU. (default: false)
--image-no-control-net-offload, --control-net-cpu, --image-no-control-net-model-offload Specify to offload the control net model to CPU. (default: false)
--image-width value, --width value, --image-max-width value Specify the (maximum) width of the image. (default: 1024)
Load
--cache-expiration value Specify the expiration of cache, works with "--url/--hf-*/--ms-*/--ol-*". (default: 24h0m0s)
--cache-path value Cache the read result to the path, works with "--url/--hf-*/--ms-*/--ol-*". (default: "/Users/thxcode/.cache/gguf-parser")
--skip-cache Skip cache, works with "--url/--hf-*/--ms-*/--ol-*", default is caching the read result. (default: false) [$SKIP_CACHE]
--skip-dns-cache Skip DNS cache, works with "--url/--hf-*/--ms-*/--ol-*", default is caching the DNS lookup result. (default: false) [$SKIP_DNS_CACHE]
--skip-proxy Skip proxy settings, works with "--url/--hf-*/--ms-*/--ol-*", default is respecting the environment variables "HTTP_PROXY/HTTPS_PROXY/NO_PROXY". (default: false) [$SKIP_PROXY]
--skip-range-download-detect Skip range download detect, works with "--url/--hf-*/--ms-*/--ol-*", default is detecting the range download support. (default: false) [$SKIP_RANGE_DOWNLOAD_DETECT]
--skip-tls-verify Skip TLS verification, works with "--url/--hf-*/--ms-*/--ol-*", default is verifying the TLS certificate on HTTPs request. (default: false) [$SKIP_TLS_VERIFY]
Model/Local
--control-net-path value, --control-net value, --image-control-net-model value Path where the GGUF file to load for the Control Net model, optional.
--control-vector-path value, --control-vector value [ --control-vector-path value, --control-vector value ] Path where the GGUF file to load for the Control Vector adapter, optional.
--draft-path value, --model-draft value, --md value Path where the GGUF file to load for the draft model, optional, e.g. "~/.cache/lm-studio/models/QuantFactory/Qwen2-1.5B-Instruct-GGUF/Qwen2-1.5B-Instruct.Q5_K_M.gguf".
--lora-path value, --lora value [ --lora-path value, --lora value ] Path where the GGUF file to load for the LoRA adapter, optional.
--mmproj-path value, --mmproj value Path where the GGUF file to load for the multimodal projector, optional.
--path value, --model value, -m value Path where the GGUF file to load for the main model, e.g. "~/.cache/lm-studio/models/QuantFactory/Qwen2-7B-Instruct-GGUF/Qwen2-7B-Instruct.Q5_K_M.gguf".
--upscale-path value, --upscale-model value, --image-upscale-model value Path where the GGUF file to load for the Upscale model, optional.
Model/Remote
--control-net-url value Url where the GGUF file to load for the Control Net model, optional.
--control-vector-url value [ --control-vector-url value ] Url where the GGUF file to load for the Control Vector adapter, optional.
--draft-url value Url where the GGUF file to load for the draft model, optional, e.g. "https://huggingface.co/QuantFactory/Qwen2-1.5B-Instruct-GGUF/resolve/main/Qwen2-1.5B-Instruct.Q5_K_M.gguf". Note that gguf-parser does not need to download the entire GGUF file.
--header value [ --header value ] Custom HTTP header in "Key: Value" format, works with "--url/--draft-url".
--lora-url value [ --lora-url value ] Url where the GGUF file to load for the LoRA adapter, optional.
--mmproj-url value Url where the GGUF file to load for the multimodal projector, optional.
--token value Bearer auth token to load GGUF file, optional, works with "--url/--draft-url". [$TOKEN]
--upscale-url value Url where the GGUF file to load for the Upscale model, optional.
--url value, --model-url value, --mu value Url where the GGUF file to load for the main model, e.g. "https://huggingface.co/QuantFactory/Qwen2-7B-Instruct-GGUF/resolve/main/Qwen2-7B-Instruct.Q5_K_M.gguf". Note that gguf-parser does not need to download the entire GGUF file.
Model/Remote/HuggingFace
--hf-control-net-file value Model file below the "--hf-control-net-repo", optional.
--hf-control-net-repo value Repository of HuggingFace which the GGUF file store for the Control Net model, optional, works with "--hf-control-net-file".
--hf-control-vector-file value [ --hf-control-vector-file value ] Control Vector adapter file below the "--hf-repo".
--hf-draft-file value Model file below the "--hf-draft-repo", optional, e.g. "Qwen2-1.5B-Instruct.Q5_K_M.gguf".
--hf-draft-repo value Repository of HuggingFace which the GGUF file store for the draft model, optional, e.g. "QuantFactory/Qwen2-1.5B-Instruct-GGUF", works with "--hf-draft-file".
--hf-file value, --hff value Model file below the "--hf-repo", e.g. "Qwen2-7B-Instruct.Q5_K_M.gguf".
--hf-lora-file value [ --hf-lora-file value ] LoRA adapter file below the "--hf-repo".
--hf-mmproj-file value Multimodal projector file below the "--hf-repo".
--hf-repo value, --hfr value Repository of HuggingFace which the GGUF file store for the main model, e.g. "QuantFactory/Qwen2-7B-Instruct-GGUF", works with "--hf-file".
--hf-token value, --hft value User access token of HuggingFace, optional, works with "--hf-repo/--hf-file pair" or "--hf-draft-repo/--hf-draft-file" pair. See https://huggingface.co/settings/tokens. [$HF_TOKEN]
--hf-upscale-file value Model file below the "--hf-upscale-repo", optional.
--hf-upscale-repo value Repository of HuggingFace which the GGUF file store for the Upscale model, optional, works with "--hf-upscale-file".
Model/Remote/ModelScope
--ms-control-net-file value Model file below the "--ms-control-net-repo", optional.
--ms-control-net-repo value Repository of ModelScope which the GGUF file store for the Control Net model, optional, works with "--ms-control-net-file".
--ms-control-vector-file value [ --ms-control-vector-file value ] Control Vector adapter file below the "--ms-repo".
--ms-draft-file value Model file below the "--ms-draft-repo", optional, e.g. "qwen1_5-1_8b-chat-q5_k_m.gguf".
--ms-draft-repo value Repository of ModelScope which the GGUF file store for the draft model, optional, e.g. "qwen/Qwen1.5-1.8B-Chat-GGUF", works with "--ms-draft-file".
--ms-file value Model file below the "--ms-repo", e.g. "qwen1_5-7b-chat-q5_k_m.gguf".
--ms-lora-file value [ --ms-lora-file value ] LoRA adapter file below the "--ms-repo".
--ms-mmproj-file value Multimodal projector file below the "--ms-repo".
--ms-repo value Repository of ModelScope which the GGUF file store for the main model, e.g. "qwen/Qwen1.5-7B-Chat-GGUF", works with "--ms-file".
--ms-token value Git access token of ModelScope, optional, works with "--ms-repo/--ms-file" pair or "--ms-draft-repo/--ms-draft-file" pair. See https://modelscope.cn/my/myaccesstoken. [$HF_TOKEN, $MS_TOKEN]
--ms-upscale-file value Model file below the "--ms-upscale-repo", optional.
--ms-upscale-repo value Repository of ModelScope which the GGUF file store for the Upscale model, optional, works with "--ms-upscale-file".
Model/Remote/Ollama
--ol-base-url value Model base URL of Ollama, e.g. https://registry.ollama.ai. (default: "https://registry.ollama.ai")
--ol-model value Model name of Ollama, e.g. "gemma2".
--ol-usage Specify respecting the extending layers introduced by Ollama, works with "--ol-model", which affects the usage estimation. (default: false)
Output
--estimate Skip all the information except the estimate result. (default: false)
--in-mib Display the estimated result in table with MiB. (default: false)
--in-short Display the estimated result in table in short form. (default: false)
--json Output as JSON. (default: false)
--json-pretty Works with "--json", to output pretty format JSON. (default: true)
--raw Output the GGUF file information as JSON only, skip anything. (default: false)
--raw-output value Works with "--raw", to save the result to the file
--skip-architecture Skip to display architecture. (default: false)
--skip-estimate Skip to estimate. By default, gguf-parser always estimates the file which types with "model". (default: false)
--skip-metadata Skip to display metadata. (default: false)
--skip-tokenizer Skip to display tokenizer. By default, gguf-parser always displays the tokenizer of the file which types with "model". (default: false)
```
### Environment Variables Support
- `TOKEN`: The bearer auth token to load GGUF file, works with `--url/--draft-url`.
- `HF_ENDPOINT`: The HuggingFace endpoint, default is `https://huggingface.co`.
- `HF_TOKEN`: The HuggingFace token, see [HuggingFace token](https://huggingface.co/settings/tokens).
- `MS_ENDPOINT`: The ModelScope endpoint, default is `https://modelscope.cn`.
- `MS_TOKEN`: The ModelScope token, see [ModelScope token](https://modelscope.cn/my/myaccesstoken).
## License
MIT
================================================
FILE: cmd/gguf-parser/go.mod
================================================
module github.com/gpustack/gguf-parser-go/cmd/gguf-parser
go 1.22.0
toolchain go1.22.9
replace github.com/gpustack/gguf-parser-go => ../../
require (
github.com/gpustack/gguf-parser-go v0.6.0
github.com/jedib0t/go-pretty/v6 v6.6.1
github.com/urfave/cli/v2 v2.27.5
)
require (
github.com/cpuguy83/go-md2man/v2 v2.0.5 // indirect
github.com/henvic/httpretty v0.1.4 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/mattn/go-runewidth v0.0.16 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/rivo/uniseg v0.4.7 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d // indirect
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 // indirect
golang.org/x/crypto v0.29.0 // indirect
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f // indirect
golang.org/x/mod v0.22.0 // indirect
golang.org/x/sync v0.9.0 // indirect
golang.org/x/sys v0.27.0 // indirect
golang.org/x/tools v0.27.0 // indirect
gonum.org/v1/gonum v0.15.1 // indirect
)
================================================
FILE: cmd/gguf-parser/go.sum
================================================
github.com/cpuguy83/go-md2man/v2 v2.0.5 h1:ZtcqGrnekaHpVLArFSe4HK5DoKx1T0rq2DwVB0alcyc=
github.com/cpuguy83/go-md2man/v2 v2.0.5/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
github.com/henvic/httpretty v0.1.4 h1:Jo7uwIRWVFxkqOnErcoYfH90o3ddQyVrSANeS4cxYmU=
github.com/henvic/httpretty v0.1.4/go.mod h1:Dn60sQTZfbt2dYsdUSNsCljyF4AfdqnuJFDLJA1I4AM=
github.com/jedib0t/go-pretty/v6 v6.6.1 h1:iJ65Xjb680rHcikRj6DSIbzCex2huitmc7bDtxYVWyc=
github.com/jedib0t/go-pretty/v6 v6.6.1/go.mod h1:zbn98qrYlh95FIhwwsbIip0LYpwSG8SUOScs+v9/t0E=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/mattn/go-runewidth v0.0.16 h1:E5ScNMtiwvlvB5paMFdw9p4kSQzbXFikJ5SQO6TULQc=
github.com/mattn/go-runewidth v0.0.16/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M=
github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/russross/blackfriday/v2 v2.1.0 h1:JIOH55/0cWyOuilr9/qlrm0BSXldqnqwMsf35Ld67mk=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d h1:3VwvTjiRPA7cqtgOWddEL+JrcijMlXUmj99c/6YyZoY=
github.com/smallnest/ringbuffer v0.0.0-20241116012123-461381446e3d/go.mod h1:tAG61zBM1DYRaGIPloumExGvScf08oHuo0kFoOqdbT0=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/urfave/cli/v2 v2.27.5 h1:WoHEJLdsXr6dDWoJgMq/CboDmyY/8HMMH1fTECbih+w=
github.com/urfave/cli/v2 v2.27.5/go.mod h1:3Sevf16NykTbInEnD0yKkjDAeZDS0A6bzhBH5hrMvTQ=
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1 h1:gEOO8jv9F4OT7lGCjxCBTO/36wtF6j2nSip77qHd4x4=
github.com/xrash/smetrics v0.0.0-20240521201337-686a1a2994c1/go.mod h1:Ohn+xnUBiLI6FVj/9LpzZWtj1/D6lUovWYBkxHVV3aM=
golang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ=
golang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg=
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f h1:XdNn9LlyWAhLVp6P/i8QYBW+hlyhrhei9uErw2B5GJo=
golang.org/x/exp v0.0.0-20241108190413-2d47ceb2692f/go.mod h1:D5SMRVC3C2/4+F/DB1wZsLRnSNimn2Sp/NPsCrsv8ak=
golang.org/x/mod v0.22.0 h1:D4nJWe9zXqHOmWqj4VMOJhvzj7bEZg4wEYa759z1pH4=
golang.org/x/mod v0.22.0/go.mod h1:6SkKJ3Xj0I0BrPOZoBy3bdMptDDU9oJrpohJ3eWZ1fY=
golang.org/x/sync v0.9.0 h1:fEo0HyrW1GIgZdpbhCRO0PkJajUS5H9IFUztCgEo2jQ=
golang.org/x/sync v0.9.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s=
golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/term v0.26.0 h1:WEQa6V3Gja/BhNxg540hBip/kkaYtRg3cxg4oXSw4AU=
golang.org/x/term v0.26.0/go.mod h1:Si5m1o57C5nBNQo5z1iq+XDijt21BDBDp2bK0QI8e3E=
golang.org/x/tools v0.27.0 h1:qEKojBykQkQ4EynWy4S8Weg69NumxKdn40Fce3uc/8o=
golang.org/x/tools v0.27.0/go.mod h1:sUi0ZgbwW9ZPAq26Ekut+weQPR5eIM6GQLQ1Yjm1H0Q=
gonum.org/v1/gonum v0.15.1 h1:FNy7N6OUZVUaWG9pTiD+jlhdQ3lMP+/LcTpJ6+a8sQ0=
gonum.org/v1/gonum v0.15.1/go.mod h1:eZTZuRFrzu5pcyjN5wJhcIhnUdNijYxX1T2IcrOGY0o=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
================================================
FILE: cmd/gguf-parser/main.go
================================================
package main
import (
"errors"
"fmt"
"net"
"os"
"path/filepath"
"regexp"
"strconv"
"strings"
"sync"
"time"
"github.com/gpustack/gguf-parser-go/util/anyx"
"github.com/gpustack/gguf-parser-go/util/json"
"github.com/gpustack/gguf-parser-go/util/osx"
"github.com/gpustack/gguf-parser-go/util/signalx"
"github.com/jedib0t/go-pretty/v6/table"
"github.com/jedib0t/go-pretty/v6/text"
"github.com/urfave/cli/v2"
. "github.com/gpustack/gguf-parser-go" // nolint: stylecheck
)
var Version = "v0.0.0"
func init() {
cli.VersionFlag = &cli.BoolFlag{
Name: "version",
Aliases: []string{"v"},
Usage: "Print the version.",
DisableDefaultText: true,
}
cli.HelpFlag = &cli.BoolFlag{
Name: "help",
Aliases: []string{"h"},
Usage: "Print the usage.",
DisableDefaultText: true,
}
}
func main() {
name := filepath.Base(os.Args[0])
app := &cli.App{
Name: name,
Usage: "Review/Check GGUF files and estimate the memory usage and provide optimization suggestions.",
UsageText: name + " [GLOBAL OPTIONS]",
Version: Version,
Reader: os.Stdin,
Writer: os.Stdout,
ErrWriter: os.Stderr,
HideHelpCommand: true,
OnUsageError: func(c *cli.Context, _ error, _ bool) error {
return cli.ShowAppHelp(c)
},
Flags: []cli.Flag{
&cli.BoolFlag{
Destination: &debug,
Value: debug,
Name: "debug",
Usage: "Enable debugging, verbosity.",
},
&cli.StringFlag{
Destination: &path,
Value: path,
Category: "Model/Local",
Name: "path",
Aliases: []string{ // LLaMACpp compatibility
"model",
"m",
},
Usage: "Path where the GGUF file to load for the main model, e.g. \"~/.cache" +
"/lm-studio/models/QuantFactory/Qwen2-7B-Instruct-GGUF" +
"/Qwen2-7B-Instruct.Q5_K_M.gguf\".",
},
&cli.StringFlag{
Destination: &draftPath,
Value: draftPath,
Category: "Model/Local",
Name: "draft-path",
Aliases: []string{ // LLaMACpp compatibility
"model-draft",
"md",
},
Usage: "Path where the GGUF file to load for the draft model, optional, e.g. \"~/.cache" +
"/lm-studio/models/QuantFactory/Qwen2-1.5B-Instruct-GGUF" +
"/Qwen2-1.5B-Instruct.Q5_K_M.gguf\".",
},
&cli.StringFlag{
Destination: &mmprojPath,
Value: mmprojPath,
Category: "Model/Local",
Name: "mmproj-path",
Aliases: []string{ // LLaMACpp compatibility
"mmproj",
},
Usage: "Path where the GGUF file to load for the multimodal projector, optional.",
},
&cli.StringSliceFlag{
Destination: &loraPaths,
Category: "Model/Local",
Name: "lora-path",
Aliases: []string{ // LLaMACpp compatibility
"lora",
},
Usage: "Path where the GGUF file to load for the LoRA adapter, optional.",
},
&cli.StringSliceFlag{
Destination: &controlVectorPaths,
Category: "Model/Local",
Name: "control-vector-path",
Aliases: []string{ // LLaMACpp compatibility
"control-vector",
},
Usage: "Path where the GGUF file to load for the Control Vector adapter, optional.",
},
&cli.StringFlag{
Destination: &upscalePath,
Value: upscalePath,
Category: "Model/Local",
Name: "upscale-path",
Aliases: []string{
"upscale-model", // StableDiffusionCpp compatibility
"image-upscale-model", // LLaMABox compatibility
},
Usage: "Path where the GGUF file to load for the Upscale model, optional.",
},
&cli.StringFlag{
Destination: &controlNetPath,
Value: controlNetPath,
Category: "Model/Local",
Name: "control-net-path",
Aliases: []string{
"control-net", // StableDiffusionCpp compatibility
"image-control-net-model", // LLaMABox compatibility
},
Usage: "Path where the GGUF file to load for the Control Net model, optional.",
},
&cli.StringFlag{
Destination: &url,
Value: url,
Category: "Model/Remote",
Name: "url",
Aliases: []string{
"model-url",
"mu",
},
Usage: "Url where the GGUF file to load for the main model, e.g. " +
"\"https://huggingface.co/QuantFactory/Qwen2-7B-Instruct-GGUF" +
"/resolve/main/Qwen2-7B-Instruct.Q5_K_M.gguf\". " +
"Note that gguf-parser does not need to download the entire GGUF file.",
},
&cli.StringFlag{
Destination: &draftUrl,
Value: draftUrl,
Category: "Model/Remote",
Name: "draft-url",
Usage: "Url where the GGUF file to load for the draft model, optional, e.g. " +
"\"https://huggingface.co/QuantFactory/Qwen2-1.5B-Instruct-GGUF" +
"/resolve/main/Qwen2-1.5B-Instruct.Q5_K_M.gguf\". " +
"Note that gguf-parser does not need to download the entire GGUF file.",
},
&cli.StringFlag{
Destination: &mmprojUrl,
Value: mmprojUrl,
Category: "Model/Remote",
Name: "mmproj-url",
Usage: "Url where the GGUF file to load for the multimodal projector, optional.",
},
&cli.StringSliceFlag{
Destination: &loraUrls,
Category: "Model/Remote",
Name: "lora-url",
Usage: "Url where the GGUF file to load for the LoRA adapter, optional.",
},
&cli.StringSliceFlag{
Destination: &controlVectorUrls,
Category: "Model/Remote",
Name: "control-vector-url",
Usage: "Url where the GGUF file to load for the Control Vector adapter, optional.",
},
&cli.StringFlag{
Destination: &upscaleUrl,
Value: upscaleUrl,
Category: "Model/Remote",
Name: "upscale-url",
Usage: "Url where the GGUF file to load for the Upscale model, optional.",
},
&cli.StringFlag{
Destination: &controlNetUrl,
Value: controlNetUrl,
Category: "Model/Remote",
Name: "control-net-url",
Usage: "Url where the GGUF file to load for the Control Net model, optional.",
},
&cli.StringFlag{
Destination: &token,
Value: token,
Category: "Model/Remote",
Name: "token",
EnvVars: []string{
"TOKEN",
},
Usage: "Bearer auth token to load GGUF file, optional, " +
"works with \"--url/--draft-url\".",
},
&cli.StringSliceFlag{
Destination: &headers,
Category: "Model/Remote",
Name: "header",
Usage: "Custom HTTP header in \"Key: Value\" format, " +
"works with \"--url/--draft-url\".",
},
&cli.StringFlag{
Destination: &hfRepo,
Value: hfRepo,
Category: "Model/Remote/HuggingFace",
Name: "hf-repo",
Aliases: []string{ // LLaMACpp compatibility
"hfr",
},
Usage: "Repository of HuggingFace which the GGUF file store for the main model, e.g. " +
"\"QuantFactory/Qwen2-7B-Instruct-GGUF\", works with \"--hf-file\".",
},
&cli.StringFlag{
Destination: &hfFile,
Value: hfFile,
Category: "Model/Remote/HuggingFace",
Name: "hf-file",
Aliases: []string{ // LLaMACpp compatibility
"hff",
},
Usage: "Model file below the \"--hf-repo\", e.g. " +
"\"Qwen2-7B-Instruct.Q5_K_M.gguf\".",
},
&cli.StringFlag{
Destination: &hfDraftRepo,
Value: hfDraftRepo,
Category: "Model/Remote/HuggingFace",
Name: "hf-draft-repo",
Usage: "Repository of HuggingFace which the GGUF file store for the draft model, optional, e.g. " +
"\"QuantFactory/Qwen2-1.5B-Instruct-GGUF\", works with \"--hf-draft-file\".",
},
&cli.StringFlag{
Destination: &hfDraftFile,
Value: hfDraftFile,
Category: "Model/Remote/HuggingFace",
Name: "hf-draft-file",
Usage: "Model file below the \"--hf-draft-repo\", optional, e.g. " +
"\"Qwen2-1.5B-Instruct.Q5_K_M.gguf\".",
},
&cli.StringFlag{
Destination: &hfMMProjFile,
Value: hfMMProjFile,
Category: "Model/Remote/HuggingFace",
Name: "hf-mmproj-file",
Usage: "Multimodal projector file below the \"--hf-repo\".",
},
&cli.StringSliceFlag{
Destination: &hfLoRAFiles,
Category: "Model/Remote/HuggingFace",
Name: "hf-lora-file",
Usage: "LoRA adapter file below the \"--hf-repo\".",
},
&cli.StringSliceFlag{
Destination: &hfControlVectorFiles,
Category: "Model/Remote/HuggingFace",
Name: "hf-control-vector-file",
Usage: "Control Vector adapter file below the \"--hf-repo\".",
},
&cli.StringFlag{
Destination: &hfUpscaleRepo,
Value: hfUpscaleRepo,
Category: "Model/Remote/HuggingFace",
Name: "hf-upscale-repo",
Usage: "Repository of HuggingFace which the GGUF file store for the Upscale model, optional, " +
"works with \"--hf-upscale-file\".",
},
&cli.StringFlag{
Destination: &hfUpscaleFile,
Value: hfUpscaleFile,
Category: "Model/Remote/HuggingFace",
Name: "hf-upscale-file",
Usage: "Model file below the \"--hf-upscale-repo\", optional.",
},
&cli.StringFlag{
Destination: &hfControlNetRepo,
Value: hfControlNetRepo,
Category: "Model/Remote/HuggingFace",
Name: "hf-control-net-repo",
Usage: "Repository of HuggingFace which the GGUF file store for the Control Net model, optional, " +
"works with \"--hf-control-net-file\".",
},
&cli.StringFlag{
Destination: &hfControlNetFile,
Value: hfControlNetFile,
Category: "Model/Remote/HuggingFace",
Name: "hf-control-net-file",
Usage: "Model file below the \"--hf-control-net-repo\", optional.",
},
&cli.StringFlag{
Destination: &hfToken,
Value: hfToken,
Category: "Model/Remote/HuggingFace",
Name: "hf-token",
Aliases: []string{ // LLaMACpp compatibility
"hft",
},
EnvVars: []string{
"HF_TOKEN",
},
Usage: "User access token of HuggingFace, optional, " +
"works with \"--hf-repo/--hf-file pair\" or \"--hf-draft-repo/--hf-draft-file\" pair. " +
"See https://huggingface.co/settings/tokens.",
},
&cli.StringFlag{
Destination: &msRepo,
Value: msRepo,
Category: "Model/Remote/ModelScope",
Name: "ms-repo",
Usage: "Repository of ModelScope which the GGUF file store for the main model, e.g. " +
"\"qwen/Qwen1.5-7B-Chat-GGUF\", works with \"--ms-file\".",
},
&cli.StringFlag{
Destination: &msFile,
Value: msFile,
Category: "Model/Remote/ModelScope",
Name: "ms-file",
Usage: "Model file below the \"--ms-repo\", e.g. " +
"\"qwen1_5-7b-chat-q5_k_m.gguf\".",
},
&cli.StringFlag{
Destination: &msDraftRepo,
Value: msDraftRepo,
Category: "Model/Remote/ModelScope",
Name: "ms-draft-repo",
Usage: "Repository of ModelScope which the GGUF file store for the draft model, optional, e.g. " +
"\"qwen/Qwen1.5-1.8B-Chat-GGUF\", works with \"--ms-draft-file\".",
},
&cli.StringFlag{
Destination: &msDraftFile,
Value: msDraftFile,
Category: "Model/Remote/ModelScope",
Name: "ms-draft-file",
Usage: "Model file below the \"--ms-draft-repo\", optional, e.g. " +
"\"qwen1_5-1_8b-chat-q5_k_m.gguf\".",
},
&cli.StringFlag{
Destination: &msMMProjFile,
Value: msMMProjFile,
Category: "Model/Remote/ModelScope",
Name: "ms-mmproj-file",
Usage: "Multimodal projector file below the \"--ms-repo\".",
},
&cli.StringSliceFlag{
Destination: &msLoRAFiles,
Category: "Model/Remote/ModelScope",
Name: "ms-lora-file",
Usage: "LoRA adapter file below the \"--ms-repo\".",
},
&cli.StringSliceFlag{
Destination: &msControlVectorFiles,
Category: "Model/Remote/ModelScope",
Name: "ms-control-vector-file",
Usage: "Control Vector adapter file below the \"--ms-repo\".",
},
&cli.StringFlag{
Destination: &msUpscaleRepo,
Value: msUpscaleRepo,
Category: "Model/Remote/ModelScope",
Name: "ms-upscale-repo",
Usage: "Repository of ModelScope which the GGUF file store for the Upscale model, optional, " +
"works with \"--ms-upscale-file\".",
},
&cli.StringFlag{
Destination: &msUpscaleFile,
Value: msUpscaleFile,
Category: "Model/Remote/ModelScope",
Name: "ms-upscale-file",
Usage: "Model file below the \"--ms-upscale-repo\", optional.",
},
&cli.StringFlag{
Destination: &msControlNetRepo,
Value: msControlNetRepo,
Category: "Model/Remote/ModelScope",
Name: "ms-control-net-repo",
Usage: "Repository of ModelScope which the GGUF file store for the Control Net model, optional, " +
"works with \"--ms-control-net-file\".",
},
&cli.StringFlag{
Destination: &msControlNetFile,
Value: msControlNetFile,
Category: "Model/Remote/ModelScope",
Name: "ms-control-net-file",
Usage: "Model file below the \"--ms-control-net-repo\", optional.",
},
&cli.StringFlag{
Destination: &msToken,
Value: msToken,
Category: "Model/Remote/ModelScope",
Name: "ms-token",
EnvVars: []string{
"HF_TOKEN", // Compatible with HuggingFace
"MS_TOKEN",
},
Usage: "Git access token of ModelScope, optional, " +
"works with \"--ms-repo/--ms-file\" pair or \"--ms-draft-repo/--ms-draft-file\" pair. " +
"See https://modelscope.cn/my/myaccesstoken.",
},
&cli.StringFlag{
Destination: &olBaseURL,
Value: olBaseURL,
Category: "Model/Remote/Ollama",
Name: "ol-base-url",
Usage: "Model base URL of Ollama, e.g. " +
"https://registry.ollama.ai.",
},
&cli.StringFlag{
Destination: &olModel,
Value: olModel,
Category: "Model/Remote/Ollama",
Name: "ol-model",
Usage: "Model name of Ollama, e.g. " +
"\"gemma2\".",
},
&cli.BoolFlag{
Destination: &olUsage,
Value: olUsage,
Category: "Model/Remote/Ollama",
Name: "ol-usage",
Usage: "Specify respecting the extending layers introduced by Ollama, " +
"works with \"--ol-model\", which affects the usage estimation.",
},
&cli.BoolFlag{
Destination: &skipProxy,
Value: skipProxy,
Category: "Load",
Name: "skip-proxy",
EnvVars: []string{
"SKIP_PROXY",
},
Usage: "Skip proxy settings, " +
"works with \"--url/--hf-*/--ms-*/--ol-*\", " +
"default is respecting the environment variables \"HTTP_PROXY/HTTPS_PROXY/NO_PROXY\".",
},
&cli.BoolFlag{
Destination: &skipTLSVerify,
Value: skipTLSVerify,
Category: "Load",
Name: "skip-tls-verify",
EnvVars: []string{
"SKIP_TLS_VERIFY",
},
Usage: "Skip TLS verification, " +
"works with \"--url/--hf-*/--ms-*/--ol-*\", " +
"default is verifying the TLS certificate on HTTPs request.",
},
&cli.BoolFlag{
Destination: &skipDNSCache,
Value: skipDNSCache,
Category: "Load",
Name: "skip-dns-cache",
EnvVars: []string{
"SKIP_DNS_CACHE",
},
Usage: "Skip DNS cache, " +
"works with \"--url/--hf-*/--ms-*/--ol-*\", " +
"default is caching the DNS lookup result.",
},
&cli.BoolFlag{
Destination: &skipRangDownloadDetect,
Value: skipRangDownloadDetect,
Category: "Load",
Name: "skip-range-download-detect",
EnvVars: []string{
"SKIP_RANGE_DOWNLOAD_DETECT",
},
Usage: "Skip range download detect, " +
"works with \"--url/--hf-*/--ms-*/--ol-*\", " +
"default is detecting the range download support.",
},
&cli.DurationFlag{
Destination: &cacheExpiration,
Value: cacheExpiration,
Category: "Load",
Name: "cache-expiration",
Usage: "Specify the expiration of cache, " +
"works with \"--url/--hf-*/--ms-*/--ol-*\".",
},
&cli.StringFlag{
Destination: &cachePath,
Value: cachePath,
Category: "Load",
Name: "cache-path",
Usage: "Cache the read result to the path, " +
"works with \"--url/--hf-*/--ms-*/--ol-*\".",
},
&cli.BoolFlag{
Destination: &skipCache,
Value: skipCache,
Category: "Load",
Name: "skip-cache",
EnvVars: []string{
"SKIP_CACHE",
},
Usage: "Skip cache, " +
"works with \"--url/--hf-*/--ms-*/--ol-*\", " +
"default is caching the read result.",
},
&cli.IntFlag{
Destination: ¶llelSize,
Value: parallelSize,
Category: "Estimate",
Name: "parallel-size",
Aliases: []string{ // LLaMACpp compatibility
"parallel",
"np",
"threads-http", // LLaMABox v0.0.140+ compatibility
},
Usage: "Specify the number of parallel sequences to decode, " +
"which is used to estimate the usage.",
},
&cli.BoolFlag{
Destination: &flashAttention,
Value: flashAttention,
Category: "Estimate",
Name: "flash-attention",
Aliases: []string{
"flash-attn",
"fa",
"diffusion-fa", // StableDiffusionCpp compatibility
},
Usage: "Specify enabling Flash Attention, " +
"which is used to estimate the usage. " +
"Flash Attention can reduce the usage of RAM/VRAM.",
},
&cli.BoolFlag{ // LLaMABox compatibility
Category: "Estimate",
Name: "no-flash-attention",
Aliases: []string{
"no-flash-attn",
},
Usage: "Specify disabling Flash Attention.",
Action: func(context *cli.Context, b bool) error {
flashAttention = !b
return nil
},
},
&cli.UintFlag{
Destination: &mainGPU,
Value: mainGPU,
Category: "Estimate",
Name: "main-gpu",
Aliases: []string{ // LLaMACpp compatibility
"mg",
},
Usage: "Specify the GPU to use for the model (with \"--split-mode=none\") " +
"or for intermediate results and KV (with \"--split-mode=row\"), " +
"which is used to estimate the usage. " +
"Since gguf-parser cannot recognize the host GPU devices or RPC servers, " +
"\"--main-gpu\" only works when \"--tensor-split\" is set.",
},
&cli.StringFlag{
Destination: &rpcServers,
Value: rpcServers,
Category: "Estimate",
Name: "rpc",
Usage: "Specify the RPC servers, " +
"which is used to estimate the usage, " +
"it is a comma-separated list of host:port. " +
"Woks with \"--tensor-split\".",
},
&cli.StringFlag{
Destination: &tensorSplit,
Value: tensorSplit,
Category: "Estimate",
Name: "tensor-split",
Aliases: []string{ // LLaMACpp compatibility
"ts",
},
Usage: "Specify the fraction of the model to offload to each device, " +
"which is used to estimate the usage, " +
"it is a comma-separated list of integer. " +
"Since gguf-parser cannot recognize the host GPU devices or RPC servers, " +
"must explicitly set \"--tensor-split\" to indicate how many devices are used. " +
"To declare the devices belong to RPC servers, set \"--rpc\" please.",
},
&cli.IntFlag{
Destination: &offloadLayers,
Value: offloadLayers,
Category: "Estimate",
Name: "gpu-layers",
Aliases: []string{ // LLaMACpp compatibility
"ngl",
"n-gpu-layers",
},
Usage: "Specify how many layers of the main model to offload, " +
"which is used to estimate the usage, " +
"default is full offloaded.",
},
&cli.StringSliceFlag{
Destination: &overrideTensors,
Category: "Estimate",
Name: "override-tensor",
Aliases: []string{ // LLaMACpp compatibility
"ot",
},
Usage: "Override tensor buffer type, " +
"for example, use --override-tensor \"[2-9][0-9]\\.ffn_.*_exps\\.=CPU\" to keep experts of layers 20-99 in the CPU",
},
&cli.StringSliceFlag{
Destination: &deviceMetrics,
Category: "Estimate",
Name: "device-metric",
Usage: "Specify the device metrics, " +
"which is used to estimate the throughput, in form of \"FLOPS;Up Bandwidth[;Down Bandwidth]\". " +
"The FLOPS unit, select from [PFLOPS, TFLOPS, GFLOPS, MFLOPS, KFLOPS]. " +
"The Up/Down Bandwidth unit, select from [PiBps, TiBps, GiBps, MiBps, KiBps, PBps, TBps, GBps, MBps, KBps, Pbps, Tbps, Gbps, Mbps, Kbps]. " +
"Up Bandwidth usually indicates the bandwidth to transmit the data to calculate, " +
"and Down Bandwidth indicates the bandwidth to transmit the calculated result to next layer. " +
"For example, \"--device-metric 10TFLOPS;400GBps\" means the device has 10 TFLOPS and 400 GBps Up/Down bandwidth, " +
"\"--device-metric 10TFLOPS;400GBps;5000MBps\" means the device has 5000MBps Down bandwidth. " +
"If the quantity specified by \"--device-metric\" is less than the number of estimation devices(" +
"determined by \"--tensor-split\" and \"--rpc\" to infer the device count), " +
"then replicate the last \"--device-metric\" to meet the required number of evaluation devices.",
},
&cli.StringFlag{
Destination: &platformFootprint,
Value: platformFootprint,
Category: "Estimate",
Name: "platform-footprint",
Usage: "Specify the platform footprint(RAM,VRAM) of running host in MiB, " +
"which is used to estimate the NonUMA usage, " +
"default is \"150,250\". " +
"Different platform always gets different RAM and VRAM footprints, " +
"for example, within CUDA, \"cudaMemGetInfo\" or \"cudaSetDevice\" would occupy some RAM and VRAM, " +
"see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo.",
},
&cli.IntFlag{
Destination: &lmcCtxSize,
Value: lmcCtxSize,
Category: "Estimate/LLaMACpp",
Name: "ctx-size",
Aliases: []string{ // LLaMACpp compatibility
"c",
},
Usage: "Specify the size of prompt context, " +
"which is used to estimate the usage, " +
"default is equal to the model's maximum context size.",
},
&cli.StringFlag{
Destination: &lmcRoPEScalingType,
Category: "Estimate/LLaMACpp",
Name: "rope-scaling",
Usage: "RoPE frequency scaling method, " +
"defaults to linear unless specified by the model, select from [none, linear, yarn].",
},
&cli.Float64Flag{
Category: "Estimate/LLaMACpp",
Name: "rope-scale",
Usage: "RoPE context scaling factor, " +
"expands context by a factor of N.",
Action: func(context *cli.Context, f float64) error {
if f != 0 {
lmcRoPEFreqScale = 1 / f
}
return nil
},
},
&cli.Float64Flag{
Destination: &lmcRoPEFreqBase,
Category: "Estimate/LLaMACpp",
Name: "rope-freq-base",
Usage: "RoPE base frequency, " +
"used by NTK-aware scaling.",
},
&cli.Float64Flag{
Destination: &lmcRoPEFreqScale,
Category: "Estimate/LLaMACpp",
Name: "rope-freq-scale",
Usage: "RoPE frequency scaling factor, " +
"expands context by a factor of 1/N.",
},
&cli.IntFlag{
Destination: &lmcRoPEScalingOrigCtxSize,
Category: "Estimate/LLaMACpp",
Name: "yarn-orig-ctx",
Usage: "YaRN original context size of model, " +
"defaults to model training context size.",
},
&cli.BoolFlag{
Destination: &lmcInMaxCtxSize,
Value: lmcInMaxCtxSize,
Category: "Estimate/LLaMACpp",
Name: "in-max-ctx-size",
Usage: "Limit the context size to the maximum context size of the model, " +
"if the context size is larger than the maximum context size.",
},
&cli.IntFlag{
Destination: &lmcLogicalBatchSize,
Value: lmcLogicalBatchSize,
Category: "Estimate/LLaMACpp",
Name: "batch-size",
Aliases: []string{ // LLaMACpp compatibility
"b",
},
Usage: "Specify the logical batch size, " +
"which is used to estimate the usage.",
},
&cli.IntFlag{
Destination: &lmcPhysicalBatchSize,
Value: lmcPhysicalBatchSize,
Category: "Estimate/LLaMACpp",
Name: "ubatch-size",
Aliases: []string{ // LLaMACpp compatibility
"ub",
},
Usage: "Specify the physical maximum batch size, " +
"which is used to estimate the usage.",
},
&cli.StringFlag{
Destination: &lmcCacheKeyType,
Value: lmcCacheKeyType,
Category: "Estimate/LLaMACpp",
Name: "cache-type-k",
Aliases: []string{ // LLaMACpp compatibility
"ctk",
},
Usage: "Specify the type of Key cache, " +
"which is used to estimate the usage, select from [f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1].",
},
&cli.StringFlag{
Destination: &lmcCacheValueType,
Value: lmcCacheValueType,
Category: "Estimate/LLaMACpp",
Name: "cache-type-v",
Aliases: []string{ // LLaMACpp compatibility
"ctv",
},
Usage: "Specify the type of Value cache, " +
"which is used to estimate the usage, select from [f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1].",
},
&cli.BoolFlag{
Destination: &lmcNoKVOffload,
Value: lmcNoKVOffload,
Category: "Estimate/LLaMACpp",
Name: "no-kv-offload",
Aliases: []string{ // LLaMACpp compatibility
"nkvo",
},
Usage: "Specify disabling Key-Value offloading, " +
"which is used to estimate the usage. " +
"Disable Key-Value offloading can reduce the usage of VRAM.",
},
&cli.StringFlag{
Destination: &lmcSplitMode,
Value: lmcSplitMode,
Category: "Estimate/LLaMACpp",
Name: "split-mode",
Aliases: []string{ // LLaMACpp compatibility
"sm",
},
Usage: "Specify how to split the model across multiple devices, " +
"which is used to estimate the usage, select from [layer, row, none]. " +
"Since gguf-parser always estimates the usage of VRAM, " +
"\"none\" is meaningless here, keep for compatibility.",
},
&cli.BoolFlag{
Destination: &lmcSWAFull,
Value: lmcSWAFull,
Category: "Estimate/LLaMACpp",
Name: "swa-full",
Usage: "Specify using full-size SWA cache.",
},
&cli.BoolFlag{
Destination: &lmcNoMMap,
Value: lmcNoMMap,
Category: "Estimate/LLaMACpp",
Name: "no-mmap",
Usage: "Specify disabling Memory-Mapped using, " +
"which is used to estimate the usage. " +
"Memory-Mapped can avoid loading the entire model weights into RAM.",
},
&cli.BoolFlag{ // LLaMABox compatibility
Category: "Estimate/LLaMACpp",
Name: "mmap",
Usage: "Specify enabling Memory-Mapped using, " +
"which is used to estimate the usage. " +
"Memory-Mapped can avoid loading the entire model weights into RAM.",
Action: func(context *cli.Context, b bool) error {
lmcNoMMap = !b
return nil
},
},
&cli.UintFlag{ // LLaMABox compatibility
Destination: &lmcVisualMaxImageSize,
Value: lmcVisualMaxImageSize,
Category: "Estimate/LLaMACpp",
Name: "visual-max-image-size",
Usage: "Specify maximum image size when completion with vision model.",
},
&cli.UintFlag{ // LLaMABox compatibility
Destination: &lmcMaxProjectedCache,
Value: lmcMaxProjectedCache,
Category: "Estimate/LLaMACpp",
Name: "max-projected-cache",
Aliases: []string{
"visual-max-image-cache", // Deprecated argument name
},
Usage: "Specify how many projected embedding to be cached.",
},
&cli.IntFlag{
Destination: &lmcOffloadLayersDraft,
Value: lmcOffloadLayersDraft,
Category: "Estimate/LLaMACpp",
Name: "gpu-layers-draft",
Aliases: []string{ // LLaMACpp compatibility
"ngld",
"n-gpu-layers-draft",
},
Usage: "Specify how many layers of the draft model to offload, " +
"which is used to estimate the usage, " +
"default is full offloaded.",
},
&cli.Uint64Flag{
Destination: &lmcOffloadLayersStep,
Value: lmcOffloadLayersStep,
Category: "Estimate/LLaMACpp",
Name: "gpu-layers-step",
Usage: "Specify the step of layers to offload, " +
"works with \"--gpu-layers\".",
},
&cli.UintFlag{
Destination: &sdcBatchCount,
Value: sdcBatchCount,
Category: "Estimate/StableDiffusionCpp",
Name: "image-batch-count",
Aliases: []string{
"batch-count", // StableDiffusionCpp compatibility
"image-max-batch", // LLaMABox compatibility
},
Usage: "Specify the batch(generation) count of the image.",
},
&cli.UintFlag{
Destination: &sdcHeight,
Value: sdcHeight,
Category: "Estimate/StableDiffusionCpp",
Name: "image-height",
Aliases: []string{
"height", // StableDiffusionCpp compatibility
"image-max-height", // LLaMABox compatibility
},
Usage: "Specify the (maximum) height of the image.",
},
&cli.UintFlag{
Destination: &sdcWidth,
Value: sdcWidth,
Category: "Estimate/StableDiffusionCpp",
Name: "image-width",
Aliases: []string{
"width", // StableDiffusionCpp compatibility
"image-max-width", // LLaMABox compatibility
},
Usage: "Specify the (maximum) width of the image.",
},
&cli.BoolFlag{
Destination: &sdcNoConditionerOffload,
Value: sdcNoConditionerOffload,
Category: "Estimate/StableDiffusionCpp",
Name: "image-no-conditioner-offload",
Aliases: []string{
"clip-on-cpu", // StableDiffusionCpp compatibility
"image-no-text-encoder-model-offload", // LLaMABox compatibility
},
Usage: "Specify to offload the text encoder model to CPU.",
},
&cli.BoolFlag{
Destination: &sdcNoAutoencoderOffload,
Value: sdcNoAutoencoderOffload,
Category: "Estimate/StableDiffusionCpp",
Name: "image-no-autoencoder-offload",
Aliases: []string{
"vae-on-cpu", // StableDiffusionCpp compatibility
"image-no-vae-model-offload", // LLaMABox compatibility
},
Usage: "Specify to offload the vae model to CPU.",
},
&cli.BoolFlag{
Destination: &sdcNoControlNetOffload,
Value: sdcNoControlNetOffload,
Category: "Estimate/StableDiffusionCpp",
Name: "image-no-control-net-offload",
Aliases: []string{
"control-net-cpu", // StableDiffusionCpp compatibility
"image-no-control-net-model-offload", // LLaMABox compatibility
},
Usage: "Specify to offload the control net model to CPU.",
},
&cli.BoolFlag{
Destination: &sdcAutoencoderTiling,
Value: sdcAutoencoderTiling,
Category: "Estimate/StableDiffusionCpp",
Name: "image-autoencoder-tiling",
Aliases: []string{
"vae-tiling", // StableDiffusionCpp compatibility
"image-vae-tiling", // LLaMABox compatibility
},
Usage: "Specify to enable tiling for the vae model.",
},
&cli.BoolFlag{
Destination: &sdcNoAutoencoderTiling,
Value: sdcNoAutoencoderTiling,
Category: "Estimate/StableDiffusionCpp",
Name: "image-no-autoencoder-tiling",
Aliases: []string{
"image-no-vae-tiling", // LLaMABox compatibility
},
Usage: "Specify to disable tiling for the vae model, it takes precedence over --image-autoencoder-tiling.",
},
&cli.BoolFlag{
Destination: &sdcFreeComputeMemoryImmediately,
Value: sdcFreeComputeMemoryImmediately,
Category: "Estimate/StableDiffusionCpp",
Name: "image-free-compute-memory-immediately", // LLaMABox compatibility
Usage: "Specify to free the compute memory immediately after the generation, which burst using VRAM.",
},
&cli.BoolFlag{
Destination: &raw,
Value: raw,
Category: "Output",
Name: "raw",
Usage: "Output the GGUF file information as JSON only, skip anything.",
},
&cli.StringFlag{
Destination: &rawOutput,
Value: rawOutput,
Category: "Output",
Name: "raw-output",
Usage: "Works with \"--raw\", to save the result to the file",
},
&cli.BoolFlag{
Destination: &skipMetadata,
Value: skipMetadata,
Category: "Output",
Name: "skip-metadata",
Usage: "Skip to display metadata.",
},
&cli.BoolFlag{
Destination: &skipArchitecture,
Value: skipArchitecture,
Category: "Output",
Name: "skip-architecture",
Usage: "Skip to display architecture.",
},
&cli.BoolFlag{
Destination: &skipTokenizer,
Value: skipTokenizer,
Category: "Output",
Name: "skip-tokenizer",
Usage: "Skip to display tokenizer. " +
"By default, gguf-parser always displays the tokenizer of the file which types with \"model\".",
},
&cli.BoolFlag{
Destination: &skipEstimate,
Value: skipEstimate,
Category: "Output",
Name: "skip-estimate",
Usage: "Skip to estimate. " +
"By default, gguf-parser always estimates the file which types with \"model\".",
},
&cli.BoolFlag{
Category: "Output",
Name: "estimate",
Usage: "Skip all the information except the estimate result.",
Action: func(_ *cli.Context, estimateOnly bool) error {
if estimateOnly {
skipMetadata = true
skipArchitecture = true
skipTokenizer = true
}
return nil
},
},
&cli.BoolFlag{
Destination: &inShort,
Value: inShort,
Category: "Output",
Name: "in-short",
Usage: "Display the estimated result in table in short form.",
},
&cli.BoolFlag{
Destination: &inMib,
Value: inMib,
Category: "Output",
Name: "in-mib",
Usage: "Display the estimated result in table with MiB.",
},
&cli.BoolFlag{
Destination: &inJson,
Value: inJson,
Category: "Output",
Name: "json",
Usage: "Output as JSON.",
},
&cli.BoolFlag{
Destination: &inPrettyJson,
Value: inPrettyJson,
Category: "Output",
Name: "json-pretty",
Usage: "Works with \"--json\", to output pretty format JSON.",
},
},
Action: mainAction,
}
if err := app.RunContext(signalx.Handler(), os.Args); err != nil {
_, _ = fmt.Fprintf(os.Stderr, "%v\n", err)
os.Exit(1)
}
}
var (
// model options
path string
draftPath string // for estimate
mmprojPath string // for estimate
loraPaths cli.StringSlice // for estimate
controlVectorPaths cli.StringSlice // for estimate
upscalePath string // for estimate
controlNetPath string // for estimate
url string
draftUrl string // for estimate
mmprojUrl string // for estimate
loraUrls cli.StringSlice // for estimate
controlVectorUrls cli.StringSlice // for estimate
upscaleUrl string // for estimate
controlNetUrl string // for estimate
token string
headers cli.StringSlice
hfRepo string
hfFile string
hfDraftRepo string // for estimate
hfDraftFile string // for estimate
hfMMProjFile string // for estimate
hfLoRAFiles cli.StringSlice // for estimate
hfControlVectorFiles cli.StringSlice // for estimate
hfUpscaleRepo string // for estimate
hfUpscaleFile string // for estimate
hfControlNetRepo string // for estimate
hfControlNetFile string // for estimate
hfToken string
msRepo string
msFile string
msDraftRepo string // for estimate
msDraftFile string // for estimate
msMMProjFile string // for estimate
msLoRAFiles cli.StringSlice // for estimate
msControlVectorFiles cli.StringSlice // for estimate
msUpscaleRepo string // for estimate
msUpscaleFile string // for estimate
msControlNetRepo string // for estimate
msControlNetFile string // for estimate
msToken string
olBaseURL = "https://registry.ollama.ai"
olModel string
olUsage bool
// load options
debug bool
skipProxy bool
skipTLSVerify bool
skipDNSCache bool
skipRangDownloadDetect bool
cacheExpiration = 24 * time.Hour
cachePath = DefaultCachePath()
skipCache bool
// estimate options
parallelSize = 1
flashAttention bool
mainGPU uint
rpcServers string
tensorSplit string
offloadLayers = -1
overrideTensors cli.StringSlice
deviceMetrics cli.StringSlice
platformFootprint = "150,250"
// estimate options for llama.cpp
lmcCtxSize = 0
lmcRoPEFreqBase float64
lmcRoPEFreqScale float64
lmcRoPEScalingType string
lmcRoPEScalingOrigCtxSize int
lmcInMaxCtxSize bool
lmcLogicalBatchSize = 2048
lmcPhysicalBatchSize = 512
lmcCacheKeyType = "f16"
lmcCacheValueType = "f16"
lmcNoKVOffload bool
lmcSplitMode = "layer"
lmcSWAFull = false
lmcNoMMap bool
lmcVisualMaxImageSize uint
lmcMaxProjectedCache uint
lmcOffloadLayersDraft = -1
lmcOffloadLayersStep uint64
// estimate options for stable-diffusion.cpp
sdcBatchCount uint = 1
sdcHeight uint = 1024
sdcWidth uint = 1024
sdcNoConditionerOffload bool
sdcNoAutoencoderOffload bool
sdcNoControlNetOffload bool
sdcAutoencoderTiling bool
sdcNoAutoencoderTiling bool
sdcFreeComputeMemoryImmediately bool
// output options
raw bool
rawOutput string
inShort bool
skipMetadata bool
skipArchitecture bool
skipTokenizer bool
skipEstimate bool
inMib bool
inJson bool
inPrettyJson = true
)
func mainAction(c *cli.Context) error {
ctx := c.Context
// Prepare options.
ropts := []GGUFReadOption{
SkipLargeMetadata(),
UseMMap(),
UseCache(),
}
if hs := headers.Value(); len(hs) > 0 {
hm := make(map[string]string, len(hs))
for _, h := range hs {
parts := strings.SplitN(h, ":", 2)
if len(parts) == 2 {
hm[strings.TrimSpace(parts[0])] = strings.TrimSpace(parts[1])
}
}
if len(hm) > 0 {
ropts = append(ropts, UseHeaders(hm))
}
}
if token != "" {
ropts = append(ropts, UseBearerAuth(token))
}
if debug {
ropts = append(ropts, UseDebug())
}
if skipProxy {
ropts = append(ropts, SkipProxy())
}
if skipTLSVerify {
ropts = append(ropts, SkipTLSVerification())
}
if skipDNSCache {
ropts = append(ropts, SkipDNSCache())
}
if skipRangDownloadDetect {
ropts = append(ropts, SkipRangeDownloadDetection())
}
if cacheExpiration >= 0 {
ropts = append(ropts, UseCacheExpiration(cacheExpiration))
}
if cachePath != "" {
ropts = append(ropts, UseCachePath(cachePath))
}
if skipCache {
ropts = append(ropts, SkipCache())
}
eopts := []GGUFRunEstimateOption{
WithLLaMACppCacheValueType(GGMLTypeF16),
WithLLaMACppCacheKeyType(GGMLTypeF16),
}
if parallelSize > 0 {
eopts = append(eopts, WithParallelSize(int32(parallelSize)))
}
if flashAttention {
eopts = append(eopts, WithFlashAttention())
}
if tensorSplit != "" {
tss := strings.Split(tensorSplit, ",")
if len(tss) > 128 {
return errors.New("--tensor-split exceeds the number of devices")
}
var vs float64
vv := make([]float64, len(tss))
vf := make([]float64, len(tss))
for i, s := range tss {
s = strings.TrimSpace(s)
v, err := strconv.ParseFloat(s, 64)
if err != nil {
return errors.New("--tensor-split has invalid integer")
}
vs += v
vv[i] = vs
}
for i, v := range vv {
vf[i] = v / vs
}
eopts = append(eopts, WithTensorSplitFraction(vf))
if mainGPU < uint(len(vv)) {
eopts = append(eopts, WithMainGPUIndex(int(mainGPU)))
} else {
return errors.New("--main-gpu must be less than item size of --tensor-split")
}
if rpcServers != "" {
rss := strings.Split(rpcServers, ",")
if len(rss) > len(tss) {
return errors.New("--rpc has more items than --tensor-split")
}
rpc := make([]string, len(rss))
for i, s := range rss {
s = strings.TrimSpace(s)
if _, _, err := net.SplitHostPort(s); err != nil {
return errors.New("--rpc has invalid host:port")
}
rpc[i] = s
}
eopts = append(eopts, WithRPCServers(rpc))
}
}
if otss := overrideTensors.Value(); len(otss) > 0 {
var ots []GGUFRunOverriddenTensor
for i := range otss {
pots := strings.Split(otss[i], ",")
for j := range pots {
ss := strings.SplitN(strings.TrimSpace(pots[j]), "=", 2)
if len(ss) != 2 {
return errors.New("--override-tensor has invalid format")
}
pr, err := regexp.Compile(strings.TrimSpace(ss[0]))
if err != nil {
return fmt.Errorf("--override-tensor has invalid pattern: %w", err)
}
bt := strings.TrimSpace(ss[1])
if bt == "" {
return errors.New("--override-tensor has empty buffer type")
}
ots = append(ots, GGUFRunOverriddenTensor{
PatternRegex: pr,
BufferType: bt,
})
}
}
eopts = append(eopts, WithOverriddenTensors(ots))
}
if dmss := deviceMetrics.Value(); len(dmss) > 0 {
dms := make([]GGUFRunDeviceMetric, len(dmss))
for i := range dmss {
ss := strings.Split(dmss[i], ";")
if len(ss) < 2 {
return errors.New("--device-metric has invalid format")
}
var err error
dms[i].FLOPS, err = ParseFLOPSScalar(strings.TrimSpace(ss[0]))
if err != nil {
return fmt.Errorf("--device-metric has invalid FLOPS: %w", err)
}
dms[i].UpBandwidth, err = ParseBytesPerSecondScalar(strings.TrimSpace(ss[1]))
if err != nil {
return fmt.Errorf("--device-metric has invalid Up Bandwidth: %w", err)
}
if len(ss) > 2 {
dms[i].DownBandwidth, err = ParseBytesPerSecondScalar(strings.TrimSpace(ss[2]))
if err != nil {
return fmt.Errorf("--device-metric has invalid Down Bandwidth: %w", err)
}
} else {
dms[i].DownBandwidth = dms[i].UpBandwidth
}
}
eopts = append(eopts, WithDeviceMetrics(dms))
}
if lmcCtxSize > 0 {
eopts = append(eopts, WithLLaMACppContextSize(int32(lmcCtxSize)))
}
if lmcRoPEFreqBase > 0 || lmcRoPEFreqScale > 0 || lmcRoPEScalingType != "" || lmcRoPEScalingOrigCtxSize > 0 {
eopts = append(eopts, WithLLaMACppRoPE(lmcRoPEFreqBase, lmcRoPEFreqScale, lmcRoPEScalingType, int32(lmcRoPEScalingOrigCtxSize)))
}
if lmcInMaxCtxSize {
eopts = append(eopts, WithinLLaMACppMaxContextSize())
}
if lmcLogicalBatchSize > 0 {
eopts = append(eopts, WithLLaMACppLogicalBatchSize(int32(max(32, lmcLogicalBatchSize))))
}
if lmcPhysicalBatchSize > 0 {
if lmcPhysicalBatchSize > lmcLogicalBatchSize {
return errors.New("--ubatch-size must be less than or equal to --batch-size")
}
eopts = append(eopts, WithLLaMACppPhysicalBatchSize(int32(lmcPhysicalBatchSize)))
}
if lmcCacheKeyType != "" {
eopts = append(eopts, WithLLaMACppCacheKeyType(toGGMLType(lmcCacheKeyType)))
}
if lmcCacheValueType != "" {
eopts = append(eopts, WithLLaMACppCacheValueType(toGGMLType(lmcCacheValueType)))
}
if lmcNoKVOffload {
eopts = append(eopts, WithoutLLaMACppOffloadKVCache())
}
switch lmcSplitMode {
case "row":
eopts = append(eopts, WithLLaMACppSplitMode(LLaMACppSplitModeRow))
case "none":
eopts = append(eopts, WithLLaMACppSplitMode(LLaMACppSplitModeNone))
default:
eopts = append(eopts, WithLLaMACppSplitMode(LLaMACppSplitModeLayer))
}
if lmcSWAFull {
eopts = append(eopts, WithLLaMACppFullSizeSWACache())
}
if lmcVisualMaxImageSize > 0 {
eopts = append(eopts, WithLLaMACppVisualMaxImageSize(uint32(lmcVisualMaxImageSize)))
}
if lmcMaxProjectedCache > 0 {
eopts = append(eopts, WithLLaMACppMaxProjectedCache(uint32(lmcMaxProjectedCache)))
}
if sdcBatchCount > 1 {
eopts = append(eopts, WithStableDiffusionCppBatchCount(int32(sdcBatchCount)))
}
if sdcHeight > 0 {
eopts = append(eopts, WithStableDiffusionCppHeight(uint32(sdcHeight)))
}
if sdcWidth > 0 {
eopts = append(eopts, WithStableDiffusionCppWidth(uint32(sdcWidth)))
}
if sdcNoConditionerOffload {
eopts = append(eopts, WithoutStableDiffusionCppOffloadConditioner())
}
if sdcNoAutoencoderOffload {
eopts = append(eopts, WithoutStableDiffusionCppOffloadAutoencoder())
}
if sdcAutoencoderTiling && !sdcNoAutoencoderTiling {
eopts = append(eopts, WithStableDiffusionCppAutoencoderTiling())
}
if sdcFreeComputeMemoryImmediately {
eopts = append(eopts, WithStableDiffusionCppFreeComputeMemoryImmediately())
}
if offloadLayers >= 0 {
eopts = append(eopts, WithLLaMACppOffloadLayers(uint64(offloadLayers)), WithStableDiffusionCppOffloadLayers(uint64(offloadLayers)))
}
// Parse GGUF file.
var (
// Common.
gf *GGUFFile
adapterGfs []*GGUFFile
// LLaMACpp specific.
lmcProjectGf *GGUFFile
lmcDrafterGf *GGUFFile
// StableDiffusionCpp specific.
sdcControlNetGf *GGUFFile
sdcUpscaleGf *GGUFFile
)
{
var err error
ropts := ropts[:len(ropts):len(ropts)]
// Main model.
switch {
default:
return errors.New("no model specified")
case path != "":
gf, err = ParseGGUFFile(path, ropts...)
case url != "":
gf, err = ParseGGUFFileRemote(ctx, url, ropts...)
case hfRepo != "" && hfFile != "":
if hfToken != "" {
ropts = append(ropts, UseBearerAuth(hfToken))
}
gf, err = ParseGGUFFileFromHuggingFace(ctx, hfRepo, hfFile, ropts...)
case msRepo != "" && msFile != "":
if msToken != "" {
ropts = append(ropts, UseBearerAuth(msToken))
}
gf, err = ParseGGUFFileFromModelScope(ctx, msRepo, msFile, ropts...)
case olModel != "":
om := ParseOllamaModel(olModel, SetOllamaModelBaseURL(olBaseURL))
gf, err = ParseGGUFFileFromOllamaModel(ctx, om, ropts...)
if err == nil && om != nil && olUsage {
// Parameters override.
{
ps, _ := om.Params(ctx, nil)
if v, ok := ps["num_ctx"]; ok {
eopts = append(eopts, WithLLaMACppContextSize(anyx.Number[int32](v)))
} else if lmcCtxSize <= 0 {
eopts = append(eopts, WithLLaMACppContextSize(2048))
}
if v, ok := ps["use_mmap"]; ok && !anyx.Bool(v) {
lmcNoMMap = true
}
if v, ok := ps["num_gpu"]; ok {
offloadLayers = anyx.Number[int](v)
}
}
// Multimodal projector overlap.
gitextract_sklq_6lp/ ├── .gitattributes ├── .github/ │ └── workflows/ │ ├── ci.yml │ ├── cmd.yml │ ├── prune.yml │ └── sync.yml ├── .gitignore ├── .golangci.yaml ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── cache.go ├── cmd/ │ └── gguf-parser/ │ ├── README.md │ ├── go.mod │ ├── go.sum │ └── main.go ├── file.go ├── file_architecture.go ├── file_architecture_test.go ├── file_estimate__llamacpp.go ├── file_estimate__llamacpp_test.go ├── file_estimate__stablediffusioncpp.go ├── file_estimate__stablediffusioncpp_test.go ├── file_estimate_option.go ├── file_from_distro.go ├── file_from_remote.go ├── file_metadata.go ├── file_metadata_test.go ├── file_option.go ├── file_test.go ├── file_tokenizer.go ├── file_tokenizer_test.go ├── filename.go ├── filename_test.go ├── gen.go ├── gen.regression.go ├── gen.stringer.go ├── ggml.go ├── go.mod ├── go.sum ├── ollama_model.go ├── ollama_model_option.go ├── ollama_model_test.go ├── ollama_registry_authenticate.go ├── scalar.go ├── scalar_test.go ├── util/ │ ├── anyx/ │ │ └── any.go │ ├── bytex/ │ │ └── pool.go │ ├── funcx/ │ │ └── error.go │ ├── httpx/ │ │ ├── client.go │ │ ├── client_helper.go │ │ ├── client_options.go │ │ ├── file.go │ │ ├── file_options.go │ │ ├── proxy.go │ │ ├── resolver.go │ │ ├── transport.go │ │ └── transport_options.go │ ├── json/ │ │ ├── common.go │ │ ├── jsoniter.go │ │ └── stdjson.go │ ├── osx/ │ │ ├── env.go │ │ ├── file.go │ │ ├── file_mmap.go │ │ ├── file_mmap_js.go │ │ ├── file_mmap_unix.go │ │ ├── file_mmap_windows.go │ │ ├── file_mmap_windows_386.go │ │ ├── file_mmap_windows_non386.go │ │ └── homedir.go │ ├── ptr/ │ │ └── pointer.go │ ├── signalx/ │ │ ├── handler.go │ │ ├── handler_unix.go │ │ └── handler_windows.go │ ├── slicex/ │ │ └── search.go │ └── stringx/ │ ├── bytes.go │ ├── random.go │ ├── strings.go │ └── sum.go ├── zz_generated.diffusion_model_memory_usage.regression.go ├── zz_generated.ggmltype.stringer.go ├── zz_generated.gguffiletype.stringer.go ├── zz_generated.ggufmagic.stringer.go ├── zz_generated.ggufmetadatavaluetype.stringer.go └── zz_generated.ggufversion.stringer.go
SYMBOL INDEX (687 symbols across 64 files)
FILE: cache.go
type GGUFFileCache (line 21) | type GGUFFileCache
method getKeyPath (line 23) | func (c GGUFFileCache) getKeyPath(key string) string {
method Get (line 29) | func (c GGUFFileCache) Get(key string, exp time.Duration) (*GGUFFile, ...
method Put (line 67) | func (c GGUFFileCache) Put(key string, gf *GGUFFile) error {
method Delete (line 88) | func (c GGUFFileCache) Delete(key string) error {
FILE: cmd/gguf-parser/main.go
function init (line 28) | func init() {
function main (line 43) | func main() {
function mainAction (line 1129) | func mainAction(c *cli.Context) error {
function sprintf (line 2138) | func sprintf(f any, a ...any) string {
function tprint (line 2148) | func tprint(title string, headers, bodies [][]any) {
function tenary (line 2176) | func tenary(c bool, t, f any) any {
function tenaryFunc (line 2183) | func tenaryFunc(c bool, t, f func() any) any {
function toGGMLType (line 2190) | func toGGMLType(s string) GGMLType {
FILE: file.go
type GGUFFile (line 26) | type GGUFFile struct
method Layers (line 523) | func (gf *GGUFFile) Layers(ignores ...string) GGUFLayerTensorInfos {
type GGUFMagic (line 79) | type GGUFMagic
constant GGUFMagicGGML (line 83) | GGUFMagicGGML GGUFMagic = 0x67676d6c
constant GGUFMagicGGMF (line 84) | GGUFMagicGGMF GGUFMagic = 0x67676d66
constant GGUFMagicGGJT (line 85) | GGUFMagicGGJT GGUFMagic = 0x67676a74
constant GGUFMagicGGUFLe (line 86) | GGUFMagicGGUFLe GGUFMagic = 0x46554747
constant GGUFMagicGGUFBe (line 87) | GGUFMagicGGUFBe GGUFMagic = 0x47475546
type GGUFVersion (line 92) | type GGUFVersion
constant GGUFVersionV1 (line 96) | GGUFVersionV1 GGUFVersion = iota + 1
constant GGUFVersionV2 (line 97) | GGUFVersionV2
constant GGUFVersionV3 (line 98) | GGUFVersionV3
type GGUFHeader (line 102) | type GGUFHeader struct
type GGUFMetadataValueType (line 117) | type GGUFMetadataValueType
constant GGUFMetadataValueTypeUint8 (line 121) | GGUFMetadataValueTypeUint8 GGUFMetadataValueType = iota
constant GGUFMetadataValueTypeInt8 (line 122) | GGUFMetadataValueTypeInt8
constant GGUFMetadataValueTypeUint16 (line 123) | GGUFMetadataValueTypeUint16
constant GGUFMetadataValueTypeInt16 (line 124) | GGUFMetadataValueTypeInt16
constant GGUFMetadataValueTypeUint32 (line 125) | GGUFMetadataValueTypeUint32
constant GGUFMetadataValueTypeInt32 (line 126) | GGUFMetadataValueTypeInt32
constant GGUFMetadataValueTypeFloat32 (line 127) | GGUFMetadataValueTypeFloat32
constant GGUFMetadataValueTypeBool (line 128) | GGUFMetadataValueTypeBool
constant GGUFMetadataValueTypeString (line 129) | GGUFMetadataValueTypeString
constant GGUFMetadataValueTypeArray (line 130) | GGUFMetadataValueTypeArray
constant GGUFMetadataValueTypeUint64 (line 131) | GGUFMetadataValueTypeUint64
constant GGUFMetadataValueTypeInt64 (line 132) | GGUFMetadataValueTypeInt64
constant GGUFMetadataValueTypeFloat64 (line 133) | GGUFMetadataValueTypeFloat64
constant _GGUFMetadataValueTypeCount (line 134) | _GGUFMetadataValueTypeCount
type GGUFMetadataKV (line 140) | type GGUFMetadataKV struct
method ValueUint8 (line 527) | func (kv GGUFMetadataKV) ValueUint8() uint8 {
method ValueInt8 (line 534) | func (kv GGUFMetadataKV) ValueInt8() int8 {
method ValueUint16 (line 541) | func (kv GGUFMetadataKV) ValueUint16() uint16 {
method ValueInt16 (line 548) | func (kv GGUFMetadataKV) ValueInt16() int16 {
method ValueUint32 (line 555) | func (kv GGUFMetadataKV) ValueUint32() uint32 {
method ValueInt32 (line 562) | func (kv GGUFMetadataKV) ValueInt32() int32 {
method ValueFloat32 (line 569) | func (kv GGUFMetadataKV) ValueFloat32() float32 {
method ValueBool (line 576) | func (kv GGUFMetadataKV) ValueBool() bool {
method ValueString (line 583) | func (kv GGUFMetadataKV) ValueString() string {
method ValueArray (line 590) | func (kv GGUFMetadataKV) ValueArray() GGUFMetadataKVArrayValue {
method ValueUint64 (line 615) | func (kv GGUFMetadataKV) ValueUint64() uint64 {
method ValueInt64 (line 622) | func (kv GGUFMetadataKV) ValueInt64() int64 {
method ValueFloat64 (line 629) | func (kv GGUFMetadataKV) ValueFloat64() float64 {
type GGUFMetadataKVArrayValue (line 151) | type GGUFMetadataKVArrayValue struct
method ValuesUint8 (line 661) | func (av GGUFMetadataKVArrayValue) ValuesUint8() []uint8 {
method ValuesInt8 (line 672) | func (av GGUFMetadataKVArrayValue) ValuesInt8() []int8 {
method ValuesUint16 (line 683) | func (av GGUFMetadataKVArrayValue) ValuesUint16() []uint16 {
method ValuesInt16 (line 694) | func (av GGUFMetadataKVArrayValue) ValuesInt16() []int16 {
method ValuesUint32 (line 705) | func (av GGUFMetadataKVArrayValue) ValuesUint32() []uint32 {
method ValuesInt32 (line 716) | func (av GGUFMetadataKVArrayValue) ValuesInt32() []int32 {
method ValuesFloat32 (line 727) | func (av GGUFMetadataKVArrayValue) ValuesFloat32() []float32 {
method ValuesBool (line 738) | func (av GGUFMetadataKVArrayValue) ValuesBool() []bool {
method ValuesString (line 749) | func (av GGUFMetadataKVArrayValue) ValuesString() []string {
method ValuesArray (line 760) | func (av GGUFMetadataKVArrayValue) ValuesArray() []GGUFMetadataKVArray...
method ValuesUint64 (line 789) | func (av GGUFMetadataKVArrayValue) ValuesUint64() []uint64 {
method ValuesInt64 (line 800) | func (av GGUFMetadataKVArrayValue) ValuesInt64() []int64 {
method ValuesFloat64 (line 811) | func (av GGUFMetadataKVArrayValue) ValuesFloat64() []float64 {
type GGUFMetadataKVs (line 173) | type GGUFMetadataKVs
method Get (line 855) | func (kvs GGUFMetadataKVs) Get(key string) (value GGUFMetadataKV, foun...
method Search (line 865) | func (kvs GGUFMetadataKVs) Search(keyRegex *regexp.Regexp) (values []G...
method Index (line 876) | func (kvs GGUFMetadataKVs) Index(keys []string) (values map[string]GGU...
type GGUFTensorInfo (line 179) | type GGUFTensorInfo struct
method Get (line 896) | func (ti GGUFTensorInfo) Get(name string) (info GGUFTensorInfo, found ...
method GetFileType (line 904) | func (ti GGUFTensorInfo) GetFileType() GGUFFileType {
method Match (line 909) | func (ti GGUFTensorInfo) Match(nameRegex *regexp.Regexp) bool {
method Search (line 914) | func (ti GGUFTensorInfo) Search(nameRegex *regexp.Regexp) (infos []GGU...
method Index (line 923) | func (ti GGUFTensorInfo) Index(names []string) (infos map[string]GGUFT...
method Elements (line 936) | func (ti GGUFTensorInfo) Elements(filter ...GGUFTensorInfoFilter) uint...
method Bytes (line 957) | func (ti GGUFTensorInfo) Bytes(filter ...GGUFTensorInfoFilter) uint64 {
method Count (line 1001) | func (ti GGUFTensorInfo) Count() uint64 {
type GGUFTensorInfos (line 206) | type GGUFTensorInfos
method Get (line 1007) | func (tis GGUFTensorInfos) Get(name string) (info GGUFTensorInfo, foun...
method GetFileType (line 1017) | func (tis GGUFTensorInfos) GetFileType() GGUFFileType {
method Match (line 1031) | func (tis GGUFTensorInfos) Match(nameRegex *regexp.Regexp) bool {
method Search (line 1041) | func (tis GGUFTensorInfos) Search(nameRegex *regexp.Regexp) (infos []G...
method Index (line 1052) | func (tis GGUFTensorInfos) Index(names []string) (infos map[string]GGU...
method Elements (line 1071) | func (tis GGUFTensorInfos) Elements() uint64 {
method Bytes (line 1080) | func (tis GGUFTensorInfos) Bytes() uint64 {
method Count (line 1089) | func (tis GGUFTensorInfos) Count() uint64 {
method Layers (line 1094) | func (tis GGUFTensorInfos) Layers(ignores ...string) GGUFLayerTensorIn...
method layers (line 1109) | func (tis GGUFTensorInfos) layers() GGUFLayerTensorInfos {
function ParseGGUFFile (line 213) | func ParseGGUFFile(path string, opts ...GGUFReadOption) (*GGUFFile, erro...
type _GGUFFileReadSeeker (line 267) | type _GGUFFileReadSeeker struct
function _validateCountWithRemaining (line 273) | func _validateCountWithRemaining(f _GGUFFileReadSeeker, count uint64, ve...
function parseGGUFFile (line 313) | func parseGGUFFile(fs []_GGUFFileReadSeeker, o _GGUFReadOptions) (_ *GGU...
type GGUFTensorInfoFilter (line 476) | type GGUFTensorInfoFilter
type IGGUFTensorInfos (line 480) | type IGGUFTensorInfos interface
type GGUFLayerTensorInfos (line 503) | type GGUFLayerTensorInfos
method Get (line 1255) | func (ltis GGUFLayerTensorInfos) Get(name string) (info GGUFTensorInfo...
method GetFileType (line 1273) | func (ltis GGUFLayerTensorInfos) GetFileType() GGUFFileType {
method Match (line 1292) | func (ltis GGUFLayerTensorInfos) Match(nameRegex *regexp.Regexp) bool {
method Search (line 1309) | func (ltis GGUFLayerTensorInfos) Search(nameRegex *regexp.Regexp) (inf...
method Index (line 1325) | func (ltis GGUFLayerTensorInfos) Index(names []string) (infos map[stri...
method Elements (line 1353) | func (ltis GGUFLayerTensorInfos) Elements(filter ...GGUFTensorInfoFilt...
method Bytes (line 1362) | func (ltis GGUFLayerTensorInfos) Bytes(filter ...GGUFTensorInfoFilter)...
method Count (line 1371) | func (ltis GGUFLayerTensorInfos) Count() uint64 {
method Cut (line 1385) | func (ltis GGUFLayerTensorInfos) Cut(names []string) (before, after GG...
type GGUFNamedTensorInfos (line 507) | type GGUFNamedTensorInfos struct
function ValueNumeric (line 643) | func ValueNumeric[T constraints.Integer | constraints.Float](kv GGUFMeta...
function ValuesNumeric (line 829) | func ValuesNumeric[T constraints.Integer | constraints.Float](av GGUFMet...
type _GGUFReader (line 1447) | type _GGUFReader struct
method ReadUint8 (line 1454) | func (rd _GGUFReader) ReadUint8() (v uint8, err error) {
method ReadInt8 (line 1462) | func (rd _GGUFReader) ReadInt8() (v int8, err error) {
method ReadUint16 (line 1470) | func (rd _GGUFReader) ReadUint16() (v uint16, err error) {
method ReadInt16 (line 1478) | func (rd _GGUFReader) ReadInt16() (v int16, err error) {
method ReadUint32 (line 1486) | func (rd _GGUFReader) ReadUint32() (v uint32, err error) {
method ReadUint64FromUint32 (line 1494) | func (rd _GGUFReader) ReadUint64FromUint32() (uint64, error) {
method ReadInt32 (line 1499) | func (rd _GGUFReader) ReadInt32() (v int32, err error) {
method ReadFloat32 (line 1507) | func (rd _GGUFReader) ReadFloat32() (v float32, err error) {
method ReadBool (line 1515) | func (rd _GGUFReader) ReadBool() (v bool, err error) {
method ReadString (line 1523) | func (rd _GGUFReader) ReadString() (v string, err error) {
method SkipReadingString (line 1543) | func (rd _GGUFReader) SkipReadingString() (err error) {
method ReadArray (line 1560) | func (rd _GGUFReader) ReadArray(key string) (v GGUFMetadataKVArrayValu...
method ReadUint64 (line 1634) | func (rd _GGUFReader) ReadUint64() (v uint64, err error) {
method ReadInt64 (line 1642) | func (rd _GGUFReader) ReadInt64() (v int64, err error) {
method ReadFloat64 (line 1650) | func (rd _GGUFReader) ReadFloat64() (v float64, err error) {
method ReadValue (line 1658) | func (rd _GGUFReader) ReadValue(vk string, vt GGUFMetadataValueType) (...
type _GGUFMetadataReader (line 1700) | type _GGUFMetadataReader struct
method Read (line 1704) | func (rd _GGUFMetadataReader) Read() (kv GGUFMetadataKV, err error) {
type _GGUFTensorInfoReader (line 1729) | type _GGUFTensorInfoReader struct
method Read (line 1733) | func (rd _GGUFTensorInfoReader) Read() (ti GGUFTensorInfo, err error) {
FILE: file_architecture.go
type GGUFArchitecture (line 12) | type GGUFArchitecture struct
method DiffusionHasConditioners (line 329) | func (ga GGUFArchitecture) DiffusionHasConditioners() bool {
method DiffusionHasAutoencoder (line 334) | func (ga GGUFArchitecture) DiffusionHasAutoencoder() bool {
type GGUFArchitectureDiffusionConditioners (line 305) | type GGUFArchitectureDiffusionConditioners
method String (line 338) | func (gacs GGUFArchitectureDiffusionConditioners) String() string {
type GGUFArchitectureDiffusionConditioner (line 308) | type GGUFArchitectureDiffusionConditioner struct
method String (line 349) | func (gac GGUFArchitectureDiffusionConditioner) String() string {
type GGUFArchitectureDiffusionAutoencoder (line 317) | type GGUFArchitectureDiffusionAutoencoder struct
method String (line 353) | func (gaa GGUFArchitectureDiffusionAutoencoder) String() string {
method Architecture (line 358) | func (gf *GGUFFile) Architecture() (ga GGUFArchitecture) {
method diffuserArchitecture (line 403) | func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) {
method clipArchitecture (line 589) | func (gf *GGUFFile) clipArchitecture() (ga GGUFArchitecture) {
method adapterArchitecture (line 781) | func (gf *GGUFFile) adapterArchitecture(arch string) (ga GGUFArchitectur...
method imatrixArchitecture (line 816) | func (gf *GGUFFile) imatrixArchitecture(_ string) (ga GGUFArchitecture) {
method transformerArchitecture (line 823) | func (gf *GGUFFile) transformerArchitecture(arch string) (ga GGUFArchite...
FILE: file_architecture_test.go
function TestGGUFFile_Architecture (line 11) | func TestGGUFFile_Architecture(t *testing.T) {
function BenchmarkGGUFFile_Architecture (line 27) | func BenchmarkGGUFFile_Architecture(b *testing.B) {
FILE: file_estimate__llamacpp.go
type LLaMACppRunEstimate (line 17) | type LLaMACppRunEstimate struct
method SummarizeItem (line 1524) | func (e LLaMACppRunEstimate) SummarizeItem(mmap bool, nonUMARamFootpri...
method Summarize (line 1625) | func (e LLaMACppRunEstimate) Summarize(mmap bool, nonUMARamFootprint, ...
type LLaMACppRunDeviceUsage (line 74) | type LLaMACppRunDeviceUsage struct
type LLaMACppParameterUsage (line 110) | type LLaMACppParameterUsage struct
type LLaMACppWeightMemoryUsage (line 124) | type LLaMACppWeightMemoryUsage struct
method Sum (line 1648) | func (u LLaMACppWeightMemoryUsage) Sum() GGUFBytesScalar {
type LLaMACppKVCacheMemoryUsage (line 136) | type LLaMACppKVCacheMemoryUsage struct
method Sum (line 1652) | func (u LLaMACppKVCacheMemoryUsage) Sum() GGUFBytesScalar {
type LLaMACppComputationMemoryUsage (line 144) | type LLaMACppComputationMemoryUsage struct
method Sum (line 1656) | func (u LLaMACppComputationMemoryUsage) Sum() GGUFBytesScalar {
method EstimateLLaMACppRun (line 157) | func (gf *GGUFFile) EstimateLLaMACppRun(opts ...GGUFRunEstimateOption) (...
method estimateLLaMACppRunInModel (line 249) | func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOption...
method estimateLLaMACppRunInProjector (line 913) | func (gf *GGUFFile) estimateLLaMACppRunInProjector(o *_GGUFRunEstimateOp...
method estimateLLaMACppRunInAdapter (line 1281) | func (gf *GGUFFile) estimateLLaMACppRunInAdapter(o *_GGUFRunEstimateOpti...
method estimateLLaMACppRunInIMatrix (line 1404) | func (gf *GGUFFile) estimateLLaMACppRunInIMatrix(_ *_GGUFRunEstimateOpti...
type LLaMACppRunEstimateSummary (line 1436) | type LLaMACppRunEstimateSummary struct
type LLaMACppRunEstimateSummaryItem (line 1484) | type LLaMACppRunEstimateSummaryItem struct
type LLaMACppRunEstimateMemory (line 1499) | type LLaMACppRunEstimateMemory struct
function ClipAligning (line 1662) | func ClipAligning(x, n uint64) uint64 {
FILE: file_estimate__llamacpp_test.go
function TestGGUFFile_EstimateLLaMACppRun (line 10) | func TestGGUFFile_EstimateLLaMACppRun(t *testing.T) {
function TestGGUFFile_EstimateLLaMACppRun_ContextSize (line 68) | func TestGGUFFile_EstimateLLaMACppRun_ContextSize(t *testing.T) {
function TestGGUFFile_EstimateLLaMACppRun_OffloadLayers (line 97) | func TestGGUFFile_EstimateLLaMACppRun_OffloadLayers(t *testing.T) {
FILE: file_estimate__stablediffusioncpp.go
type StableDiffusionCppRunEstimate (line 16) | type StableDiffusionCppRunEstimate struct
method SummarizeItem (line 445) | func (e StableDiffusionCppRunEstimate) SummarizeItem(
method Summarize (line 552) | func (e StableDiffusionCppRunEstimate) Summarize(
type StableDiffusionCppRunDeviceUsage (line 52) | type StableDiffusionCppRunDeviceUsage struct
method EstimateStableDiffusionCppRun (line 74) | func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimat...
type StableDiffusionCppRunEstimateSummary (line 387) | type StableDiffusionCppRunEstimateSummary struct
type StableDiffusionCppRunEstimateSummaryItem (line 416) | type StableDiffusionCppRunEstimateSummaryItem struct
type StableDiffusionCppRunEstimateMemory (line 427) | type StableDiffusionCppRunEstimateMemory struct
function normalizeArchitecture (line 572) | func normalizeArchitecture(arch string) string {
FILE: file_estimate__stablediffusioncpp_test.go
function TestGGUFFile_EstimateStableDiffusionRun (line 10) | func TestGGUFFile_EstimateStableDiffusionRun(t *testing.T) {
FILE: file_estimate_option.go
type _GGUFRunEstimateOptions (line 12) | type _GGUFRunEstimateOptions struct
type GGUFRunOverriddenTensor (line 61) | type GGUFRunOverriddenTensor struct
method ParseBufferType (line 126) | func (odt *GGUFRunOverriddenTensor) ParseBufferType() (GGUFRunOverridd...
type GGUFRunDeviceMetric (line 89) | type GGUFRunDeviceMetric struct
type GGUFRunEstimateOption (line 101) | type GGUFRunEstimateOption
type GGUFRunOverriddenTensorBufferType (line 105) | type GGUFRunOverriddenTensorBufferType
constant _ (line 108) | _ GGUFRunOverriddenTensorBufferType = iota
constant GGUFRunOverriddenTensorBufferTypeCPU (line 109) | GGUFRunOverriddenTensorBufferTypeCPU
constant GGUFRunOverriddenTensorBufferTypeGPU (line 110) | GGUFRunOverriddenTensorBufferTypeGPU
constant GGUFRunOverriddenTensorBufferTypeRPC (line 111) | GGUFRunOverriddenTensorBufferTypeRPC
constant GGUFRunOverriddenTensorBufferTypeUnknown (line 112) | GGUFRunOverriddenTensorBufferTypeUnknown
function WithParallelSize (line 152) | func WithParallelSize(size int32) GGUFRunEstimateOption {
function WithFlashAttention (line 162) | func WithFlashAttention() GGUFRunEstimateOption {
function WithMainGPUIndex (line 174) | func WithMainGPUIndex(di int) GGUFRunEstimateOption {
function WithRPCServers (line 181) | func WithRPCServers(srvs []string) GGUFRunEstimateOption {
function WithTensorSplitFraction (line 197) | func WithTensorSplitFraction(fractions []float64) GGUFRunEstimateOption {
function WithOverriddenTensors (line 215) | func WithOverriddenTensors(tensors []GGUFRunOverriddenTensor) GGUFRunEst...
function WithDeviceMetrics (line 233) | func WithDeviceMetrics(metrics []GGUFRunDeviceMetric) GGUFRunEstimateOpt...
function WithLLaMACppContextSize (line 243) | func WithLLaMACppContextSize(size int32) GGUFRunEstimateOption {
function WithLLaMACppRoPE (line 253) | func WithLLaMACppRoPE(
function WithinLLaMACppMaxContextSize (line 277) | func WithinLLaMACppMaxContextSize() GGUFRunEstimateOption {
function WithLLaMACppLogicalBatchSize (line 284) | func WithLLaMACppLogicalBatchSize(size int32) GGUFRunEstimateOption {
function WithLLaMACppPhysicalBatchSize (line 294) | func WithLLaMACppPhysicalBatchSize(size int32) GGUFRunEstimateOption {
function WithLLaMACppCacheKeyType (line 315) | func WithLLaMACppCacheKeyType(t GGMLType) GGUFRunEstimateOption {
function WithLLaMACppCacheValueType (line 324) | func WithLLaMACppCacheValueType(t GGMLType) GGUFRunEstimateOption {
function WithoutLLaMACppOffloadKVCache (line 333) | func WithoutLLaMACppOffloadKVCache() GGUFRunEstimateOption {
function WithLLaMACppOffloadLayers (line 340) | func WithLLaMACppOffloadLayers(layers uint64) GGUFRunEstimateOption {
type LLaMACppSplitMode (line 347) | type LLaMACppSplitMode
constant LLaMACppSplitModeLayer (line 350) | LLaMACppSplitModeLayer LLaMACppSplitMode = iota
constant LLaMACppSplitModeRow (line 351) | LLaMACppSplitModeRow
constant LLaMACppSplitModeNone (line 352) | LLaMACppSplitModeNone
constant _LLAMACppSplitModeMax (line 353) | _LLAMACppSplitModeMax
function WithLLaMACppSplitMode (line 357) | func WithLLaMACppSplitMode(mode LLaMACppSplitMode) GGUFRunEstimateOption {
function WithLLaMACppFullSizeSWACache (line 366) | func WithLLaMACppFullSizeSWACache() GGUFRunEstimateOption {
function WithLLaMACppVisualMaxImageSize (line 373) | func WithLLaMACppVisualMaxImageSize(size uint32) GGUFRunEstimateOption {
function WithLLaMACppMaxProjectedCache (line 383) | func WithLLaMACppMaxProjectedCache(cacheSize uint32) GGUFRunEstimateOpti...
function WithLLaMACppDrafter (line 393) | func WithLLaMACppDrafter(dft *LLaMACppRunEstimate) GGUFRunEstimateOption {
function WithLLaMACppProjector (line 400) | func WithLLaMACppProjector(prj *LLaMACppRunEstimate) GGUFRunEstimateOpti...
function WithLLaMACppAdapters (line 407) | func WithLLaMACppAdapters(adp []LLaMACppRunEstimate) GGUFRunEstimateOpti...
function WithStableDiffusionCppOffloadLayers (line 417) | func WithStableDiffusionCppOffloadLayers(layers uint64) GGUFRunEstimateO...
function WithStableDiffusionCppBatchCount (line 424) | func WithStableDiffusionCppBatchCount(count int32) GGUFRunEstimateOption {
function WithStableDiffusionCppHeight (line 434) | func WithStableDiffusionCppHeight(height uint32) GGUFRunEstimateOption {
function WithStableDiffusionCppWidth (line 444) | func WithStableDiffusionCppWidth(width uint32) GGUFRunEstimateOption {
function WithoutStableDiffusionCppOffloadConditioner (line 454) | func WithoutStableDiffusionCppOffloadConditioner() GGUFRunEstimateOption {
function WithoutStableDiffusionCppOffloadAutoencoder (line 461) | func WithoutStableDiffusionCppOffloadAutoencoder() GGUFRunEstimateOption {
function WithStableDiffusionCppAutoencoderTiling (line 468) | func WithStableDiffusionCppAutoencoderTiling() GGUFRunEstimateOption {
function WithStableDiffusionCppFreeComputeMemoryImmediately (line 475) | func WithStableDiffusionCppFreeComputeMemoryImmediately() GGUFRunEstimat...
function WithStableDiffusionCppUpscaler (line 482) | func WithStableDiffusionCppUpscaler(ups *StableDiffusionCppRunEstimate) ...
function WithStableDiffusionCppControlNet (line 489) | func WithStableDiffusionCppControlNet(cn *StableDiffusionCppRunEstimate)...
FILE: file_from_distro.go
function ParseGGUFFileFromOllama (line 21) | func ParseGGUFFileFromOllama(ctx context.Context, model string, opts ......
function ParseGGUFFileFromOllamaModel (line 29) | func ParseGGUFFileFromOllamaModel(ctx context.Context, model *OllamaMode...
FILE: file_from_remote.go
function ParseGGUFFileFromHuggingFace (line 18) | func ParseGGUFFileFromHuggingFace(ctx context.Context, repo, file string...
function ParseGGUFFileFromModelScope (line 25) | func ParseGGUFFileFromModelScope(ctx context.Context, repo, file string,...
function ParseGGUFFileRemote (line 33) | func ParseGGUFFileRemote(ctx context.Context, url string, opts ...GGUFRe...
function parseGGUFFileFromRemote (line 113) | func parseGGUFFileFromRemote(ctx context.Context, cli *http.Client, url ...
FILE: file_metadata.go
type GGUFMetadata (line 13) | type GGUFMetadata struct
type GGUFFileType (line 80) | type GGUFFileType
method GGMLType (line 248) | func (t GGUFFileType) GGMLType() GGMLType {
constant GGUFFileTypeMostlyF32 (line 90) | GGUFFileTypeMostlyF32 GGUFFileType = iota
constant GGUFFileTypeMostlyF16 (line 91) | GGUFFileTypeMostlyF16
constant GGUFFileTypeMostlyQ4_0 (line 92) | GGUFFileTypeMostlyQ4_0
constant GGUFFileTypeMostlyQ4_1 (line 93) | GGUFFileTypeMostlyQ4_1
constant GGUFFileTypeMostlyQ4_1_SOME_F16 (line 94) | GGUFFileTypeMostlyQ4_1_SOME_F16
constant GGUFFileTypeMostlyQ4_2 (line 95) | GGUFFileTypeMostlyQ4_2
constant GGUFFileTypeMostlyQ4_3 (line 96) | GGUFFileTypeMostlyQ4_3
constant GGUFFileTypeMostlyQ8_0 (line 97) | GGUFFileTypeMostlyQ8_0
constant GGUFFileTypeMostlyQ5_0 (line 98) | GGUFFileTypeMostlyQ5_0
constant GGUFFileTypeMostlyQ5_1 (line 99) | GGUFFileTypeMostlyQ5_1
constant GGUFFileTypeMostlyQ2_K (line 100) | GGUFFileTypeMostlyQ2_K
constant GGUFFileTypeMostlyQ3_K_S (line 101) | GGUFFileTypeMostlyQ3_K_S
constant GGUFFileTypeMostlyQ3_K_M (line 102) | GGUFFileTypeMostlyQ3_K_M
constant GGUFFileTypeMostlyQ3_K_L (line 103) | GGUFFileTypeMostlyQ3_K_L
constant GGUFFileTypeMostlyQ4_K_S (line 104) | GGUFFileTypeMostlyQ4_K_S
constant GGUFFileTypeMostlyQ4_K_M (line 105) | GGUFFileTypeMostlyQ4_K_M
constant GGUFFileTypeMostlyQ5_K_S (line 106) | GGUFFileTypeMostlyQ5_K_S
constant GGUFFileTypeMostlyQ5_K_M (line 107) | GGUFFileTypeMostlyQ5_K_M
constant GGUFFileTypeMostlyQ6_K (line 108) | GGUFFileTypeMostlyQ6_K
constant GGUFFileTypeMostlyIQ2_XXS (line 109) | GGUFFileTypeMostlyIQ2_XXS
constant GGUFFileTypeMostlyIQ2_XS (line 110) | GGUFFileTypeMostlyIQ2_XS
constant GGUFFileTypeMostlyQ2_K_S (line 111) | GGUFFileTypeMostlyQ2_K_S
constant GGUFFileTypeMostlyIQ3_XS (line 112) | GGUFFileTypeMostlyIQ3_XS
constant GGUFFileTypeMostlyIQ3_XXS (line 113) | GGUFFileTypeMostlyIQ3_XXS
constant GGUFFileTypeMostlyIQ1_S (line 114) | GGUFFileTypeMostlyIQ1_S
constant GGUFFileTypeMostlyIQ4_NL (line 115) | GGUFFileTypeMostlyIQ4_NL
constant GGUFFileTypeMostlyIQ3_S (line 116) | GGUFFileTypeMostlyIQ3_S
constant GGUFFileTypeMostlyIQ3_M (line 117) | GGUFFileTypeMostlyIQ3_M
constant GGUFFileTypeMostlyIQ2_S (line 118) | GGUFFileTypeMostlyIQ2_S
constant GGUFFileTypeMostlyIQ2_M (line 119) | GGUFFileTypeMostlyIQ2_M
constant GGUFFileTypeMostlyIQ4_XS (line 120) | GGUFFileTypeMostlyIQ4_XS
constant GGUFFileTypeMostlyIQ1_M (line 121) | GGUFFileTypeMostlyIQ1_M
constant GGUFFileTypeMostlyBF16 (line 122) | GGUFFileTypeMostlyBF16
constant GGUFFileTypeMostlyQ4_0_4_4 (line 123) | GGUFFileTypeMostlyQ4_0_4_4
constant GGUFFileTypeMostlyQ4_0_4_8 (line 124) | GGUFFileTypeMostlyQ4_0_4_8
constant GGUFFileTypeMostlyQ4_0_8_8 (line 125) | GGUFFileTypeMostlyQ4_0_8_8
constant GGUFFileTypeMostlyTQ1_0 (line 126) | GGUFFileTypeMostlyTQ1_0
constant GGUFFileTypeMostlyTQ2_0 (line 127) | GGUFFileTypeMostlyTQ2_0
constant GGUFFileTypeMostlyMXFP4 (line 128) | GGUFFileTypeMostlyMXFP4
constant _GGUFFileTypeCount (line 129) | _GGUFFileTypeCount
method Metadata (line 158) | func (gf *GGUFFile) Metadata() (gm GGUFMetadata) {
method extractFileType (line 335) | func (gf *GGUFFile) extractFileType(arch string) (fileType GGUFFileType,...
function GetFileType (line 447) | func GetFileType(cm map[GGMLType]int) GGUFFileType {
FILE: file_metadata_test.go
function TestGGUFFile_Metadata (line 14) | func TestGGUFFile_Metadata(t *testing.T) {
function BenchmarkGGUFFile_Metadata (line 30) | func BenchmarkGGUFFile_Metadata(b *testing.B) {
function TestGGUFFile_extractFileType (line 51) | func TestGGUFFile_extractFileType(t *testing.T) {
FILE: file_option.go
type _GGUFReadOptions (line 14) | type _GGUFReadOptions struct
type GGUFReadOption (line 35) | type GGUFReadOption
function UseDebug (line 39) | func UseDebug() GGUFReadOption {
function SkipLargeMetadata (line 47) | func SkipLargeMetadata() GGUFReadOption {
function UseMMap (line 54) | func UseMMap() GGUFReadOption {
function UseBearerAuth (line 61) | func UseBearerAuth(token string) GGUFReadOption {
function UseHeaders (line 68) | func UseHeaders(headers map[string]string) GGUFReadOption {
function UseProxy (line 75) | func UseProxy(url *url.URL) GGUFReadOption {
function SkipProxy (line 82) | func SkipProxy() GGUFReadOption {
function SkipTLSVerification (line 89) | func SkipTLSVerification() GGUFReadOption {
function SkipDNSCache (line 96) | func SkipDNSCache() GGUFReadOption {
function UseBufferSize (line 103) | func UseBufferSize(size int) GGUFReadOption {
function SkipRangeDownloadDetection (line 114) | func SkipRangeDownloadDetection() GGUFReadOption {
function UseCache (line 121) | func UseCache() GGUFReadOption {
function SkipCache (line 129) | func SkipCache() GGUFReadOption {
function DefaultCachePath (line 137) | func DefaultCachePath() string {
function UseCachePath (line 146) | func UseCachePath(path string) GGUFReadOption {
function UseCacheExpiration (line 159) | func UseCacheExpiration(expiration time.Duration) GGUFReadOption {
FILE: file_test.go
function TestParseGGUFFile (line 14) | func TestParseGGUFFile(t *testing.T) {
function BenchmarkParseGGUFFileMMap (line 46) | func BenchmarkParseGGUFFileMMap(b *testing.B) {
function BenchmarkParseGGUFFileSkipLargeMetadata (line 78) | func BenchmarkParseGGUFFileSkipLargeMetadata(b *testing.B) {
function TestParseGGUFFileRemote (line 110) | func TestParseGGUFFileRemote(t *testing.T) {
function BenchmarkParseGGUFFileRemoteWithBufferSize (line 141) | func BenchmarkParseGGUFFileRemoteWithBufferSize(b *testing.B) {
function TestParseGGUFFileFromHuggingFace (line 183) | func TestParseGGUFFileFromHuggingFace(t *testing.T) {
function TestParseGGUFFileFromModelScope (line 212) | func TestParseGGUFFileFromModelScope(t *testing.T) {
function TestParseGGUFFileFromOllama (line 241) | func TestParseGGUFFileFromOllama(t *testing.T) {
function FuzzParseGGUFFile (line 265) | func FuzzParseGGUFFile(f *testing.F) {
function TestParseGGUFFileWithFuzzInput (line 294) | func TestParseGGUFFileWithFuzzInput(t *testing.T) {
FILE: file_tokenizer.go
type GGUFTokenizer (line 4) | type GGUFTokenizer struct
method Tokenizer (line 53) | func (gf *GGUFFile) Tokenizer() (gt GGUFTokenizer) {
FILE: file_tokenizer_test.go
function TestGGUFFile_Tokenizer (line 11) | func TestGGUFFile_Tokenizer(t *testing.T) {
function BenchmarkGGUFFile_Tokenizer (line 27) | func BenchmarkGGUFFile_Tokenizer(b *testing.B) {
FILE: filename.go
type GGUFFilename (line 15) | type GGUFFilename struct
method String (line 65) | func (gn GGUFFilename) String() string {
method IsShard (line 103) | func (gn GGUFFilename) IsShard() bool {
function ParseGGUFFilename (line 30) | func ParseGGUFFilename(name string) *GGUFFilename {
function IsShardGGUFFilename (line 110) | func IsShardGGUFFilename(name string) bool {
function CompleteShardGGUFFilename (line 139) | func CompleteShardGGUFFilename(name string) []string {
function parseInt (line 174) | func parseInt(v string) int {
FILE: filename_test.go
function TestParseGGUFFilename (line 11) | func TestParseGGUFFilename(t *testing.T) {
function TestGGUFFilenameString (line 92) | func TestGGUFFilenameString(t *testing.T) {
function TestIsShardGGUFFilename (line 147) | func TestIsShardGGUFFilename(t *testing.T) {
function TestCompleteShardGGUFFilename (line 181) | func TestCompleteShardGGUFFilename(t *testing.T) {
FILE: gen.regression.go
type LinearRegression (line 20) | type LinearRegression struct
method Fit (line 25) | func (lr *LinearRegression) Fit(xs, ys []float64) {
method Predict (line 48) | func (lr *LinearRegression) Predict(x float64) (y float64) {
type PolynomialRegression (line 52) | type PolynomialRegression struct
method Fit (line 57) | func (pr *PolynomialRegression) Fit(xs, ys []float64) {
method Predict (line 81) | func (pr *PolynomialRegression) Predict(x float64) (y float64) {
function DiffusionModelMemoryUsageRegression (line 89) | func DiffusionModelMemoryUsageRegression(output string) {
function main (line 493) | func main() {
FILE: ggml.go
type GGMLType (line 13) | type GGMLType
method Trait (line 118) | func (t GGMLType) Trait() (GGMLTypeTrait, bool) {
method IsQuantized (line 124) | func (t GGMLType) IsQuantized() bool {
method RowSizeOf (line 140) | func (t GGMLType) RowSizeOf(dimensions []uint64) uint64 {
type GGMLTypeTrait (line 17) | type GGMLTypeTrait struct
constant GGMLTypeF32 (line 30) | GGMLTypeF32 GGMLType = iota
constant GGMLTypeF16 (line 31) | GGMLTypeF16
constant GGMLTypeQ4_0 (line 32) | GGMLTypeQ4_0
constant GGMLTypeQ4_1 (line 33) | GGMLTypeQ4_1
constant GGMLTypeQ4_2 (line 34) | GGMLTypeQ4_2
constant GGMLTypeQ4_3 (line 35) | GGMLTypeQ4_3
constant GGMLTypeQ5_0 (line 36) | GGMLTypeQ5_0
constant GGMLTypeQ5_1 (line 37) | GGMLTypeQ5_1
constant GGMLTypeQ8_0 (line 38) | GGMLTypeQ8_0
constant GGMLTypeQ8_1 (line 39) | GGMLTypeQ8_1
constant GGMLTypeQ2_K (line 40) | GGMLTypeQ2_K
constant GGMLTypeQ3_K (line 41) | GGMLTypeQ3_K
constant GGMLTypeQ4_K (line 42) | GGMLTypeQ4_K
constant GGMLTypeQ5_K (line 43) | GGMLTypeQ5_K
constant GGMLTypeQ6_K (line 44) | GGMLTypeQ6_K
constant GGMLTypeQ8_K (line 45) | GGMLTypeQ8_K
constant GGMLTypeIQ2_XXS (line 46) | GGMLTypeIQ2_XXS
constant GGMLTypeIQ2_XS (line 47) | GGMLTypeIQ2_XS
constant GGMLTypeIQ3_XXS (line 48) | GGMLTypeIQ3_XXS
constant GGMLTypeIQ1_S (line 49) | GGMLTypeIQ1_S
constant GGMLTypeIQ4_NL (line 50) | GGMLTypeIQ4_NL
constant GGMLTypeIQ3_S (line 51) | GGMLTypeIQ3_S
constant GGMLTypeIQ2_S (line 52) | GGMLTypeIQ2_S
constant GGMLTypeIQ4_XS (line 53) | GGMLTypeIQ4_XS
constant GGMLTypeI8 (line 54) | GGMLTypeI8
constant GGMLTypeI16 (line 55) | GGMLTypeI16
constant GGMLTypeI32 (line 56) | GGMLTypeI32
constant GGMLTypeI64 (line 57) | GGMLTypeI64
constant GGMLTypeF64 (line 58) | GGMLTypeF64
constant GGMLTypeIQ1_M (line 59) | GGMLTypeIQ1_M
constant GGMLTypeBF16 (line 60) | GGMLTypeBF16
constant GGMLTypeQ4_0_4_4 (line 61) | GGMLTypeQ4_0_4_4
constant GGMLTypeQ4_0_4_8 (line 62) | GGMLTypeQ4_0_4_8
constant GGMLTypeQ4_0_8_8 (line 63) | GGMLTypeQ4_0_8_8
constant GGMLTypeTQ1_0 (line 64) | GGMLTypeTQ1_0
constant GGMLTypeTQ2_0 (line 65) | GGMLTypeTQ2_0
constant GGMLTypeIQ4_NL_4_4 (line 66) | GGMLTypeIQ4_NL_4_4
constant GGMLTypeIQ4_NL_4_8 (line 67) | GGMLTypeIQ4_NL_4_8
constant GGMLTypeIQ4_NL_8_8 (line 68) | GGMLTypeIQ4_NL_8_8
constant GGMLTypeMXFP4 (line 69) | GGMLTypeMXFP4
constant _GGMLTypeCount (line 70) | _GGMLTypeCount
function GGMLMemoryPadding (line 160) | func GGMLMemoryPadding(size uint64) uint64 {
function GGMLPadding (line 167) | func GGMLPadding(size, align uint64) uint64 {
constant GGMLTensorSize (line 175) | GGMLTensorSize = 368
constant GGMLObjectSize (line 179) | GGMLObjectSize = 32
function GGMLTensorOverhead (line 184) | func GGMLTensorOverhead() uint64 {
constant GGMLComputationGraphSize (line 191) | GGMLComputationGraphSize = 80
constant GGMLComputationBitsetSize (line 195) | GGMLComputationBitsetSize = 4
function GGMLComputationGraphOverhead (line 200) | func GGMLComputationGraphOverhead(nodes uint64, grads bool) uint64 {
function GGMLHashSize (line 221) | func GGMLHashSize(base uint64) uint64 {
function GGMLBitsetSize (line 243) | func GGMLBitsetSize(n uint64) uint64 {
FILE: ollama_model.go
constant OllamaDefaultScheme (line 22) | OllamaDefaultScheme = "https"
constant OllamaDefaultRegistry (line 23) | OllamaDefaultRegistry = "registry.ollama.ai"
constant OllamaDefaultNamespace (line 24) | OllamaDefaultNamespace = "library"
constant OllamaDefaultTag (line 25) | OllamaDefaultTag = "latest"
type OllamaModel (line 31) | type OllamaModel struct
method String (line 147) | func (om *OllamaModel) String() string {
method GetLayer (line 167) | func (om *OllamaModel) GetLayer(mediaType string) (OllamaModelLayer, b...
method SearchLayers (line 177) | func (om *OllamaModel) SearchLayers(mediaTypeRegex *regexp.Regexp) []O...
method WebPageURL (line 188) | func (om *OllamaModel) WebPageURL() *url.URL {
method Complete (line 197) | func (om *OllamaModel) Complete(ctx context.Context, cli *http.Client)...
method Params (line 234) | func (om *OllamaModel) Params(ctx context.Context, cli *http.Client) (...
method Template (line 276) | func (om *OllamaModel) Template(ctx context.Context, cli *http.Client)...
method System (line 298) | func (om *OllamaModel) System(ctx context.Context, cli *http.Client) (...
method License (line 320) | func (om *OllamaModel) License(ctx context.Context, cli *http.Client) ...
method Messages (line 352) | func (om *OllamaModel) Messages(ctx context.Context, cli *http.Client)...
type OllamaModelLayer (line 54) | type OllamaModelLayer struct
method BlobURL (line 384) | func (ol *OllamaModelLayer) BlobURL() *url.URL {
method FetchBlob (line 398) | func (ol *OllamaModelLayer) FetchBlob(ctx context.Context, cli *http.C...
method FetchBlobFunc (line 409) | func (ol *OllamaModelLayer) FetchBlobFunc(ctx context.Context, cli *ht...
function ParseOllamaModel (line 72) | func ParseOllamaModel(model string, opts ...OllamaModelOption) *OllamaMo...
FILE: ollama_model_option.go
type _OllamaModelOptions (line 9) | type _OllamaModelOptions struct
type OllamaModelOption (line 15) | type OllamaModelOption
function SetOllamaModelBaseURL (line 20) | func SetOllamaModelBaseURL(baseURL string) OllamaModelOption {
function SetOllamaModelDefaultScheme (line 42) | func SetOllamaModelDefaultScheme(scheme string) OllamaModelOption {
function SetOllamaModelDefaultRegistry (line 52) | func SetOllamaModelDefaultRegistry(registry string) OllamaModelOption {
function SetOllamaModelDefaultNamespace (line 62) | func SetOllamaModelDefaultNamespace(namespace string) OllamaModelOption {
function SetOllamaModelDefaultTag (line 72) | func SetOllamaModelDefaultTag(tag string) OllamaModelOption {
FILE: ollama_model_test.go
function TestParseOllamaModel (line 9) | func TestParseOllamaModel(t *testing.T) {
FILE: ollama_registry_authenticate.go
constant httpHeaderWWWAuthenticate (line 31) | httpHeaderWWWAuthenticate = "WWW-Authenticate"
constant httpHeaderAuthorization (line 32) | httpHeaderAuthorization = "Authorization"
function OllamaUserAgent (line 38) | func OllamaUserAgent() string {
function OllamaRegistryAuthorizeRetry (line 46) | func OllamaRegistryAuthorizeRetry(resp *http.Response, cli *http.Client)...
function OllamaRegistryAuthorize (line 75) | func OllamaRegistryAuthorize(ctx context.Context, cli *http.Client, auth...
function OllamaSingKeyLoad (line 173) | func OllamaSingKeyLoad() (ssh.Signer, error) {
FILE: scalar.go
constant _Ki (line 10) | _Ki = 1 << ((iota + 1) * 10)
constant _Mi (line 11) | _Mi
constant _Gi (line 12) | _Gi
constant _Ti (line 13) | _Ti
constant _Pi (line 14) | _Pi
constant _K (line 18) | _K = 1e3
constant _M (line 19) | _M = 1e6
constant _G (line 20) | _G = 1e9
constant _T (line 21) | _T = 1e12
constant _P (line 22) | _P = 1e15
constant _Thousand (line 26) | _Thousand = 1e3
constant _Million (line 27) | _Million = 1e6
constant _Billion (line 28) | _Billion = 1e9
constant _Trillion (line 29) | _Trillion = 1e12
constant _Quadrillion (line 30) | _Quadrillion = 1e15
type SizeScalar (line 35) | type SizeScalar
method String (line 107) | func (s SizeScalar) String() string {
type FLOPSScalar (line 38) | type FLOPSScalar
method String (line 144) | func (s FLOPSScalar) String() string {
type BytesPerSecondScalar (line 41) | type BytesPerSecondScalar
method String (line 188) | func (s BytesPerSecondScalar) String() string {
function ParseSizeScalar (line 88) | func ParseSizeScalar(s string) (_ SizeScalar, err error) {
function ParseFLOPSScalar (line 124) | func ParseFLOPSScalar(s string) (_ FLOPSScalar, err error) {
function ParseBytesPerSecondScalar (line 161) | func ParseBytesPerSecondScalar(s string) (_ BytesPerSecondScalar, err er...
type GGUFBytesScalar (line 206) | type GGUFBytesScalar
method String (line 242) | func (s GGUFBytesScalar) String() string {
type GGUFParametersScalar (line 209) | type GGUFParametersScalar
method String (line 263) | func (s GGUFParametersScalar) String() string {
type GGUFBitsPerWeightScalar (line 212) | type GGUFBitsPerWeightScalar
method String (line 279) | func (s GGUFBitsPerWeightScalar) String() string {
type GGUFTokensPerSecondScalar (line 215) | type GGUFTokensPerSecondScalar
method String (line 286) | func (s GGUFTokensPerSecondScalar) String() string {
function ParseGGUFBytesScalar (line 219) | func ParseGGUFBytesScalar(s string) (_ GGUFBytesScalar, err error) {
FILE: scalar_test.go
function TestParseSizeScalar (line 9) | func TestParseSizeScalar(t *testing.T) {
function TestParseFLOPSScalar (line 32) | func TestParseFLOPSScalar(t *testing.T) {
function TestParseBytesPerSecondScalar (line 55) | func TestParseBytesPerSecondScalar(t *testing.T) {
function TestParseGGUFBytesScalar (line 94) | func TestParseGGUFBytesScalar(t *testing.T) {
FILE: util/anyx/any.go
function Number (line 12) | func Number[T constraints.Integer | constraints.Float](v any) T {
function Bool (line 71) | func Bool(v any) bool {
function String (line 89) | func String(v any) string {
FILE: util/bytex/pool.go
constant defaultSize (line 8) | defaultSize = 32 * 1024
function GetBytes (line 25) | func GetBytes(size ...uint64) Bytes {
function WithBytes (line 52) | func WithBytes(fn func(Bytes) error, size ...uint64) error {
function GetBuffer (line 64) | func GetBuffer(size ...uint64) BytesBuffer {
function WithBuffer (line 71) | func WithBuffer(fn func(BytesBuffer) error, size ...uint64) error {
function Put (line 82) | func Put[T Bytes | BytesBuffer](buf T) {
FILE: util/funcx/error.go
function NoError (line 5) | func NoError[T any](t T, _ error) T {
function NoError2 (line 11) | func NoError2[T, U any](t T, u U, _ error) (T, U) {
function NoError3 (line 17) | func NoError3[T, U, V any](t T, u U, v V, _ error) (T, U, V) {
function NoError4 (line 23) | func NoError4[T, U, V, W any](t T, u U, v V, w W, _ error) (T, U, V, W) {
function MustNoError (line 30) | func MustNoError[T any](t T, e error) T {
function MustNoError2 (line 40) | func MustNoError2[T, U any](t T, u U, e error) (T, U) {
function MustNoError3 (line 50) | func MustNoError3[T, U, V any](t T, u U, v V, e error) (T, U, V) {
function MustNoError4 (line 60) | func MustNoError4[T, U, V, W any](t T, u U, v V, w W, e error) (T, U, V,...
FILE: util/httpx/client.go
function Client (line 34) | func Client(opts ...*ClientOption) *http.Client {
function NewGetRequestWithContext (line 103) | func NewGetRequestWithContext(ctx context.Context, uri string) (*http.Re...
function NewGetRequest (line 109) | func NewGetRequest(uri string) (*http.Request, error) {
function NewHeadRequestWithContext (line 115) | func NewHeadRequestWithContext(ctx context.Context, uri string) (*http.R...
function NewHeadRequest (line 121) | func NewHeadRequest(uri string) (*http.Request, error) {
function NewPostRequestWithContext (line 127) | func NewPostRequestWithContext(ctx context.Context, uri string, body io....
function NewPostRequest (line 133) | func NewPostRequest(uri string, body io.Reader) (*http.Request, error) {
function NewPutRequestWithContext (line 139) | func NewPutRequestWithContext(ctx context.Context, uri string, body io.R...
function NewPutRequest (line 145) | func NewPutRequest(uri string, body io.Reader) (*http.Request, error) {
function NewPatchRequestWithContext (line 151) | func NewPatchRequestWithContext(ctx context.Context, uri string, body io...
function NewPatchRequest (line 157) | func NewPatchRequest(uri string, body io.Reader) (*http.Request, error) {
function NewDeleteRequestWithContext (line 163) | func NewDeleteRequestWithContext(ctx context.Context, uri string) (*http...
function NewDeleteRequest (line 169) | func NewDeleteRequest(uri string) (*http.Request, error) {
function NewConnectRequestWithContext (line 175) | func NewConnectRequestWithContext(ctx context.Context, uri string) (*htt...
function NewConnectRequest (line 181) | func NewConnectRequest(uri string) (*http.Request, error) {
function NewOptionsRequestWithContext (line 187) | func NewOptionsRequestWithContext(ctx context.Context, uri string) (*htt...
function NewOptionsRequest (line 193) | func NewOptionsRequest(uri string) (*http.Request, error) {
function NewTraceRequestWithContext (line 199) | func NewTraceRequestWithContext(ctx context.Context, uri string) (*http....
function NewTraceRequest (line 205) | func NewTraceRequest(uri string) (*http.Request, error) {
function Error (line 211) | func Error(rw http.ResponseWriter, code int) {
function Close (line 216) | func Close(resp *http.Response) {
function BodyBytes (line 223) | func BodyBytes(resp *http.Response) []byte {
function BodyString (line 233) | func BodyString(resp *http.Response) string {
function Do (line 243) | func Do(cli *http.Client, req *http.Request, respFunc func(*http.Respons...
FILE: util/httpx/client_helper.go
type JSONFormatter (line 19) | type JSONFormatter struct
method Match (line 24) | func (j *JSONFormatter) Match(mediatype string) bool {
method Format (line 29) | func (j *JSONFormatter) Format(w io.Writer, src []byte) error {
type RoundTripperChain (line 46) | type RoundTripperChain struct
method RoundTrip (line 51) | func (c RoundTripperChain) RoundTrip(req *http.Request) (*http.Respons...
type RoundTripperFunc (line 63) | type RoundTripperFunc
method RoundTrip (line 65) | func (fn RoundTripperFunc) RoundTrip(req *http.Request) (*http.Respons...
FILE: util/httpx/client_options.go
type ClientOption (line 11) | type ClientOption struct
method WithTransport (line 31) | func (o *ClientOption) WithTransport(opt *TransportOption) *ClientOpti...
method WithTimeout (line 44) | func (o *ClientOption) WithTimeout(timeout time.Duration) *ClientOption {
method WithDebug (line 53) | func (o *ClientOption) WithDebug() *ClientOption {
method WithRetryIf (line 65) | func (o *ClientOption) WithRetryIf(retryIf RetryFunc) *ClientOption {
method WithRetryBackoff (line 74) | func (o *ClientOption) WithRetryBackoff(waitMin, waitMax time.Duration...
method WithUserAgent (line 83) | func (o *ClientOption) WithUserAgent(ua string) *ClientOption {
method WithBearerAuth (line 91) | func (o *ClientOption) WithBearerAuth(token string) *ClientOption {
method WithBasicAuth (line 99) | func (o *ClientOption) WithBasicAuth(username, password string) *Clien...
method WithHeader (line 107) | func (o *ClientOption) WithHeader(key, value string) *ClientOption {
method WithHeaders (line 115) | func (o *ClientOption) WithHeaders(headers map[string]string) *ClientO...
method WithRoundTripper (line 125) | func (o *ClientOption) WithRoundTripper(rt func(req *http.Request) err...
method If (line 135) | func (o *ClientOption) If(condition bool, then func(*ClientOption) *Cl...
function ClientOptions (line 21) | func ClientOptions() *ClientOption {
type RetryFunc (line 61) | type RetryFunc
function DefaultRetry (line 144) | func DefaultRetry(resp *http.Response, respErr error) bool {
function createRetryBackoff (line 177) | func createRetryBackoff(waitMin, waitMax time.Duration, attemptMax int) ...
FILE: util/httpx/file.go
type SeekerFile (line 16) | type SeekerFile struct
method Close (line 91) | func (f *SeekerFile) Close() error {
method Len (line 98) | func (f *SeekerFile) Len() int64 {
method ReadAt (line 102) | func (f *SeekerFile) ReadAt(p []byte, off int64) (int, error) {
method Read (line 165) | func (f *SeekerFile) Read(p []byte) (int, error) {
method sync (line 171) | func (f *SeekerFile) sync(off int64, reset bool) error {
method skip (line 203) | func (f *SeekerFile) skip(dif int64) error {
function OpenSeekerFile (line 26) | func OpenSeekerFile(cli *http.Client, req *http.Request, opts ...*Seeker...
type _WriterOnly (line 220) | type _WriterOnly struct
method Write (line 224) | func (w _WriterOnly) Write(p []byte) (int, error) {
FILE: util/httpx/file_options.go
type SeekerFileOption (line 3) | type SeekerFileOption struct
method WithBufferSize (line 18) | func (o *SeekerFileOption) WithBufferSize(bufSize int) *SeekerFileOpti...
method WithSize (line 29) | func (o *SeekerFileOption) WithSize(size int) *SeekerFileOption {
method WithoutRangeDownloadDetect (line 46) | func (o *SeekerFileOption) WithoutRangeDownloadDetect() *SeekerFileOpt...
method If (line 56) | func (o *SeekerFileOption) If(condition bool, then func(*SeekerFileOpt...
function SeekerFileOptions (line 9) | func SeekerFileOptions() *SeekerFileOption {
FILE: util/httpx/proxy.go
function init (line 14) | func init() {
function ProxyFromEnvironment (line 27) | func ProxyFromEnvironment(r *http.Request) (*url.URL, error) {
FILE: util/httpx/resolver.go
function DNSCacheDialContext (line 8) | func DNSCacheDialContext(dialer *net.Dialer) func(context.Context, strin...
FILE: util/httpx/transport.go
function Transport (line 16) | func Transport(opts ...*TransportOption) *http.Transport {
FILE: util/httpx/transport_options.go
type TransportOption (line 11) | type TransportOption struct
method WithProxy (line 41) | func (o *TransportOption) WithProxy(proxy func(*http.Request) (*url.UR...
method WithoutProxy (line 50) | func (o *TransportOption) WithoutProxy() *TransportOption {
method WithKeepalive (line 59) | func (o *TransportOption) WithKeepalive(timeoutAndKeepalive ...time.Du...
method WithoutKeepalive (line 77) | func (o *TransportOption) WithoutKeepalive() *TransportOption {
method WithInsecureVerify (line 88) | func (o *TransportOption) WithInsecureVerify() *TransportOption {
method WithoutInsecureVerify (line 97) | func (o *TransportOption) WithoutInsecureVerify() *TransportOption {
method TimeoutForDial (line 110) | func (o *TransportOption) TimeoutForDial(timeout time.Duration) *Trans...
method TimeoutForResponseHeader (line 123) | func (o *TransportOption) TimeoutForResponseHeader(timeout time.Durati...
method TimeoutForTLSHandshake (line 136) | func (o *TransportOption) TimeoutForTLSHandshake(timeout time.Duration...
method TimeoutForIdleConn (line 149) | func (o *TransportOption) TimeoutForIdleConn(timeout time.Duration) *T...
method WithTLSClientConfig (line 158) | func (o *TransportOption) WithTLSClientConfig(config *tls.Config) *Tra...
method WithoutDNSCache (line 167) | func (o *TransportOption) WithoutDNSCache() *TransportOption {
method WithDialer (line 176) | func (o *TransportOption) WithDialer(dialer *net.Dialer) *TransportOpt...
method Customize (line 186) | func (o *TransportOption) Customize(fn func(*http.Transport)) *Transpo...
method If (line 197) | func (o *TransportOption) If(condition bool, then func(*TransportOptio...
function TransportOptions (line 16) | func TransportOptions() *TransportOption {
FILE: util/json/common.go
function MustMarshal (line 19) | func MustMarshal(v any) []byte {
function MustUnmarshal (line 30) | func MustUnmarshal(data []byte, v any) {
function MustMarshalIndent (line 39) | func MustMarshalIndent(v any, prefix, indent string) []byte {
function ShouldMarshal (line 50) | func ShouldMarshal(v any) []byte {
function ShouldUnmarshal (line 57) | func ShouldUnmarshal(data []byte, v any) {
function ShouldMarshalIndent (line 63) | func ShouldMarshalIndent(v any, prefix, indent string) []byte {
FILE: util/json/jsoniter.go
function init (line 15) | func init() {
FILE: util/osx/env.go
function ExistEnv (line 8) | func ExistEnv(key string) bool {
function Getenv (line 16) | func Getenv(key string, def ...string) string {
function ExpandEnv (line 27) | func ExpandEnv(key string, def ...string) string {
FILE: util/osx/file.go
function InlineTilde (line 11) | func InlineTilde(path string) string {
function Open (line 25) | func Open(path string) (*os.File, error) {
function Exists (line 32) | func Exists(path string, checks ...func(os.FileInfo) bool) bool {
function ExistsDir (line 55) | func ExistsDir(path string) bool {
function ExistsLink (line 62) | func ExistsLink(path string) bool {
function ExistsFile (line 69) | func ExistsFile(path string) bool {
function ExistsSocket (line 76) | func ExistsSocket(path string) bool {
function ExistsDevice (line 83) | func ExistsDevice(path string) bool {
function Close (line 90) | func Close(c io.Closer) {
function WriteFile (line 99) | func WriteFile(name string, data []byte, perm os.FileMode) error {
function CreateFile (line 112) | func CreateFile(name string, perm os.FileMode) (*os.File, error) {
function OpenFile (line 125) | func OpenFile(name string, flag int, perm os.FileMode) (*os.File, error) {
FILE: util/osx/file_mmap.go
type MmapFile (line 26) | type MmapFile struct
method Close (line 61) | func (f *MmapFile) Close() error {
method Bytes (line 71) | func (f *MmapFile) Bytes() []byte {
method Len (line 75) | func (f *MmapFile) Len() int64 {
method ReadAt (line 81) | func (f *MmapFile) ReadAt(p []byte, off int64) (_ int, err error) {
function OpenMmapFile (line 31) | func OpenMmapFile(path string) (*MmapFile, error) {
function OpenMmapFileWithSize (line 35) | func OpenMmapFileWithSize(path string, size int) (*MmapFile, error) {
FILE: util/osx/file_mmap_js.go
function mmap (line 21) | func mmap(f *os.File, length int) ([]byte, error) {
function munmap (line 25) | func munmap(b []byte) (err error) {
FILE: util/osx/file_mmap_unix.go
function mmap (line 24) | func mmap(f *os.File, length int) ([]byte, error) {
function munmap (line 28) | func munmap(b []byte) (err error) {
FILE: util/osx/file_mmap_windows.go
function mmap (line 9) | func mmap(f *os.File, size int) ([]byte, error) {
function munmap (line 28) | func munmap(b []byte) error {
FILE: util/osx/file_mmap_windows_386.go
constant maxMapSize (line 16) | maxMapSize = 0x7FFFFFFF
FILE: util/osx/file_mmap_windows_non386.go
constant maxMapSize (line 18) | maxMapSize = 0xFFFFFFFFFFFF
FILE: util/osx/homedir.go
function UserHomeDir (line 11) | func UserHomeDir() string {
FILE: util/ptr/pointer.go
function Int (line 9) | func Int(v int) *int {
function IntDeref (line 13) | func IntDeref(v *int, def int) int {
function Int8 (line 17) | func Int8(v int8) *int8 {
function Int8Deref (line 21) | func Int8Deref(v *int8, def int8) int8 {
function Int16 (line 25) | func Int16(v int16) *int16 {
function Int16Deref (line 29) | func Int16Deref(v *int16, def int16) int16 {
function Int32 (line 33) | func Int32(v int32) *int32 {
function Int32Deref (line 37) | func Int32Deref(v *int32, def int32) int32 {
function Int64 (line 41) | func Int64(v int64) *int64 {
function Int64Deref (line 45) | func Int64Deref(v *int64, def int64) int64 {
function Uint (line 49) | func Uint(v uint) *uint {
function UintDeref (line 53) | func UintDeref(v *uint, def uint) uint {
function Uint8 (line 57) | func Uint8(v uint8) *uint8 {
function Uint8Deref (line 61) | func Uint8Deref(v *uint8, def uint8) uint8 {
function Uint16 (line 65) | func Uint16(v uint16) *uint16 {
function Uint16Deref (line 69) | func Uint16Deref(v *uint16, def uint16) uint16 {
function Uint32 (line 73) | func Uint32(v uint32) *uint32 {
function Uint32Deref (line 77) | func Uint32Deref(v *uint32, def uint32) uint32 {
function Uint64 (line 81) | func Uint64(v uint64) *uint64 {
function Uint64Deref (line 85) | func Uint64Deref(v *uint64, def uint64) uint64 {
function Float32 (line 89) | func Float32(v float32) *float32 {
function Float32Deref (line 93) | func Float32Deref(v *float32, def float32) float32 {
function Float64 (line 97) | func Float64(v float64) *float64 {
function Float64Deref (line 101) | func Float64Deref(v *float64, def float64) float64 {
function String (line 105) | func String(v string) *string {
function StringDeref (line 109) | func StringDeref(v *string, def string) string {
function Bool (line 113) | func Bool(v bool) *bool {
function BoolDeref (line 117) | func BoolDeref(v *bool, def bool) bool {
function Duration (line 121) | func Duration(v time.Duration) *time.Duration {
function DurationDeref (line 125) | func DurationDeref(v *time.Duration, def time.Duration) time.Duration {
function Time (line 129) | func Time(v time.Time) *time.Time {
function TimeDeref (line 133) | func TimeDeref(v *time.Time, def time.Time) time.Time {
type Pointerable (line 137) | type Pointerable interface
function Ref (line 141) | func Ref[T Pointerable](v T) *T {
function To (line 145) | func To[T Pointerable](v T) *T {
function Deref (line 149) | func Deref[T Pointerable](ptr *T, def T) T {
function Equal (line 157) | func Equal[T Pointerable](a, b *T) bool {
FILE: util/signalx/handler.go
function Handler (line 12) | func Handler() context.Context {
FILE: util/slicex/search.go
function UpperBound (line 6) | func UpperBound[T constraints.Integer | constraints.Float](s []T, e T) i...
FILE: util/stringx/bytes.go
function FromBytes (line 6) | func FromBytes(b *[]byte) string {
function ToBytes (line 12) | func ToBytes(s *string) (bs []byte) {
FILE: util/stringx/random.go
function RandomBytes (line 17) | func RandomBytes(n int) []byte {
function RandomHex (line 30) | func RandomHex(n int) string { return hex.EncodeToString(RandomBytes(n)) }
function RandomString (line 34) | func RandomString(n int, letters ...string) string {
function RandomBase64 (line 59) | func RandomBase64(n int) string {
FILE: util/stringx/strings.go
function CutFromLeft (line 11) | func CutFromLeft(s, sep string) (before, after string, found bool) {
function CutFromRight (line 21) | func CutFromRight(s, sep string) (before, after string, found bool) {
function ReplaceAllFunc (line 30) | func ReplaceAllFunc(s string, f func(rune) rune) string {
function HasSuffixes (line 39) | func HasSuffixes(s string, suffixes ...string) bool {
FILE: util/stringx/sum.go
function SumByFNV64a (line 10) | func SumByFNV64a(s string, ss ...string) string {
function SumBytesByFNV64a (line 23) | func SumBytesByFNV64a(bs []byte, bss ...[]byte) string {
function SumBySHA256 (line 36) | func SumBySHA256(s string, ss ...string) string {
function SumBytesBySHA256 (line 49) | func SumBytesBySHA256(bs []byte, bss ...[]byte) string {
function SumBySHA224 (line 62) | func SumBySHA224(s string, ss ...string) string {
function SumBytesBySHA224 (line 75) | func SumBytesBySHA224(bs []byte, bss ...[]byte) string {
FILE: zz_generated.diffusion_model_memory_usage.regression.go
function GuessSD1DiffusionModelMemoryUsage (line 7) | func GuessSD1DiffusionModelMemoryUsage(width, height uint32, flashAttent...
function GuessSD2DiffusionModelMemoryUsage (line 21) | func GuessSD2DiffusionModelMemoryUsage(width, height uint32, flashAttent...
function GuessSDXLDiffusionModelMemoryUsage (line 40) | func GuessSDXLDiffusionModelMemoryUsage(width, height uint32, flashAtten...
function GuessSDXLRefinerDiffusionModelMemoryUsage (line 59) | func GuessSDXLRefinerDiffusionModelMemoryUsage(width, height uint32, fla...
function GuessSD3MediumDiffusionModelMemoryUsage (line 78) | func GuessSD3MediumDiffusionModelMemoryUsage(width, height uint32, flash...
function GuessSD35MediumDiffusionModelMemoryUsage (line 92) | func GuessSD35MediumDiffusionModelMemoryUsage(width, height uint32, flas...
function GuessSD35LargeDiffusionModelMemoryUsage (line 106) | func GuessSD35LargeDiffusionModelMemoryUsage(width, height uint32, flash...
function GuessFLUXDiffusionModelMemoryUsage (line 120) | func GuessFLUXDiffusionModelMemoryUsage(width, height uint32, flashAtten...
FILE: zz_generated.ggmltype.stringer.go
function _ (line 7) | func _() {
constant _GGMLType_name (line 54) | _GGMLType_name = "F32F16Q4_0Q4_1Q4_2Q4_3Q5_0Q5_1Q8_0Q8_1Q2_KQ3_KQ4_KQ5_K...
method String (line 58) | func (i GGMLType) String() string {
FILE: zz_generated.gguffiletype.stringer.go
function _ (line 7) | func _() {
constant _GGUFFileType_name (line 53) | _GGUFFileType_name = "MOSTLY_F32MOSTLY_F16MOSTLY_Q4_0MOSTLY_Q4_1MOSTLY_Q...
method String (line 57) | func (i GGUFFileType) String() string {
FILE: zz_generated.ggufmagic.stringer.go
function _ (line 7) | func _() {
constant _GGUFMagic_name_0 (line 19) | _GGUFMagic_name_0 = "GGUF"
constant _GGUFMagic_name_1 (line 20) | _GGUFMagic_name_1 = "GGUF"
constant _GGUFMagic_name_2 (line 21) | _GGUFMagic_name_2 = "GGJT"
constant _GGUFMagic_name_3 (line 22) | _GGUFMagic_name_3 = "GGMF"
constant _GGUFMagic_name_4 (line 23) | _GGUFMagic_name_4 = "GGML"
method String (line 26) | func (i GGUFMagic) String() string {
FILE: zz_generated.ggufmetadatavaluetype.stringer.go
function _ (line 7) | func _() {
constant _GGUFMetadataValueType_name (line 27) | _GGUFMetadataValueType_name = "Uint8Int8Uint16Int16Uint32Int32Float32Boo...
method String (line 31) | func (i GGUFMetadataValueType) String() string {
FILE: zz_generated.ggufversion.stringer.go
function _ (line 7) | func _() {
constant _GGUFVersion_name (line 16) | _GGUFVersion_name = "V1V2V3"
method String (line 20) | func (i GGUFVersion) String() string {
Condensed preview — 85 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (662K chars).
[
{
"path": ".gitattributes",
"chars": 99,
"preview": "* text=auto eol=lf\n\n**/go.sum linguist-generated=true\n**/zz_generated.*.go linguist-generated=true\n"
},
{
"path": ".github/workflows/ci.yml",
"chars": 1325,
"preview": "name: ci\n\npermissions:\n contents: read\n pull-requests: read\n actions: read\n\ndefaults:\n run:\n shell: bash\n\non:\n p"
},
{
"path": ".github/workflows/cmd.yml",
"chars": 3241,
"preview": "name: cmd\n\npermissions:\n contents: write\n actions: read\n id-token: write\n\ndefaults:\n run:\n shell: bash\n\non:\n pus"
},
{
"path": ".github/workflows/prune.yml",
"chars": 700,
"preview": "name: prune\n\npermissions:\n contents: write\n pull-requests: write\n actions: write\n issues: write\n\ndefaults:\n run:\n "
},
{
"path": ".github/workflows/sync.yml",
"chars": 2829,
"preview": "name: sync\n\npermissions:\n contents: read\n pull-requests: read\n actions: read\n\ndefaults:\n run:\n shell: bash\n\non:\n "
},
{
"path": ".gitignore",
"chars": 223,
"preview": "# Files\n.DS_Store\n*.lock\n*.test\n*.out\n*.swp\n*.swo\n*.db\n*.exe\n*.exe~\n*.dll\n*.so\n*.dylib\n*.log\ngo.work\ngo.work.*\n\n# Dirs\n/"
},
{
"path": ".golangci.yaml",
"chars": 2680,
"preview": "version: \"1\"\n\nrun:\n timeout: 10m\n tests: true\n modules-download-mode: readonly\n go: \"1.22\"\n\n# output configuration o"
},
{
"path": "Dockerfile",
"chars": 151,
"preview": "FROM scratch\nARG TARGETOS\nARG TARGETARCH\nCOPY --chmod=755 .dist/gguf-parser-${TARGETOS}-${TARGETARCH} /bin/gguf-parser\nE"
},
{
"path": "LICENSE",
"chars": 1078,
"preview": "MIT License\n\nCopyright (c) 2024 gguf-parser-go authors\n\nPermission is hereby granted, free of charge, to any person obta"
},
{
"path": "Makefile",
"chars": 5080,
"preview": ".SILENT:\n.DEFAULT_GOAL := ci\n\nSHELL := /bin/bash\n\nSRCDIR := $(patsubst %/,%,$(dir $(abspath $(lastword $(MAKEFILE_LIST))"
},
{
"path": "README.md",
"chars": 105656,
"preview": "# GGUF Parser\n\n> tl;dr, Review/Check [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) files and estima"
},
{
"path": "cache.go",
"chars": 2139,
"preview": "package gguf_parser\n\nimport (\n\t\"errors\"\n\t\"fmt\"\n\t\"os\"\n\t\"path/filepath\"\n\t\"time\"\n\n\t\"github.com/gpustack/gguf-parser-go/util"
},
{
"path": "cmd/gguf-parser/README.md",
"chars": 19700,
"preview": "# GGUF Parser\n\nReview/Check [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) files,\nestimate the memor"
},
{
"path": "cmd/gguf-parser/go.mod",
"chars": 1169,
"preview": "module github.com/gpustack/gguf-parser-go/cmd/gguf-parser\n\ngo 1.22.0\n\ntoolchain go1.22.9\n\nreplace github.com/gpustack/gg"
},
{
"path": "cmd/gguf-parser/go.sum",
"chars": 4812,
"preview": "github.com/cpuguy83/go-md2man/v2 v2.0.5 h1:ZtcqGrnekaHpVLArFSe4HK5DoKx1T0rq2DwVB0alcyc=\ngithub.com/cpuguy83/go-md2man/v2"
},
{
"path": "cmd/gguf-parser/main.go",
"chars": 69908,
"preview": "package main\n\nimport (\n\t\"errors\"\n\t\"fmt\"\n\t\"net\"\n\t\"os\"\n\t\"path/filepath\"\n\t\"regexp\"\n\t\"strconv\"\n\t\"strings\"\n\t\"sync\"\n\t\"time\"\n\n\t"
},
{
"path": "file.go",
"chars": 50171,
"preview": "package gguf_parser\n\nimport (\n\t\"bytes\"\n\t\"encoding/binary\"\n\t\"errors\"\n\t\"fmt\"\n\t\"io\"\n\t\"regexp\"\n\t\"strings\"\n\n\t\"golang.org/x/ex"
},
{
"path": "file_architecture.go",
"chars": 46308,
"preview": "package gguf_parser\n\nimport (\n\t\"regexp\"\n\t\"slices\"\n\t\"strings\"\n)\n\n// Types for the architecture metadata of a GGUF file.\nt"
},
{
"path": "file_architecture_test.go",
"chars": 784,
"preview": "package gguf_parser\n\nimport (\n\t\"context\"\n\t\"os\"\n\t\"testing\"\n\n\t\"github.com/davecgh/go-spew/spew\"\n)\n\nfunc TestGGUFFile_Archi"
},
{
"path": "file_estimate__llamacpp.go",
"chars": 62450,
"preview": "package gguf_parser\n\nimport (\n\t\"math\"\n\t\"regexp\"\n\t\"slices\"\n\t\"strings\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/anyx\"\n\t\""
},
{
"path": "file_estimate__llamacpp_test.go",
"chars": 3190,
"preview": "package gguf_parser\n\nimport (\n\t\"context\"\n\t\"testing\"\n\n\t\"github.com/davecgh/go-spew/spew\"\n)\n\nfunc TestGGUFFile_EstimateLLa"
},
{
"path": "file_estimate__stablediffusioncpp.go",
"chars": 19765,
"preview": "package gguf_parser\n\nimport (\n\t\"math\"\n\t\"strings\"\n\n\t\"golang.org/x/exp/maps\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/pt"
},
{
"path": "file_estimate__stablediffusioncpp_test.go",
"chars": 1848,
"preview": "package gguf_parser\n\nimport (\n\t\"context\"\n\t\"testing\"\n\n\t\"github.com/davecgh/go-spew/spew\"\n)\n\nfunc TestGGUFFile_EstimateSta"
},
{
"path": "file_estimate_option.go",
"chars": 15832,
"preview": "package gguf_parser\n\nimport (\n\t\"regexp\"\n\t\"slices\"\n\t\"strconv\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/ptr\"\n)\n\ntype (\n\t"
},
{
"path": "file_from_distro.go",
"chars": 2981,
"preview": "package gguf_parser\n\nimport (\n\t\"context\"\n\t\"errors\"\n\t\"fmt\"\n\t\"net/http\"\n\t\"path/filepath\"\n\t\"time\"\n\n\t\"github.com/gpustack/gg"
},
{
"path": "file_from_remote.go",
"chars": 4234,
"preview": "package gguf_parser\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"io\"\n\t\"net/http\"\n\t\"path/filepath\"\n\t\"strings\"\n\t\"time\"\n\n\t\"github.com/gpus"
},
{
"path": "file_metadata.go",
"chars": 18226,
"preview": "package gguf_parser\n\nimport (\n\t\"regexp\"\n\t\"slices\"\n\t\"sort\"\n\t\"strings\"\n\n\t\"golang.org/x/exp/maps\"\n)\n\n// GGUFMetadata repres"
},
{
"path": "file_metadata_test.go",
"chars": 4061,
"preview": "package gguf_parser\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"os\"\n\t\"strings\"\n\t\"testing\"\n\n\t\"github.com/davecgh/go-spew/spew\"\n\t\"github"
},
{
"path": "file_option.go",
"chars": 3974,
"preview": "package gguf_parser\n\nimport (\n\t\"net/url\"\n\t\"path/filepath\"\n\t\"runtime\"\n\t\"strings\"\n\t\"time\"\n\n\t\"github.com/gpustack/gguf-pars"
},
{
"path": "file_test.go",
"chars": 6651,
"preview": "package gguf_parser\n\nimport (\n\t\"bytes\"\n\t\"context\"\n\t\"encoding/binary\"\n\t\"os\"\n\t\"testing\"\n\t\"time\"\n\n\t\"github.com/davecgh/go-s"
},
{
"path": "file_tokenizer.go",
"chars": 3726,
"preview": "package gguf_parser\n\n// GGUFTokenizer represents the tokenizer metadata of a GGUF file.\ntype GGUFTokenizer struct {\n\t/* "
},
{
"path": "file_tokenizer_test.go",
"chars": 772,
"preview": "package gguf_parser\n\nimport (\n\t\"context\"\n\t\"os\"\n\t\"testing\"\n\n\t\"github.com/davecgh/go-spew/spew\"\n)\n\nfunc TestGGUFFile_Token"
},
{
"path": "filename.go",
"chars": 4682,
"preview": "package gguf_parser\n\nimport (\n\t\"fmt\"\n\t\"regexp\"\n\t\"strconv\"\n\t\"strings\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/funcx\"\n\t"
},
{
"path": "filename_test.go",
"chars": 6904,
"preview": "package gguf_parser\n\nimport (\n\t\"testing\"\n\n\t\"github.com/stretchr/testify/assert\"\n\n\t\"github.com/gpustack/gguf-parser-go/ut"
},
{
"path": "gen.go",
"chars": 138,
"preview": "//go:generate go generate -tags stringer gen.stringer.go\n//go:generate go generate -tags regression gen.regression.go\npa"
},
{
"path": "gen.regression.go",
"chars": 17904,
"preview": "//go:build regression\n\n//go:generate go run -tags regression gen.regression.go\npackage main\n\nimport (\n\t\"fmt\"\n\t\"strconv\"\n"
},
{
"path": "gen.stringer.go",
"chars": 868,
"preview": "//go:build stringer\n\n//go:generate go run golang.org/x/tools/cmd/stringer -linecomment -type GGUFMagic -output zz_genera"
},
{
"path": "ggml.go",
"chars": 8880,
"preview": "package gguf_parser\n\nimport (\n\t\"errors\"\n\t\"fmt\"\n\t\"slices\"\n)\n\n// Types for GGMLType.\ntype (\n\t// GGMLType is a type of GGML"
},
{
"path": "go.mod",
"chars": 756,
"preview": "module github.com/gpustack/gguf-parser-go\n\ngo 1.22.0\n\ntoolchain go1.22.9\n\nrequire (\n\tgithub.com/davecgh/go-spew v1.1.1\n\t"
},
{
"path": "go.sum",
"chars": 3649,
"preview": "github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=\ngithub.com/davecgh/go-spew v1.1"
},
{
"path": "ollama_model.go",
"chars": 10650,
"preview": "package gguf_parser\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"net/http\"\n\t\"net/url\"\n\t\"regexp\"\n\t\"strings\"\n\n\t\"golang.org/x/sync/errgrou"
},
{
"path": "ollama_model_option.go",
"chars": 1734,
"preview": "package gguf_parser\n\nimport (\n\t\"net/url\"\n\t\"strings\"\n)\n\ntype (\n\t_OllamaModelOptions struct {\n\t\tDefaultScheme string\n\t\t"
},
{
"path": "ollama_model_test.go",
"chars": 1858,
"preview": "package gguf_parser\n\nimport (\n\t\"testing\"\n\n\t\"github.com/stretchr/testify/assert\"\n)\n\nfunc TestParseOllamaModel(t *testing."
},
{
"path": "ollama_registry_authenticate.go",
"chars": 5729,
"preview": "package gguf_parser\n\nimport (\n\t\"bytes\"\n\t\"context\"\n\t\"crypto/ed25519\"\n\t\"crypto/rand\"\n\t\"encoding/base64\"\n\t\"encoding/json\"\n\t"
},
{
"path": "scalar.go",
"chars": 6656,
"preview": "package gguf_parser\n\nimport (\n\t\"errors\"\n\t\"strconv\"\n\t\"strings\"\n)\n\nconst (\n\t_Ki = 1 << ((iota + 1) * 10)\n\t_Mi\n\t_Gi\n\t_Ti\n\t_"
},
{
"path": "scalar_test.go",
"chars": 2410,
"preview": "package gguf_parser\n\nimport (\n\t\"testing\"\n\n\t\"github.com/stretchr/testify/assert\"\n)\n\nfunc TestParseSizeScalar(t *testing.T"
},
{
"path": "util/anyx/any.go",
"chars": 2404,
"preview": "package anyx\n\nimport (\n\t\"encoding/json\"\n\t\"fmt\"\n\t\"strconv\"\n\n\t\"golang.org/x/exp/constraints\"\n)\n\n// Number converts any typ"
},
{
"path": "util/bytex/pool.go",
"chars": 1702,
"preview": "package bytex\n\nimport (\n\t\"bytes\"\n\t\"sync\"\n)\n\nconst defaultSize = 32 * 1024\n\ntype (\n\tBytes = []byte\n\tBytesBuffer = *"
},
{
"path": "util/funcx/error.go",
"chars": 1709,
"preview": "package funcx\n\n// NoError ignores the given error,\n// it is usually a nice helper for chain function calling.\nfunc NoErr"
},
{
"path": "util/httpx/client.go",
"chars": 8254,
"preview": "package httpx\n\nimport (\n\t\"context\"\n\t\"fmt\"\n\t\"io\"\n\t\"net/http\"\n\t\"time\"\n\n\t\"github.com/henvic/httpretty\"\n\n\t\"github.com/gpusta"
},
{
"path": "util/httpx/client_helper.go",
"chars": 1660,
"preview": "package httpx\n\nimport (\n\t\"bytes\"\n\t\"errors\"\n\t\"io\"\n\t\"net/http\"\n\t\"regexp\"\n\n\t\"github.com/henvic/httpretty\"\n\n\t\"github.com/gpu"
},
{
"path": "util/httpx/client_options.go",
"chars": 5429,
"preview": "package httpx\n\nimport (\n\t\"math\"\n\t\"net/http\"\n\t\"strconv\"\n\t\"strings\"\n\t\"time\"\n)\n\ntype ClientOption struct {\n\t*TransportOptio"
},
{
"path": "util/httpx/file.go",
"chars": 4827,
"preview": "package httpx\n\nimport (\n\t\"errors\"\n\t\"fmt\"\n\t\"io\"\n\t\"net/http\"\n\t\"strings\"\n\t\"syscall\"\n\n\t\"github.com/smallnest/ringbuffer\"\n\n\t\""
},
{
"path": "util/httpx/file_options.go",
"chars": 1809,
"preview": "package httpx\n\ntype SeekerFileOption struct {\n\tbufSize int\n\tsize int\n\tskipRangeDownlo"
},
{
"path": "util/httpx/proxy.go",
"chars": 789,
"preview": "package httpx\n\nimport (\n\t\"net\"\n\t\"net/http\"\n\t\"net/url\"\n\t\"strings\"\n\n\t\"github.com/gpustack/gguf-parser-go/util/osx\"\n)\n\nvar "
},
{
"path": "util/httpx/resolver.go",
"chars": 789,
"preview": "package httpx\n\nimport (\n\t\"context\"\n\t\"net\"\n)\n\nfunc DNSCacheDialContext(dialer *net.Dialer) func(context.Context, string, "
},
{
"path": "util/httpx/transport.go",
"chars": 717,
"preview": "package httpx\n\nimport (\n\t\"net/http\"\n)\n\n// DefaultTransport is similar to the default http.DefaultTransport used by the p"
},
{
"path": "util/httpx/transport_options.go",
"chars": 5119,
"preview": "package httpx\n\nimport (\n\t\"crypto/tls\"\n\t\"net\"\n\t\"net/http\"\n\t\"net/url\"\n\t\"time\"\n)\n\ntype TransportOption struct {\n\tdialer "
},
{
"path": "util/json/common.go",
"chars": 1435,
"preview": "package json\n\nimport (\n\tstdjson \"encoding/json\"\n\t\"fmt\"\n)\n\ntype RawMessage = stdjson.RawMessage\n\nvar (\n\tMarshalIndent = s"
},
{
"path": "util/json/jsoniter.go",
"chars": 1010,
"preview": "//go:build !stdjson\n\npackage json\n\nimport (\n\tstdjson \"encoding/json\"\n\t\"strconv\"\n\t\"unsafe\"\n\n\tjsoniter \"github.com/json-it"
},
{
"path": "util/json/stdjson.go",
"chars": 157,
"preview": "//go:build stdjson\n\npackage json\n\nimport (\n\t\"encoding/json\"\n)\n\nvar (\n\tMarshal = json.Marshal\n\tUnmarshal = json.Unmar"
},
{
"path": "util/osx/env.go",
"chars": 721,
"preview": "package osx\n\nimport (\n\t\"os\"\n)\n\n// ExistEnv checks if the environment variable named by the key exists.\nfunc ExistEnv(key"
},
{
"path": "util/osx/file.go",
"chars": 3138,
"preview": "package osx\n\nimport (\n\t\"io\"\n\t\"os\"\n\t\"path/filepath\"\n\t\"strings\"\n)\n\n// InlineTilde replaces the leading ~ with the home dir"
},
{
"path": "util/osx/file_mmap.go",
"chars": 2036,
"preview": "// Copyright 2018 The Prometheus Authors\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may n"
},
{
"path": "util/osx/file_mmap_js.go",
"chars": 800,
"preview": "// Copyright 2022 The Prometheus Authors\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may n"
},
{
"path": "util/osx/file_mmap_unix.go",
"chars": 932,
"preview": "// Copyright 2017 The Prometheus Authors\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may n"
},
{
"path": "util/osx/file_mmap_windows.go",
"chars": 846,
"preview": "package osx\n\nimport (\n\t\"os\"\n\t\"syscall\"\n\t\"unsafe\"\n)\n\nfunc mmap(f *os.File, size int) ([]byte, error) {\n\tlow, high := uint"
},
{
"path": "util/osx/file_mmap_windows_386.go",
"chars": 643,
"preview": "// Copyright 2018 The Prometheus Authors\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may n"
},
{
"path": "util/osx/file_mmap_windows_non386.go",
"chars": 677,
"preview": "// Copyright 2018 The Prometheus Authors\n// Licensed under the Apache License, Version 2.0 (the \"License\");\n// you may n"
},
{
"path": "util/osx/homedir.go",
"chars": 318,
"preview": "package osx\n\nimport (\n\t\"os\"\n\t\"path/filepath\"\n\t\"time\"\n)\n\n// UserHomeDir is similar to os.UserHomeDir,\n// but returns the "
},
{
"path": "util/ptr/pointer.go",
"chars": 2397,
"preview": "package ptr\n\nimport (\n\t\"time\"\n\n\t\"golang.org/x/exp/constraints\"\n)\n\nfunc Int(v int) *int {\n\treturn Ref(v)\n}\n\nfunc IntDeref"
},
{
"path": "util/signalx/handler.go",
"chars": 559,
"preview": "package signalx\n\nimport (\n\t\"context\"\n\t\"os\"\n\t\"os/signal\"\n)\n\nvar registered = make(chan struct{})\n\n// Handler registers fo"
},
{
"path": "util/signalx/handler_unix.go",
"chars": 123,
"preview": "//go:build !windows\n\npackage signalx\n\nimport (\n\t\"os\"\n\t\"syscall\"\n)\n\nvar sigs = []os.Signal{syscall.SIGINT, syscall.SIGTER"
},
{
"path": "util/signalx/handler_windows.go",
"chars": 85,
"preview": "package signalx\n\nimport (\n\t\"os\"\n\t\"syscall\"\n)\n\nvar sigs = []os.Signal{syscall.SIGINT}\n"
},
{
"path": "util/slicex/search.go",
"chars": 332,
"preview": "package slicex\n\nimport \"golang.org/x/exp/constraints\"\n\n// UpperBound returns an index of the first element that is great"
},
{
"path": "util/stringx/bytes.go",
"chars": 364,
"preview": "package stringx\n\nimport \"unsafe\"\n\n// FromBytes converts a byte slice to a string.\nfunc FromBytes(b *[]byte) string {\n\tre"
},
{
"path": "util/stringx/random.go",
"chars": 1534,
"preview": "package stringx\n\n// Borrowed from github.com/thanhpk/randstr.\n\nimport (\n\t\"bytes\"\n\t\"crypto/rand\"\n\t\"encoding/binary\"\n\t\"enc"
},
{
"path": "util/stringx/strings.go",
"chars": 1371,
"preview": "package stringx\n\nimport \"strings\"\n\n// CutFromLeft is the same as strings.Cut,\n// which starts from left to right,\n// sli"
},
{
"path": "util/stringx/sum.go",
"chars": 1728,
"preview": "package stringx\n\nimport (\n\t\"crypto/sha256\"\n\t\"encoding/hex\"\n\t\"hash/fnv\"\n)\n\n// SumByFNV64a sums up the string(s) by FNV-64"
},
{
"path": "zz_generated.diffusion_model_memory_usage.regression.go",
"chars": 4401,
"preview": "package gguf_parser\n\nimport \"math\"\n\n// GuessSD1DiffusionModelMemoryUsage returns the memory usage in bytes for the given"
},
{
"path": "zz_generated.ggmltype.stringer.go",
"chars": 2039,
"preview": "// Code generated by \"stringer -linecomment -type GGMLType -output zz_generated.ggmltype.stringer.go -trimprefix GGMLTyp"
},
{
"path": "zz_generated.gguffiletype.stringer.go",
"chars": 2772,
"preview": "// Code generated by \"stringer -linecomment -type GGUFFileType -output zz_generated.gguffiletype.stringer.go -trimprefix"
},
{
"path": "zz_generated.ggufmagic.stringer.go",
"chars": 1056,
"preview": "// Code generated by \"stringer -linecomment -type GGUFMagic -output zz_generated.ggufmagic.stringer.go -trimprefix GGUFM"
},
{
"path": "zz_generated.ggufmetadatavaluetype.stringer.go",
"chars": 1450,
"preview": "// Code generated by \"stringer -linecomment -type GGUFMetadataValueType -output zz_generated.ggufmetadatavaluetype.strin"
},
{
"path": "zz_generated.ggufversion.stringer.go",
"chars": 766,
"preview": "// Code generated by \"stringer -linecomment -type GGUFVersion -output zz_generated.ggufversion.stringer.go -trimprefix G"
}
]
About this extraction
This page contains the full source code of the gpustack/gguf-parser-go GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 85 files (593.2 KB), approximately 183.0k tokens, and a symbol index with 687 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.