Showing preview only (827K chars total). Download the full file or copy to clipboard to get everything.
Repository: huggingface/quanto
Branch: main
Commit: ef3aafb30e6b
Files: 207
Total size: 766.4 KB
Directory structure:
gitextract_e7pf933s/
├── .github/
│ ├── CODEOWNERS
│ ├── PULL_REQUEST_TEMPLATE.md
│ └── workflows/
│ ├── check-commits.yml
│ ├── linux-cpu-tests.yml
│ ├── linux-cuda-tests.yml
│ ├── linux-examples.yml
│ ├── python-quality.yml
│ ├── security.yml
│ └── stale.yml
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── bench/
│ ├── generation/
│ │ ├── README.md
│ │ ├── evaluate_configurations.py
│ │ ├── evaluate_many_models.sh
│ │ ├── evaluate_model.py
│ │ ├── gen_barchart.py
│ │ ├── metrics/
│ │ │ ├── __init__.py
│ │ │ ├── latency.py
│ │ │ ├── perplexity.py
│ │ │ └── prediction.py
│ │ └── setup/
│ │ ├── __init__.py
│ │ ├── awq.py
│ │ ├── bnb.py
│ │ ├── hqq.py
│ │ └── quanto.py
│ ├── kernels/
│ │ ├── benchmark.py
│ │ ├── benchmark_marlin_fp8.py
│ │ └── benchmark_w4a16.py
│ └── torch_kernels/
│ ├── README.md
│ ├── test_int_mm.py
│ ├── test_int_mm_inductor.py
│ ├── test_weight_int4pack_mm.py
│ └── test_weight_int8pack_mm.py
├── examples/
│ ├── nlp/
│ │ ├── text-classification/
│ │ │ └── sst2/
│ │ │ └── quantize_sst2_model.py
│ │ └── text-generation/
│ │ └── quantize_causal_lm_model.py
│ ├── speech/
│ │ └── speech_recognition/
│ │ ├── quantize_asr_model.py
│ │ └── requirements.txt
│ └── vision/
│ ├── StableDiffusion/
│ │ ├── README.md
│ │ ├── quantize_StableDiffusion.py
│ │ └── requirements.txt
│ ├── image-classification/
│ │ ├── mnist/
│ │ │ └── quantize_mnist_model.py
│ │ └── pets/
│ │ └── quantize_vit_model.py
│ ├── object-detection/
│ │ └── quantize_owl_model.py
│ └── text-to-image/
│ └── quantize_pixart_sigma.py
├── external/
│ ├── awq/
│ │ ├── conftest.py
│ │ ├── pack_intweight.py
│ │ ├── packing_utils.py
│ │ ├── test_awq_kernels.py
│ │ ├── test_awq_packing.py
│ │ └── test_awq_quantize.py
│ └── smoothquant/
│ ├── README.md
│ └── smoothquant.py
├── optimum/
│ └── quanto/
│ ├── __init__.py
│ ├── calibrate.py
│ ├── library/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── extensions/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── cpp/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pybind_module.cpp
│ │ │ │ ├── unpack.cpp
│ │ │ │ └── unpack.h
│ │ │ ├── cuda/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── awq/
│ │ │ │ │ ├── dequantize.cuh
│ │ │ │ │ └── v2/
│ │ │ │ │ ├── gemm_cuda.cu
│ │ │ │ │ ├── gemm_cuda.h
│ │ │ │ │ ├── gemv_cuda.cu
│ │ │ │ │ ├── gemv_cuda.h
│ │ │ │ │ └── semaphore.h
│ │ │ │ ├── marlin/
│ │ │ │ │ ├── COPYRIGHT
│ │ │ │ │ ├── fp8_marlin.cu
│ │ │ │ │ ├── fp8_marlin.cuh
│ │ │ │ │ ├── gptq_marlin.cuh
│ │ │ │ │ ├── gptq_marlin_dtypes.cuh
│ │ │ │ │ ├── gptq_marlin_repack.cu
│ │ │ │ │ ├── gptq_marlin_repack.cuh
│ │ │ │ │ ├── marlin_cuda.cpp
│ │ │ │ │ ├── marlin_cuda.h
│ │ │ │ │ ├── marlin_cuda_kernel.cu
│ │ │ │ │ └── marlin_cuda_kernel.cuh
│ │ │ │ ├── pybind_module.cpp
│ │ │ │ ├── unpack.cu
│ │ │ │ └── unpack.h
│ │ │ ├── extension.py
│ │ │ ├── hip/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pybind_module.cpp
│ │ │ │ ├── unpack.cu
│ │ │ │ └── unpack.h
│ │ │ ├── mps/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pybind_module.cpp
│ │ │ │ ├── unpack.h
│ │ │ │ └── unpack.mm
│ │ │ └── xpu/
│ │ │ ├── __init__.py
│ │ │ ├── pybind_module.cpp
│ │ │ ├── unpack.h
│ │ │ └── unpack.sycl
│ │ ├── qbytes_mm.py
│ │ ├── quantize.py
│ │ └── unpack.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── diffusers_models.py
│ │ ├── shared_dict.py
│ │ └── transformers_models.py
│ ├── nn/
│ │ ├── __init__.py
│ │ ├── qconv2d.py
│ │ ├── qlayernorm.py
│ │ ├── qlinear.py
│ │ └── qmodule.py
│ ├── quantize.py
│ ├── subpackage/
│ │ ├── __init__.py
│ │ └── commands/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ └── quantize.py
│ └── tensor/
│ ├── __init__.py
│ ├── activations/
│ │ ├── __init__.py
│ │ ├── qbytes.py
│ │ ├── qbytes_ops.py
│ │ └── quantization.py
│ ├── core.py
│ ├── function.py
│ ├── grouped.py
│ ├── optimizers/
│ │ ├── __init__.py
│ │ ├── absmax_optimizer.py
│ │ ├── affine_optimizer.py
│ │ ├── hqq_optimizer.py
│ │ ├── max_optimizer.py
│ │ ├── optimizer.py
│ │ └── symmetric_optimizer.py
│ ├── packed.py
│ ├── qbits.py
│ ├── qbytes.py
│ ├── qtensor.py
│ ├── qtype.py
│ └── weights/
│ ├── __init__.py
│ ├── awq/
│ │ ├── __init__.py
│ │ ├── packed.py
│ │ └── qbits.py
│ ├── marlin/
│ │ ├── __init__.py
│ │ ├── fp8/
│ │ │ ├── __init__.py
│ │ │ ├── packed.py
│ │ │ └── qbits.py
│ │ ├── int4/
│ │ │ ├── __init__.py
│ │ │ ├── packed.py
│ │ │ └── qbits.py
│ │ └── permutations.py
│ ├── packing.py
│ ├── qbits.py
│ ├── qbytes.py
│ ├── quantization.py
│ ├── reordering.py
│ └── tinygemm/
│ ├── __init__.py
│ ├── packed.py
│ └── qbits.py
├── pyproject.toml
├── setup.sh
└── tests/
├── cli/
│ ├── cli_helpers.py
│ └── test_quantize_cli.py
├── conftest.py
├── helpers.py
├── library/
│ ├── test_extensions.py
│ ├── test_mm.py
│ ├── test_quantize.py
│ └── test_unpack.py
├── models/
│ ├── conftest.py
│ ├── test_quantized_model_for_causal_lm.py
│ └── test_quantized_model_for_pixart.py
├── nn/
│ ├── test_calibrate.py
│ ├── test_qattention.py
│ ├── test_qconv2d.py
│ ├── test_qlayernorm.py
│ ├── test_qlinear.py
│ └── test_qmodule.py
├── quantize/
│ ├── test_quantize_mlp.py
│ ├── test_quantize_patterns.py
│ └── test_requantize.py
└── tensor/
├── activations/
│ ├── test_activations_compile.py
│ ├── test_activations_dispatch.py
│ └── test_activations_quantize.py
├── ops/
│ ├── test_linear_dispatch.py
│ └── test_mm_dispatch.py
├── optimizers/
│ └── test_hqq_optimizer.py
├── test_absmax.py
├── test_packed_tensor.py
└── weights/
├── optimized/
│ ├── test_awq_packed_tensor.py
│ ├── test_awq_weight_qbits_tensor.py
│ ├── test_marlin_fp8_packed_tensor.py
│ ├── test_marlin_int4_packed_tensor.py
│ ├── test_marlin_int4_weight_qbits_tensor.py
│ ├── test_marlin_qbytes_tensor.py
│ ├── test_tinygemm_packed_tensor.py
│ └── test_tinygemm_weight_qbits_tensor.py
├── test_weight_qbits_tensor.py
├── test_weight_qbits_tensor_dispatch.py
├── test_weight_qbits_tensor_instantiate.py
├── test_weight_qbits_tensor_quantize.py
├── test_weight_qbytes_tensor_backward.py
├── test_weight_qbytes_tensor_dispatch.py
├── test_weight_qbytes_tensor_instantiate.py
├── test_weight_qbytes_tensor_quantize.py
├── test_weight_qbytes_tensor_serialization.py
└── weight_helpers.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/CODEOWNERS
================================================
* @dacorvo @sunmarc
================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
# What does this PR do?
<!--
Congratulations! You've made it this far! You're not quite done yet though.
Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution.
Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change.
Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost.
-->
<!-- Remove if not applicable -->
Fixes # (issue)
## Before submitting
- [ ] Did you read the [contributor guideline](https://github.com/huggingface/optimum-quanto/blob/main/CONTRIBUTING.md#create-a-pull-request),
Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link
to it if that's the case.
- [ ] Did you run all tests locally and make sure they pass.
- [ ] Did you write any new necessary tests?
## Who can review?
Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
members/contributors who may be interested in your PR.
================================================
FILE: .github/workflows/check-commits.yml
================================================
name: Check Commits
on: [workflow_call]
jobs:
build:
name: Check commits
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: huggingface/action-check-commits@v1.0.0
with:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
max-commits: "10"
min-words: "3"
forbidden-words: "fixup"
================================================
FILE: .github/workflows/linux-cpu-tests.yml
================================================
name: Linux CPU tests
on:
push:
branches:
- main
paths:
- "optimum/quanto/**"
- "tests/**"
- "pyproject.toml"
pull_request:
types: [assigned, opened, synchronize, reopened]
paths:
- "optimum/quanto/**"
- "tests/**"
- "pyproject.toml"
jobs:
check-commits:
uses: ./.github/workflows/check-commits.yml
python-quality:
uses: ./.github/workflows/python-quality.yml
test-ubuntu-cpu:
needs: [check-commits, python-quality]
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.11"]
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@e9aba2c848f5ebd159c070c61ea2c4e2b122355e # v2
with:
python-version: ${{ matrix.python-version }}
- name: Build and install quanto
run: |
pip install --upgrade pip
pip install -e .[dev]
- name: Run base tests
run: |
python -m pytest tests --ignore=tests/models --ignore=tests/cli
- name: Run models tests
run: |
pip install accelerate transformers diffusers
python -m pytest tests/models
- name: Run CLI tests
run: |
pip install optimum
python -m pytest tests/cli
run_staging_tests:
needs: [check-commits, python-quality]
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.9", "3.11"]
steps:
- uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@e9aba2c848f5ebd159c070c61ea2c4e2b122355e # v2
with:
python-version: ${{ matrix.python-version }}
- name: Build and install quanto
run: |
pip install --upgrade pip
pip install -e .[dev]
- name: Run models hub tests
run: |
pip install accelerate transformers diffusers
HUGGINGFACE_CO_STAGING=true python -m pytest tests/models -k "hub"
================================================
FILE: .github/workflows/linux-cuda-tests.yml
================================================
name: Linux CUDA tests
on:
push:
branches:
- main
paths:
- "optimum/quanto/**"
- "tests/**"
- "pyproject.toml"
pull_request:
types: [assigned, opened, synchronize, reopened]
paths:
- "optimum/quanto/**"
- "tests/**"
- "pyproject.toml"
jobs:
check-commits:
uses: ./.github/workflows/check-commits.yml
python-quality:
uses: ./.github/workflows/python-quality.yml
test-ubuntu-cuda:
needs: [check-commits, python-quality]
runs-on:
group: aws-g5-4xlarge-plus
strategy:
fail-fast: false
matrix:
cuda-version: ["11.8", "12.4", "12.6"]
container:
image: pytorch/pytorch:2.6.0-cuda${{ matrix.cuda-version }}-cudnn9-devel
options: --gpus 0
steps:
- uses: actions/checkout@v2
- name: Check CUDA installation
run: |
nvcc -V
- name: Build and install quanto
run: |
pip install --upgrade pip
pip install -e .[dev]
- name: Run base tests
run: |
python -m pytest tests --ignore=tests/models --ignore=tests/cli
- name: Run models tests
run: |
pip install accelerate transformers diffusers
python -m pytest tests/models
- name: Run CLI tests
run: |
pip install optimum
python -m pytest tests/cli
================================================
FILE: .github/workflows/linux-examples.yml
================================================
name: Linux examples (CPU, CUDA)
on:
push:
branches:
- main
paths:
- "optimum/quanto/**"
- "examples/**"
- "pyproject.toml"
pull_request:
types: [assigned, opened, synchronize, reopened]
paths:
- "optimum/quanto/**"
- "examples/**"
- "pyproject.toml"
jobs:
check-commits:
uses: ./.github/workflows/check-commits.yml
python-quality:
uses: ./.github/workflows/python-quality.yml
run-examples:
needs: [check-commits, python-quality]
runs-on:
group: aws-g5-4xlarge-plus
strategy:
fail-fast: false
matrix:
device: ["cpu", "cuda"]
container:
image: pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel
options: --gpus 0
steps:
- uses: actions/checkout@v2
- name: Check CUDA installation
run: |
nvcc -V
- name: Build and install packages
run: |
pip install --upgrade pip
pip install -e .[examples]
# Run examples
- name: Run MNIST classification example
run: |
for w in int4 int8 float8; do \
for a in none int8 float8; do \
python examples/vision/image-classification/mnist/quantize_mnist_model.py \
--weights $w --activations $a --device ${{ matrix.device }}; \
done; \
done
- name: Run OWL detection example
run: |
for w in int4 int8 float8; do \
python examples/vision/object-detection/quantize_owl_model.py \
--image http://images.cocodataset.org/val2017/000000039769.jpg \
--texts "a photo of a cat" "a remote" \
--weights $w --device ${{ matrix.device }}; \
done
- name: Run text-classification example
run: |
for w in int4 int8; do \
for a in none int8; do \
python examples/nlp/text-classification/sst2/quantize_sst2_model.py \
--weights $w --activations $a --device ${{ matrix.device }}; \
done; \
done
- name: Run text-to-image example
if: ${{ matrix.device == 'cuda'}}
run: |
for w in int4 int8 fp8; do \
python examples/vision/text-to-image/quantize_pixart_sigma.py \
--qtype $w --device ${{ matrix.device }}; \
done
================================================
FILE: .github/workflows/python-quality.yml
================================================
name: Python code quality
on: [workflow_call]
jobs:
check_code_quality:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.9
- name: Install dependencies
run: |
pip install --upgrade pip
pip install .[dev]
- run: ruff format bench examples optimum tests --diff
- run: ruff check --show-fixes bench examples optimum tests
================================================
FILE: .github/workflows/security.yml
================================================
name: Security Checks
on:
push:
permissions:
contents: read
jobs:
secrets:
runs-on: ubuntu-latest
steps:
- shell: bash
env:
REF_NAME: ${{ github.ref_name }}
HEAD_REF: ${{ github.event.pull_request.head.ref }}
run: |
if [ "${{ github.event_name }}" == "push" ]; then
echo "depth=$(($(jq length <<< '${{ toJson(github.event.commits) }}') + 2))" >> $GITHUB_ENV
echo "branch=$REF_NAME" >> $GITHUB_ENV
fi
if [ "${{ github.event_name }}" == "pull_request" ]; then
echo "depth=$((${{ github.event.pull_request.commits }}+2))" >> $GITHUB_ENV
echo "branch=$HEAD_REF" >> $GITHUB_ENV
fi
- name: Checkout code
uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
with:
ref: ${{env.branch}}
fetch-depth: ${{env.depth}}
- name: Scan for secrets
uses: trufflesecurity/trufflehog@6bd2d14f7a4bc1e569fa3550efa7ec632a4fa67b # main
================================================
FILE: .github/workflows/stale.yml
================================================
name: 'Close stale issues and PRs'
on:
schedule:
- cron: '30 1 * * *'
workflow_dispatch:
permissions:
issues: write
pull-requests: write
jobs:
stale:
runs-on: ubuntu-latest
steps:
- uses: actions/stale@v9
with:
stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
stale-pr-message: 'This PR is stale because it has been open 15 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
close-issue-message: 'This issue was closed because it has been stalled for 5 days with no activity.'
close-pr-message: 'This PR was closed because it has been stalled for 5 days with no activity.'
days-before-issue-stale: 30
days-before-pr-stale: 15
days-before-issue-close: 5
days-before-pr-close: 5
================================================
FILE: .gitignore
================================================
__pycache__
.pytest_cache
*.egg-info
dist
.venv
build/
================================================
FILE: CONTRIBUTING.md
================================================
<!---
Copyright 2024 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
# Contribute to optimum-quanto
Everyone is welcome to contribute, and we value everybody's contribution. Code
contributions are not the only way to help the community. Answering questions, helping
others, and improving the documentation are also immensely valuable.
It also helps us if you spread the word! Reference the library in blog posts
about the awesome projects it made possible, shout out on Twitter every time it has
helped you, or simply ⭐️ the repository to say thank you.
However you choose to contribute, please be mindful and respect our
[code of conduct](https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md).
**This guide is directly inspired by [transformers guide to contributing](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md).**
## Ways to contribute
There are several ways you can contribute:
* Fix outstanding issues with the existing code.
* Submit issues related to bugs or desired new features.
* Implement new kernels.
> All contributions are equally valuable to the community. 🥰
## Fixing outstanding issues
If you notice an issue with the existing code and have a fix in mind, feel free to [start contributing](https://github.com/huggingface/optimum-quanto/blob/main/CONTRIBUTING.md/#create-a-pull-request) and open a Pull Request!
## Submitting a bug-related issue or feature request
Do your best to follow these guidelines when submitting a bug-related issue or a feature
request. It will make it easier for us to come back to you quickly and with good
feedback.
### Did you find a bug?
The `optimum-quanto` backend will become more robust and reliable thanks to users who will report the problems they encounter.
Before you report an issue, we would really appreciate it if you could **make sure the bug was not
already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the library itself, and not your code. If you're unsure whether the bug is in your code or the library, please ask in the [forum](https://discuss.huggingface.co/) first. This helps us respond quicker to fixing issues related to the library versus general questions.
Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so we can quickly resolve it:
* Your **OS type and version** and **Python** and **PyTorch** versions.
* A short, self-contained, code snippet that allows us to reproduce the bug in
less than 30s.
* The *full* traceback if an exception is raised.
* Attach any other additional information, like screenshots, you think may help.
### Do you want a new feature?
If there is a new feature you'd like to see, please open an issue and describe:
1. What is the *motivation* behind this feature? Is it related to a problem or frustration with the library? Is it a feature related to something you need for a project? Is it something you worked on and think it could benefit the community?
Whatever it is, we'd love to hear about it!
2. Describe your requested feature in as much detail as possible. The more you can tell us about it, the better we'll be able to help you.
3. Provide a *code snippet* that demonstrates the features usage.
4. If the feature is related to a paper, please include a link.
If your issue is well written we're already 80% of the way there by the time you create it.
## Do you want to implement a new kernel?
With the constant evolution of hardware backends, there is always a need for updating the kernels for better performance.
* The hardware configuration(s) it will apply to.
* If any, a short description of the novel techniques that should be used to implement the kernel.
If you are willing to contribute the kernel yourself, let us know so we can help you add it to `optimum-quanto`!
## Create a Pull Request
Before writing any code, we strongly advise you to search through the existing PRs or
issues to make sure nobody is already working on the same thing. If you are
unsure, it is always a good idea to open an issue to get some feedback.
You will need basic `git` proficiency to contribute. While `git` is not the easiest tool to use, it has the greatest manual. Type `git --help` in a shell and enjoy! If you prefer books, [Pro Git](https://git-scm.com/book/en/v2) is a very good reference.
You'll need **Python 3.8** or above to contribute. Follow the steps below to start contributing:
1. Fork the [repository](https://github.com/huggingface/optimum-quanto) by
clicking on the **[Fork](https://github.com/huggingface/optimum-quanto/fork)** button on the repository's page. This creates a copy of the code
under your GitHub user account.
2. Clone your fork to your local disk, and add the base repository as a remote:
```bash
git clone git@github.com:<your Github handle>/optimum-quanto.git
cd optimum-quanto
git remote add upstream https://github.com/huggingface/optimum-quanto.git
```
3. Create a new branch to hold your development changes:
```bash
git checkout -b a-descriptive-name-for-my-changes
```
🚨 **Do not** work on the `main` branch!
4. Set up a development environment by running the following command in a virtual environment:
```bash
pip install -e ".[dev]"
```
If `optimum-quanto` was already installed in the virtual environment, remove
it with `pip uninstall optimum-quanto` before reinstalling it in editable
mode with the `-e` flag.
5. Develop the features in your branch.
As you work on your code, you should make sure the test suite
passes. Run the tests impacted by your changes like this:
```bash
pytest tests/<TEST_TO_RUN>.py
```
`optimum-quanto` relies on `black` and `ruff` to format its source code
consistently. After you make changes, apply automatic style corrections and code verifications
that can't be automated in one go with:
```bash
make style
```
Once you're happy with your changes, add the changed files with `git add` and
record your changes locally with `git commit`:
```bash
git add modified_file.py
git commit
```
This repository uses a `rebase` strategy when merging pull-requests, meaning that your commits will **not** be squashed automatically.
We therefore request you to keep a tidy queue of commits in your pull-request, clearly communicating the changes you made in each commit.
**This is enforced by the continuous integration, so your pull-request will not be reviewed if your commit queue is not clean.**
Although this is not mandatory, we kindly ask you to consider using [conventional commits](https://www.conventionalcommits.org/en/v1.0.0/#summary)
(here the full [specification](https://www.conventionalcommits.org/en/v1.0.0/))!
This article gives a brief [rationale](https://julien.ponge.org/blog/the-power-of-conventional-commits/) of why this will make our life and yours easier.
To keep your copy of the code up to date with the original
repository, rebase your branch on `upstream/branch` *before* you open a pull request or if requested by a maintainer:
```bash
git fetch upstream
git rebase upstream/main
```
Before submitting, cleanup your commit history to make it more readable for the reviewer (like squashing temporary commits and editing commit messages to clearly explain what you changed).
Push your changes to your branch:
```bash
git push -u origin a-descriptive-name-for-my-changes
```
If you've already opened a pull request, you'll need to force push with the `--force` flag. Otherwise, if the pull request hasn't been opened yet, you can just push your changes normally.
6. Now you can go to your fork of the repository on GitHub and click on **Pull Request** to open a pull request. Make sure you tick off all the boxes on our [checklist](https://github.com/huggingface/optimum-quanto/blob/main/CONTRIBUTING.md/#pull-request-checklist) below. When you're ready, you can send your changes to the project maintainers for review.
7. It's ok if maintainers request changes, it happens to our core contributors
too! So everyone can see the changes in the pull request, work in your local
branch and push the changes to your fork. They will automatically appear in
the pull request.
### Pull request checklist
☐ The pull request title should summarize your contribution.<br>
☐ If your pull request addresses an issue, please mention the issue number in the pull
request description to make sure they are linked (and people viewing the issue know you
are working on it).<br>
☐ To indicate a work in progress please prefix the title with `[WIP]`. These are
useful to avoid duplicated work, and to differentiate it from PRs ready to be merged.<br>
☐ Make sure existing tests pass.<br>
☐ If adding a new feature, also add tests for it.<br>
☐ All public methods must have informative docstrings.<br>
### Tests
An extensive test suite is included to test the library behavior in the [tests](https://github.com/huggingface/optimum-quanto/tree/main/tests) folder.
From the root of the repository, specify a *path to a subfolder or a test file* to run the test.
```bash
python -m pytest -sv ./tests/<subfolder>/<test>.py
```
You can run all tests by typing:
```bash
make test
```
### Style guide
For documentation strings, `optimum-quanto` follows the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html).
Check `transformers` [documentation writing guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification)
for more information.
================================================
FILE: LICENSE
================================================
Copyright 2023 - The Hugging Face team. All rights reserved.
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: Makefile
================================================
.PHONY: check test style
check_dirs := optimum tests bench examples
check:
ruff check --show-fixes ${check_dirs}
ruff format ${check_dirs} --diff
style:
ruff check ${check_dirs} --fix
ruff format ${check_dirs}
test:
python -m pytest -sv tests
================================================
FILE: README.md
================================================
# Optimum Quanto
> This project is currently in maintenance mode. We accept pull requests only for minor bug fixes, documentation improvements, and other maintenance tasks. Major new features or breaking changes are unlikely to be merged. For production-ready quantization features or active development, consider alternative projects such as [bitsandbytes](https://github.com/bitsandbytes-foundation/bitsandbytes) or [torchAO](https://github.com/pytorch/ao).
🤗 Optimum Quanto is a pytorch quantization backend for [optimum](https://huggingface.co/docs/optimum/en/index).
It has been designed with versatility and simplicity in mind:
- all features are available in eager mode (works with non-traceable models),
- quantized models can be placed on any device (including CUDA and MPS),
- automatically inserts quantization and dequantization stubs,
- automatically inserts quantized functional operations,
- automatically inserts quantized modules (see below the list of supported modules),
- provides a seamless workflow from a float model to a dynamic to a static quantized model,
- serialization compatible with pytorch `weight_only` and 🤗 `safetensors`,
- accelerated matrix multiplications on CUDA devices (int8-int8, fp16-int4, bf16-int8, bf16-int4),
- supports int2, int4, int8 and float8 weights,
- supports int8 and float8 activations.
Features yet to be implemented:
- dynamic activations smoothing,
- kernels for all mixed matrix multiplications on all devices,
- compatibility with [torch compiler](https://pytorch.org/docs/stable/torch.compiler.html) (aka dynamo).
## Performances
In a nutshell:
- accuracy: models compiled with `int8`/`float8` weights and `float8` activations are very close to the full-precision models,
- latency: whenever optimized kernels are available, the inference of quantized model is comparable with the full-precision models when quantizing only the model weights,
- device memory: approximately divided by float bits / integer bits.
The paragraph below is just an example. Please refer to the `bench` folder for detailed results per use-case of model.
### meta-llama/Meta-Llama-3.1-8B
<div class="row"><center>
<div class="column">
<img src="https://github.com/huggingface/optimum-quanto/blob/main/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Perplexity.png" alt="meta-llama/Meta-Llama-3.1-8B WikiText perplexity">
</div>
</center>
</div>
<div class="row"><center>
<div class="column">
<img src="https://github.com/huggingface/optimum-quanto/blob/main/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Latency__ms_.png" alt="meta-llama/Meta-Llama-3.1-8B Latency">
</div>
</center>
</div>
## Installation
Optimum Quanto is available as a pip package.
```sh
pip install optimum-quanto
```
## Quantization workflow for Hugging Face models
`optimum-quanto` provides helper classes to quantize, save and reload Hugging Face quantized models.
### LLM models
The first step is to quantize the model
```python
from transformers import AutoModelForCausalLM
from optimum.quanto import QuantizedModelForCausalLM, qint4
model = AutoModelForCausalLM.from_pretrained('meta-llama/Meta-Llama-3-8B')
qmodel = QuantizedModelForCausalLM.quantize(model, weights=qint4, exclude='lm_head')
```
Note: the model quantized weights will be frozen. If you want to keep them unfrozen to train them you need to use `optimum.quanto.quantize` directly.
The quantized model can be saved using `save_pretrained`:
```python
qmodel.save_pretrained('./Llama-3-8B-quantized')
```
It can later be reloaded using `from_pretrained`:
```python
from optimum.quanto import QuantizedModelForCausalLM
qmodel = QuantizedModelForCausalLM.from_pretrained('Llama-3-8B-quantized')
```
### Diffusers models
You can quantize any of the submodels inside a diffusers pipeline and seamlessly include them later in another pipeline.
Here we quantize the `transformer` of a `Pixart` pipeline.
```python
from diffusers import PixArtTransformer2DModel
from optimum.quanto import QuantizedPixArtTransformer2DModel, qfloat8
model = PixArtTransformer2DModel.from_pretrained("PixArt-alpha/PixArt-Sigma-XL-2-1024-MS", subfolder="transformer")
qmodel = QuantizedPixArtTransformer2DModel.quantize(model, weights=qfloat8)
qmodel.save_pretrained("./pixart-sigma-fp8")
```
Later, we can reload the quantized model and recreate the pipeline:
```python
from diffusers import PixArtTransformer2DModel
from optimum.quanto import QuantizedPixArtTransformer2DModel
transformer = QuantizedPixArtTransformer2DModel.from_pretrained("./pixart-sigma-fp8")
transformer.to(device="cuda")
pipe = PixArtSigmaPipeline.from_pretrained(
"PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
transformer=None,
torch_dtype=torch.float16,
).to("cuda")
pipe.transformer = transformer
```
## Quantization workflow for vanilla pytorch models (low-level API)
One thing to keep in mind when using the low-level quanto API is that by default models
weights are dynamically quantized: an explicit call must be made to 'freeze' the quantized weights.
A typical quantization workflow would consist of the following steps:
**1. Quantize**
The first step converts a standard float model into a dynamically quantized model.
```python
from optimum.quanto import quantize, qint8
quantize(model, weights=qint8, activations=qint8)
```
At this stage, only the inference of the model is modified to dynamically quantize the weights.
**2. Calibrate (optional if activations are not quantized)**
Quanto supports a calibration mode that allows to record the activation ranges while passing representative samples through the quantized model.
```python
from optimum.quanto import Calibration
with Calibration(momentum=0.9):
model(samples)
```
This automatically activates the quantization of the activations in the quantized modules.
**3. Tune, aka Quantization-Aware-Training (optional)**
If the performance of the model degrades too much, one can tune it for a few epochs to recover the float model performance.
```python
import torch
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data).dequantize()
loss = torch.nn.functional.nll_loss(output, target)
loss.backward()
optimizer.step()
```
**4. Freeze integer weights**
When freezing a model, its float weights are replaced by quantized integer weights.
```python
from optimum.quanto import freeze
freeze(model)
```
**5. Serialize quantized model**
Quantized models weights can be serialized to a `state_dict`, and saved to a file.
Both `pickle` and `safetensors` (recommended) are supported.
```python
from safetensors.torch import save_file
save_file(model.state_dict(), 'model.safetensors')
```
In order to be able to reload these weights, you also need to store the quantized
model quantization map.
```python
import json
from optimum.quanto import quantization_map
with open('quantization_map.json', 'w') as f:
json.dump(quantization_map(model), f)
```
**5. Reload a quantized model**
A serialized quantized model can be reloaded from a `state_dict` and a `quantization_map` using the `requantize` helper.
Note that you need first to instantiate an empty model.
```python
import json
from safetensors.torch import load_file
from optimum.quanto import requantize
state_dict = load_file('model.safetensors')
with open('quantization_map.json', 'r') as f:
quantization_map = json.load(f)
# Create an empty model from your modeling code and requantize it
with torch.device('meta'):
new_model = ...
requantize(new_model, state_dict, quantization_map, device=torch.device('cuda'))
```
Please refer to the [examples](https://github.com/huggingface/quanto/tree/main/examples) for instantiations of that workflow.
## Design overview
### Tensors
At the heart of quanto is a Tensor subclass that corresponds to:
- the projection of a source Tensor into the optimal range for a given destination type,
- the mapping of projected values to the destination type.
For floating-point destination types, the mapping is done by the native pytorch cast (i.e. `Tensor.to()`).
For integer destination types, the mapping is a simple rounding operation (i.e. `torch.round()`).
The goal of the projection is to increase the accuracy of the conversion by minimizing the number of:
- saturated values (i.e. mapped to the destination type min/max),
- zeroed values (because they are below the smallest number that can be represented by the destination type)
The projection is symmetric per-tensor or per-channel for `int8` and `float8`, and group-wise affine (with a shift or 'zero-point') for lower bitwidth.
One of the benefits of using a lower-bitwidth representation is that you will be able to take advantage of accelerated operations
for the destination type, which is typically faster than their higher precision equivalents.
Quanto does not support the conversion of a Tensor using mixed destination types.
### Modules
Quanto provides a generic mechanism to replace `torch` modules by `optimum-quanto` modules that are able to process quanto tensors.
`optimum-quanto` modules dynamically convert their weights until a model is frozen, which slows down inference a bit but is
required if the model needs to be tuned.
Weights are usually quantized per-channel along the first dimension (output features).
Biases are not converted to preserve the accuracy of a typical `addmm` operation.
Explanation: to be consistent with the unquantized arithmetic operations, biases would need to be quantized with a scale that
is equal to the product of the input and weight scales, which leads to a ridiculously small scale, and conversely
requires a very high bitwidth to avoid clipping. Typically, with `int8` inputs and weights, biases would need to be quantized
with at least `12` bits, i.e. in `int16`. Since most biases are today `float16`, this is a waste of time.
Activations are dynamically quantized per-tensor using static scales (defaults to the range `[-1, 1]`).
To preserve accuracy, the model needs to be calibrated to evaluate the best activation scales (using a momentum).
The following modules can be quantized:
- [Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) (QLinear).
Weights are always quantized, and biases are not quantized. Inputs and outputs can be quantized.
- [Conv2d](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html) (QConv2D).
Weights are always quantized, and biases are not quantized. Inputs and outputs can be quantized.
- [LayerNorm](https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html),
Weights and biases are __not__ quantized. Outputs can be quantized.
## Pitfalls to avoid when quantizing activations
Activations are always quantized per-tensor because most linear algebra operations in a model graph are not compatible
with per-axis inputs: you simply cannot add numbers that are not expressed in the same base (`you cannot add apples and oranges`).
Weights involved in matrix multiplications are, on the contrary, always quantized along their first axis, because all output features
are evaluated independently from one another.
The outputs of a quantized matrix multiplication will anyway always be dequantized, even if activations are quantized, because:
- the resulting accumulated values are expressed with a much higher bitwidth (typically `int32` or `float32`) than the activation bitwidth (typically `int8` or `float8`),
- they might be combined with a `float` bias.
Quantizing activations per-tensor to `int8` can lead to serious quantization errors if the corresponding tensors contain large outlier values.
Typically, this will lead to quantized tensors with most values set to zero (except the outliers).
A possible solution to work around that issue is to 'smooth' the activations statically as illustrated by [SmoothQuant](https://github.com/mit-han-lab/smoothquant).
You can find a script to smooth some model architectures under [external/smoothquant](external/smoothquant).
A better option is to represent activations using `float8`.
================================================
FILE: bench/generation/README.md
================================================
# Quanto generation benchmark
This repository contains scripts to evaluate the performances of quantized models using three metrics:
- `latency.py` evaluates the latency per generated token,
- `prediction.py` evaluates the accuracy when predicting the last token of prompts from the [Lambada dataset](https://huggingface.co/datasets/lambada),
- `perplexity.py` evaluates the perplexity of the model on the [WikiText dataset](https://huggingface.co/datasets/wikitext), as defined in the [transformers documentation](https://huggingface.co/docs/transformers/en/perplexity).
A `evaluate_model.py` utility script is also provided to evaluate the metrics on a specific model for several quantization configurations, and output the result to a `png` barchart and/or a `json` file.
Note: the language modeling head (lm_head) of the tested models is not quantized.
The paragraphs below display results for some popular models on a NVIDIA A10 GPU.
## meta-llama/Meta-Llama-3.1-8B
<div class="row"><center>
<div class="column">
<img src="https://github.com/huggingface/quanto/blob/main/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Accuracy.png" alt="meta-llama/Meta-llama-3.1-8B Lambada prediction accuracy">
</div>
</center>
</div>
<div class="row"><center>
<div class="column">
<img src="https://github.com/huggingface/quanto/blob/main/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Perplexity.png" alt="meta-llama/Meta-Llama-3.1-8B WikiText perplexity">
</div>
</center>
</div>
<div class="row"><center>
<div class="column">
<img src="https://github.com/huggingface/quanto/blob/main/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Latency__ms_.png" alt="meta-llama/Meta-Llama-3.1-8B Latency">
</div>
</center>
</div>
## mistralai/Mistral-7B-Instruct-v0.3
<div class="row"><center>
<div class="column">
<img src="https://github.com/huggingface/quanto/blob/main/bench/generation/charts/mistralai-Mistral-7B-Instruct-v0.3_bf16_Accuracy.png" alt="mistralai/Mistral-7B-Instruct-v0.3 Lambada prediction accuracy">
</div>
</center>
</div>
<div class="row"><center>
<div class="column">
<img src="https://github.com/huggingface/quanto/blob/main/bench/generation/charts/mistralai-Mistral-7B-Instruct-v0.3_bf16_Perplexity.png" alt="mistralai/Mistral-7B-Instruct-v0.3 WikiText perplexity">
</div>
</center>
</div>
<div class="row"><center>
<div class="column">
<img src="https://github.com/huggingface/quanto/blob/main/bench/generation/charts/mistralai-Mistral-7B-Instruct-v0.3_bf16_Latency__ms_.png" alt="mistralai/Mistral-7B-Instruct-v0.3 Latency">
</div>
</center>
</div>
## google/gemma-2b
<div class="row"><center>
<div class="column">
<img src="https://github.com/huggingface/quanto/blob/main/bench/generation/charts/google-gemma-2b_bf16_Accuracy.png" alt="google-gemma-2b Lambada prediction accuracy">
</div>
</center>
</div>
<div class="row"><center>
<div class="column">
<img src="https://github.com/huggingface/quanto/blob/main/bench/generation/charts/google-gemma-2b_bf16_Perplexity.png" alt="google-gemma-2b WikiText perplexity">
</div>
</center>
</div>
<div class="row"><center>
<div class="column">
<img src="https://github.com/huggingface/quanto/blob/main/bench/generation/charts/google-gemma-2b_bf16_Latency__ms_.png" alt="google-gemma-2b Latency">
</div>
</center>
</div>
================================================
FILE: bench/generation/evaluate_configurations.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import torch
from evaluate_model import evaluate
from gen_barchart import gen_barchart
from transformers import AutoConfig
from optimum.quanto import qtype
def evaluate_model_configurations(
model_id: str, metric: str, device: torch.device, batch_size: int = 32, dtype: torch.dtype = torch.float16
):
weights = [
"int4",
"int8",
"float8",
]
activations = [
"none",
"float8",
]
def short_name(qtype: qtype):
return {
"none": "f16" if dtype == torch.float16 else "bf16",
"int4": "i4",
"int8": "i8",
"float8": "f8",
}[qtype]
results = {}
# Evaluate float16/bfloat16 model
config_name = f"W{short_name('none')}A{short_name('none')}"
print(f"{model_id}[{config_name}]:")
results[config_name] = evaluate(model_id, metric, "quanto", "none", "none", batch_size, device, dtype)
# Evaluate quantized models
for w in weights:
for a in activations:
config_name = f"W{short_name(w)}A{short_name(a)}"
print(f"{model_id}[{config_name}]:")
results[config_name] = evaluate(model_id, metric, "quanto", w, a, batch_size, device, dtype)
return results
def main():
parser = argparse.ArgumentParser(description="Evaluate quantized model predictions on Lambada Dataset")
parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
parser.add_argument(
"--model",
type=str,
default="facebook/opt-350m",
help="The name of the trained Model.",
)
parser.add_argument("--device", type=str, default=None, help="The device to use for generation.")
parser.add_argument("--metric", type=str, default="prediction", choices=["latency", "prediction", "perplexity"])
parser.add_argument("--batch_size", type=int, default=32, help="The batch size during evaluation.")
parser.add_argument("--dtype", type=str, help="Use the following dtype to load the model.")
parser.add_argument("--json", action="store_true", help="Dump the results to a json file.")
parser.add_argument("--png", action="store_true", help="Generate a PNG.")
args = parser.parse_args()
torch.manual_seed(args.seed)
if args.device is None:
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
elif torch.xpu.is_available():
device = torch.device("xpu")
else:
device = torch.device("cpu")
else:
device = torch.device(args.device)
if args.dtype is None:
config = AutoConfig.from_pretrained(args.model)
dtype = getattr(config, "torch_dtype", torch.float16)
else:
dtype = torch.float16 if args.dtype == "fp16" else torch.bfloat16
results = evaluate_model_configurations(args.model, args.metric, device, batch_size=args.batch_size, dtype=dtype)
if args.json:
model_name = args.model.split("/")[-1]
json_path = f"{model_name}-{args.metric}.json"
with open(json_path, "w") as fp:
json.dump({model_name: results}, fp, indent=4)
if args.png:
if args.metric == "latency":
title = f"{args.model}: Mean latency per token"
label = "Latency (ms)"
elif args.metric == "prediction":
title = f"{args.model}: Prediction accuracy on Lambada dataset"
label = "Accuracy"
elif args.metric == "perplexity":
title = f"{args.model}: Perplexity evaluated on WikiText dataset"
label = "Perplexity"
gen_barchart(args.model, title, label, results, dtype)
if __name__ == "__main__":
main()
================================================
FILE: bench/generation/evaluate_many_models.sh
================================================
#!/bin/bash
# Absolute path to this script, e.g. /home/user/bin/foo.sh
SCRIPT=$(readlink -f "$0")
# Absolute path this script is in, thus /home/user/bin
SCRIPT_PATH=$(dirname "$SCRIPT")
models=(
google/gemma-2b
meta-llama/Meta-Llama-3.1-8B
mistralai/Mistral-7B-Instruct-v0.3
)
for m in ${models[@]}; do
python ${SCRIPT_PATH}/evaluate_configurations.py --model $m --metric prediction --png --json --batch_size 16
python ${SCRIPT_PATH}/evaluate_configurations.py --model $m --metric perplexity --png --json --batch_size 16
python ${SCRIPT_PATH}/evaluate_configurations.py --model $m --metric latency --png --json --batch_size 16
done
================================================
FILE: bench/generation/evaluate_model.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import importlib
import torch
from datasets import load_dataset
from metrics.latency import latency
from metrics.perplexity import perplexity
from metrics.prediction import prediction_accuracy
if importlib.util.find_spec("awq") is not None:
from setup.awq import setup as awq_setup
if importlib.util.find_spec("bitsandbytes") is not None:
from setup.bnb import setup as bnb_setup
if importlib.util.find_spec("hqq") is not None:
from setup.hqq import setup as hqq_setup
from setup.quanto import setup as quanto_setup
from transformers import AutoConfig
@torch.no_grad()
def calibrate(model, tokenizer, batch_size, batches):
samples = batch_size * batches
cal_dataset = load_dataset("lambada", split=["validation"])[0]
model.eval()
total = 0
for batch in cal_dataset.iter(batch_size=batch_size):
inputs = tokenizer(batch["text"], return_tensors="pt", padding=True)
input_ids = inputs.input_ids.to(model.device)
attention_mask = inputs.attention_mask.to(model.device)
model(input_ids, attention_mask=attention_mask)
total += input_ids.size(0)
if total >= samples:
break
def evaluate(
model_id: str,
metric: str,
quantizer: str,
weights: str,
activations: str,
batch_size: int,
device: torch.device,
dtype: torch.dtype = None,
):
if quantizer == "quanto":
if dtype is None:
config = AutoConfig.from_pretrained(model_id)
dtype = getattr(config, "torch_dtype", torch.float16)
model, tokenizer = quanto_setup(model_id, weights, activations, batch_size, device, dtype)
elif quantizer == "awq":
model, tokenizer = awq_setup(model_id, weights, activations, group_size=128)
elif quantizer == "bnb":
model, tokenizer = bnb_setup(model_id, weights, activations, device)
elif quantizer == "hqq":
model, tokenizer = hqq_setup(model_id, weights, activations, device)
else:
raise ValueError(f"Unsupported quantizer {quantizer}")
dtype = next(model.parameters()).dtype
weights = dtype if weights == "none" else weights
activations = dtype if activations == "none" else activations
print(f"Evaluating {model_id} {metric} with {weights} weights and {activations} activations.")
if metric == "latency":
return latency(model, tokenizer, device, batch_size=1, prompt_length=512, nb_tokens=512, iterations=3)
elif metric == "prediction":
return prediction_accuracy(model, tokenizer, batch_size)
elif metric == "perplexity":
return perplexity(model, tokenizer)
def main():
parser = argparse.ArgumentParser(description="Evaluate quantized model metrics")
parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
parser.add_argument(
"--model",
type=str,
default="facebook/opt-350m",
help="The name of the trained Model.",
)
parser.add_argument("--device", type=str, default=None, help="The device to use for generation.")
parser.add_argument("--metric", type=str, default="prediction", choices=["latency", "prediction", "perplexity"])
parser.add_argument("--quantizer", type=str, default="quanto", choices=["quanto", "awq", "bnb", "hqq"])
parser.add_argument(
"--weights",
type=str,
default="none",
choices=["none", "int4", "int8", "float8"],
)
parser.add_argument(
"--activations",
type=str,
default="none",
choices=["none", "int8", "float8"],
)
parser.add_argument("--batch_size", type=int, default=32, help="The batch size during evaluation.")
parser.add_argument(
"--dtype",
type=str,
default="none",
choices=["none", "fp16", "bf16"],
)
args = parser.parse_args()
torch.manual_seed(args.seed)
if args.device is None:
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
elif torch.xpu.is_available():
device = torch.device("xpu")
else:
device = torch.device("cpu")
else:
device = torch.device(args.device)
dtype = {"none": None, "fp16": torch.float16, "bf16": torch.bfloat16}[args.dtype]
evaluate(args.model, args.metric, args.quantizer, args.weights, args.activations, args.batch_size, device, dtype)
if __name__ == "__main__":
main()
================================================
FILE: bench/generation/gen_barchart.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import matplotlib.pyplot as plt
import numpy as np
import torch
def save_bar_chart(title, labels, ylabel, series, save_path):
x = np.arange(len(labels)) # the label locations
width = 0.15 # the width of the bars
multiplier = 0
fig, ax = plt.subplots(layout="constrained")
fig.set_figwidth(10)
max_value = 0
for attribute, measurement in series.items():
max_value = max(max_value, max(measurement))
offset = width * multiplier
rects = ax.bar(x + offset, measurement, width, label=attribute)
ax.bar_label(rects, padding=5)
multiplier += 1
# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel(ylabel)
ax.set_title(title)
ax.set_xticks(x + width, labels)
ax.legend(loc="upper left", ncols=4)
ax.set_ylim(0, max_value * 1.2)
plt.savefig(save_path)
def gen_barchart(model_id, title, label, results, dtype):
dtype_str = "f16" if dtype is torch.float16 else "bf16"
activations = (dtype_str, "f8")
weights = ("i4", "i8", "f8")
series = {}
reference = round(results[f"W{dtype_str}A{dtype_str}"], 2)
series[f"Weights {dtype_str}"] = [
reference,
] * len(activations)
for w in weights:
name = f"Weights {w}"
series[name] = []
for a in activations:
result = results[f"W{w}A{a}"]
series[name].append(round(result, 2))
model_name = model_id.replace("/", "-")
metric_name = label.replace(" ", "_").replace("(", "_").replace(")", "_")
save_bar_chart(
title=title,
labels=[f"Activations {a}" for a in activations],
series=series,
ylabel=label,
save_path=f"{model_name}_{dtype_str}_{metric_name}.png",
)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("benchmark", type=str, help="A benchmark result file (.json).")
parser.add_argument("--title", type=str, required=True, help="The graph title.")
parser.add_argument("--label", type=str, required=True, help="The graph vertical label.")
args = parser.parse_args()
with open(args.benchmark) as f:
benchmark = json.load(f)
for model_id, results in benchmark.items():
gen_barchart(model_id, args.title, args.label, results)
if __name__ == "__main__":
main()
================================================
FILE: bench/generation/metrics/__init__.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
================================================
FILE: bench/generation/metrics/latency.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import time
import numpy as np
import torch
from tqdm.auto import tqdm
from transformers import GenerationConfig
def latency(model, tokenizer, device, batch_size=1, prompt_length=512, nb_tokens=512, iterations=10):
def synchronize(device):
if device.type == "cuda":
torch.cuda.synchronize()
elif device.type == "mps":
torch.mps.synchronize()
elif device.type == "xpu":
torch.xpu.synchronize()
else:
torch.cpu.synchronize()
def timing_event(device):
if device.type == "cuda":
return torch.cuda.Event(enable_timing=True)
elif device.type == "mps":
return torch.mps.Event(enable_timing=True)
elif device.type == "xpu":
return torch.xpu.Event(enable_timing=True)
class CPUEvent:
def __init__(self):
self.time = None
def record(self):
self.time = time.time()
def elapsed_time(self, other):
assert self.time is not None
assert other.time is not None
return (other.time - self.time) * 1000
return CPUEvent()
generation_config = GenerationConfig(
max_new_tokens=nb_tokens,
min_new_tokens=nb_tokens,
use_cache=True,
pad_token_id=tokenizer.pad_token_id,
num_beams=1,
do_sample=False,
eos_token_id=None, # This is required for min_new_tokens to actually have an effect.
)
if getattr(model, "generation_config", None) is not None:
model.generation_config.eos_token_id = None # greedy_search falls back on this eos_token_id that we need to set to None as well for min_new_tokens to have an effect.
synchronize(device)
if device.type == "cuda":
torch.cuda.reset_peak_memory_stats()
elif device.type == "xpu":
torch.xpu.reset_peak_memory_stats()
memory = get_device_memory(device)
if memory is not None:
print(f"Device memory: {memory / (2**30):.4f} GB")
latencies = []
input_ids = torch.randint(1, model.config.vocab_size - 1, size=(batch_size, prompt_length)).to(device)
masks = torch.ones(batch_size, prompt_length, dtype=torch.int32).to(device)
for _ in tqdm(range(iterations)):
start_event = timing_event(device)
end_event = timing_event(device)
synchronize(device)
start_event.record()
_ = model.generate(input_ids, attention_mask=masks, generation_config=generation_config)
end_event.record()
synchronize(device)
latency_ms = start_event.elapsed_time(end_event)
latencies.append(latency_ms)
if device.type == "cuda":
peak_memory = torch.cuda.max_memory_allocated()
print(f"Peak memory during benchmark: {peak_memory / (2**30):.4f} GB")
elif device.type == "xpu":
peak_memory = torch.xpu.max_memory_allocated()
print(f"Peak memory during benchmark: {peak_memory / (2**30):.4f} GB")
mean_latency = np.mean(latencies) / generation_config.min_new_tokens
print(f"Average latency per token: {mean_latency} ms")
return mean_latency
def get_device_memory(device):
gc.collect()
if device.type == "cuda":
torch.cuda.empty_cache()
return torch.cuda.memory_allocated()
elif device.type == "mps":
torch.mps.empty_cache()
return torch.mps.current_allocated_memory()
elif device.type == "xpu":
torch.xpu.empty_cache()
return torch.xpu.memory_allocated()
return None
================================================
FILE: bench/generation/metrics/perplexity.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import numpy as np
import torch
from datasets import load_dataset
from tqdm import tqdm
class Perplexity:
"""
A class for calculating the perplexity of a language model.
"""
def __init__(self, model, tokenizer, dataset_path="wikitext", dataset_name=None, split="test", text_column="text"):
"""
Calculate perplexity using the same method as seen in llama.cpp.
Parameters
----------
model : AutoModelForCausalLM
The language model for which the perplexity is calculated.
tokenizer : AutoTokenizer
The tokenizer corresponding to the model.
dataset_path : str, optional
The path to the dataset on the Hugging Face dataset hub. Default is 'wikitext'.
dataset_name : str, optional
The name of the dataset. Default is None.
split : str, optional
The split of the dataset to use. Default is 'test'.
text_column : str, optional
The name of the column in the dataset that contains the text data. Default is 'text'.
"""
self._model = model
self._tokenizer = tokenizer
self._dataset_path = dataset_path
self._dataset_name = dataset_name
self._split = split
self._text_column = text_column
self._text = self._prepare_data()
def _prepare_data(self):
"""
Prepares the dataset by loading and formatting.
Returns
-------
str
The formatted dataset as a single string.
"""
if self._dataset_path == "wikitext":
self._dataset_name = "wikitext-2-raw-v1"
# Load the dataset
data = load_dataset(self._dataset_path, self._dataset_name, split=self._split)
# Format the text column of the dataset
text_list = [" \n" if s == "" else s for s in data[self._text_column]]
return "".join(text_list)
@staticmethod
def softmax(logits):
"""
Static method for applying the softmax function.
Parameters
----------
logits : np.ndarray
The input to the softmax function.
Returns
-------
np.ndarray
The output of the softmax function.
"""
e_x = np.exp(logits - np.max(logits))
return e_x / e_x.sum(axis=0)
def calculate_perplexity(self, n_ctx=512, n_batch=512):
"""
Calculates the perplexity of the language model.
Parameters
----------
n_ctx : int
The context size.
n_batch : int
The batch size.
Returns
-------
list
The list of perplexity scores calculated.
"""
# Tokenize the text
self._tokenizer.model_max_length = sys.maxsize
tokens = self._tokenizer(self._text, truncation=False, return_tensors="pt").input_ids.to(self._model.device)
nll = 0.0 # Negative log likelihood
count = 0 # Counter for processed tokens
curr_ppl = 0
all_perplexity = []
with tqdm(range(len(tokens[0]) // n_ctx), desc="Perplexity: - ") as progress:
for i in progress:
# Process each batch of tokens
nll, count = self._process_batch(i, n_ctx, n_batch, tokens, nll, count)
# Calculate and display the current perplexity
curr_ppl = np.exp(nll / count)
all_perplexity.append(curr_ppl)
progress.set_description(f"Perplexity: {curr_ppl:.4f}")
return all_perplexity
def _process_batch(self, i, n_ctx, n_batch, tokens, nll, count):
"""
Processes each batch of tokens.
Parameters
----------
i : int
The batch index.
n_ctx : int
The context size.
n_batch : int
The batch size.
tokens : torch.Tensor
The tokenized text.
nll : float
The current negative log likelihood.
count : int
The current count of processed tokens.
Returns
-------
float
The updated negative log likelihood.
int
The updated count of processed tokens.
"""
start = i * n_ctx
end = start + n_ctx
num_batches = (n_ctx + n_batch - 1) // n_batch
logits = []
for j in range(num_batches):
batch_start = start + j * n_batch
batch_size = min(end - batch_start, n_batch)
token_org = tokens[0][batch_start].item()
if j == 0:
# Replace the first token with the BOS token
tokens[0][batch_start] = self._tokenizer.bos_token_id
# Compute the logits for the current batch of tokens
batch_logits = self._compute_batch_logits(tokens, batch_start, batch_size)
tokens[0][batch_start] = token_org
logits.append(batch_logits)
# We rely on the fact that attention in the forward pass only looks at previous
# tokens here, so the logits returned for each token are an accurate representation
# of what the model would have predicted at that point.
#
# Example, we have a context window of 512, we will compute perplexity for each of the
# last 256 tokens. Then, we split the input up into context window size chunks to
# process the entire prompt.
for j in range(min(512, n_ctx // 2), n_ctx - 1):
tok_logits = logits[0][0][j].cpu().numpy()
# Compute the probability of the next token
prob = self.softmax(tok_logits)[tokens[0][start + j + 1]]
# Update the negative log likelihood and the count of processed tokens
nll += -np.log(prob, where=prob > 0)
count += 1
return nll, count
def _compute_batch_logits(self, tokens, batch_start, batch_size):
"""
Computes the logits for a batch of tokens.
Parameters
----------
tokens : torch.Tensor
The tokenized text.
batch_start : int
The start index of the batch.
batch_size : int
The size of the batch.
Returns
-------
torch.Tensor
The logits for the batch of tokens.
"""
# Compute the logits without keeping track of gradients
with torch.no_grad():
outputs = self._model(tokens[:, batch_start : batch_start + batch_size])
return outputs.logits.detach()
def perplexity(
model,
tokenizer,
stride: int = 512,
):
print("Evaluating perplexity")
ppl = Perplexity(model, tokenizer)
ppl_value = np.mean(ppl.calculate_perplexity(n_ctx=stride))
return ppl_value
================================================
FILE: bench/generation/metrics/prediction.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import torch
from datasets import load_dataset
@torch.no_grad()
def prediction_accuracy(model, tokenizer, batch_size, samples=None):
test_dataset = load_dataset("lambada", split=["test"])[0]
model.eval()
# The task is to predict the last token of the input.
total, hit = 0, 0
start = time.time()
for batch in test_dataset.iter(batch_size=batch_size):
inputs = tokenizer(batch["text"], return_tensors="pt", padding=True)
input_ids = inputs.input_ids.to(model.device)
attention_mask = inputs.attention_mask.to(model.device)
labels = input_ids[:, -1]
# Pass only the first tokens
outputs = model(input_ids[:, :-1], attention_mask=attention_mask[:, :-1])
preds = outputs.logits[:, -1, :].argmax(dim=-1)
total += labels.size(0)
hit += (preds == labels).sum().item()
if samples is not None and total >= samples:
break
end = time.time()
acc = hit / total
print(f"{total} sequences evaluated in {end - start:.2f} s. accuracy = {acc:.2f}")
return acc
================================================
FILE: bench/generation/setup/__init__.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
================================================
FILE: bench/generation/setup/awq.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
def prepare_inputs_for_generation(input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs):
if past_key_values is not None:
cache_length = past_length = past_key_values[0][0].shape[2]
max_cache_length = None
# Keep only the unprocessed tokens:
# 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
# some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
# input)
if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
# 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
# input_ids based on the past_length.
elif past_length < input_ids.shape[1]:
input_ids = input_ids[:, past_length:]
# 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
# If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
if (
max_cache_length is not None
and attention_mask is not None
and cache_length + input_ids.shape[1] > max_cache_length
):
attention_mask = attention_mask[:, -max_cache_length:]
position_ids = kwargs.get("position_ids", None)
if attention_mask is not None and position_ids is None:
# create position_ids on the fly for batch generation
position_ids = attention_mask.long().cumsum(-1) - 1
position_ids.masked_fill_(attention_mask == 0, 1)
if past_key_values:
position_ids = position_ids[:, -input_ids.shape[1] :]
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
if inputs_embeds is not None and past_key_values is None:
model_inputs = {"inputs_embeds": inputs_embeds}
else:
model_inputs = {"input_ids": input_ids}
model_inputs.update(
{
"position_ids": position_ids,
"past_key_values": past_key_values,
"use_cache": kwargs.get("use_cache"),
"attention_mask": attention_mask,
}
)
return model_inputs
def setup(model_id: str, weights: str, activations: str, group_size: int = 64, version="GEMV_FAST"):
if activations != "none":
raise ValueError("Activation quantization is not supported by HQQ")
if weights != "int4":
raise ValueError("AWQ only supports int4 weights.")
quant_config = {"zero_point": True, "q_group_size": group_size, "w_bit": 4, "version": version}
# Load model
model = AutoAWQForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"
# Quantize
model.quantize(tokenizer, quant_config=quant_config)
# We need to save otherwise it doesn't work
quant_path = model_id.replace("/", "-") + f"_{group_size}_{version}"
model.save_quantized(quant_path)
# Reload model
model = AutoAWQForCausalLM.from_quantized(quant_path)
# Hack: force transformers 4.36.2 behaviour
model.model.prepare_inputs_for_generation = prepare_inputs_for_generation
# Hack because AWQ models are not transformers models
model.device = next(model.parameters()).device
return model, tokenizer
================================================
FILE: bench/generation/setup/bnb.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
def setup(
model_id: str,
weights: str,
activations: str,
device: torch.device,
):
if activations != "none":
raise ValueError("Activation quantization is not supported by BitsAndBytes")
if weights == "int4":
quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="fp4")
elif weights == "int8":
quantization_config = BitsAndBytesConfig(load_in_8bit=True)
else:
raise ValueError("BitsAndBytes only supports int4 and int8 weights.")
dtype = torch.float32 if device.type == "cpu" else torch.float16
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"
quantization_config.bnb_4bit_compute_dtype = dtype
model = AutoModelForCausalLM.from_pretrained(
model_id, torch_dtype=dtype, low_cpu_mem_usage=True, quantization_config=quantization_config
)
return model, tokenizer
================================================
FILE: bench/generation/setup/hqq.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import torch
from hqq.core.quantize import BaseQuantizeConfig
from hqq.engine.hf import HQQModelForCausalLM
from transformers import AutoTokenizer
def setup(model_id: str, weights: str, activations: str, device: torch.device, group_size: int = 64):
if activations != "none":
raise ValueError("Activation quantization is not supported by HQQ")
if weights == "int4":
quant_config = BaseQuantizeConfig(nbits=4, group_size=group_size)
elif weights == "int8":
quant_config = BaseQuantizeConfig(nbits=8, group_size=group_size)
else:
raise ValueError("HQQ only supports int4 and int8 weights.")
# Load model
model = HQQModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
# Quantize
model.quantize_model(quant_config=quant_config, compute_dtype=torch.float16, device=device)
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"
return model, tokenizer
================================================
FILE: bench/generation/setup/quanto.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from optimum.quanto import Calibration, freeze, qfloat8, qint4, qint8, quantize
@torch.no_grad()
def calibrate(model, tokenizer, batch_size, batches):
samples = batch_size * batches
cal_dataset = load_dataset("lambada", split=["validation"])[0]
model.eval()
total = 0
for batch in cal_dataset.iter(batch_size=batch_size):
inputs = tokenizer(batch["text"], return_tensors="pt", padding=True)
input_ids = inputs.input_ids.to(model.device)
attention_mask = inputs.attention_mask.to(model.device)
model(input_ids, attention_mask=attention_mask)
total += input_ids.size(0)
if total >= samples:
break
def setup(
model_id: str,
weights: str,
activations: str,
batch_size: int,
device: torch.device,
dtype: torch.dtype,
):
weights = keyword_to_qtype(weights)
activations = keyword_to_qtype(activations)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, low_cpu_mem_usage=True).to(device)
if weights is not None or activations is not None:
print("Quantizing")
start = time.time()
quantization_root = model
if hasattr(model, "model"):
quantization_root = model.model
quantize(quantization_root, weights=weights, activations=activations)
if activations is not None:
print("Calibrating")
with Calibration():
calibrate(model, tokenizer, batch_size, batches=4)
print("Freezing")
freeze(model)
print(f"Finished: {time.time() - start:.2f}")
return model, tokenizer
def keyword_to_qtype(k):
return {
"none": None,
"int4": qint4,
"int8": qint8,
"float8": qfloat8,
}[k]
================================================
FILE: bench/kernels/benchmark.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import time
from contextlib import nullcontext
import numpy as np
import torch
from tqdm.auto import tqdm
from optimum.quanto.library import disable_extensions
def get_unpack_bench(bits, device):
qmax = 2**bits
a = torch.randint(0, qmax, [10240, 10240], dtype=torch.uint8).to(device)
def bench_fn():
return torch.ops.quanto.unpack(a, bits)
return bench_fn
def timing(get_bench_func, device, iterations=10):
def synchronize(device):
if device.type == "cuda":
torch.cuda.synchronize()
elif device.type == "mps":
torch.mps.synchronize()
elif device.type == "xpu":
torch.xpu.synchronize()
else:
torch.cpu.synchronize()
def timing_event(device):
if device.type == "cuda":
return torch.cuda.Event(enable_timing=True)
elif device.type == "mps":
return torch.mps.Event(enable_timing=True)
elif device.type == "xpu":
return torch.xpu.Event(enable_timing=True)
class CPUEvent:
def __init__(self):
self.time = None
def record(self):
self.time = time.time()
def elapsed_time(self, other):
assert self.time is not None
assert other.time is not None
return (other.time - self.time) * 1000
return CPUEvent()
synchronize(device)
bench_func = get_bench_func(device)
# Warmup to load library
bench_func()
latencies = np.empty((iterations, 2))
for i in tqdm(range(iterations)):
for j, context in enumerate([disable_extensions(), nullcontext()]):
start_event = timing_event(device)
end_event = timing_event(device)
synchronize(device)
start_event.record()
with context:
bench_func()
end_event.record()
synchronize(device)
latencies[i, j] = start_event.elapsed_time(end_event)
return np.mean(latencies[:, 0]), np.mean(latencies[:, 1])
GET_BENCH_FUNCTIONS = {
"unpack_2bit": lambda device: get_unpack_bench(2, device),
"unpack_4bit": lambda device: get_unpack_bench(4, device),
}
def main():
parser = argparse.ArgumentParser(description="Kernel benchmark")
parser.add_argument("--kernel", type=str, default=None, help="The kernel to benchmark. None to test all of them")
parser.add_argument("--device", type=str, default=None, help="The device to use for benchmark.")
parser.add_argument("--it", type=int, default=10, help="The number of benchmark iterations")
args = parser.parse_args()
if args.device is None:
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
elif torch.xpu.is_available():
device = torch.device("xpu")
else:
device = torch.device("cpu")
else:
device = torch.device(args.device)
all_kernels = GET_BENCH_FUNCTIONS.keys()
kernels = all_kernels if args.kernel is None else [args.kernel]
for kernel in kernels:
get_bench_fn = GET_BENCH_FUNCTIONS[kernel]
python_ms, ext_ms = timing(get_bench_fn, device, iterations=args.it)
ratio = python_ms / ext_ms
print(f"\n{kernel}[{device.type}]: python = {python_ms:.3f} ms, ext = {ext_ms:.3f} ms, ratio = {ratio:.1f}x")
if __name__ == "__main__":
main()
================================================
FILE: bench/kernels/benchmark_marlin_fp8.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from typing import Optional
import numpy as np
import torch
from optimum.quanto.tensor.weights.marlin.packed import pack_fp8_as_int32
M_SHAPES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
N_SHAPES = [4096]
K_SHAPES = [4096]
def run_benchmark(
m: Optional[int],
n: Optional[int],
k: Optional[int],
n_runs: int,
n_warmup: int,
dtype: torch.dtype = torch.float16,
):
print(f"\n----------- m={m}, n={n}, k={k}")
n_tokens = m
in_features = k
out_features = n
assert m is not None
device = torch.device("cuda")
inputs = torch.rand(n_tokens, in_features, dtype=dtype, device=device)
other_shape = (in_features, out_features)
other_data = torch.rand(other_shape, dtype=dtype, device=device).to(torch.float8_e4m3fn)
other_data_int32 = pack_fp8_as_int32(other_data)
perm = torch.empty(0, dtype=torch.int, device=device)
other_data_repack = torch.ops.quanto.gptq_marlin_repack(
b_q_weight=other_data_int32, perm=perm, size_k=in_features, size_n=out_features, num_bits=8
)
other_scale = torch.rand(1, dtype=dtype, device=device)
other_scale = other_scale.repeat(1, out_features)
workspace = torch.zeros(out_features // 64 * 16, dtype=torch.int, device=device)
latencies_marlin_fp8 = []
latencies_torch = []
with torch.no_grad():
for i in range(n_runs):
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
torch.cuda.synchronize(device)
start_event.record()
_ = torch.ops.quanto.fp8_marlin_gemm(
a=inputs,
b_q_weight=other_data_repack,
b_scales=other_scale,
workspace=workspace,
num_bits=8,
size_m=n_tokens,
size_n=out_features,
size_k=in_features,
)
end_event.record()
torch.cuda.synchronize(device)
latency_ms = start_event.elapsed_time(end_event)
if i >= n_warmup:
latencies_marlin_fp8.append(latency_ms)
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
torch.cuda.synchronize(device)
start_event.record()
other = other_data.to(dtype) * other_scale
_ = torch.matmul(inputs, other)
end_event.record()
torch.cuda.synchronize(device)
latency_ms = start_event.elapsed_time(end_event)
if i >= n_warmup:
latencies_torch.append(latency_ms)
mean_latency_torch = np.mean(latencies_torch)
mean_latency_marlin_fp8 = np.mean(latencies_marlin_fp8)
print("mean_latency_torch:", mean_latency_torch)
print("mean_latency_marlin_fp8:", mean_latency_marlin_fp8)
return mean_latency_torch, mean_latency_marlin_fp8
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Marlin FP8 kernel benchmark")
parser.add_argument("--nruns", type=int, default=20, help="The number of benchmark iterations")
parser.add_argument("--nwarmup", type=int, default=2, help="The number of warmup iterations (deducted from nruns)")
parser.add_argument(
"--m",
type=int,
help="m dimension of A=m*k",
default=None,
)
parser.add_argument(
"--n",
type=int,
help="n dimension of B=k*n (out_features)",
default=None,
)
parser.add_argument(
"--k",
type=int,
help="k dimension of A=m*k and B=k*n (in_features), hidden_size",
default=None,
)
args = parser.parse_args()
if args.m is not None:
def shape_generator():
yield (args.m, args.n, args.k)
else:
def shape_generator():
for m in M_SHAPES:
for n in N_SHAPES:
for k in K_SHAPES:
yield (m, n, k)
result = "m,n_out,k_in,torch_latency_ms,marlin_fp8_latency_ms\n"
for m, n, k in shape_generator():
mean_latency_torch, mean_latency_marlin_fp8 = run_benchmark(m, n, k, args.nruns, args.nwarmup)
result += (
",".join(
[
str(m),
str(n),
str(k),
f"{mean_latency_torch:.4f}",
f"{mean_latency_marlin_fp8:.4f}",
]
)
+ "\n"
)
print("\nResults:")
print(result)
================================================
FILE: bench/kernels/benchmark_w4a16.py
================================================
# From: https://github.com/IST-DASLab/marlin/blob/master/bench.py
import argparse
import time
import torch
from optimum.quanto.tensor.weights.awq import AWQPackedTensor, AWQPacking
from optimum.quanto.tensor.weights.marlin import marlin_permute
from optimum.quanto.tensor.weights.marlin.int4 import MarlinInt4PackedTensor
def benchmark(f, warmup=1, iter=10):
for i in range(warmup + iter):
f()
# We do not synchronize here in order to hide the kernel launch overhead during benchmarkining as this will also
# happen during realistic model inference as many launches are submitted to the kernel queue.
if i == warmup - 1:
torch.cuda.synchronize()
tick = time.time()
torch.cuda.synchronize()
res = (time.time() - tick) / iter
# Make sure there is enough to "cool down" the GPU in between benchmarks to avoid throttling for later runs when
# we execute many benchmarks consecutively
time.sleep(1.0)
return res
def get_problem(m, n, k, groupsize=128):
dev = torch.device("cuda:0")
A = torch.rand((m, k), dtype=torch.half, device=dev)
B_4bit = torch.randint(0, 2**4, (n, k), dtype=torch.uint8, device=dev)
B_awq = AWQPackedTensor.pack(B_4bit, packing=AWQPacking.V2)._data
B_marlin = MarlinInt4PackedTensor.pack(B_4bit)._data
B_ref = torch.rand((k, n), dtype=torch.half, device=dev)
s = torch.rand((k // groupsize, n), dtype=torch.half, device=dev) / 2**4
s_marlin = marlin_permute(s)
z = torch.randint(-(2 ** (4 - 1)), 2 ** (4 - 1), (k // groupsize, n), dtype=torch.int8, device=dev)
sz = -z * s
sz_marlin = marlin_permute(sz)
torch.cuda.synchronize()
return A, B_ref, B_awq, B_marlin, s, s_marlin, sz, sz_marlin
def benchmark_dense(A, B, m, n, k):
res = benchmark(lambda: torch.matmul(A, B))
return {
"s": res,
"TFLOP/s": 2 * A.numel() * n / res / 10**12,
"GB/s": (2 * A.numel() + 2 * B.numel() + 2 * (m * n)) / res / 10**9,
}
def benchmark_awq(A, B, s, sz, m, n, k):
res = benchmark(
lambda: torch.ops.quanto.gemm_f16i4_awq(A, B, s, sz, rows=m, out_cols=n, in_cols=k, bits=4, group_size=128)
)
return {
"s": res,
"TFLOP/s": 2 * (m * k) * n / res / 10**12,
"GB/s": (2 * A.numel() + 2 * B.numel() + 2 * (m * n) + 2 * s.numel() + 2 * sz.numel()) / res / 10**9,
}
def benchmark_marlin(A, B, s, sz, m, n, k):
workspace = torch.zeros(n // 128 * 16, dtype=torch.int, device=torch.device("cuda:0"))
res = benchmark(lambda: torch.ops.quanto.gemm_f16i4_marlin(A, B, s, sz, workspace))
return {
"s": res,
"TFLOP/s": 2 * (m * k) * n / res / 10**12,
"GB/s": (2 * A.numel() + 4 * B.numel() + 2 * (m * n) + 2 * s.numel() + 2 * sz.numel()) / res / 10**9,
}
MODELS = {
"Llama7B": [(4096, 3 * 4096), (4096, 4096), (4096, 2 * 10752), (10752, 4096)],
"Llama13B": [(5120, 3 * 5120), (5120, 5120), (5120, 2 * 13568), (13568, 5120)],
"Llama33B": [(6656, 3 * 6656), (6656, 6656), (6656, 2 * 17664), (17664, 6656)],
"Llama65B": [(8192, 3 * 8192), (8192, 8192), (8192, 2 * 21760), (21760, 8192)],
"Falcon180B": [
# Note that parallel attention and FC allows layer fusions
(14848, 14848 * 5 + 1024),
(14848 * 5, 14848),
],
}
def run_benchmark(model, tokens=None):
if tokens is None:
tokens = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
elif not isinstance(tokens, (list, tuple)):
tokens = [tokens]
groupsize = 128
layers = MODELS[model]
print(model)
for m in tokens:
tot_awq = {"s": 0, "TFLOP/s": 0, "GB/s": 0, "speedup": 0}
tot_marlin = {"s": 0, "TFLOP/s": 0, "GB/s": 0, "speedup": 0}
for layer in layers:
k, n = layer
A, B_ref, B_awq, B_marlin, s, s_marlin, sz, sz_marlin = get_problem(m, n, k, groupsize)
res_d = benchmark_dense(A, B_ref, m, n, k)
res_awq = benchmark_awq(A, B_awq, s, sz, m, n, k)
res_awq["speedup"] = res_d["s"] / res_awq["s"]
tot_awq["s"] += res_awq["s"]
for key in tot_awq:
if key != "s":
tot_awq[key] += res_awq[key] * res_awq["s"]
res_marlin = benchmark_marlin(A, B_marlin, s_marlin, sz_marlin, m, n, k)
res_marlin["speedup"] = res_d["s"] / res_marlin["s"]
tot_marlin["s"] += res_marlin["s"]
for key in tot_marlin:
if key != "s":
tot_marlin[key] += res_marlin[key] * res_marlin["s"]
for key in tot_awq:
if key != "s":
tot_awq[key] /= tot_awq["s"]
for key in tot_marlin:
if key != "s":
tot_marlin[key] /= tot_marlin["s"]
print(
"AWQ, tokens=%04d: s=%.5f, TFLOP/s=%07.3f, GB/s=%08.3f, speedup=%.2f"
% (m, tot_awq["s"], tot_awq["TFLOP/s"], tot_awq["GB/s"], tot_awq["speedup"])
)
print(
"Marlin, batch=%04d: s=%.5f, TFLOP/s=%07.3f, GB/s=%08.3f, speedup=%.2f"
% (m, tot_marlin["s"], tot_marlin["TFLOP/s"], tot_marlin["GB/s"], tot_marlin["speedup"])
)
def main():
parser = argparse.ArgumentParser(description="W4A16 Matrix Multiplication Kernel benchmark")
parser.add_argument(
"--model", type=str, default=None, help="The model configuration to benchmark. None to test all of them."
)
parser.add_argument(
"--tokens",
type=int,
default=None,
help="The numbers of input tokens used to benchmark. None to test a predefined range.",
)
args = parser.parse_args()
models = MODELS if args.model is None else [args.model]
for model in models:
run_benchmark(model, args.tokens)
print()
if __name__ == "__main__":
main()
================================================
FILE: bench/torch_kernels/README.md
================================================
This contains a few scripts to test pytorch kernels that are relevant for quantization.
================================================
FILE: bench/torch_kernels/test_int_mm.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import timeit
import torch
def main():
parser = argparse.ArgumentParser(description="Torch integer matmul benchmark")
parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
parser.add_argument("--device", type=str, default=None, help="The device to use for the test.")
parser.add_argument("--it", type=int, default=100, help="Number of iterations for average")
args = parser.parse_args()
torch.manual_seed(args.seed)
if args.device is None:
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
elif torch.xpu.is_available():
device = torch.device("xpu")
else:
device = torch.device("cpu")
else:
device = torch.device(args.device)
def avg_time(f, it):
return timeit.Timer(f).timeit(it) / it
# Resstrictions for accelerated integer matmul:
# - input matrices must be 2D
# - the collapsing dimension must be a multiple of 8
A = torch.randint(1, 10, [2400, 3200]).type(torch.int8).to(device)
B = torch.randint(1, 10, [3200, 4800]).type(torch.int8).to(device)
print(f"Evaluating integer matmul on {device.type}:")
# Warmup (slow)
torch._int_mm(A, B)
# Average on several calls
t = avg_time(lambda: torch._int_mm(A, B), args.it) * 1000
print(f"Average inference on {args.it} iterations: {t:.4f} ms")
# Convert inputs to float
def to_float(x):
if x.device.type == ("cpu"):
# matrix multiplication is not supported for float16 on CPU
return x.to(torch.float32)
return x.to(torch.float16)
A = to_float(A)
B = to_float(B)
print(f"Evaluating {A.dtype} matmul on {device.type}:")
# Warmup (slow)
torch.matmul(A, B)
# Average on several calls
t = avg_time(lambda: torch.matmul(A, B), args.it) * 1000
print(f"Average inference on {args.it} iterations: {t:.4f} ms")
if __name__ == "__main__":
main()
================================================
FILE: bench/torch_kernels/test_int_mm_inductor.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import timeit
import torch
def mm(a, b):
return torch._int_mm(a, b)
A = torch.randint(1, 10, [2400, 2400]).type(torch.int8).cuda()
B = torch.randint(1, 10, [2400, 2400]).type(torch.int8).cuda()
it = 100
# Warmup (slow)
mm(A, B)
# Get a reference
print(timeit.Timer(lambda: mm(A, B)).timeit(it) / it)
cmm = torch.compile(mm, backend="inductor")
# First invocation will trigger the actual compilation
cmm(A, B)
# Now compare execution time
print(timeit.Timer(lambda: cmm(A, B)).timeit(it) / it)
================================================
FILE: bench/torch_kernels/test_weight_int4pack_mm.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import timeit
import torch
def _group_quantize_tensor(w, n_bit=4, q_group_size=16):
assert w.dim() == 2
w = w.transpose(0, 1).contiguous()
assert q_group_size > 1
assert w.shape[-1] % q_group_size == 0
to_quant = w.reshape(-1, q_group_size)
assert torch.isnan(to_quant).sum() == 0
max_val = to_quant.amax(dim=1, keepdim=True)
min_val = to_quant.amin(dim=1, keepdim=True)
max_int = 2**n_bit - 1
min_int = 0
scales = (max_val - min_val).clamp(min=1e-6) / max_int
assert torch.isnan(scales).sum() == 0
zeros = min_val + scales * (2 ** (n_bit - 1))
assert torch.isnan(zeros).sum() == 0
out = to_quant.sub(min_val).div(scales).round().clamp_(min_int, max_int)
assert torch.isnan(out).sum() == 0
out = out.to(dtype=torch.int32).reshape(w.shape)
# Scales and zeros for the same q-group should be contiguous, so we can
# load as a 32-bit word
scales = scales.view(w.shape[0], -1)
zeros = zeros.view(w.shape[0], -1)
scales_and_zeros = (
torch.cat(
[
scales.reshape(scales.size(0), scales.size(1), 1),
zeros.reshape(zeros.size(0), zeros.size(1), 1),
],
2,
)
.transpose(0, 1)
.contiguous()
)
return out, scales_and_zeros
def main():
parser = argparse.ArgumentParser(description="Torch quantized int4 weight matmul benchmark")
parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
parser.add_argument("--dtype", type=str, default="fp16", choices=["fp16", "bf16"], help="floating point type")
parser.add_argument("--device", type=str, default=None, help="The device to use for the test.")
parser.add_argument("--it", type=int, default=10, help="Number of iterations for average")
args = parser.parse_args()
torch.manual_seed(args.seed)
if args.device is None:
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
elif torch.xpu.is_available():
device = torch.device("xpu")
else:
device = torch.device("cpu")
else:
device = torch.device(args.device)
def avg_time(f, it):
return timeit.Timer(f).timeit(it) / it
dtype = {"fp16": torch.float16, "bf16": torch.bfloat16}[args.dtype]
A = torch.rand([2400, 3200], dtype=dtype, device=device)
B = torch.rand([3200, 4800], dtype=dtype, device=device)
group_size = 128
B_int32, B_scale_and_zeros = _group_quantize_tensor(B, n_bit=4, q_group_size=group_size)
if device.type == "cpu":
B_packed = torch._convert_weight_to_int4pack_for_cpu(B_int32, innerKTiles=2)
else:
B_uint8 = (B_int32[::, ::2] << 4 | B_int32[::, 1::2]).to(torch.uint8)
B_packed = torch._convert_weight_to_int4pack(B_uint8, innerKTiles=2)
# Check quantized mm is close to float mm
if device.type == "cpu":
qout = torch._weight_int4pack_mm_for_cpu(A, B_packed, group_size, B_scale_and_zeros)
else:
qout = torch._weight_int4pack_mm(A, B_packed, group_size, B_scale_and_zeros)
out = torch.mm(A, B)
mean_err = ((qout - out).abs() / out.abs()).mean()
print(mean_err)
print(f"Evaluating quantized int4 matmul on {device.type}:")
# Warmup (slow)
if device.type == "cpu":
torch._weight_int4pack_mm_for_cpu(A, B_packed, group_size, B_scale_and_zeros)
else:
torch._weight_int4pack_mm(A, B_packed, group_size, B_scale_and_zeros)
# Average on several calls
if device.type == "cpu":
t = (
avg_time(lambda: torch._weight_int4pack_mm_for_cpu(A, B_packed, group_size, B_scale_and_zeros), args.it)
* 1000
)
else:
t = avg_time(lambda: torch._weight_int4pack_mm(A, B_packed, group_size, B_scale_and_zeros), args.it) * 1000
print(f"Average inference on {args.it} iterations: {t:.4f} ms")
print(f"Evaluating {A.dtype} matmul on {device.type}:")
# Warmup (slow)
torch.mm(A, B)
# Average on several calls
t = avg_time(lambda: torch.mm(A, B), args.it) * 1000
print(f"Average inference on {args.it} iterations: {t:.4f} ms")
if __name__ == "__main__":
main()
================================================
FILE: bench/torch_kernels/test_weight_int8pack_mm.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import timeit
import torch
def main():
parser = argparse.ArgumentParser(description="Torch quantized int8 weight matmul benchmark")
parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
parser.add_argument("--device", type=str, default=None, help="The device to use for the test.")
parser.add_argument("--it", type=int, default=10, help="Number of iterations for average")
args = parser.parse_args()
torch.manual_seed(args.seed)
if args.device is None:
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
elif torch.xpu.is_available():
device = torch.device("xpu")
else:
device = torch.device("cpu")
else:
device = torch.device(args.device)
def avg_time(f, it):
return timeit.Timer(f).timeit(it) / it
A = torch.rand([2400, 3200], dtype=torch.bfloat16, device=device)
B = torch.randint(-128, 127, [4800, 3200], dtype=torch.int8, device=device)
B_scale = torch.rand([4800], dtype=torch.bfloat16, device=device)
print(f"Evaluating quantized int8 matmul on {device.type}:")
# Warmup (slow)
torch._weight_int8pack_mm(A, B, B_scale)
# Average on several calls
t = avg_time(lambda: torch._weight_int8pack_mm(A, B, B_scale), args.it) * 1000
print(f"Average inference on {args.it} iterations: {t:.4f} ms")
# Convert weights to float
B = B.to(torch.bfloat16).t()
print(f"Evaluating {A.dtype} matmul on {device.type}:")
# Warmup (slow)
torch.matmul(A, B) * B_scale
# Average on several calls
t = avg_time(lambda: torch.matmul(A, B) * B_scale, args.it) * 1000
print(f"Average inference on {args.it} iterations: {t:.4f} ms")
if __name__ == "__main__":
main()
================================================
FILE: examples/nlp/text-classification/sst2/quantize_sst2_model.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import io
import time
import numpy as np
import torch
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from transformers.pipelines.pt_utils import KeyDataset
from optimum.quanto import Calibration, freeze, qint4, qint8, quantize
def evaluate_model(model, tokenizer, dataset, device, batch_size):
p = pipeline("sentiment-analysis", model, tokenizer=tokenizer, device=device)
results = p(KeyDataset(dataset, "sentence"), batch_size=batch_size)
start = time.time()
pred_labels = [0 if result["label"] == "NEGATIVE" else 1 for result in results]
end = time.time()
accuracy = np.sum(np.equal(pred_labels, dataset["label"])) / len(pred_labels)
print(f"{len(pred_labels)} sentences evaluated in {end - start:.2f} s. accuracy = {accuracy}")
def keyword_to_itype(k):
return {"none": None, "int8": qint8, "int4": qint4}[k]
def main():
parser = argparse.ArgumentParser(description="Transformers SST2 Example")
parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
parser.add_argument(
"--model",
type=str,
default="distilbert-base-uncased-finetuned-sst-2-english",
help="The name of the trained Model.",
)
parser.add_argument("--samples", type=int, default=872, help="The number of sst2 samples to use for evaluation.")
parser.add_argument("--batch_size", type=int, default=100, help="The batch size to use for evaluation.")
parser.add_argument("--weights", type=str, default="int8", choices=["int4", "int8"])
parser.add_argument("--activations", type=str, default="int8", choices=["none", "int8"])
parser.add_argument("--device", type=str, default=None, help="The device to use for evaluation.")
args = parser.parse_args()
torch.manual_seed(args.seed)
if args.device is None:
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
elif torch.xpu.is_available():
device = torch.device("xpu")
else:
device = torch.device("cpu")
else:
device = torch.device(args.device)
model = AutoModelForSequenceClassification.from_pretrained(args.model).to(device)
tokenizer = AutoTokenizer.from_pretrained(args.model)
dataset = load_dataset("sst2", split=f"validation[:{args.samples}]")
print("Float model")
evaluate_model(model, tokenizer, dataset, device, args.batch_size)
weights = keyword_to_itype(args.weights)
activations = keyword_to_itype(args.activations)
quantize(model, weights=weights, activations=activations)
if activations is not None:
print("Calibrating ...")
with Calibration():
evaluate_model(model, tokenizer, dataset, device, args.batch_size)
freeze(model)
print(f"Quantized model (w: {args.weights}, a: {args.activations})")
evaluate_model(model, tokenizer, dataset, device, args.batch_size)
b = io.BytesIO()
torch.save(model.state_dict(), b)
b.seek(0)
state_dict = torch.load(b)
model_reloaded = AutoModelForSequenceClassification.from_pretrained(args.model).to(device)
quantize(model_reloaded, weights=weights, activations=activations)
model_reloaded.load_state_dict(state_dict)
print("Serialized quantized model")
evaluate_model(model, tokenizer, dataset, device, args.batch_size)
if __name__ == "__main__":
main()
================================================
FILE: examples/nlp/text-generation/quantize_causal_lm_model.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import time
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from optimum.quanto import Calibration, QuantizedModelForCausalLM, qfloat8, qint4, qint8
@torch.no_grad()
def generate(model, tokenizer, device, prompt, max_new_tokens):
inputs = tokenizer(prompt, return_tensors="pt", padding=True)
start = time.time()
outputs = model.generate(
input_ids=inputs.input_ids.to(device),
max_new_tokens=max_new_tokens,
attention_mask=inputs.attention_mask.to(device),
do_sample=True,
top_k=50,
top_p=0.9,
)
end = time.time()
generated_text = tokenizer.decode(outputs[0])
print(f"Generated '{generated_text}' in [{end - start:.2f} s]")
@torch.no_grad()
def calibrate(model, tokenizer, dataset, device, batch_size, samples=None):
model.eval()
total = 0
for batch in dataset.iter(batch_size=batch_size):
inputs = tokenizer(batch["text"], return_tensors="pt", padding=True)
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)
model(input_ids, attention_mask=attention_mask)
total += input_ids.size(0)
if samples is not None and total >= samples:
break
def keyword_to_itype(k):
return {
"none": None,
"int4": qint4,
"int8": qint8,
"float8": qfloat8,
}[k]
def main():
parser = argparse.ArgumentParser(description="Transformers Causal LM Example")
parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
parser.add_argument(
"--model",
type=str,
default="facebook/opt-350m",
help="The name of the trained Model.",
)
parser.add_argument("--prompt", type=str, default="One of my fondest memory is", help="The generation prompt.")
parser.add_argument("--max_new_tokens", type=int, default=20, help="The maximum number of tokens to generate.")
parser.add_argument("--batch_size", type=int, default=32, help="The batch_size for evaluation (and calibration).")
parser.add_argument("--validation_batch", type=int, default=4, help="The number of batch to use for calibration.")
parser.add_argument(
"--load_dtype",
type=str,
default="float16",
choices=["float16", "float32", "bfloat16"],
help="Precision to load the initial model",
)
parser.add_argument(
"--weights",
type=str,
default="int8",
choices=["int4", "int8", "float8"],
)
parser.add_argument(
"--activations",
type=str,
default="int8",
choices=["none", "int8", "float8"],
)
parser.add_argument("--device", type=str, default=None, help="The device to use for generation.")
parser.add_argument(
"--no-streamline",
action="store_false",
help="Do not remove consecutive quantize/dequantize (not recommended).",
)
parser.add_argument(
"--debug", action="store_true", help="Provide detailed feedback on the console during calibration."
)
args = parser.parse_args()
torch.manual_seed(args.seed)
if args.device is None:
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
elif torch.xpu.is_available():
device = torch.device("xpu")
else:
device = torch.device("cpu")
else:
device = torch.device(args.device)
torch_dtype = (
torch.float16
if args.load_dtype == "float16"
else torch.bfloat16
if args.load_dtype == "bfloat16"
else torch.float32
)
model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch_dtype, low_cpu_mem_usage=True).to(
device
)
tokenizer = AutoTokenizer.from_pretrained(args.model)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"
cal_dataset = load_dataset("lambada", split=["validation"])[0]
print(f"{args.model} (w: {args.weights}, a: {args.activations})")
weights = keyword_to_itype(args.weights)
activations = keyword_to_itype(args.activations)
qmodel = QuantizedModelForCausalLM.quantize(model, weights=weights, activations=activations)
if activations is not None:
print("Calibrating ...")
cal_dataset.shuffle(args.seed)
with Calibration(streamline=args.no_streamline, debug=args.debug):
cal_samples = args.batch_size * args.validation_batch
calibrate(qmodel, tokenizer, cal_dataset, device, args.batch_size, samples=cal_samples)
generate(qmodel, tokenizer, device, args.prompt, args.max_new_tokens)
if __name__ == "__main__":
main()
================================================
FILE: examples/speech/speech_recognition/quantize_asr_model.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# REQUIRES: librosa, soundfile
import argparse
import io
import time
from functools import partial
import evaluate
import numpy as np
import torch
from datasets import load_dataset
from evaluate import load
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from optimum.quanto import Calibration, freeze, qint4, qint8, quantize
def map_to_feats(batch, processor):
audio = batch["audio"]
input_features = processor(
audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt"
).input_features
batch["input_features"] = input_features
batch["reference"] = processor.tokenizer.normalize(batch["text"])
return batch
def transcribe_batch(batch, model, processor):
with torch.no_grad():
features = torch.from_numpy(np.array(batch["input_features"], dtype=np.float32)).squeeze(1)
predicted_ids = model.generate(features.to(model.device))
transcription = [processor.decode(ids) for ids in predicted_ids]
batch["prediction"] = [processor.tokenizer.normalize(x) for x in transcription]
return batch
def evaluate_model(model, processor, dataset, metric: evaluate.EvaluationModule, batch_size=10):
map_fn = partial(transcribe_batch, model=model, processor=processor)
start = time.time()
result = dataset.map(map_fn, batched=True, batch_size=batch_size)
end = time.time()
score = 100 * metric.compute(references=result["reference"], predictions=result["prediction"])
print(score)
print(f"{len(result)} sentences evaluated in {end - start:.2f} s. {metric.name} = {score}")
def keyword_to_itype(k):
return {"none": None, "int8": qint8, "int4": qint4}[k]
def main():
parser = argparse.ArgumentParser(description="Transformers Whisper Example")
parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
parser.add_argument(
"--model",
type=str,
default="openai/whisper-medium",
help="The name of the trained Model.",
)
parser.add_argument(
"--samples", type=int, default=872, help="The number of librispeech samples to use for evaluation."
)
parser.add_argument("--batch_size", type=int, default=10, help="The batch size to use for evaluation.")
parser.add_argument("--weights", type=str, default="int8", choices=["int4", "int8"])
parser.add_argument("--activations", type=str, default="int8", choices=["none", "int8"])
parser.add_argument("--device", type=str, default=None, help="The device to use for evaluation.")
args = parser.parse_args()
torch.manual_seed(args.seed)
if args.device is None:
if torch.cuda.is_available():
device = torch.device("cuda")
print("USING CUDA")
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")
print("USING CPU")
else:
device = torch.device(args.device)
model = WhisperForConditionalGeneration.from_pretrained(args.model).to(device)
model.config.forced_decoder_ids = None
processor = WhisperProcessor.from_pretrained(args.model)
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
processed_dataset = dataset.map(lambda x: map_to_feats(x, processor))
wer = load("wer")
print("Float model:")
evaluate_model(model, processor, processed_dataset, wer, args.batch_size)
weights = keyword_to_itype(args.weights)
activations = keyword_to_itype(args.activations)
quantize(model, weights=weights, activations=activations)
if activations is not None:
print("Calibrating ...")
with Calibration():
evaluate_model(model, processor, processed_dataset, wer, args.batch_size)
freeze(model)
print(f"Quantized model (w: {args.weights}, a: {args.activations})")
evaluate_model(model, processor, processed_dataset, wer, args.batch_size)
b = io.BytesIO()
torch.save(model.state_dict(), b)
b.seek(0)
state_dict = torch.load(b)
model_reloaded = WhisperForConditionalGeneration.from_pretrained(args.model).to(device)
quantize(model_reloaded, weights=weights, activations=activations)
model_reloaded.load_state_dict(state_dict)
print("Serialized quantized model")
evaluate_model(model, processor, processed_dataset, wer, args.batch_size)
if __name__ == "__main__":
main()
================================================
FILE: examples/speech/speech_recognition/requirements.txt
================================================
transformers
evaluate
librosa
soundfile
jiwer
================================================
FILE: examples/vision/StableDiffusion/README.md
================================================
# Quantize Stable Diffusion examples
## Running locally with PyTorch
### Installing the dependencies
Before running the scripts, make sure to install the library's training dependencies:
**Important**
To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
```bash
git clone https://github.com/huggingface/quanto
cd quanto
pip install -e .
```
Then cd in the `examples/vision/StableDiffusion` folder and run
```bash
pip install -r requirements.txt
```
**Now, we can launch the image generation script:**
```bash
python quantize_StableDiffusion.py --batch_size=1 --torch_dtype="fp32"
```
To better track our training experiments, we're using the following flags in the command above:
* `batch_size` Batch size is the number of samples used in one iteration of training.
* `torch_dtype` {fp32,fp16,bf16}
* `unet_qtype` {fp8,int8,int4,none}
Our experiments were conducted on a single 24GB A10 GPU.
```bash
fp16-fp16
batch_size: 1, torch_dtype: fp16, unet_dtype: none in 3.307 seconds.Memory: 3.192GB.
```
```bash
bf16-int8
batch_size: 1, torch_dtype: bf16, unet_dtype: int8 in 3.918 seconds.Memory: 2.644GB.
```
```bash
fp16-int8
batch_size: 1, torch_dtype: fp16, unet_dtype: int8 in 3.920 seconds.Memory: 2.634GB.
```
will both get high-quality images at fast speed generation
================================================
FILE: examples/vision/StableDiffusion/quantize_StableDiffusion.py
================================================
import argparse
import gc
import torch
import torch.utils.benchmark as benchmark
from diffusers import DiffusionPipeline
from optimum.quanto import freeze, qfloat8, qint4, qint8, quantize
CKPT = "runwayml/stable-diffusion-v1-5"
NUM_INFERENCE_STEPS = 50
WARM_UP_ITERS = 5
PROMPT = "ghibli style, a fantasy landscape with castles"
TORCH_DTYPES = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
UNET_QTYPES = {
"fp8": qfloat8,
"int8": qint8,
"int4": qint4,
"none": None,
}
def load_pipeline(torch_dtype, unet_dtype=None, device="cpu"):
pipe = DiffusionPipeline.from_pretrained(CKPT, torch_dtype=torch_dtype, use_safetensors=True).to(device)
if unet_dtype:
quantize(pipe.unet, weights=unet_dtype)
freeze(pipe.unet)
pipe.set_progress_bar_config(disable=True)
return pipe
def run_inference(pipe, batch_size=1):
_ = pipe(
prompt=args.prompt,
num_inference_steps=args.num_inference_steps,
num_images_per_prompt=args.batch_size,
generator=torch.manual_seed(0),
)
def benchmark_fn(f, *args, **kwargs):
t0 = benchmark.Timer(stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f})
return f"{(t0.blocked_autorange().mean):.3f}"
def bytes_to_giga_bytes(bytes):
return f"{(bytes / 1024 / 1024 / 1024):.3f}"
def get_device_memory(device):
gc.collect()
if device.type == "cuda":
torch.cuda.empty_cache()
return torch.cuda.memory_allocated()
elif device.type == "mps":
torch.mps.empty_cache()
return torch.mps.current_allocated_memory()
elif device.type == "xpu":
torch.xpu.empty_cache()
return torch.xpu.memory_allocated()
return None
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--prompt", type=str, default="ghibli style, a fantasy landscape with castles")
parser.add_argument("--output_path", type=str, default=None)
parser.add_argument("--num_inference_steps", type=int, default=50, help="Number of inference steps")
parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument("--torch_dtype", type=str, default="fp32", choices=list(TORCH_DTYPES.keys()))
parser.add_argument("--unet_qtype", type=str, default=None, choices=list(UNET_QTYPES.keys()))
parser.add_argument("--device", type=str, default=None, help="The device to use for generation.")
args = parser.parse_args()
if args.device is None:
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
elif torch.xpu.is_available():
device = torch.device("xpu")
else:
device = torch.device("cpu")
else:
device = torch.device(args.device)
pipeline = load_pipeline(
TORCH_DTYPES[args.torch_dtype], UNET_QTYPES[args.unet_qtype] if args.unet_qtype else None, device
)
for _ in range(WARM_UP_ITERS):
run_inference(pipeline, args.batch_size)
time = benchmark_fn(run_inference, pipeline, args.batch_size)
if device.type == "cuda":
memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs.
elif device.type == "xpu":
memory = bytes_to_giga_bytes(torch.xpu.max_memory_allocated()) # in GBs.
else:
memory = 0
get_device_memory(device)
print(
f"batch_size: {args.batch_size}, torch_dtype: {args.torch_dtype}, unet_dtype: {args.unet_qtype} in {time} seconds."
)
print(f"Memory: {memory}GB.")
img_name = f"bs@{args.batch_size}-dtype@{args.torch_dtype}-unet_dtype@{args.unet_qtype}.png"
image = pipeline(
prompt=args.prompt,
num_inference_steps=NUM_INFERENCE_STEPS,
num_images_per_prompt=args.batch_size,
).images[0]
image.save(img_name)
================================================
FILE: examples/vision/StableDiffusion/requirements.txt
================================================
quanto
diffusers
torch
transformers
accelerate
wandb
================================================
FILE: examples/vision/image-classification/mnist/quantize_mnist_model.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import time
from tempfile import NamedTemporaryFile
import torch
import torch.nn.functional as F
from accelerate import init_empty_weights
from safetensors.torch import load_file, save_file
from torchvision import datasets, transforms
from transformers import AutoConfig, AutoModel
from optimum.quanto import (
Calibration,
QTensor,
freeze,
qfloat8,
qint4,
qint8,
quantization_map,
quantize,
requantize,
)
def test(model, device, test_loader):
model.to(device)
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
start = time.time()
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
if isinstance(output, QTensor):
output = output.dequantize()
test_loss += F.nll_loss(output, target, reduction="sum").item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()
end = time.time()
test_loss /= len(test_loader.dataset)
print(
"\nTest set evaluated in {:.2f} s: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
end - start, test_loss, correct, len(test_loader.dataset), 100.0 * correct / len(test_loader.dataset)
)
)
def train(log_interval, model, device, train_loader, optimizer, epoch):
model.to(device)
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
if isinstance(output, QTensor):
output = output.dequantize()
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % log_interval == 0:
print(
"Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
epoch,
batch_idx * len(data),
len(train_loader.dataset),
100.0 * batch_idx / len(train_loader),
loss.item(),
)
)
def keyword_to_itype(k):
return {"none": None, "int4": qint4, "int8": qint8, "float8": qfloat8}[k]
def main():
# Training settings
parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
parser.add_argument(
"--batch-size", type=int, default=250, metavar="N", help="input batch size for testing (default: 250)"
)
parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
parser.add_argument("--model", type=str, default="dacorvo/mnist-mlp", help="The name of the trained Model.")
parser.add_argument("--weights", type=str, default="int8", choices=["int4", "int8", "float8"])
parser.add_argument("--activations", type=str, default="int8", choices=["none", "int8", "float8"])
parser.add_argument("--device", type=str, default=None, help="The device to use for evaluation.")
args = parser.parse_args()
torch.manual_seed(args.seed)
if args.device is None:
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
elif torch.xpu.is_available():
device = torch.device("xpu")
else:
device = torch.device("cpu")
else:
device = torch.device(args.device)
dataset_kwargs = {"batch_size": args.batch_size}
if torch.cuda.is_available() or torch.xpu.is_available():
backend_kwargs = {"num_workers": 1, "pin_memory": True, "shuffle": True}
dataset_kwargs.update(backend_kwargs)
transform = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,)),
transforms.Lambda(lambda x: torch.flatten(x)),
]
)
dataset1 = datasets.MNIST("./data", train=True, download=True, transform=transform)
train_loader = torch.utils.data.DataLoader(dataset1, **dataset_kwargs)
dataset2 = datasets.MNIST("./data", train=False, download=True, transform=transform)
test_loader = torch.utils.data.DataLoader(dataset2, **dataset_kwargs)
model = AutoModel.from_pretrained(args.model, trust_remote_code=True)
model.eval()
print("Float model")
test(model, device, test_loader)
weights = keyword_to_itype(args.weights)
activations = keyword_to_itype(args.activations)
quantize(model, weights=weights, activations=activations)
if activations is not None:
print("Calibrating ...")
with Calibration():
test(model, device, test_loader)
print(f"Quantized model (w: {args.weights}, a: {args.activations})")
test(model, device, test_loader)
print("Tuning quantized model for one epoch")
optimizer = torch.optim.Adadelta(model.parameters(), lr=0.5)
train(50, model, device, train_loader, optimizer, 1)
print("Quantized tuned model")
test(model, device, test_loader)
print("Quantized frozen model")
freeze(model)
test(model, device, test_loader)
# Serialize model to a state_dict, save it to disk and reload it
with NamedTemporaryFile() as tmp_file:
save_file(model.state_dict(), tmp_file.name)
state_dict = load_file(tmp_file.name)
model_reloaded = AutoModel.from_pretrained(args.model, trust_remote_code=True)
# Create an empty model
config = AutoConfig.from_pretrained(args.model, trust_remote_code=True)
with init_empty_weights():
model_reloaded = AutoModel.from_config(config, trust_remote_code=True)
# Requantize it using the serialized state_dict
requantize(model_reloaded, state_dict, quantization_map(model), device)
print("Serialized quantized model")
test(model_reloaded, device, test_loader)
if __name__ == "__main__":
main()
================================================
FILE: examples/vision/image-classification/pets/quantize_vit_model.py
================================================
import argparse
import time
from tempfile import NamedTemporaryFile
import torch
import torch.nn.functional as F
from accelerate import init_empty_weights
from datasets import load_dataset
from safetensors.torch import load_file, save_file
from transformers import (
ViTConfig,
ViTForImageClassification,
ViTImageProcessor,
)
from optimum.quanto import (
Calibration,
QTensor,
freeze,
qfloat8,
qint4,
qint8,
quantization_map,
quantize,
requantize,
)
def test(model, device, test_loader):
model.to(device)
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
start = time.time()
for batch in test_loader:
data, target = batch["pixel_values"], batch["labels"]
data, target = data.to(device), target.to(device)
output = model(data).logits
if isinstance(output, QTensor):
output = output.dequantize()
test_loss += F.nll_loss(output, target, reduction="sum").item() # sum up batch loss
pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
correct += pred.eq(target.view_as(pred)).sum().item()
end = time.time()
test_loss /= len(test_loader.dataset)
print(
"\nTest set evaluated in {:.2f} s: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
end - start, test_loss, correct, len(test_loader.dataset), 100.0 * correct / len(test_loader.dataset)
)
)
def keyword_to_itype(k):
return {"none": None, "int4": qint4, "int8": qint8, "float8": qfloat8}[k]
def main():
parser = argparse.ArgumentParser(description="ViT PETS Example")
parser.add_argument("--model", type=str, default="super-j/vit-base-pets")
parser.add_argument("--device", type=str, default=None, help="The device to use for evaluation.")
parser.add_argument("--weights", type=str, default="int8", choices=["int4", "int8", "float8"])
parser.add_argument("--activations", type=str, default="int8", choices=["none", "int8", "float8"])
args = parser.parse_args()
dataset_kwargs = {}
if args.device is None:
if torch.cuda.is_available():
device = torch.device("cuda")
cuda_kwargs = {"num_workers": 1, "pin_memory": True, "shuffle": True}
dataset_kwargs.update(cuda_kwargs)
elif all([torch.backends.mps.is_available(), args.weights != "float8", args.activations != "float8"]):
device = torch.device("mps")
else:
device = torch.device("cpu")
else:
device = torch.device(args.device)
# load the processor and model
model_name = args.model
processor = ViTImageProcessor.from_pretrained(model_name)
model = ViTForImageClassification.from_pretrained(model_name)
def transform(data_batch):
# Take a list of PIL images and turn them to pixel values
inputs = processor(data_batch["image"], return_tensors="pt")
# Don't forget to include the labels!
inputs["labels"] = data_batch["label"]
return inputs
ds = load_dataset("rokmr/pets")
prepared_ds = ds.with_transform(transform)
test_loader = torch.utils.data.DataLoader(prepared_ds["test"], **dataset_kwargs)
print("Model before quantization...")
test(model, device, test_loader)
weights = keyword_to_itype(args.weights)
activations = keyword_to_itype(args.activations)
quantize(model, weights=weights, activations=activations)
if activations is not None:
print("Calibrating ...")
with Calibration():
test(model, device, test_loader)
print(f"Quantized model (w: {args.weights}, a: {args.activations})")
test(model, device, test_loader)
print("Quantized frozen model")
freeze(model)
test(model, device, test_loader)
# Serialize model to a state_dict, save it to disk and reload it
with NamedTemporaryFile() as tmp_file:
save_file(model.state_dict(), tmp_file.name)
state_dict = load_file(tmp_file.name)
model_reloaded = ViTForImageClassification.from_pretrained(model_name)
# Create an empty model
config = ViTConfig.from_pretrained(model_name)
with init_empty_weights():
model_reloaded = ViTForImageClassification.from_pretrained(model_name, config=config)
# Requantize it using the serialized state_dict
requantize(model_reloaded, state_dict, quantization_map(model), device)
print("Serialized quantized model")
test(model_reloaded, device, test_loader)
if __name__ == "__main__":
main()
================================================
FILE: examples/vision/object-detection/quantize_owl_model.py
================================================
import argparse
import gc
import numpy as np
import requests
import torch
from PIL import Image
from transformers import AutoProcessor, Owlv2ForObjectDetection
from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
from optimum.quanto import freeze, qfloat8, qint4, qint8, quantize
def detect(model, processor, image, texts):
inputs = processor(text=texts, images=image, return_tensors="pt").to(model.device)
# forward pass
with torch.no_grad():
outputs = model(**inputs)
# Note: boxes need to be visualized on the padded, unnormalized image
# hence we'll set the target image sizes (height, width) based on that
def get_preprocessed_image(pixel_values):
pixel_values = pixel_values.squeeze().cpu().numpy()
unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[
:, None, None
]
unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
unnormalized_image = Image.fromarray(unnormalized_image)
return unnormalized_image
unnormalized_image = get_preprocessed_image(inputs.pixel_values)
target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
# Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
results = processor.post_process_object_detection(outputs=outputs, threshold=0.2, target_sizes=target_sizes)
i = 0 # Retrieve predictions for the first image for the corresponding text queries
text = texts[i]
boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
if len(boxes) == 0:
print("None of the specified labels were detected")
return
for box, score, label in zip(boxes, scores, labels):
box = [round(i, 2) for i in box.tolist()]
print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
def get_device_memory(device):
gc.collect()
if device.type == "cuda":
torch.cuda.empty_cache()
return torch.cuda.memory_allocated()
elif device.type == "mps":
torch.mps.empty_cache()
return torch.mps.current_allocated_memory()
elif device.type == "xpu":
torch.xpu.empty_cache()
return torch.xpu.memory_allocated()
return None
def keyword_to_qtype(k):
return {"none": None, "int4": qint4, "int8": qint8, "float8": qfloat8}[k]
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, default="google/owlv2-base-patch16")
parser.add_argument("--image", type=str, required=True)
parser.add_argument("--texts", type=str, nargs="+", required=True)
parser.add_argument("--weights", type=str, default="none", choices=["none", "int4", "int8", "float8"])
parser.add_argument("--exclude-heads", action="store_true", help="Do not quantize detection heads")
parser.add_argument("--device", type=str, default=None, help="The device to use for generation.")
args = parser.parse_args()
if args.device is None:
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
# MPS backend does not support torch.float64 that is required for owl models
device = torch.device("cpu")
elif torch.xpu.is_available():
device = torch.device("xpu")
else:
device = torch.device("cpu")
else:
device = torch.device(args.device)
processor = AutoProcessor.from_pretrained(args.model)
model = Owlv2ForObjectDetection.from_pretrained(args.model, low_cpu_mem_usage=True).to(device)
weights_qtype = keyword_to_qtype(args.weights)
if weights_qtype is not None:
if args.exclude_heads:
quantize(model.owlv2, weights=weights_qtype)
else:
quantize(model, weights=weights_qtype)
freeze(model)
memory = get_device_memory(device)
if memory is not None:
memory_gb = memory / 2**30
print(f"{device.type} device memory: {memory_gb:.2f} GB.")
image_path = args.image
if image_path.startswith("http"):
image_path = requests.get(args.image, stream=True).raw
image = Image.open(image_path)
texts = [args.texts]
detect(model, processor, image, texts)
if __name__ == "__main__":
main()
================================================
FILE: examples/vision/text-to-image/quantize_pixart_sigma.py
================================================
import argparse
import gc
import torch
from diffusers import DiffusionPipeline
from optimum.quanto import freeze, qfloat8, qint4, qint8, quantize
NUM_INFERENCE_STEPS = 50
TORCH_DTYPES = {"fp16": torch.float16, "bf16": torch.bfloat16}
QTYPES = {
"fp8": qfloat8,
"int8": qint8,
"int4": qint4,
"none": None,
}
def load_pipeline(model_id, torch_dtype, qtype=None, device="cpu"):
pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch_dtype, use_safetensors=True).to(device)
if qtype:
quantize(pipe.transformer, weights=qtype)
freeze(pipe.transformer)
quantize(pipe.text_encoder, weights=qtype)
freeze(pipe.text_encoder)
pipe.set_progress_bar_config(disable=True)
return pipe
def get_device_memory(device):
gc.collect()
if device.type == "cuda":
torch.cuda.empty_cache()
return torch.cuda.memory_allocated()
elif device.type == "mps":
torch.mps.empty_cache()
return torch.mps.current_allocated_memory()
elif device.type == "xpu":
torch.xpu.empty_cache()
return torch.xpu.memory_allocated()
return None
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model_id", type=str, default="PixArt-alpha/PixArt-Sigma-XL-2-1024-MS")
parser.add_argument("--prompt", type=str, default="ghibli style, a fantasy landscape with castles")
parser.add_argument("--torch_dtype", type=str, default="fp16", choices=list(TORCH_DTYPES.keys()))
parser.add_argument("--qtype", type=str, default=None, choices=list(QTYPES.keys()))
parser.add_argument("--device", type=str, default=None, help="The device to use for generation.")
args = parser.parse_args()
if args.device is None:
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
elif torch.xpu.is_available():
device = torch.device("xpu")
else:
device = torch.device("cpu")
else:
device = torch.device(args.device)
pipeline = load_pipeline(
args.model_id, TORCH_DTYPES[args.torch_dtype], QTYPES[args.qtype] if args.qtype else None, device
)
print(f"torch_dtype: {args.torch_dtype}, qtype: {args.qtype}.")
memory = get_device_memory(device)
if memory is not None:
memory_gb = memory / 2**30
print(f"{device.type} device memory: {memory_gb:.2f} GB.")
if args.qtype == "int4" and device.type == "CUDA":
raise ValueError("This example does not work (yet) for int4 on CUDA")
img_name = f"pixart-sigma-dtype@{args.torch_dtype}-qtype@{args.qtype}.png"
image = pipeline(
prompt=args.prompt,
num_inference_steps=NUM_INFERENCE_STEPS,
num_images_per_prompt=1,
generator=torch.manual_seed(0),
).images[0]
image.save(img_name)
================================================
FILE: external/awq/conftest.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
import torch
devices = ["cpu"]
if torch.cuda.is_available():
devices += ["cuda"]
elif torch.backends.mps.is_available():
devices += ["mps"]
@pytest.fixture(scope="module", params=devices)
def device(request):
return torch.device(request.param)
def pytest_configure(config):
# register additional markers
config.addinivalue_line("markers", "skip_device(type): mark test to be skipped for the specified device type")
def pytest_runtest_call(item):
fixture_name = "device"
if fixture_name in item.fixturenames:
# TODO: should be able to recover the fixture id instead of the actual value
fixture_arg = item.funcargs[fixture_name].type
skip_marks = {mark.args[0] for mark in item.iter_markers(name=f"skip_{fixture_name}")}
if fixture_arg in skip_marks:
pytest.skip(f"Test skipped for {fixture_name} {fixture_arg}")
================================================
FILE: external/awq/pack_intweight.py
================================================
# MIT License
#
# Copyright (c) 2023 MIT HAN Lab
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import torch
def pack_intweight(unpacked_qweight, interleave, kstride):
# unpacked_qweight: [N, K]
N = unpacked_qweight.shape[0]
K = unpacked_qweight.shape[1]
Packed_Kernel = unpacked_qweight.cpu().numpy().reshape(N, K // 32, 32)
# np.arange(32).reshape(4, 4, 2).transpose(1, 0, 2) => [0, 1, 8, 9, 16, 17, 24, 25, ...]
Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 4, 4, 2).transpose(0, 1, 3, 2, 4)
Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 32)
# reorder each 8 weights for fast dequantization
# [0, 1, 2, 3, 4, 5, 6, 7] => [0, 2, 4, 6, 1, 3, 5, 7]
Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 4, 8)
Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 4, 4, 2).transpose(0, 1, 2, 4, 3)
Packed_Kernel = Packed_Kernel.reshape(N, K)
# interleaving every four rows
Packed_Kernel = Packed_Kernel.reshape(
N // interleave, interleave, K // kstride, kstride
)
# N // 4, K // 64, 4, 64
Packed_Kernel = Packed_Kernel.transpose(0, 2, 1, 3)
Packed_Kernel = Packed_Kernel.reshape(
N // interleave, K // kstride, kstride, interleave
)
# Packing -> (N // 4, K // 64, 64)
Packed_Kernel = (
Packed_Kernel[..., 0]
| (Packed_Kernel[..., 1] << 4)
| (Packed_Kernel[..., 2] << 8)
| (Packed_Kernel[..., 3] << 12)
)
# reshape to (N // 4, K), FP16 format
Packed_Kernel = Packed_Kernel.reshape(N // interleave, K)
qweight = (
torch.tensor(Packed_Kernel.astype("int16"))
.to(unpacked_qweight.device)
.contiguous()
)
return qweight
================================================
FILE: external/awq/packing_utils.py
================================================
import torch
AWQ_ORDER = [0, 2, 4, 6, 1, 3, 5, 7]
AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]
def pack_awq(intweight: torch.Tensor, reorder=False):
bits = 4
pack_num = 32 // bits
qweight = torch.zeros(intweight.shape[0], intweight.shape[1] // pack_num, dtype=torch.int32, device=intweight.device)
for col in range(intweight.shape[1] // pack_num):
if reorder:
order_map = [0, 2, 4, 6, 1, 3, 5, 7]
else:
order_map = [0, 1, 2, 3, 4, 5, 6, 7]
for i in range(pack_num):
qweight_col = intweight[:, col * pack_num + order_map[i]]
qweight[:, col] |= qweight_col << (i * bits)
return qweight
def unpack_awq(qweight: torch.Tensor, bits: int):
shifts = torch.arange(0, 32, bits, device=qweight.device)
# unpacking columnwise
iweights = torch.bitwise_right_shift(qweight[:, :, None], shifts[None, None, :]).to(
torch.int8 # smallest dtype available
)
iweights = iweights.view(iweights.shape[0], -1)
return iweights
def reverse_awq_order(iweights: torch.Tensor, bits: int):
reverse_order_tensor = torch.arange(
iweights.shape[-1],
dtype=torch.int32,
device=iweights.device,
)
reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits)
reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER]
reverse_order_tensor = reverse_order_tensor.view(-1)
iweights = iweights[:, reverse_order_tensor]
return iweights
def pack_exllama(iweights: torch.Tensor, izeros: torch.Tensor, bits: int):
shifts = torch.arange(0, 32, bits, device=iweights.device)
# packing rowwise
iweights = iweights.view(iweights.shape[0] // (32 // bits), 32 // bits, -1)
qweight = (
torch.bitwise_left_shift(iweights, shifts[None, :, None])
.sum(dim=1)
.to(torch.int32)
)
# packing columnwise
izeros = izeros.view(-1, izeros.shape[1] // (32 // bits), 32 // bits)
qzeros = (
torch.bitwise_left_shift(izeros, shifts[None, None, :])
.sum(dim=-1)
.to(torch.int32)
)
return qweight, qzeros
def unpack_reorder_pack(qweight, qzeros, bits):
# Unpack the qweight and qzeros tensors
iweight, izeros = unpack_awq(qweight, qzeros, bits)
# Reverse the order of the iweight and izeros tensors
iweight, izeros = reverse_awq_order(iweight, izeros, bits)
# overflow checks
iweight = torch.bitwise_and(iweight, (2**bits) - 1)
izeros = torch.bitwise_and(izeros, (2**bits) - 1)
# Subtract 1 from the izeros tensor (exllama adds 1 during inference)
# We can remove it if we remove the +1 in the exllama code
izeros = izeros - 1
# Pack the qweight and qzeros tensors
qweight, qzeros = pack_exllama(iweight, izeros, bits)
return qweight, qzeros
def dequantize_gemm(qweight, qzeros, scales, bits, group_size):
# Unpack the qweight and qzeros tensors
iweight, izeros = unpack_awq(qweight, qzeros, bits)
# Reverse the order of the iweight and izeros tensors
iweight, izeros = reverse_awq_order(iweight, izeros, bits)
# overflow checks
iweight = torch.bitwise_and(iweight, (2**bits) - 1)
izeros = torch.bitwise_and(izeros, (2**bits) - 1)
# fp16 weights
scales = scales.repeat_interleave(group_size, dim=0)
izeros = izeros.repeat_interleave(group_size, dim=0)
iweight = (iweight - izeros) * scales
return iweight
================================================
FILE: external/awq/test_awq_kernels.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
import torch
from pack import pack_awq
from optimum.quanto import AffineQuantizer, MaxOptimizer, qint4, ungroup
def assert_similar(a, b, atol=None, rtol=None):
"""Verify that the cosine similarity of the two inputs is close to 1.0 everywhere"""
assert a.dtype == b.dtype
assert a.shape == b.shape
if atol is None:
# We use torch finfo resolution
atol = torch.finfo(a.dtype).resolution
if rtol is None:
# Please refer to that discussion for default rtol values based on the float type:
# https://scicomp.stackexchange.com/questions/43111/float-equality-tolerance-for-single-and-half-precision
rtol = {torch.float32: 1e-5, torch.float16: 1e-3, torch.bfloat16: 1e-1}[a.dtype]
sim = torch.nn.functional.cosine_similarity(a.flatten(), b.flatten(), dim=0)
if not torch.allclose(sim, torch.tensor(1.0, dtype=sim.dtype), atol=atol, rtol=rtol):
max_deviation = torch.min(sim)
raise ValueError(f"Alignment {max_deviation:.8f} deviates too much from 1.0 with atol={atol}, rtol={rtol}")
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.parametrize("in_features, out_features", [(256, 256), (512, 256)])
@pytest.mark.parametrize("kernel", ["gemv", "gemm"])
def test_standalone_kernel(in_features, out_features, kernel):
"""This test verifies that the GEMM operation is equivalent to torch.mm.
"""
bits = 4
group_size = 128 # Hard-coded in kernels
interleave = 4 # Hard-coded in kernels
kstride = 64 # Hard-coded in kernels
device = torch.device('cuda')
batch_size, tokens = (4, 1) if kernel =="gemv" else (10, 128)
input_shape = (batch_size, tokens, in_features)
# FIXME: does not work if inputs are negative !!??
inputs = torch.rand(input_shape, dtype=torch.float16, device=device)
qmax = 2**bits
other_shape = (out_features, in_features)
other_data = torch.randint(0, qmax, other_shape, dtype=torch.uint8, device=device)
#packed_other_data = pack_intweight(other_data.to(torch.int32), interleave=interleave, kstride=kstride)
packed_other_data = pack_awq(other_data.to(torch.int32), interleave=interleave, kstride=kstride)
# The GEMM kernel works on transposed scales
scales_shape = (in_features // group_size, out_features)
other_scales = torch.rand(scales_shape, dtype=torch.float16, device=device) / qmax
# The GEMM kernel works on transposed, negated and scaled zeropoints
qmin = -2**(bits -1)
qmax = 2**(bits -1)
other_zeropoints = torch.randint(qmin, qmax, scales_shape, dtype=torch.int8, device=device)
# Negate and scale
other_scaled_zeropoints = - other_zeropoints * other_scales
# Evaluate mm outputs using the GEMM kernel
if kernel == "gemv":
awq_outputs = torch.ops.quanto.gemv(inputs,
packed_other_data,
other_scales,
other_scaled_zeropoints,
rows=inputs.numel() // inputs.shape[-1],
out_cols=out_features,
in_cols=in_features,
bits=4,
group_size=group_size)
else:
awq_outputs = torch.ops.quanto.gemm(inputs,
packed_other_data,
other_scales,
other_scaled_zeropoints,
rows=inputs.numel() // inputs.shape[-1],
out_cols=out_features,
in_cols=in_features,
bits=4,
group_size=group_size)
# Transpose other data and reshape it to align it with transposed scales and zeros
other_data_t = other_data.t().reshape(group_size, in_features // group_size, out_features)
# Dequantize transposed other
other_t = (other_data_t - other_zeropoints) * other_scales
# Reshape it as expected by the matmul
other_t = other_t.reshape(in_features, out_features)
# Evaluate the matrix multiplication using pytorch float16 mm
pt_outputs = torch.matmul(inputs, other_t)
# Verify the results are similar
assert_similar(awq_outputs, pt_outputs, rtol=5e-3)
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.parametrize("in_features, out_features", [(256, 256), (512, 256)])
@pytest.mark.parametrize("kernel", ["gemm", "gemv"])
def test_integrated_kernel(in_features, out_features, kernel):
group_size = 128 # Hard-coded in kernels
interleave = 4 # Hard-coded in kernels
kstride = 64 # Hard-coded in kernels
device = torch.device('cuda')
batch_size, tokens = (4, 1) if kernel == "gemv" else (10, 128)
input_shape = (batch_size, tokens, in_features)
inputs = torch.rand(input_shape, dtype=torch.float16, device=device) * 2 - 1
other_shape = (out_features, in_features)
other = torch.rand(other_shape, dtype=torch.float16, device=device) * 2 - 1
# Quantize using quanto
scale, zeropoint = MaxOptimizer()(other, bits=4, axis=0, group_size=128)
quanto_base = AffineQuantizer.apply(other, qint4, 0, group_size, scale, zeropoint)
# Evaluate mm
quanto_outputs = torch.matmul(inputs, quanto_base.t())
# Extract quantized data, unpack and ungroup to recover original shape
quanto_data = ungroup(quanto_base._data.unpack(), axis=0, orig_shape=other_shape)
# Pack data for AWQ kernel
awq_data = pack_awq(quanto_data.to(torch.int32), interleave=interleave, kstride=kstride)
# Reshape and transpose scale as expected by AWQ kernel (! buffer must be contiguous)
awq_scale = scale.reshape(out_features, in_features // group_size).t().contiguous()
# Reshape and transpose zeropoint as expected by AWQ kernel (! buffer must be contiguous)
awq_zeropoint = zeropoint.reshape(out_features, in_features // group_size).t().contiguous()
# Negate and rescale
awq_scaled_zeropoint = - awq_zeropoint * awq_scale
# Evaluate mm outputs using the AWQ kernels
if kernel == "gemv":
awq_outputs = torch.ops.quanto.gemv(inputs,
awq_data,
awq_scale,
awq_scaled_zeropoint,
rows=inputs.numel() // inputs.shape[-1],
out_cols=out_features,
in_cols=in_features,
bits=4,
group_size=group_size)
else:
awq_outputs = torch.ops.quanto.gemm(inputs,
awq_data,
awq_scale,
awq_scaled_zeropoint,
rows=inputs.numel() // inputs.shape[-1],
out_cols=out_features,
in_cols=in_features,
bits=4,
group_size=group_size)
# Verify the results are similar
assert_similar(awq_outputs, quanto_outputs, rtol=5e-3)
================================================
FILE: external/awq/test_awq_packing.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import pytest
import torch
from pack_intweight import pack_intweight
from packing_utils import pack_awq, reverse_awq_order, unpack_awq
from optimum.quanto import AWQPackedTensor, AWQPacking
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.parametrize("in_features", [128, 256, 512, 1024])
@pytest.mark.parametrize("out_features", [128, 256, 512, 1024])
@pytest.mark.parametrize("reorder", [True, False])
@pytest.mark.parametrize("random", [True, False])
def test_awq_pack(in_features, out_features, reorder, random):
"""This test verifies two things:
- that we are able to replicate awq packing,
- that we can unpack awq packed tensors and recover the original tensor.
"""
bits = 4
interleave = 4
kstride = 64
qmax = 2**bits
shape = (out_features, in_features)
device = torch.device('cuda')
if random:
t = torch.randint(0, qmax, shape, dtype=torch.uint8).to(device)
else:
numel = np.prod(shape)
t = torch.tensor(range(numel), dtype=torch.int32)
t = (t % qmax).reshape(shape).to(torch.uint8).to(device)
packed = pack_awq(t.to(torch.int32), reorder=reorder)
# Sanity check: verify we can recover the Tensor using AWQ unpacking
unpacked = unpack_awq(packed, bits=4)
if reorder:
unpacked = reverse_awq_order(unpacked, bits=4)
unpacked = torch.bitwise_and(unpacked, qmax - 1)
assert torch.equal(t, unpacked)
# Compare with quanto packing
repacked = AWQPackedTensor.pack(t, packing=AWQPacking.V1, reorder=reorder)
assert torch.equal(packed, repacked._data)
unpacked = repacked.unpack()
assert torch.equal(unpacked, t)
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.parametrize("in_features", [128, 256, 512, 1024])
@pytest.mark.parametrize("out_features", [128, 256, 512, 1024])
@pytest.mark.parametrize("random", [True, False])
def test_awq_pack_v2(in_features, out_features, random):
"""This test verifies two things:
- that we are able to replicate awq packing,
- that we can unpack awq packed tensors and recover the original tensor.
"""
bits = 4
interleave = 4
kstride = 64
qmax = 2**bits
shape = (out_features, in_features)
device = torch.device('cuda')
if random:
t = torch.randint(0, qmax, shape, dtype=torch.uint8).to(device)
else:
numel = np.prod(shape)
t = torch.tensor(range(numel), dtype=torch.int32)
t = (t % qmax).reshape(shape).to(torch.uint8).to(device)
packed = pack_intweight(t.to(torch.int32), interleave=interleave, kstride=kstride)
# Compare with quanto packing
repacked = AWQPackedTensor.pack(t, packing=AWQPacking.V2)
assert torch.equal(packed, repacked._data)
unpacked = repacked.unpack()
assert torch.equal(unpacked, t)
================================================
FILE: external/awq/test_awq_quantize.py
================================================
import pytest
import torch
from optimum.quanto import AffineQuantizer, MaxOptimizer, qint4, ungroup
def awq_quantize(base, scales, zeros, group_size):
_, in_features = base.shape
scale_zeros = scales * zeros
intweight = []
# From https://github.com/casper-hansen/AutoAWQ/blob/main/awq/modules/linear/gemv_fast.py#L165
for idx in range(in_features):
intweight.append(
torch.round(
(base[:, idx] + scale_zeros[:, idx // group_size])
/ scales[:, idx // group_size]
).to(torch.uint8)[:, None]
)
intweight = torch.cat(intweight, dim=1)
return intweight
@pytest.mark.parametrize("in_features, out_features", [(256, 512), (1024, 1024)])
def test_awq_quantize(in_features, out_features):
"""Verify that AWQ quantization is equivalent to quanto affine quantization
"""
shape = (out_features, in_features)
base = torch.rand(shape, dtype=torch.float16)
group_size = 128
# Quantize using quanto
scale, zeropoint = MaxOptimizer()(base, bits=4, axis=0, group_size=128)
quanto_base = AffineQuantizer.apply(base, qint4, 0, group_size, scale, zeropoint)
# Extract quantized data, unpack and ungroup to recover original shape
quanto_data = ungroup(quanto_base._data.unpack(), axis=0, orig_shape=shape)
# Reshape scale and zeropoint as expected by awq
awq_shape = (out_features, in_features // group_size)
scale = scale.reshape(awq_shape)
zeropoint = zeropoint.reshape(awq_shape)
# Compare with awq quantization
awq_data = awq_quantize(base, scale, zeropoint, group_size)
# FIX: AWQ does not clamp values before packing
qmax = 2 ** 4 - 1
awq_data = torch.clamp(awq_data, 0, qmax)
mismatches = quanto_data != awq_data
n = torch.sum(mismatches).numpy()
rate = n / base.numel()
print(f"Mismatches: {n}/{base.numel()} ({rate:.8f} %)")
# Extract mismatches
display = 10
quanto_values = torch.masked_select(quanto_data, mismatches)[:display]
awq_values = torch.masked_select(awq_data, mismatches)[:display]
print(f"First {display} mismatches")
print(list(quanto_values.numpy()))
print(list(awq_values.numpy()))
# Due to a slightly different order of operations (zero is multiplied by scale before subtracting it),
# there are some mismatches
assert rate < 5e-4
================================================
FILE: external/smoothquant/README.md
================================================
# SmoothQuant original conversion script
This converts an OPT or Bloom [🤗 transformers](https://github.com/huggingface/transformers) model to a "smoothed" version, as described in
[SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models](https://arxiv.org/abs/2211.10438).
```bash
$ python smoothquant.py --model facebook/opt-1.3b --save-path smoothed-models/facebook/opt-1.3b
```
Note: due to hard-coded assumptions on model architecture in the script this only works for OPT models that apply the layer_norm
before the attention (`do_layer_norm_before=true` in `config.json`). This means all models but `facebook/opt-350m`.
================================================
FILE: external/smoothquant/smoothquant.py
================================================
import argparse
import functools
import os
import torch
import torch.nn as nn
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.models.bloom.modeling_bloom import BloomBlock
from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRMSNorm
from transformers.models.mistral.modeling_mistral import MistralDecoderLayer, MistralRMSNorm
from transformers.models.opt.modeling_opt import OPTDecoderLayer
def get_act_scales(model, tokenizer, dataset, num_samples=512, seq_len=512):
model.eval()
device = next(model.parameters()).device
act_scales = {}
def stat_tensor(name, tensor):
hidden_dim = tensor.shape[-1]
tensor = tensor.view(-1, hidden_dim).abs().detach()
comming_max = torch.max(tensor, dim=0)[0].float().cpu()
if name in act_scales:
act_scales[name] = torch.max(act_scales[name], comming_max)
else:
act_scales[name] = comming_max
def stat_input_hook(m, x, y, name):
if isinstance(x, tuple):
x = x[0]
stat_tensor(name, x)
hooks = []
for name, m in model.named_modules():
if isinstance(m, nn.Linear):
hooks.append(m.register_forward_hook(functools.partial(stat_input_hook, name=name)))
for i in tqdm(range(num_samples)):
input_ids = tokenizer(
dataset[i]["text"], return_tensors="pt", max_length=seq_len, truncation=True
).input_ids.to(device)
model(input_ids)
for h in hooks:
h.remove()
return act_scales
@torch.no_grad()
def smooth_ln_fcs(ln, fcs, act_scales, alpha=0.5):
if not isinstance(fcs, list):
fcs = [fcs]
assert isinstance(ln, (nn.LayerNorm, LlamaRMSNorm, MistralRMSNorm))
for fc in fcs:
assert isinstance(fc, nn.Linear)
assert ln.weight.numel() == fc.in_features == act_scales.numel()
device, dtype = fcs[0].weight.device, fcs[0].weight.dtype
act_scales = act_scales.to(device=device, dtype=dtype)
weight_scales = torch.cat([fc.weight.abs().max(dim=0, keepdim=True)[0] for fc in fcs], dim=0)
weight_scales = weight_scales.max(dim=0)[0].clamp(min=1e-5)
scales = (act_scales.pow(alpha) / weight_scales.pow(1 - alpha)).clamp(min=1e-5).to(device).to(dtype)
ln.weight.div_(scales)
if getattr(ln, 'bias', None) is not None:
ln.bias.div_(scales)
for fc in fcs:
fc.weight.mul_(scales.view(1, -1))
@torch.no_grad()
def smooth_lm(model, scales, alpha=0.5):
for name, module in model.named_modules():
if isinstance(module, OPTDecoderLayer):
attn_ln = module.self_attn_layer_norm
qkv = [module.self_attn.q_proj, module.self_attn.k_proj, module.self_attn.v_proj]
qkv_input_scales = scales[name + ".self_attn.q_proj"]
smooth_ln_fcs(attn_ln, qkv, qkv_input_scales, alpha)
ffn_ln = module.final_layer_norm
fc1 = module.fc1
fc1_input_scales = scales[name + ".fc1"]
smooth_ln_fcs(ffn_ln, fc1, fc1_input_scales, alpha)
elif isinstance(module, BloomBlock):
attn_ln = module.input_layernorm
qkv = module.self_attention.query_key_value
qkv_input_scales = scales[name + ".self_attention.query_key_value"]
smooth_ln_fcs(attn_ln, qkv, qkv_input_scales, alpha)
ffn_ln = module.post_attention_layernorm
fc1 = module.mlp.dense_h_to_4h
fc1_input_scales = scales[name + ".mlp.dense_h_to_4h"]
smooth_ln_fcs(ffn_ln, fc1, fc1_input_scales, alpha)
elif isinstance(module, (LlamaDecoderLayer, MistralDecoderLayer)):
attn_ln = module.input_layernorm
qkv = [module.self_attn.q_proj, module.self_attn.k_proj, module.self_attn.v_proj]
qkv_input_scales = scales[name + ".self_attn.q_proj"]
smooth_ln_fcs(attn_ln, qkv, qkv_input_scales, alpha)
ffn_ln = module.post_attention_layernorm
fc = [module.mlp.gate_proj, module.mlp.up_proj]
fc_input_scales = scales[name + ".mlp.gate_proj"]
smooth_ln_fcs(ffn_ln, fc, fc_input_scales, alpha)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, default="facebook/opt-125m", help="model name")
parser.add_argument("--save-path", type=str, default=None, help="smoothed model model save path")
parser.add_argument("--num-samples", type=int, default=512)
parser.add_argument("--seq-len", type=int, default=512)
parser.add_argument("--device", type=str, default=None, help="The device to use for generation.")
args = parser.parse_args()
if args.device is None:
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
else:
device = torch.device("cpu")
else:
device = torch.device(args.device)
dataset = load_dataset("lambada", split=f"validation[:{args.num_samples}]").shuffle()
tokenizer = AutoTokenizer.from_pretrained(args.model, model_max_length=args.seq_len)
model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype="auto").to(device)
act_scales = get_act_scales(model, tokenizer, dataset, args.num_samples, args.seq_len)
smooth_lm(model, act_scales, 0.5)
save_path = args.save_path
if save_path is None:
save_path = os.path.join("smoothed_models", args.model)
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
if __name__ == "__main__":
main()
================================================
FILE: optimum/quanto/__init__.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = "0.2.7dev"
from .calibrate import *
from .library import *
from .models import *
from .nn import *
from .quantize import *
from .tensor import *
================================================
FILE: optimum/quanto/calibrate.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional
import torch
from torch.nn.modules.module import (
register_module_forward_hook,
register_module_forward_pre_hook,
)
from torch.overrides import TorchFunctionMode
from .nn import QModuleMixin
from .tensor import ActivationQBytesTensor, QTensor, axis_to_dim, dtype_info, qint8, qtype
__all__ = ["Calibration", "absmax_scale"]
def _updated_scale(scale, new_scale, momentum):
if torch.all(scale == 1):
return new_scale
return momentum * scale + new_scale * (1.0 - momentum)
def absmax_scale(base: torch.Tensor, qtype: qtype = qint8, axis: Optional[int] = None) -> torch.Tensor:
"""Evaluate the quantization scale using the absmax algorithm.
The Absolute Maximum quantization algorithm is a symmetrical quantization
algorithm where the scale corresponds to the maximum absolute value of the
base divided by the highest positive integer value for the target integer
representation.
Args:
base (`torch.Tensor`): the base tensor on which the scale will be applied.
qtype (`quanto.qtype`): the target qtype for quantization.
axis (`int`): the index of the axis to preserve, or -1 for the last one.
Defaults to None to reduce all axis.
Returns:
`torch.Tensor`: a scale tensor of the same dtype as the base.
"""
base = torch.abs(base)
if axis is None:
qranges = torch.max(base)
else:
dim = axis_to_dim(base, axis)
qranges = torch.amax(base, dim=dim, keepdim=True)
info = dtype_info(qtype.dtype)
return qranges / info.max
class Calibration(TorchFunctionMode):
"""A custom torch dispatch mode to calibrate quantized modules.
In order to improve the accuracy of the quantized activations, the input and output
scales of each quantized module are evaluated per-batch using the absmax algorithm and aggregated using a
momentum.
The dispatch mode also tracks the calls to each torch function down the model graph, and applies optional
optimizations:
- streamline: do not quantize activations that are immediately consumed by an incompatible function (like `add` or `silu`).
Args:
momentum (`float`): the momentum to use when updating scales.
streamline (`bool`): if True, avoid quantizing activations when they are consumed by an incompatible function. Defaults to True.
debug (`bool`): provide very verbose feedback on the console during calibration.
"""
def __init__(self, *args, momentum: float = 0.9, streamline=True, debug=False, **kwargs):
super().__init__(*args, **kwargs)
self.momentum = momentum
self.streamline = streamline
if streamline:
self.modules_qactivations = {}
self.streamline_hooks = {}
self.debug = debug
def __torch_function__(self, func, types, args=(), kwargs=None):
kwargs = kwargs if kwargs is not None else {}
qinput = QTensor in types
output = func(*args, **kwargs)
if self.streamline and qinput:
for i, arg in enumerate(args):
module = getattr(arg, "src_module", None)
if module is not None:
if isinstance(output, ActivationQBytesTensor):
# Quantized activations are required for that module
self.modules_qactivations[module] = True
elif isinstance(output, torch.Tensor):
# Quantized activations are not required for that module unless another function requires them
qactivations_required = self.modules_qactivations.get(module, False)
self.modules_qactivations[module] = qactivations_required
return output
def __enter__(self):
super().__enter__()
self.pre_handle = register_module_forward_pre_hook(self.calibrate_input)
self.post_handle = register_module_forward_hook(self.calibrate_output)
def __exit__(self, exc_type, exc_val, exc_tb):
super().__exit__(exc_type, exc_val, exc_tb)
self.pre_handle.remove()
self.post_handle.remove()
if self.streamline:
for handle in self.streamline_hooks.values():
handle.remove()
def calibrate_input(self, module: torch.nn.Module, input, momentum: float = 0.9):
"""Calibrate a module input scale
This is registered as a global hook that is called before any module forward pre hook.
"""
if isinstance(module, QModuleMixin) and module.activation_qtype is not None:
input = input[0]
if isinstance(input, ActivationQBytesTensor):
# Just adopt the maximum scale of the input
module.input_scale = torch.max(input._scale)
else:
# Evaluate the best scale
input_scale = absmax_scale(input, module.activation_qtype)
module.input_scale = _updated_scale(module.input_scale, input_scale, momentum)
if self.streamline and module not in self.streamline_hooks:
# Add a hook to tag the module outputs (after the module quantization hook in QModuleMixin)
self.streamline_hooks[module] = module.register_forward_hook(self.tag_outputs)
return input
def calibrate_output(
self,
module: torch.nn.Module,
input: torch.Tensor,
output: torch.Tensor,
):
"""Calibrate a module output scale
This is registered as a global hook that is called before any module forward hook.
When the module is a QModuleMixin, its outputs are not quantized yet because they
are only quantized in the QModuleMixin.quantize_output forward hook.
"""
if isinstance(module, (QModuleMixin)) and module.activation_qtype is not None:
# Evaluate the optimal scale per-tensor and update output scale
output_scale = absmax_scale(output, module.activation_qtype, axis=None)
module.output_scale = _updated_scale(module.output_scale, output_scale, self.momentum)
return output
else:
if self.streamline:
for name, child in module.named_children():
if isinstance(child, QModuleMixin) and child.activation_qtype is not None:
qactivations_required = self.modules_qactivations.get(child, False)
if not qactivations_required:
# Disable output quantization for this child as its outputs are only consumed by incompatible functions.
child.disable_output_quantization()
if self.debug:
for name, child in module.named_children():
if isinstance(child, QModuleMixin):
classname = child.__class__.__name__
trace = f"{name}({classname}) activations are"
if child.activation_qtype is None:
trace += " not quantized."
else:
trace += f" quantized to {child.activation_qtype} with scale {child.output_scale}."
print(trace)
def tag_outputs(
self,
module: torch.nn.Module,
input: torch.Tensor,
output: torch.Tensor,
):
"""Mark outputs as generated by a module
This is called as a module forward hook that is called after the QModuleMixin.quantize_output
forward hook.
This is useful in streamline mode to identify the module that generated a specific QTensor.
"""
output.src_module = module
================================================
FILE: optimum/quanto/library/README.md
================================================
# Quanto operations library
This contains the `quanto::` operations, available in python under `torch.ops.quanto`.
To add a new operation:
- add a definition for the operation in `library/ops.py`,
- provide a default implementation using pytorch operators only under `library/python`,
- provide optimized kernels for all devices under `library/ext`.
================================================
FILE: optimum/quanto/library/__init__.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .extensions import *
from .qbytes_mm import *
from .quantize import *
from .unpack import *
================================================
FILE: optimum/quanto/library/extensions/README.md
================================================
# Quanto library extensions
This folder contains device-specific `quanto::` operations.
Implementations can be provided as part of:
- the generic C++ pytorch extension under `cpp`,
- the CUDA extension under `cuda`,
- the Metal Performance Shader extension under `mps`,
- the XPU SYCL extension under `xpu`.
To provide a device-specific implementation of an operation that already has a default implementation (such as unpack), use the following syntax:
```python
@torch.library.impl("quanto::unpack", ["CPU", "CUDA"])
def unpack(packed: torch.Tensor, bits: int) -> torch.Tensor:
return ext.unpack(t, bits)
```
To declare a new device-specific operation, you need to add it to the library:
```python
torch.library.define(
"quanto::gemm_f16i4",
"(Tensor input,"
" Tensor other,"
" Tensor other_scale,"
" Tensor other_shift,"
" int group_size)"
" -> Tensor",
)
```
Then you can provide its implementation:
```python
@torch.library.impl("quanto::gemm_f16i4", ["CUDA"])
def gemm_f16i4(
input: torch.Tensor,
other: torch.Tensor,
scales: torch.Tensor,
shift: torch.Tensor,
group_size: int,
) -> torch.Tensor:
...
```
Please refer to each extension folder for examples.
================================================
FILE: optimum/quanto/library/extensions/__init__.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHO
gitextract_e7pf933s/
├── .github/
│ ├── CODEOWNERS
│ ├── PULL_REQUEST_TEMPLATE.md
│ └── workflows/
│ ├── check-commits.yml
│ ├── linux-cpu-tests.yml
│ ├── linux-cuda-tests.yml
│ ├── linux-examples.yml
│ ├── python-quality.yml
│ ├── security.yml
│ └── stale.yml
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── bench/
│ ├── generation/
│ │ ├── README.md
│ │ ├── evaluate_configurations.py
│ │ ├── evaluate_many_models.sh
│ │ ├── evaluate_model.py
│ │ ├── gen_barchart.py
│ │ ├── metrics/
│ │ │ ├── __init__.py
│ │ │ ├── latency.py
│ │ │ ├── perplexity.py
│ │ │ └── prediction.py
│ │ └── setup/
│ │ ├── __init__.py
│ │ ├── awq.py
│ │ ├── bnb.py
│ │ ├── hqq.py
│ │ └── quanto.py
│ ├── kernels/
│ │ ├── benchmark.py
│ │ ├── benchmark_marlin_fp8.py
│ │ └── benchmark_w4a16.py
│ └── torch_kernels/
│ ├── README.md
│ ├── test_int_mm.py
│ ├── test_int_mm_inductor.py
│ ├── test_weight_int4pack_mm.py
│ └── test_weight_int8pack_mm.py
├── examples/
│ ├── nlp/
│ │ ├── text-classification/
│ │ │ └── sst2/
│ │ │ └── quantize_sst2_model.py
│ │ └── text-generation/
│ │ └── quantize_causal_lm_model.py
│ ├── speech/
│ │ └── speech_recognition/
│ │ ├── quantize_asr_model.py
│ │ └── requirements.txt
│ └── vision/
│ ├── StableDiffusion/
│ │ ├── README.md
│ │ ├── quantize_StableDiffusion.py
│ │ └── requirements.txt
│ ├── image-classification/
│ │ ├── mnist/
│ │ │ └── quantize_mnist_model.py
│ │ └── pets/
│ │ └── quantize_vit_model.py
│ ├── object-detection/
│ │ └── quantize_owl_model.py
│ └── text-to-image/
│ └── quantize_pixart_sigma.py
├── external/
│ ├── awq/
│ │ ├── conftest.py
│ │ ├── pack_intweight.py
│ │ ├── packing_utils.py
│ │ ├── test_awq_kernels.py
│ │ ├── test_awq_packing.py
│ │ └── test_awq_quantize.py
│ └── smoothquant/
│ ├── README.md
│ └── smoothquant.py
├── optimum/
│ └── quanto/
│ ├── __init__.py
│ ├── calibrate.py
│ ├── library/
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── extensions/
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── cpp/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pybind_module.cpp
│ │ │ │ ├── unpack.cpp
│ │ │ │ └── unpack.h
│ │ │ ├── cuda/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── awq/
│ │ │ │ │ ├── dequantize.cuh
│ │ │ │ │ └── v2/
│ │ │ │ │ ├── gemm_cuda.cu
│ │ │ │ │ ├── gemm_cuda.h
│ │ │ │ │ ├── gemv_cuda.cu
│ │ │ │ │ ├── gemv_cuda.h
│ │ │ │ │ └── semaphore.h
│ │ │ │ ├── marlin/
│ │ │ │ │ ├── COPYRIGHT
│ │ │ │ │ ├── fp8_marlin.cu
│ │ │ │ │ ├── fp8_marlin.cuh
│ │ │ │ │ ├── gptq_marlin.cuh
│ │ │ │ │ ├── gptq_marlin_dtypes.cuh
│ │ │ │ │ ├── gptq_marlin_repack.cu
│ │ │ │ │ ├── gptq_marlin_repack.cuh
│ │ │ │ │ ├── marlin_cuda.cpp
│ │ │ │ │ ├── marlin_cuda.h
│ │ │ │ │ ├── marlin_cuda_kernel.cu
│ │ │ │ │ └── marlin_cuda_kernel.cuh
│ │ │ │ ├── pybind_module.cpp
│ │ │ │ ├── unpack.cu
│ │ │ │ └── unpack.h
│ │ │ ├── extension.py
│ │ │ ├── hip/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pybind_module.cpp
│ │ │ │ ├── unpack.cu
│ │ │ │ └── unpack.h
│ │ │ ├── mps/
│ │ │ │ ├── README.md
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pybind_module.cpp
│ │ │ │ ├── unpack.h
│ │ │ │ └── unpack.mm
│ │ │ └── xpu/
│ │ │ ├── __init__.py
│ │ │ ├── pybind_module.cpp
│ │ │ ├── unpack.h
│ │ │ └── unpack.sycl
│ │ ├── qbytes_mm.py
│ │ ├── quantize.py
│ │ └── unpack.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── diffusers_models.py
│ │ ├── shared_dict.py
│ │ └── transformers_models.py
│ ├── nn/
│ │ ├── __init__.py
│ │ ├── qconv2d.py
│ │ ├── qlayernorm.py
│ │ ├── qlinear.py
│ │ └── qmodule.py
│ ├── quantize.py
│ ├── subpackage/
│ │ ├── __init__.py
│ │ └── commands/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ └── quantize.py
│ └── tensor/
│ ├── __init__.py
│ ├── activations/
│ │ ├── __init__.py
│ │ ├── qbytes.py
│ │ ├── qbytes_ops.py
│ │ └── quantization.py
│ ├── core.py
│ ├── function.py
│ ├── grouped.py
│ ├── optimizers/
│ │ ├── __init__.py
│ │ ├── absmax_optimizer.py
│ │ ├── affine_optimizer.py
│ │ ├── hqq_optimizer.py
│ │ ├── max_optimizer.py
│ │ ├── optimizer.py
│ │ └── symmetric_optimizer.py
│ ├── packed.py
│ ├── qbits.py
│ ├── qbytes.py
│ ├── qtensor.py
│ ├── qtype.py
│ └── weights/
│ ├── __init__.py
│ ├── awq/
│ │ ├── __init__.py
│ │ ├── packed.py
│ │ └── qbits.py
│ ├── marlin/
│ │ ├── __init__.py
│ │ ├── fp8/
│ │ │ ├── __init__.py
│ │ │ ├── packed.py
│ │ │ └── qbits.py
│ │ ├── int4/
│ │ │ ├── __init__.py
│ │ │ ├── packed.py
│ │ │ └── qbits.py
│ │ └── permutations.py
│ ├── packing.py
│ ├── qbits.py
│ ├── qbytes.py
│ ├── quantization.py
│ ├── reordering.py
│ └── tinygemm/
│ ├── __init__.py
│ ├── packed.py
│ └── qbits.py
├── pyproject.toml
├── setup.sh
└── tests/
├── cli/
│ ├── cli_helpers.py
│ └── test_quantize_cli.py
├── conftest.py
├── helpers.py
├── library/
│ ├── test_extensions.py
│ ├── test_mm.py
│ ├── test_quantize.py
│ └── test_unpack.py
├── models/
│ ├── conftest.py
│ ├── test_quantized_model_for_causal_lm.py
│ └── test_quantized_model_for_pixart.py
├── nn/
│ ├── test_calibrate.py
│ ├── test_qattention.py
│ ├── test_qconv2d.py
│ ├── test_qlayernorm.py
│ ├── test_qlinear.py
│ └── test_qmodule.py
├── quantize/
│ ├── test_quantize_mlp.py
│ ├── test_quantize_patterns.py
│ └── test_requantize.py
└── tensor/
├── activations/
│ ├── test_activations_compile.py
│ ├── test_activations_dispatch.py
│ └── test_activations_quantize.py
├── ops/
│ ├── test_linear_dispatch.py
│ └── test_mm_dispatch.py
├── optimizers/
│ └── test_hqq_optimizer.py
├── test_absmax.py
├── test_packed_tensor.py
└── weights/
├── optimized/
│ ├── test_awq_packed_tensor.py
│ ├── test_awq_weight_qbits_tensor.py
│ ├── test_marlin_fp8_packed_tensor.py
│ ├── test_marlin_int4_packed_tensor.py
│ ├── test_marlin_int4_weight_qbits_tensor.py
│ ├── test_marlin_qbytes_tensor.py
│ ├── test_tinygemm_packed_tensor.py
│ └── test_tinygemm_weight_qbits_tensor.py
├── test_weight_qbits_tensor.py
├── test_weight_qbits_tensor_dispatch.py
├── test_weight_qbits_tensor_instantiate.py
├── test_weight_qbits_tensor_quantize.py
├── test_weight_qbytes_tensor_backward.py
├── test_weight_qbytes_tensor_dispatch.py
├── test_weight_qbytes_tensor_instantiate.py
├── test_weight_qbytes_tensor_quantize.py
├── test_weight_qbytes_tensor_serialization.py
└── weight_helpers.py
SYMBOL INDEX (641 symbols across 138 files)
FILE: bench/generation/evaluate_configurations.py
function evaluate_model_configurations (line 26) | def evaluate_model_configurations(
function main (line 64) | def main():
FILE: bench/generation/evaluate_model.py
function calibrate (line 36) | def calibrate(model, tokenizer, batch_size, batches):
function evaluate (line 51) | def evaluate(
function main (line 86) | def main():
FILE: bench/generation/gen_barchart.py
function save_bar_chart (line 23) | def save_bar_chart(title, labels, ylabel, series, save_path):
function gen_barchart (line 50) | def gen_barchart(model_id, title, label, results, dtype):
function main (line 76) | def main():
FILE: bench/generation/metrics/latency.py
function latency (line 24) | def latency(model, tokenizer, device, batch_size=1, prompt_length=512, n...
function get_device_memory (line 108) | def get_device_memory(device):
FILE: bench/generation/metrics/perplexity.py
class Perplexity (line 23) | class Perplexity:
method __init__ (line 28) | def __init__(self, model, tokenizer, dataset_path="wikitext", dataset_...
method _prepare_data (line 55) | def _prepare_data(self):
method softmax (line 74) | def softmax(logits):
method calculate_perplexity (line 91) | def calculate_perplexity(self, n_ctx=512, n_batch=512):
method _process_batch (line 128) | def _process_batch(self, i, n_ctx, n_batch, tokens, nll, count):
method _compute_batch_logits (line 197) | def _compute_batch_logits(self, tokens, batch_start, batch_size):
function perplexity (line 221) | def perplexity(
FILE: bench/generation/metrics/prediction.py
function prediction_accuracy (line 22) | def prediction_accuracy(model, tokenizer, batch_size, samples=None):
FILE: bench/generation/setup/awq.py
function prepare_inputs_for_generation (line 19) | def prepare_inputs_for_generation(input_ids, past_key_values=None, atten...
function setup (line 69) | def setup(model_id: str, weights: str, activations: str, group_size: int...
FILE: bench/generation/setup/bnb.py
function setup (line 19) | def setup(
FILE: bench/generation/setup/hqq.py
function setup (line 21) | def setup(model_id: str, weights: str, activations: str, device: torch.d...
FILE: bench/generation/setup/quanto.py
function calibrate (line 25) | def calibrate(model, tokenizer, batch_size, batches):
function setup (line 40) | def setup(
function keyword_to_qtype (line 71) | def keyword_to_qtype(k):
FILE: bench/kernels/benchmark.py
function get_unpack_bench (line 26) | def get_unpack_bench(bits, device):
function timing (line 36) | def timing(get_bench_func, device, iterations=10):
function main (line 95) | def main():
FILE: bench/kernels/benchmark_marlin_fp8.py
function run_benchmark (line 28) | def run_benchmark(
function shape_generator (line 132) | def shape_generator():
function shape_generator (line 137) | def shape_generator():
FILE: bench/kernels/benchmark_w4a16.py
function benchmark (line 12) | def benchmark(f, warmup=1, iter=10):
function get_problem (line 28) | def get_problem(m, n, k, groupsize=128):
function benchmark_dense (line 44) | def benchmark_dense(A, B, m, n, k):
function benchmark_awq (line 53) | def benchmark_awq(A, B, s, sz, m, n, k):
function benchmark_marlin (line 64) | def benchmark_marlin(A, B, s, sz, m, n, k):
function run_benchmark (line 87) | def run_benchmark(model, tokens=None):
function main (line 130) | def main():
FILE: bench/torch_kernels/test_int_mm.py
function main (line 21) | def main():
FILE: bench/torch_kernels/test_int_mm_inductor.py
function mm (line 20) | def mm(a, b):
FILE: bench/torch_kernels/test_weight_int4pack_mm.py
function _group_quantize_tensor (line 21) | def _group_quantize_tensor(w, n_bit=4, q_group_size=16):
function main (line 64) | def main():
FILE: bench/torch_kernels/test_weight_int8pack_mm.py
function main (line 21) | def main():
FILE: examples/nlp/text-classification/sst2/quantize_sst2_model.py
function evaluate_model (line 28) | def evaluate_model(model, tokenizer, dataset, device, batch_size):
function keyword_to_itype (line 38) | def keyword_to_itype(k):
function main (line 42) | def main():
FILE: examples/nlp/text-generation/quantize_causal_lm_model.py
function generate (line 26) | def generate(model, tokenizer, device, prompt, max_new_tokens):
function calibrate (line 43) | def calibrate(model, tokenizer, dataset, device, batch_size, samples=None):
function keyword_to_itype (line 56) | def keyword_to_itype(k):
function main (line 65) | def main():
FILE: examples/speech/speech_recognition/quantize_asr_model.py
function map_to_feats (line 31) | def map_to_feats(batch, processor):
function transcribe_batch (line 42) | def transcribe_batch(batch, model, processor):
function evaluate_model (line 51) | def evaluate_model(model, processor, dataset, metric: evaluate.Evaluatio...
function keyword_to_itype (line 61) | def keyword_to_itype(k):
function main (line 65) | def main():
FILE: examples/vision/StableDiffusion/quantize_StableDiffusion.py
function load_pipeline (line 25) | def load_pipeline(torch_dtype, unet_dtype=None, device="cpu"):
function run_inference (line 36) | def run_inference(pipe, batch_size=1):
function benchmark_fn (line 45) | def benchmark_fn(f, *args, **kwargs):
function bytes_to_giga_bytes (line 50) | def bytes_to_giga_bytes(bytes):
function get_device_memory (line 54) | def get_device_memory(device):
FILE: examples/vision/image-classification/mnist/quantize_mnist_model.py
function test (line 39) | def test(model, device, test_loader):
function train (line 65) | def train(log_interval, model, device, train_loader, optimizer, epoch):
function keyword_to_itype (line 89) | def keyword_to_itype(k):
function main (line 93) | def main():
FILE: examples/vision/image-classification/pets/quantize_vit_model.py
function test (line 29) | def test(model, device, test_loader):
function keyword_to_itype (line 56) | def keyword_to_itype(k):
function main (line 60) | def main():
FILE: examples/vision/object-detection/quantize_owl_model.py
function detect (line 14) | def detect(model, processor, image, texts):
function get_device_memory (line 52) | def get_device_memory(device):
function keyword_to_qtype (line 66) | def keyword_to_qtype(k):
function main (line 70) | def main():
FILE: examples/vision/text-to-image/quantize_pixart_sigma.py
function load_pipeline (line 21) | def load_pipeline(model_id, torch_dtype, qtype=None, device="cpu"):
function get_device_memory (line 34) | def get_device_memory(device):
FILE: external/awq/conftest.py
function device (line 27) | def device(request):
function pytest_configure (line 31) | def pytest_configure(config):
function pytest_runtest_call (line 36) | def pytest_runtest_call(item):
FILE: external/awq/pack_intweight.py
function pack_intweight (line 25) | def pack_intweight(unpacked_qweight, interleave, kstride):
FILE: external/awq/packing_utils.py
function pack_awq (line 8) | def pack_awq(intweight: torch.Tensor, reorder=False):
function unpack_awq (line 23) | def unpack_awq(qweight: torch.Tensor, bits: int):
function reverse_awq_order (line 35) | def reverse_awq_order(iweights: torch.Tensor, bits: int):
function pack_exllama (line 50) | def pack_exllama(iweights: torch.Tensor, izeros: torch.Tensor, bits: int):
function unpack_reorder_pack (line 72) | def unpack_reorder_pack(qweight, qzeros, bits):
function dequantize_gemm (line 91) | def dequantize_gemm(qweight, qzeros, scales, bits, group_size):
FILE: external/awq/test_awq_kernels.py
function assert_similar (line 21) | def assert_similar(a, b, atol=None, rtol=None):
function test_standalone_kernel (line 41) | def test_standalone_kernel(in_features, out_features, kernel):
function test_integrated_kernel (line 103) | def test_integrated_kernel(in_features, out_features, kernel):
FILE: external/awq/test_awq_packing.py
function test_awq_pack (line 28) | def test_awq_pack(in_features, out_features, reorder, random):
function test_awq_pack_v2 (line 64) | def test_awq_pack_v2(in_features, out_features, random):
FILE: external/awq/test_awq_quantize.py
function awq_quantize (line 7) | def awq_quantize(base, scales, zeros, group_size):
function test_awq_quantize (line 24) | def test_awq_quantize(in_features, out_features):
FILE: external/smoothquant/smoothquant.py
function get_act_scales (line 16) | def get_act_scales(model, tokenizer, dataset, num_samples=512, seq_len=5...
function smooth_ln_fcs (line 53) | def smooth_ln_fcs(ln, fcs, act_scales, alpha=0.5):
function smooth_lm (line 77) | def smooth_lm(model, scales, alpha=0.5):
function main (line 111) | def main():
FILE: optimum/quanto/calibrate.py
function _updated_scale (line 31) | def _updated_scale(scale, new_scale, momentum):
function absmax_scale (line 37) | def absmax_scale(base: torch.Tensor, qtype: qtype = qint8, axis: Optiona...
class Calibration (line 64) | class Calibration(TorchFunctionMode):
method __init__ (line 81) | def __init__(self, *args, momentum: float = 0.9, streamline=True, debu...
method __torch_function__ (line 90) | def __torch_function__(self, func, types, args=(), kwargs=None):
method __enter__ (line 107) | def __enter__(self):
method __exit__ (line 112) | def __exit__(self, exc_type, exc_val, exc_tb):
method calibrate_input (line 120) | def calibrate_input(self, module: torch.nn.Module, input, momentum: fl...
method calibrate_output (line 139) | def calibrate_output(
method tag_outputs (line 176) | def tag_outputs(
FILE: optimum/quanto/library/extensions/__init__.py
function _is_xpu_available (line 34) | def _is_xpu_available():
FILE: optimum/quanto/library/extensions/cpp/__init__.py
function unpack_cpp (line 35) | def unpack_cpp(t: torch.Tensor, bits: int):
FILE: optimum/quanto/library/extensions/cpp/pybind_module.cpp
function PYBIND11_MODULE (line 24) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
FILE: optimum/quanto/library/extensions/cpp/unpack.cpp
function unpack_4bit (line 19) | static torch::Tensor unpack_4bit(torch::Tensor &t) {
function unpack_2bit (line 27) | static torch::Tensor unpack_2bit(torch::Tensor &t) {
function unpack (line 37) | torch::Tensor unpack(torch::Tensor &t, int bits) {
FILE: optimum/quanto/library/extensions/cuda/__init__.py
function get_max_cuda_arch (line 25) | def get_max_cuda_arch():
function unpack_cuda (line 78) | def unpack_cuda(t: torch.Tensor, bits: int):
function gemm_f16i4_awq (line 98) | def gemm_f16i4_awq(
function fp8_marlin_gemm (line 139) | def fp8_marlin_gemm(
function gptq_marlin_repack (line 162) | def gptq_marlin_repack(
function gemm_f16i4_marlin (line 177) | def gemm_f16i4_marlin(
FILE: optimum/quanto/library/extensions/cuda/awq/v2/semaphore.h
function class (line 44) | class Semaphore
FILE: optimum/quanto/library/extensions/cuda/marlin/marlin_cuda.cpp
function mul (line 28) | void mul(
FILE: optimum/quanto/library/extensions/cuda/pybind_module.cpp
function PYBIND11_MODULE (line 30) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
FILE: optimum/quanto/library/extensions/extension.py
class Extension (line 13) | class Extension(object):
method __init__ (line 14) | def __init__(
method lib (line 30) | def lib(self):
function register_extension (line 60) | def register_extension(extension: Extension):
function get_extension (line 65) | def get_extension(extension_type: str):
function is_extension_available (line 77) | def is_extension_available(extension_type: str):
FILE: optimum/quanto/library/extensions/hip/__init__.py
function unpack_hip (line 35) | def unpack_hip(t: torch.Tensor, bits: int):
FILE: optimum/quanto/library/extensions/hip/pybind_module.cpp
function PYBIND11_MODULE (line 19) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
FILE: optimum/quanto/library/extensions/mps/__init__.py
function unpack_mps (line 35) | def unpack_mps(t: torch.Tensor, bits: int):
FILE: optimum/quanto/library/extensions/mps/pybind_module.cpp
function PYBIND11_MODULE (line 19) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
FILE: optimum/quanto/library/extensions/xpu/__init__.py
function unpack_xpu (line 41) | def unpack_xpu(t: torch.Tensor, bits: int):
function gemm_f16i4_awq (line 61) | def gemm_f16i4_awq(
FILE: optimum/quanto/library/extensions/xpu/pybind_module.cpp
function PYBIND11_MODULE (line 25) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
FILE: optimum/quanto/library/qbytes_mm.py
function qbytes_mm (line 25) | def qbytes_mm(activations: torch.Tensor, weights: torch.Tensor, output_s...
function qbytes_int_mm (line 36) | def qbytes_int_mm(activations: torch.Tensor, weights: torch.Tensor, outp...
function qbytes_int8pack_mm (line 53) | def qbytes_int8pack_mm(activations: torch.Tensor, weights: torch.Tensor,...
function qbytes_mm_impl_default (line 67) | def qbytes_mm_impl_default(
function qbytes_mm_impl_cuda (line 74) | def qbytes_mm_impl_cuda(activations: torch.Tensor, weights: torch.Tensor...
function qbytes_mm_impl_cpu (line 92) | def qbytes_mm_impl_cpu(activations: torch.Tensor, weights: torch.Tensor,...
function qbytes_mm_impl_mps (line 109) | def qbytes_mm_impl_mps(activations: torch.Tensor, weights: torch.Tensor,...
FILE: optimum/quanto/library/quantize.py
function quantize_symmetric (line 28) | def quantize_symmetric(
function quantize_affine (line 65) | def quantize_affine(
FILE: optimum/quanto/library/unpack.py
function unpack (line 22) | def unpack(packed: torch.Tensor, bits: int) -> torch.Tensor:
FILE: optimum/quanto/models/__init__.py
function is_transformers_available (line 21) | def is_transformers_available() -> bool:
function is_diffusers_available (line 25) | def is_diffusers_available() -> bool:
FILE: optimum/quanto/models/diffusers_models.py
class QuantizedDiffusersModel (line 44) | class QuantizedDiffusersModel(ModelHubMixin):
method __init__ (line 48) | def __init__(self, model: ModelMixin):
method __getattr__ (line 53) | def __getattr__(self, name: str) -> Any:
method forward (line 61) | def forward(self, *args, **kwargs):
method __call__ (line 64) | def __call__(self, *args, **kwargs):
method _qmap_name (line 68) | def _qmap_name():
method quantize (line 72) | def quantize(
method from_pretrained (line 119) | def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os....
method _save_pretrained (line 180) | def _save_pretrained(self, save_directory: Path) -> None:
class QuantizedPixArtTransformer2DModel (line 189) | class QuantizedPixArtTransformer2DModel(QuantizedDiffusersModel):
FILE: optimum/quanto/models/shared_dict.py
class ShardedStateDict (line 22) | class ShardedStateDict(Mapping):
method __init__ (line 30) | def __init__(self, base_dir: str, tensor_index: Dict[str, str]):
method __iter__ (line 35) | def __iter__(self):
method __len__ (line 38) | def __len__(self):
method __getitem__ (line 41) | def __getitem__(self, key: Any) -> Any:
method __contains__ (line 49) | def __contains__(self, key: object) -> bool:
method keys (line 52) | def keys(self):
FILE: optimum/quanto/models/transformers_models.py
class QuantizedTransformersModel (line 38) | class QuantizedTransformersModel(ModelHubMixin):
method __init__ (line 42) | def __init__(self, model: PreTrainedModel):
method __getattr__ (line 47) | def __getattr__(self, name: str) -> Any:
method forward (line 55) | def forward(self, *args, **kwargs):
method __call__ (line 58) | def __call__(self, *args, **kwargs):
method __repr__ (line 61) | def __repr__(self):
method _qmap_name (line 65) | def _qmap_name():
method quantize (line 69) | def quantize(
method from_pretrained (line 115) | def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os....
method _save_pretrained (line 165) | def _save_pretrained(self, save_directory: Path) -> None:
class QuantizedModelForCausalLM (line 182) | class QuantizedModelForCausalLM(QuantizedTransformersModel):
FILE: optimum/quanto/nn/qconv2d.py
class QConv2d (line 27) | class QConv2d(QModuleMixin, torch.nn.Conv2d):
method qcreate (line 29) | def qcreate(
method forward (line 54) | def forward(self, input: torch.Tensor) -> torch.Tensor:
FILE: optimum/quanto/nn/qlayernorm.py
class QLayerNorm (line 27) | class QLayerNorm(QModuleMixin, torch.nn.LayerNorm):
method qcreate (line 29) | def qcreate(
method forward (line 52) | def forward(self, input: torch.Tensor) -> torch.Tensor:
FILE: optimum/quanto/nn/qlinear.py
class QLinear (line 27) | class QLinear(QModuleMixin, torch.nn.Linear):
method qcreate (line 29) | def qcreate(
method forward (line 49) | def forward(self, input: torch.Tensor) -> torch.Tensor:
FILE: optimum/quanto/nn/qmodule.py
function register_qmodule (line 44) | def register_qmodule(module_cls):
function quantize_module (line 81) | def quantize_module(
class QModuleMixin (line 94) | class QModuleMixin(ABC):
method __init__ (line 95) | def __init__(
method disable_output_quantization (line 143) | def disable_output_quantization(self):
method _save_to_state_dict (line 147) | def _save_to_state_dict(self, destination, prefix, keep_vars):
method _load_from_state_dict (line 161) | def _load_from_state_dict(
method from_module (line 210) | def from_module(
method qcreate (line 235) | def qcreate(
method qweight (line 246) | def qweight(self):
method qforward (line 281) | def qforward(self, input: torch.Tensor) -> torch.Tensor:
method quantize_input (line 284) | def quantize_input(self, module: torch.nn.Module, input: torch.Tensor)...
method quantize_output (line 296) | def quantize_output(
method freeze (line 304) | def freeze(self):
method frozen (line 311) | def frozen(self):
FILE: optimum/quanto/quantize.py
function set_module_by_name (line 27) | def set_module_by_name(parent_module, name, child_module):
function _quantize_submodule (line 37) | def _quantize_submodule(
function quantize (line 55) | def quantize(
function requantize (line 101) | def requantize(
function freeze (line 143) | def freeze(model):
function quantization_map (line 149) | def quantization_map(model: torch.nn.Module) -> Dict[str, Dict[str, str]]:
FILE: optimum/quanto/subpackage/commands/base.py
class QuantoCommand (line 25) | class QuantoCommand(BaseOptimumCLICommand):
FILE: optimum/quanto/subpackage/commands/quantize.py
function parse_quantize_args (line 32) | def parse_quantize_args(parser: "ArgumentParser"):
class QuantizeCommand (line 95) | class QuantizeCommand(BaseOptimumCLICommand):
method parse_args (line 97) | def parse_args(parser: "ArgumentParser"):
method run (line 100) | def run(self):
FILE: optimum/quanto/tensor/activations/qbytes.py
class ActivationQBytesQuantizer (line 28) | class ActivationQBytesQuantizer(Function):
method forward (line 30) | def forward(ctx, base: torch.Tensor, qtype: qtype, scale: torch.Tensor...
method backward (line 41) | def backward(ctx, gO):
class ActivationQBytesTensor (line 46) | class ActivationQBytesTensor(QBytesTensor):
method __new__ (line 48) | def __new__(cls, qtype, size, stride, data, scale, requires_grad=False):
method __init__ (line 54) | def __init__(self, qtype, size, stride, data, scale, requires_grad=Fal...
method quantize (line 58) | def quantize(cls, base: torch.Tensor, qtype: qtype, scale: torch.Tenso...
method __tensor_flatten__ (line 61) | def __tensor_flatten__(self):
method __tensor_unflatten__ (line 71) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
method __torch_dispatch__ (line 82) | def __torch_dispatch__(cls, op, types, args, kwargs=None):
FILE: optimum/quanto/tensor/activations/qbytes_ops.py
function register_qbytestensor_op (line 34) | def register_qbytestensor_op(aten_ops: List[Callable]):
function get_qbytestensor_op_dispatch (line 52) | def get_qbytestensor_op_dispatch(aten_op):
function is_scalar (line 56) | def is_scalar(t):
function _to_copy (line 61) | def _to_copy(op, t, dtype=None, **kwargs):
function detach (line 70) | def detach(op, t):
function cat (line 78) | def cat(op, inputs, dim=0):
function lt (line 97) | def lt(op, input, other):
function clone (line 109) | def clone(op, t, memory_format=torch.preserve_format):
function copy_ (line 121) | def copy_(op, dest, src):
function div (line 129) | def div(op, input, other):
function neg (line 137) | def neg(op, input, *args, **kwargs):
function unary_type_agnostic_op (line 154) | def unary_type_agnostic_op(op, input, *args, **kwargs):
function is_same_size (line 164) | def is_same_size(op, input, other):
function cannot_mm (line 170) | def cannot_mm(t: QTensor):
function bmm (line 176) | def bmm(op, input, other):
function mul (line 190) | def mul(op, input, other):
function relu (line 200) | def relu(op, input):
function _softmax (line 209) | def _softmax(op, input, dim, half_to_float):
function stack (line 219) | def stack(op, inputs, dim=0):
function split (line 237) | def split(op, input, *args, **kwargs):
function transpose (line 248) | def transpose(op, input, *args):
function transpose2d (line 257) | def transpose2d(op, input):
function view (line 268) | def view(op, input, *shape):
function where (line 277) | def where(op, condition, input, other):
FILE: optimum/quanto/tensor/activations/quantization.py
function quantize_activation (line 24) | def quantize_activation(t: torch.Tensor, qtype: qtype, scale: torch.Tens...
FILE: optimum/quanto/tensor/core.py
function dtype_info (line 22) | def dtype_info(dtype):
function axis_to_dim (line 27) | def axis_to_dim(t, axis):
FILE: optimum/quanto/tensor/function.py
class QuantizedLinearFunction (line 21) | class QuantizedLinearFunction(torch.autograd.Function):
method forward (line 42) | def forward(ctx, input, other, bias=None):
method backward (line 49) | def backward(ctx, gO):
FILE: optimum/quanto/tensor/grouped.py
function grouped_shape (line 10) | def grouped_shape(shape: List, axis: int, group_size: int) -> List:
function group (line 17) | def group(base: torch.Tensor, axis: int, group_size: int):
function ungroup (line 39) | def ungroup(grouped: torch.Tensor, axis: int, orig_shape: torch.Size):
FILE: optimum/quanto/tensor/optimizers/absmax_optimizer.py
class AbsmaxOptimizer (line 26) | class AbsmaxOptimizer(SymmetricOptimizer):
method optimize (line 27) | def optimize(
FILE: optimum/quanto/tensor/optimizers/affine_optimizer.py
class AffineOptimizer (line 27) | class AffineOptimizer(Optimizer):
method __call__ (line 28) | def __call__(
method optimize (line 63) | def optimize(self, base: torch.Tensor, qtype: qtype, axis: int) -> Tup...
FILE: optimum/quanto/tensor/optimizers/hqq_optimizer.py
function shrink_lp_op (line 28) | def shrink_lp_op(x: torch.Tensor, beta: float, lp_norm: float) -> torch....
class HqqOptimizer (line 37) | class HqqOptimizer(MaxOptimizer):
method __init__ (line 46) | def __init__(
method optimize (line 60) | def optimize(
FILE: optimum/quanto/tensor/optimizers/max_optimizer.py
class MaxOptimizer (line 26) | class MaxOptimizer(AffineOptimizer):
method optimize (line 27) | def optimize(
FILE: optimum/quanto/tensor/optimizers/optimizer.py
class Optimizer (line 24) | class Optimizer(ABC):
method __call__ (line 25) | def __call__(
FILE: optimum/quanto/tensor/optimizers/symmetric_optimizer.py
class SymmetricOptimizer (line 26) | class SymmetricOptimizer(Optimizer):
method __call__ (line 27) | def __call__(self, base: torch.Tensor, qtype: qtype, axis: Optional[in...
method optimize (line 37) | def optimize(self, base: torch.Tensor, qmax: float, axis: Optional[int...
FILE: optimum/quanto/tensor/packed.py
function pack_weights (line 24) | def pack_weights(intweights: torch.Tensor, bits: int) -> torch.Tensor:
class PackedTensor (line 72) | class PackedTensor(torch.Tensor):
method __new__ (line 74) | def __new__(cls, data, bits, size, stride, requires_grad=False):
method __init__ (line 82) | def __init__(self, data, bits, size, stride, requires_grad=False):
method __repr__ (line 86) | def __repr__(self):
method pack (line 93) | def pack(cls, t, bits=4):
method unpack (line 101) | def unpack(self):
method bits (line 107) | def bits(self):
method dtype (line 111) | def dtype(self):
method load_from_state_dict (line 115) | def load_from_state_dict(state_dict, prefix, bits, size, stride, missi...
method __tensor_flatten__ (line 125) | def __tensor_flatten__(self):
method __tensor_unflatten__ (line 132) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
method __torch_dispatch__ (line 145) | def __torch_dispatch__(cls, op, types, args, kwargs=None):
method numpy (line 162) | def numpy(self):
FILE: optimum/quanto/tensor/qbits.py
class QBitsDequantizer (line 27) | class QBitsDequantizer(Function):
method forward (line 29) | def forward(ctx, t):
method backward (line 52) | def backward(ctx, gO):
class QBitsTensor (line 56) | class QBitsTensor(QTensor):
method __init__ (line 57) | def __init__(self, qtype, axis, group_size, size, stride, data, scale,...
method __repr__ (line 64) | def __repr__(self):
method dequantize (line 67) | def dequantize(self):
FILE: optimum/quanto/tensor/qbytes.py
class QBytesDequantizer (line 23) | class QBytesDequantizer(Function):
method forward (line 25) | def forward(ctx, t):
method backward (line 34) | def backward(ctx, gO):
class QBytesTensor (line 39) | class QBytesTensor(QTensor):
method __init__ (line 40) | def __init__(self, qtype, axis, size, stride, data, scale, requires_gr...
method __repr__ (line 45) | def __repr__(self):
method dequantize (line 48) | def dequantize(self):
FILE: optimum/quanto/tensor/qtensor.py
function qfallback (line 21) | def qfallback(callable, *args, **kwargs):
class QTensor (line 32) | class QTensor(torch.Tensor):
method __init__ (line 33) | def __init__(self, qtype, axis):
method dequantize (line 37) | def dequantize(self):
method save_to_state_dict (line 40) | def save_to_state_dict(self, destination, prefix, keep_vars):
method axis (line 56) | def axis(self):
method qtype (line 60) | def qtype(self):
method numpy (line 63) | def numpy(self):
method equal (line 66) | def equal(self, other):
FILE: optimum/quanto/tensor/qtype.py
class qtype (line 21) | class qtype:
method __str__ (line 32) | def __str__(self):
method __hash__ (line 35) | def __hash__(self):
function qint (line 42) | def qint(bits):
function qfloat (line 55) | def qfloat(dtype: torch.dtype):
FILE: optimum/quanto/tensor/weights/awq/packed.py
function pack (line 33) | def pack(unpacked: torch.Tensor, reorder=False):
function reverse_awq_order (line 64) | def reverse_awq_order(t: torch.Tensor):
function unpack (line 80) | def unpack(packed: torch.Tensor, reorder=False):
function pack_v2 (line 100) | def pack_v2(unpacked: torch.Tensor) -> torch.Tensor:
function unpack_v2 (line 156) | def unpack_v2(packed):
class AWQPacking (line 204) | class AWQPacking(Enum):
class AWQPackedTensor (line 209) | class AWQPackedTensor(torch.Tensor):
method __new__ (line 211) | def __new__(cls, data, packing, reorder, size, stride, requires_grad=F...
method __init__ (line 220) | def __init__(self, data, packing, reorder, size, stride, requires_grad...
method __repr__ (line 225) | def __repr__(self):
method pack (line 229) | def pack(cls, t, packing=AWQPacking.V1, reorder=False):
method unpack (line 237) | def unpack(self):
method dtype (line 243) | def dtype(self):
method __tensor_flatten__ (line 246) | def __tensor_flatten__(self):
method __tensor_unflatten__ (line 258) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
method __torch_dispatch__ (line 272) | def __torch_dispatch__(cls, op, types, args, kwargs=None):
method numpy (line 293) | def numpy(self):
FILE: optimum/quanto/tensor/weights/awq/qbits.py
class AWQWeightQBitsDequantizer (line 30) | class AWQWeightQBitsDequantizer(Function):
method forward (line 32) | def forward(ctx, t):
method backward (line 49) | def backward(ctx, gO):
class AWQWeightQBitsLinearFunction (line 53) | class AWQWeightQBitsLinearFunction(QuantizedLinearFunction):
method forward (line 55) | def forward(ctx, input, other, bias):
class AWQWeightQBitsTensor (line 77) | class AWQWeightQBitsTensor(WeightQBitsTensor):
method __new__ (line 79) | def __new__(cls, qtype, axis, group_size, size, stride, data, scale, s...
method __init__ (line 87) | def __init__(self, qtype, axis, group_size, size, stride, data, scale,...
method dequantize (line 106) | def dequantize(self):
method weight_qbits_tensor (line 109) | def weight_qbits_tensor(self):
method __tensor_flatten__ (line 123) | def __tensor_flatten__(self):
method __tensor_unflatten__ (line 136) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
method __torch_function__ (line 149) | def __torch_function__(cls, func, types, args=(), kwargs=None):
FILE: optimum/quanto/tensor/weights/marlin/fp8/packed.py
function pack_fp8_as_int32 (line 22) | def pack_fp8_as_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
function unpack_int32_to_fp8 (line 51) | def unpack_int32_to_fp8(int32_tensor: torch.Tensor) -> torch.Tensor:
function get_scale_perms (line 71) | def get_scale_perms() -> torch.Tensor:
function get_row_permutation (line 78) | def get_row_permutation(n_rows: int) -> torch.Tensor:
function get_column_permutation (line 116) | def get_column_permutation(n_col: int) -> torch.Tensor:
class MarlinF8PackedTensor (line 160) | class MarlinF8PackedTensor(torch.Tensor):
method __new__ (line 161) | def __new__(cls, data, size, stride, requires_grad=False):
method __init__ (line 169) | def __init__(self, data, size, stride, requires_grad=False):
method __repr__ (line 172) | def __repr__(self):
method pack (line 176) | def pack(cls, tensor: torch.Tensor):
method unpack (line 189) | def unpack(self) -> torch.Tensor:
method dtype (line 220) | def dtype(self):
method __tensor_flatten__ (line 223) | def __tensor_flatten__(self):
method __tensor_unflatten__ (line 233) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
method __torch_dispatch__ (line 245) | def __torch_dispatch__(cls, op, types, args, kwargs=None):
FILE: optimum/quanto/tensor/weights/marlin/fp8/qbits.py
class MarlinF8QBytesLinearFunction (line 28) | class MarlinF8QBytesLinearFunction(QuantizedLinearFunction):
method forward (line 30) | def forward(ctx, input, other, bias=None):
class MarlinF8QBytesTensor (line 54) | class MarlinF8QBytesTensor(WeightQBytesTensor):
method __new__ (line 56) | def __new__(cls, qtype, axis, size, stride, data, scale, requires_grad...
method __init__ (line 63) | def __init__(self, qtype, axis, size, stride, data, scale, requires_gr...
method dequantize (line 88) | def dequantize(self):
method __repr__ (line 102) | def __repr__(self):
method weight_qbytes_tensor (line 105) | def weight_qbytes_tensor(self):
method __tensor_flatten__ (line 119) | def __tensor_flatten__(self):
method __tensor_unflatten__ (line 130) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
method __torch_function__ (line 142) | def __torch_function__(cls, func, types, args=(), kwargs=None):
FILE: optimum/quanto/tensor/weights/marlin/int4/packed.py
function _get_perm (line 19) | def _get_perm():
function pack (line 59) | def pack(unpacked: torch.Tensor):
function unpack (line 78) | def unpack(packed, orig_shape):
class MarlinInt4PackedTensor (line 91) | class MarlinInt4PackedTensor(torch.Tensor):
method __new__ (line 93) | def __new__(cls, data, size, stride, requires_grad=False):
method __init__ (line 101) | def __init__(self, data, size, stride, requires_grad=False):
method __repr__ (line 104) | def __repr__(self):
method pack (line 108) | def pack(cls, t):
method unpack (line 112) | def unpack(self):
method dtype (line 116) | def dtype(self):
method __tensor_flatten__ (line 119) | def __tensor_flatten__(self):
method __tensor_unflatten__ (line 128) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
method __torch_dispatch__ (line 139) | def __torch_dispatch__(cls, op, types, args, kwargs=None):
method numpy (line 159) | def numpy(self):
FILE: optimum/quanto/tensor/weights/marlin/int4/qbits.py
class MarlinQBitsDequantizer (line 31) | class MarlinQBitsDequantizer(Function):
method forward (line 33) | def forward(ctx, t):
method backward (line 49) | def backward(ctx, gO):
class MarlinQBitsLinearFunction (line 53) | class MarlinQBitsLinearFunction(QuantizedLinearFunction):
method forward (line 55) | def forward(ctx, input, other, bias):
class MarlinInt4WeightQBitsTensor (line 72) | class MarlinInt4WeightQBitsTensor(WeightQBitsTensor):
method __new__ (line 74) | def __new__(cls, qtype, axis, group_size, size, stride, data, scale, s...
method __init__ (line 82) | def __init__(self, qtype, axis, group_size, size, stride, data, scale,...
method dequantize (line 103) | def dequantize(self):
method weight_qbits_tensor (line 106) | def weight_qbits_tensor(self):
method __tensor_flatten__ (line 121) | def __tensor_flatten__(self):
method __tensor_unflatten__ (line 134) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
method __torch_function__ (line 147) | def __torch_function__(cls, func, types, args=(), kwargs=None):
FILE: optimum/quanto/tensor/weights/marlin/permutations.py
function _get_perms (line 28) | def _get_perms() -> Tuple[List[int], List[int]]:
function _get_inverted_perms (line 39) | def _get_inverted_perms() -> Tuple[List[int], List[int]]:
function marlin_permute (line 44) | def marlin_permute(t: torch.Tensor, reverse=False):
FILE: optimum/quanto/tensor/weights/packing.py
function unpack_int32_to_uint8 (line 18) | def unpack_int32_to_uint8(packed: torch.Tensor, bits: int):
FILE: optimum/quanto/tensor/weights/qbits.py
class WeightsQBitsQuantizer (line 34) | class WeightsQBitsQuantizer(Function):
method forward (line 36) | def forward(
method backward (line 60) | def backward(ctx, gO):
class WeightQBitsTensor (line 65) | class WeightQBitsTensor(QBitsTensor):
method create (line 67) | def create(qtype, axis, group_size, size, stride, data, scale, shift, ...
method __new__ (line 141) | def __new__(cls, qtype, axis, group_size, size, stride, data, scale, s...
method __init__ (line 148) | def __init__(self, qtype, axis, group_size, size, stride, data, scale,...
method quantize (line 154) | def quantize(
method load_from_state_dict (line 167) | def load_from_state_dict(state_dict, prefix, qtype, axis, group_size, ...
method optimize (line 201) | def optimize(self):
method save_to_state_dict (line 223) | def save_to_state_dict(self, destination, prefix, keep_vars):
method weight_qbits_tensor (line 230) | def weight_qbits_tensor(self):
method __tensor_flatten__ (line 237) | def __tensor_flatten__(self):
method __tensor_unflatten__ (line 250) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
method __torch_function__ (line 263) | def __torch_function__(cls, func, types, args=(), kwargs=None):
method __torch_dispatch__ (line 290) | def __torch_dispatch__(cls, op, types, args, kwargs=None):
FILE: optimum/quanto/tensor/weights/qbytes.py
class WeightQBytesQuantizer (line 31) | class WeightQBytesQuantizer(Function):
method forward (line 33) | def forward(
method backward (line 63) | def backward(ctx, gO):
class WeightQBytesLinearFunction (line 68) | class WeightQBytesLinearFunction(QuantizedLinearFunction):
method forward (line 70) | def forward(ctx, input, other, bias=None):
class WeightQBytesTensor (line 85) | class WeightQBytesTensor(QBytesTensor):
method create (line 87) | def create(
method __new__ (line 146) | def __new__(cls, qtype, axis, size, stride, data, scale, activation_qt...
method __init__ (line 152) | def __init__(self, qtype, axis, size, stride, data, scale, activation_...
method quantize (line 157) | def quantize(
method load_from_state_dict (line 169) | def load_from_state_dict(state_dict, prefix, qtype, axis, size, stride...
method optimize (line 191) | def optimize(self):
method save_to_state_dict (line 211) | def save_to_state_dict(self, destination, prefix, keep_vars):
method weight_qbytes_tensor (line 218) | def weight_qbytes_tensor(self):
method __tensor_flatten__ (line 225) | def __tensor_flatten__(self):
method __tensor_unflatten__ (line 237) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
method __torch_function__ (line 250) | def __torch_function__(cls, func, types, args=(), kwargs=None):
method __torch_dispatch__ (line 277) | def __torch_dispatch__(cls, op, types, args, kwargs=None):
FILE: optimum/quanto/tensor/weights/quantization.py
function quantize_weight (line 27) | def quantize_weight(
FILE: optimum/quanto/tensor/weights/reordering.py
function reorder (line 23) | def reorder(t: torch.Tensor, permutation: Union[torch.Tensor, List[int]]):
function reverse (line 38) | def reverse(permutation: Union[torch.Tensor, List[int]]):
FILE: optimum/quanto/tensor/weights/tinygemm/packed.py
class TinyGemmPackedTensor (line 25) | class TinyGemmPackedTensor(torch.Tensor):
method __new__ (line 27) | def __new__(cls, data, size, stride, requires_grad=False):
method __init__ (line 34) | def __init__(self, data, size, stride, requires_grad=False):
method __repr__ (line 37) | def __repr__(self):
method pack (line 41) | def pack(cls, t):
method unpack (line 66) | def unpack(self):
method dtype (line 98) | def dtype(self):
method __tensor_flatten__ (line 101) | def __tensor_flatten__(self):
method __tensor_unflatten__ (line 111) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
method __torch_dispatch__ (line 123) | def __torch_dispatch__(cls, op, types, args, kwargs=None):
method numpy (line 147) | def numpy(self):
FILE: optimum/quanto/tensor/weights/tinygemm/qbits.py
class TinyGemmQBitsDequantizer (line 30) | class TinyGemmQBitsDequantizer(Function):
method forward (line 32) | def forward(ctx, t):
method backward (line 38) | def backward(ctx, gO):
class TinyGemmQBitsLinearFunction (line 42) | class TinyGemmQBitsLinearFunction(QuantizedLinearFunction):
method forward (line 44) | def forward(ctx, input, other, bias):
class TinyGemmWeightQBitsTensor (line 65) | class TinyGemmWeightQBitsTensor(WeightQBitsTensor):
method __new__ (line 67) | def __new__(cls, qtype, axis, group_size, size, stride, data, scale_sh...
method __init__ (line 82) | def __init__(self, qtype, axis, group_size, size, stride, data, scale_...
method dequantize (line 111) | def dequantize(self):
method weight_qbits_tensor (line 114) | def weight_qbits_tensor(self):
method __tensor_flatten__ (line 130) | def __tensor_flatten__(self):
method __tensor_unflatten__ (line 143) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
method __torch_function__ (line 156) | def __torch_function__(cls, func, types, args=(), kwargs=None):
FILE: tests/cli/test_quantize_cli.py
function test_export_decoder_cli (line 26) | def test_export_decoder_cli(weights):
FILE: tests/conftest.py
function device (line 29) | def device(request):
function pytest_configure (line 33) | def pytest_configure(config):
function pytest_runtest_call (line 38) | def pytest_runtest_call(item):
FILE: tests/helpers.py
function torch_min_version (line 33) | def torch_min_version(v):
function device_eq (line 46) | def device_eq(a, b):
function random_tensor (line 54) | def random_tensor(shape, dtype=torch.float32, device="cpu"):
function random_qactivation (line 65) | def random_qactivation(shape, qtype=qint8, dtype=torch.float32, device="...
function random_qweight (line 71) | def random_qweight(shape, qtype, dtype=torch.float32, axis=0, group_size...
function assert_similar (line 85) | def assert_similar(a, b, atol=None, rtol=None):
function get_device_memory (line 102) | def get_device_memory(device):
FILE: tests/library/test_extensions.py
function _is_xpu_available (line 10) | def _is_xpu_available():
function test_extension_available (line 32) | def test_extension_available(extension_name):
function test_extension_compilation (line 37) | def test_extension_compilation(extension_name):
FILE: tests/library/test_mm.py
function test_qbytes_mm (line 35) | def test_qbytes_mm(batch_size, input_features, input_dtype, weight_dtype...
function test_gemm_fp16_int4 (line 59) | def test_gemm_fp16_int4(batch_size, tokens, in_features, out_features):
function test_fp8_marlin (line 112) | def test_fp8_marlin(tokens, in_features, out_features, dtype):
function test_gemm_marlin_fp16_int4 (line 155) | def test_gemm_marlin_fp16_int4(batch_size, tokens, in_features, out_feat...
FILE: tests/library/test_quantize.py
function test_symmetric_quantize_int (line 41) | def test_symmetric_quantize_int(input_shape, dtype, qtype, axis, device):
function test_symmetric_quantize_float8 (line 63) | def test_symmetric_quantize_float8(input_shape, dtype, qtype, axis, devi...
function test_affine_quantize (line 78) | def test_affine_quantize(input_shape, dtype, qtype, axis, group_size, sh...
function test_affine_quantize_integer_tensor (line 107) | def test_affine_quantize_integer_tensor(dtype, qtype, device):
FILE: tests/library/test_unpack.py
function test_unpack (line 24) | def test_unpack(bits, shape, device):
FILE: tests/models/conftest.py
function staging (line 6) | def staging():
function skip_if_staging (line 25) | def skip_if_staging(request):
FILE: tests/models/test_quantized_model_for_causal_lm.py
function quantized_model_for_causal_lm (line 11) | def quantized_model_for_causal_lm(model_id, qtype, exclude, from_config=...
function compare_models (line 49) | def compare_models(a_model, b_model):
function test_quantized_model_for_causal_lm_base (line 79) | def test_quantized_model_for_causal_lm_base(model_id, qtype, exclude_lm_...
function test_quantized_model_for_causal_lm_sharded (line 92) | def test_quantized_model_for_causal_lm_sharded():
function test_causal_lm_base_push_to_hub (line 107) | def test_causal_lm_base_push_to_hub(staging, in_org):
function test_quantized_model_load_state_dict_non_strict (line 134) | def test_quantized_model_load_state_dict_non_strict(model_id, qtype):
FILE: tests/models/test_quantized_model_for_pixart.py
function quantized_model_for_pixart (line 11) | def quantized_model_for_pixart(qtype, exclude):
function compare_models (line 40) | def compare_models(a_model, b_model):
function test_quantized_model_for_pixart (line 80) | def test_quantized_model_for_pixart(qtype, exclude_proj_out):
function test_push_to_hub (line 94) | def test_push_to_hub(staging, in_org):
FILE: tests/nn/test_calibrate.py
function _test_calibrate_qlinear (line 23) | def _test_calibrate_qlinear(batch_size, tokens, embeddings, use_bias, ac...
function test_calibrate_qlinear_activations_int8 (line 45) | def test_calibrate_qlinear_activations_int8(batch_size, tokens, embeddin...
function test_calibrate_qlinear_activations_float8 (line 58) | def test_calibrate_qlinear_activations_float8(batch_size, tokens, embedd...
function _test_calibrate_custom_module (line 62) | def _test_calibrate_custom_module(activations, device):
function test_calibrate_custom_module_activations_int8 (line 88) | def test_calibrate_custom_module_activations_int8(device):
function test_calibrate_custom_module_activations_float8 (line 98) | def test_calibrate_custom_module_activations_float8(activations, device):
FILE: tests/nn/test_qattention.py
class RotaryEmbedding (line 27) | class RotaryEmbedding(nn.Module):
method __init__ (line 28) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
method _set_cos_sin_cache (line 42) | def _set_cos_sin_cache(self, seq_len, device, dtype):
method forward (line 52) | def forward(self, x, seq_len=None):
function rotate_half (line 63) | def rotate_half(x):
function apply_rotary_pos_emb (line 70) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
function repeat_kv (line 98) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
class Attention (line 110) | class Attention(nn.Module):
method __init__ (line 113) | def __init__(self, hidden_size=128, num_heads=4, max_position_embeddin...
method _shape (line 130) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
method forward (line 133) | def forward(
function _test_quantize_attention (line 174) | def _test_quantize_attention(device, dtype=torch.float32, weights=qint8,...
function test_quantize_attention_weights_only (line 193) | def test_quantize_attention_weights_only(weights, device):
function test_quantize_attention_weights_only_float8 (line 198) | def test_quantize_attention_weights_only_float8(device):
function test_quantize_attention_activations_int8 (line 203) | def test_quantize_attention_activations_int8(weights, device):
function test_quantize_attention_activations_float8 (line 214) | def test_quantize_attention_activations_float8(weights, activations, dev...
FILE: tests/nn/test_qconv2d.py
function _test_quantize_conv2d (line 31) | def _test_quantize_conv2d(batch_size, img_shape, out_channels, use_bias,...
function test_quantize_conv2d_float16_activations_int8 (line 59) | def test_quantize_conv2d_float16_activations_int8(batch_size, img_shape,...
function test_quantize_conv2d_float32_activations_int8 (line 68) | def test_quantize_conv2d_float32_activations_int8(batch_size, img_shape,...
function test_quantize_conv2d_float16_activations_float8 (line 83) | def test_quantize_conv2d_float16_activations_float8(
function test_quantize_conv2d_float32_activations_float8 (line 100) | def test_quantize_conv2d_float32_activations_float8(
function test_quantize_conv2d_float16_weight_only (line 111) | def test_quantize_conv2d_float16_weight_only(batch_size, img_shape, out_...
function test_quantize_conv2d_float32_weight_only (line 120) | def test_quantize_conv2d_float32_weight_only(batch_size, img_shape, out_...
function test_qconv2d_gradient (line 128) | def test_qconv2d_gradient(img_shape, out_channels, activations, weights,...
FILE: tests/nn/test_qlayernorm.py
function _test_quantize_layernorm (line 23) | def _test_quantize_layernorm(batch_size, tokens, embeddings, affine, dty...
function test_quantize_layernorm_float16_activations_int8 (line 47) | def test_quantize_layernorm_float16_activations_int8(batch_size, tokens,...
function test_quantize_layernorm_float32_activations_int8 (line 54) | def test_quantize_layernorm_float32_activations_int8(batch_size, tokens,...
function test_quantize_layernorm_float16_activations_float8 (line 67) | def test_quantize_layernorm_float16_activations_float8(batch_size, token...
function test_quantize_layernorm_float32_activations_float8 (line 80) | def test_quantize_layernorm_float32_activations_float8(batch_size, token...
function test_quantize_layernom_no_activation (line 84) | def test_quantize_layernom_no_activation():
FILE: tests/nn/test_qlinear.py
function _test_quantize_linear (line 37) | def _test_quantize_linear(batch_size, tokens, embeddings, use_bias, weig...
function test_quantize_linear_float16_activations_int8 (line 65) | def test_quantize_linear_float16_activations_int8(batch_size, tokens, em...
function test_quantize_linear_float32_activations_int8 (line 73) | def test_quantize_linear_float32_activations_int8(batch_size, tokens, em...
function test_quantize_linear_float16_activations_float8 (line 90) | def test_quantize_linear_float16_activations_float8(
function test_quantize_linear_float32_activations_float8 (line 107) | def test_quantize_linear_float32_activations_float8(
function test_quantize_linear_float16_weight_only (line 120) | def test_quantize_linear_float16_weight_only(batch_size, tokens, embeddi...
function test_quantize_linear_float32_weight_only (line 134) | def test_quantize_linear_float32_weight_only(batch_size, tokens, embeddi...
function test_qlinear_gradient (line 141) | def test_qlinear_gradient(tokens, embeddings, activations, weights, devi...
function test_move_qlinear (line 182) | def test_move_qlinear(dtype, use_bias, weights, device):
function test_qlinear_serialization (line 200) | def test_qlinear_serialization(features, use_bias, activations, weights,...
FILE: tests/nn/test_qmodule.py
function test_qmodule_freeze (line 26) | def test_qmodule_freeze(in_features, out_features, use_bias, dtype):
function test_qmodule_qtype_as_string (line 50) | def test_qmodule_qtype_as_string(weights, activations):
FILE: tests/quantize/test_quantize_mlp.py
class MLP (line 40) | class MLP(torch.nn.Module):
method __init__ (line 41) | def __init__(self, input_size, output_size, hidden_size):
method forward (line 47) | def forward(self, inputs):
function check_mlp (line 53) | def check_mlp(model, frozen):
function _test_quantize_mlp (line 63) | def _test_quantize_mlp(weights, activations, optimizer, frozen, device, ...
function test_quantize_mlp_weights_only (line 85) | def test_quantize_mlp_weights_only(weights, frozen, device):
function test_quantize_mlp_weights_only_float8 (line 92) | def test_quantize_mlp_weights_only_float8(weights, frozen, device):
function test_quantize_mlp_int8_activations (line 99) | def test_quantize_mlp_int8_activations(weights, frozen, device):
function test_quantize_mlp_float8_activations (line 111) | def test_quantize_mlp_float8_activations(weights, activations, frozen, d...
function test_quantized_mlp_device_memory (line 120) | def test_quantized_mlp_device_memory(weights, dtype, weights_only, device):
function test_quantize_mlp_weights_only_optimizers (line 140) | def test_quantize_mlp_weights_only_optimizers(weights, optimizer, frozen...
function test_quantize_mlp_wrong_optimizer (line 148) | def test_quantize_mlp_wrong_optimizer(weights, optimizer, device):
FILE: tests/quantize/test_quantize_patterns.py
class MLP (line 25) | class MLP(torch.nn.Module):
method __init__ (line 26) | def __init__(self, input_size, output_size, hidden_size):
method forward (line 32) | def forward(self, inputs):
class ClassificationModel (line 38) | class ClassificationModel(torch.nn.Module):
method __init__ (line 39) | def __init__(self, input_size, output_size, hidden_size, classes):
method forward (line 44) | def forward(self, inputs):
function has_children (line 49) | def has_children(module: torch.nn.Module):
function leaf_module_names (line 53) | def leaf_module_names(module: torch.nn.Module):
function parent_module_names (line 57) | def parent_module_names(module: torch.nn.Module):
function test_quantize_mlp_include_explicit_layers (line 61) | def test_quantize_mlp_include_explicit_layers():
function test_quantize_mlp_exclude_explicit_layers (line 74) | def test_quantize_mlp_exclude_explicit_layers():
function test_quantize_mlp_include_layer_patterns (line 87) | def test_quantize_mlp_include_layer_patterns():
function test_quantize_mlp_exclude_layer_patterns (line 100) | def test_quantize_mlp_exclude_layer_patterns():
FILE: tests/quantize/test_requantize.py
function save_and_reload_state_dict (line 28) | def save_and_reload_state_dict(state_dict, serialization):
function test_requantize_serialized_model (line 50) | def test_requantize_serialized_model(
function test_requantized_model_device_memory (line 78) | def test_requantized_model_device_memory(weights, dtype, serialization, ...
FILE: tests/tensor/activations/test_activations_compile.py
function compile_for_device (line 22) | def compile_for_device(f, device):
function test_compile_quantize_tensor (line 34) | def test_compile_quantize_tensor(input_shape, qtype, dtype, device):
function test_compile_qtensor_to (line 51) | def test_compile_qtensor_to(device):
FILE: tests/tensor/activations/test_activations_dispatch.py
function test_qactivation_mul_scalar (line 24) | def test_qactivation_mul_scalar(input_shape, scalar, device):
function test_qactivation_relu (line 40) | def test_qactivation_relu(batch_size, tokens, embeddings, device):
function test_qactivation_softmax (line 49) | def test_qactivation_softmax(batch_size, tokens, embeddings, device):
function test_qactivation_view (line 58) | def test_qactivation_view(input_shape, device):
function test_qactivation_cat (line 65) | def test_qactivation_cat(input_shape, device):
function test_qactivation_transpose_2d (line 75) | def test_qactivation_transpose_2d(device):
function test_qactivation_transpose (line 84) | def test_qactivation_transpose(device):
FILE: tests/tensor/activations/test_activations_quantize.py
function test_symmetric_quantize_int (line 33) | def test_symmetric_quantize_int(input_shape, dtype, qtype, device):
function test_symmetric_quantize_float8 (line 52) | def test_symmetric_quantize_float8(input_shape, dtype, qtype, device):
FILE: tests/tensor/ops/test_linear_dispatch.py
function test_qactivation_qweight_linear (line 28) | def test_qactivation_qweight_linear(
function test_linear_fp16_int4 (line 48) | def test_linear_fp16_int4(batch_size, tokens, embeddings, use_bias, devi...
function test_linear_bf16_int4 (line 63) | def test_linear_bf16_int4(batch_size, tokens, embeddings, use_bias, devi...
FILE: tests/tensor/ops/test_mm_dispatch.py
function test_qactivation_qweight_matmul (line 26) | def test_qactivation_qweight_matmul(dtype, in_features, hidden, out_feat...
function test_qactivation_qactivation_bmm (line 38) | def test_qactivation_qactivation_bmm(dtype, batch_size, a_shape, b_shape...
FILE: tests/tensor/optimizers/test_hqq_optimizer.py
function compare_quantized_tensor (line 28) | def compare_quantized_tensor(a, qtype, axis, group_size, scale, shift):
function test_hqq_optimizer (line 42) | def test_hqq_optimizer(input_shape, dtype, qtype, axis, group_size, devi...
FILE: tests/tensor/test_absmax.py
function test_absmax_scale (line 26) | def test_absmax_scale(input_shape, axis, dtype, qtype, device):
FILE: tests/tensor/test_packed_tensor.py
function test_pack_tensor (line 26) | def test_pack_tensor(shape, bits, device):
function test_packed_tensor_serialization (line 39) | def test_packed_tensor_serialization(bits, device):
FILE: tests/tensor/weights/optimized/test_awq_packed_tensor.py
function test_pack_awq_tensor (line 30) | def test_pack_awq_tensor(in_features, out_features, random, packing, reo...
function test_move_awq_tensor (line 51) | def test_move_awq_tensor(packing, reorder, device):
FILE: tests/tensor/weights/optimized/test_awq_weight_qbits_tensor.py
function test_awq_weight_qbits_tensor_from_qbits_tensor (line 30) | def test_awq_weight_qbits_tensor_from_qbits_tensor(in_features, out_feat...
function test_awq_weight_qbits_tensor_move (line 66) | def test_awq_weight_qbits_tensor_move(device):
function _test_awq_weight_qbits_tensor_linear (line 94) | def _test_awq_weight_qbits_tensor_linear(
function test_awq_weight_qbits_tensor_linear (line 124) | def test_awq_weight_qbits_tensor_linear(batch_size, tokens, in_features,...
FILE: tests/tensor/weights/optimized/test_marlin_fp8_packed_tensor.py
function get_fp8_tensor (line 25) | def get_fp8_tensor(shape, device, random=False):
function test_pack_marlin_fp8_tensor (line 44) | def test_pack_marlin_fp8_tensor(in_features, out_features, random):
function test_move_marlin_fp8_tensor (line 55) | def test_move_marlin_fp8_tensor():
FILE: tests/tensor/weights/optimized/test_marlin_int4_packed_tensor.py
function get_uint4_tensor (line 24) | def get_uint4_tensor(shape, device, random=False):
function test_pack_marlin_int4_tensor (line 39) | def test_pack_marlin_int4_tensor(in_features, out_features, random):
function test_move_marlin_int4_packed_tensor (line 50) | def test_move_marlin_int4_packed_tensor(device):
FILE: tests/tensor/weights/optimized/test_marlin_int4_weight_qbits_tensor.py
function test_marlin_int4_weight_qbits_tensor_from_qbits_tensor (line 31) | def test_marlin_int4_weight_qbits_tensor_from_qbits_tensor(in_features, ...
function test_marlin_int4_weight_qbits_tensor_move (line 67) | def test_marlin_int4_weight_qbits_tensor_move(device):
function _test_marlin_int4_weight_qbits_tensor_linear (line 96) | def _test_marlin_int4_weight_qbits_tensor_linear(
function test_marlin_int4_weight_qbits_tensor_linear (line 125) | def test_marlin_int4_weight_qbits_tensor_linear(batch_size, tokens, in_f...
function test_marlin_int4_weight_qbits_tensor_linear_failing (line 144) | def test_marlin_int4_weight_qbits_tensor_linear_failing(batch_size, toke...
FILE: tests/tensor/weights/optimized/test_marlin_qbytes_tensor.py
function test_pack_unpack (line 29) | def test_pack_unpack(in_features: int, out_features: int):
FILE: tests/tensor/weights/optimized/test_tinygemm_packed_tensor.py
function test_pack_tinygemm_tensor (line 29) | def test_pack_tinygemm_tensor(in_features, out_features, random, device):
function test_move_tinygemm_packed_tensor (line 53) | def test_move_tinygemm_packed_tensor(device):
FILE: tests/tensor/weights/optimized/test_tinygemm_weight_qbits_tensor.py
function test_tinygemm_weight_qbits_tensor_from_qbits_tensor (line 28) | def test_tinygemm_weight_qbits_tensor_from_qbits_tensor(in_features, out...
function test_tinygemm_weight_qbits_tensor_move (line 71) | def test_tinygemm_weight_qbits_tensor_move(device):
function test_tinygemm_weight_qbits_tensor_linear (line 101) | def test_tinygemm_weight_qbits_tensor_linear(batch_size, tokens, embeddi...
FILE: tests/tensor/weights/test_weight_qbits_tensor.py
function test_weight_qbits_tensor_serialization (line 26) | def test_weight_qbits_tensor_serialization(qtype, axis):
function test_weight_qbits_tensor_requires_grad (line 43) | def test_weight_qbits_tensor_requires_grad(qtype, axis, group_size, devi...
function test_weight_qbits_tensor_backward (line 54) | def test_weight_qbits_tensor_backward(qtype, axis, group_size, device):
FILE: tests/tensor/weights/test_weight_qbits_tensor_dispatch.py
function test_qbitstensor_to_device (line 25) | def test_qbitstensor_to_device(dtype, group_size, device):
function test_qbitstensor_detach (line 45) | def test_qbitstensor_detach():
function test_qbitstensor_equal (line 54) | def test_qbitstensor_equal(dtype, qtype, axis, device):
function test_weight_qbits_tensor_linear (line 68) | def test_weight_qbits_tensor_linear(dtype, batch_size, tokens, in_featur...
function test_weight_qbits_tensor_linear_gpu (line 82) | def test_weight_qbits_tensor_linear_gpu(dtype, batch_size, tokens, in_fe...
FILE: tests/tensor/weights/test_weight_qbits_tensor_instantiate.py
function random_data_scale_shift (line 23) | def random_data_scale_shift(input_shape, dtype, qtype, axis, group_size):
function test_weight_qbits_tensor_instantiate (line 40) | def test_weight_qbits_tensor_instantiate(input_shape, dtype, qtype, axis...
function test_weight_qbits_tensor_equal (line 56) | def test_weight_qbits_tensor_equal(input_shape, dtype, qtype, axis, grou...
FILE: tests/tensor/weights/test_weight_qbits_tensor_quantize.py
function test_weight_qbits_tensor_quantize (line 33) | def test_weight_qbits_tensor_quantize(input_shape, dtype, qtype, axis, g...
function test_weight_qbits_tensor_quantize_integer_tensor (line 58) | def test_weight_qbits_tensor_quantize_integer_tensor(dtype, qtype, device):
FILE: tests/tensor/weights/test_weight_qbytes_tensor_backward.py
function test_weight_qbytes_tensor_requires_grad (line 22) | def test_weight_qbytes_tensor_requires_grad(device):
function test_weight_qbytes_tensor_backward (line 30) | def test_weight_qbytes_tensor_backward(device):
function test_weight_qbytes_tensor_chained_backward (line 41) | def test_weight_qbytes_tensor_chained_backward(device):
FILE: tests/tensor/weights/test_weight_qbytes_tensor_dispatch.py
function test_weight_qytes_tensor_to_device (line 8) | def test_weight_qytes_tensor_to_device(device):
function test_weight_qbytes_tensor_equal (line 20) | def test_weight_qbytes_tensor_equal(dtype, qtype, axis, device):
function test_weight_qbytes_tensor_transpose_contiguous (line 30) | def test_weight_qbytes_tensor_transpose_contiguous(axis, qtype, device):
function test_weight_qbytes_tensor_transposed_stride (line 43) | def test_weight_qbytes_tensor_transposed_stride(axis, qtype, device):
FILE: tests/tensor/weights/test_weight_qbytes_tensor_instantiate.py
function random_data_scale (line 22) | def random_data_scale(input_shape, dtype, qtype):
function test_qbytestensor_instantiate (line 37) | def test_qbytestensor_instantiate(input_shape, dtype, qtype, device):
function test_qbytestensor_equal (line 53) | def test_qbytestensor_equal(input_shape, dtype, qtype, device):
FILE: tests/tensor/weights/test_weight_qbytes_tensor_quantize.py
function test_symmetric_quantize_int (line 38) | def test_symmetric_quantize_int(input_shape, dtype, qtype, axis, device):
function test_symmetric_quantize_float8 (line 62) | def test_symmetric_quantize_float8(input_shape, dtype, qtype, axis, devi...
function test_quantize_weight_axis_dim_1 (line 74) | def test_quantize_weight_axis_dim_1(axis, device):
FILE: tests/tensor/weights/test_weight_qbytes_tensor_serialization.py
function test_weights_qbytes_tensor_serialization (line 28) | def test_weights_qbytes_tensor_serialization(input_shape, qtype, dtype, ...
FILE: tests/tensor/weights/weight_helpers.py
function check_weight_qtensor_linear (line 19) | def check_weight_qtensor_linear(qweight, batch_size, tokens, use_bias, r...
Condensed preview — 207 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (827K chars).
[
{
"path": ".github/CODEOWNERS",
"chars": 20,
"preview": "* @dacorvo @sunmarc\n"
},
{
"path": ".github/PULL_REQUEST_TEMPLATE.md",
"chars": 1525,
"preview": "# What does this PR do?\n\n<!--\nCongratulations! You've made it this far! You're not quite done yet though.\n\nOnce merged, "
},
{
"path": ".github/workflows/check-commits.yml",
"chars": 362,
"preview": "name: Check Commits\n\non: [workflow_call]\n\njobs:\n build:\n name: Check commits\n runs-on: ubuntu-latest\n steps:\n "
},
{
"path": ".github/workflows/linux-cpu-tests.yml",
"chars": 2207,
"preview": "name: Linux CPU tests\n\non:\n push:\n branches:\n - main\n paths:\n - \"optimum/quanto/**\"\n - \"tests/**\"\n"
},
{
"path": ".github/workflows/linux-cuda-tests.yml",
"chars": 1375,
"preview": "name: Linux CUDA tests\n\non:\n push:\n branches:\n - main\n paths:\n - \"optimum/quanto/**\"\n - \"tests/**\""
},
{
"path": ".github/workflows/linux-examples.yml",
"chars": 2355,
"preview": "name: Linux examples (CPU, CUDA)\n\non:\n push:\n branches:\n - main\n paths:\n - \"optimum/quanto/**\"\n - "
},
{
"path": ".github/workflows/python-quality.yml",
"chars": 501,
"preview": "name: Python code quality\n\non: [workflow_call]\n\njobs:\n check_code_quality:\n runs-on: ubuntu-latest\n\n steps:\n "
},
{
"path": ".github/workflows/security.yml",
"chars": 1036,
"preview": "name: Security Checks\n\non:\n push:\n\npermissions:\n contents: read\n\njobs:\n secrets:\n runs-on: ubuntu-latest\n steps"
},
{
"path": ".github/workflows/stale.yml",
"chars": 941,
"preview": "name: 'Close stale issues and PRs'\non:\n schedule:\n - cron: '30 1 * * *'\n workflow_dispatch:\n\npermissions:\n issues:"
},
{
"path": ".gitignore",
"chars": 54,
"preview": "__pycache__\n.pytest_cache\n*.egg-info\ndist\n.venv\nbuild/"
},
{
"path": "CONTRIBUTING.md",
"chars": 10213,
"preview": "<!---\nCopyright 2024 The HuggingFace Team. All rights reserved.\n\nLicensed under the Apache License, Version 2.0 (the \"Li"
},
{
"path": "LICENSE",
"chars": 11419,
"preview": "Copyright 2023 - The Hugging Face team. All rights reserved.\n\n Apache License\n "
},
{
"path": "Makefile",
"chars": 252,
"preview": ".PHONY: check test style\n\ncheck_dirs := optimum tests bench examples\n\ncheck:\n\truff check --show-fixes ${check_dirs}\n\truf"
},
{
"path": "README.md",
"chars": 12178,
"preview": "# Optimum Quanto\n\n> This project is currently in maintenance mode. We accept pull requests only for minor bug fixes, doc"
},
{
"path": "bench/generation/README.md",
"chars": 3403,
"preview": "# Quanto generation benchmark\n\nThis repository contains scripts to evaluate the performances of quantized models using t"
},
{
"path": "bench/generation/evaluate_configurations.py",
"chars": 4421,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "bench/generation/evaluate_many_models.sh",
"chars": 658,
"preview": "#!/bin/bash\n# Absolute path to this script, e.g. /home/user/bin/foo.sh\nSCRIPT=$(readlink -f \"$0\")\n# Absolute path this s"
},
{
"path": "bench/generation/evaluate_model.py",
"chars": 5134,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "bench/generation/gen_barchart.py",
"chars": 2983,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "bench/generation/metrics/__init__.py",
"chars": 606,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "bench/generation/metrics/latency.py",
"chars": 4189,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "bench/generation/metrics/perplexity.py",
"chars": 7439,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "bench/generation/metrics/prediction.py",
"chars": 1696,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "bench/generation/setup/__init__.py",
"chars": 606,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "bench/generation/setup/awq.py",
"chars": 4238,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "bench/generation/setup/bnb.py",
"chars": 1676,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "bench/generation/setup/hqq.py",
"chars": 1654,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "bench/generation/setup/quanto.py",
"chars": 2642,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "bench/kernels/benchmark.py",
"chars": 4134,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "bench/kernels/benchmark_marlin_fp8.py",
"chars": 5215,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "bench/kernels/benchmark_w4a16.py",
"chars": 5850,
"preview": "# From: https://github.com/IST-DASLab/marlin/blob/master/bench.py\nimport argparse\nimport time\n\nimport torch\n\nfrom optimu"
},
{
"path": "bench/torch_kernels/README.md",
"chars": 88,
"preview": "This contains a few scripts to test pytorch kernels that are relevant for quantization.\n"
},
{
"path": "bench/torch_kernels/test_int_mm.py",
"chars": 2700,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "bench/torch_kernels/test_int_mm_inductor.py",
"chars": 1111,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "bench/torch_kernels/test_weight_int4pack_mm.py",
"chars": 4939,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "bench/torch_kernels/test_weight_int8pack_mm.py",
"chars": 2506,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "examples/nlp/text-classification/sst2/quantize_sst2_model.py",
"chars": 4158,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "examples/nlp/text-generation/quantize_causal_lm_model.py",
"chars": 5479,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "examples/speech/speech_recognition/quantize_asr_model.py",
"chars": 5079,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "examples/speech/speech_recognition/requirements.txt",
"chars": 46,
"preview": "transformers\nevaluate\nlibrosa\nsoundfile\njiwer\n"
},
{
"path": "examples/vision/StableDiffusion/README.md",
"chars": 1566,
"preview": "# Quantize Stable Diffusion examples\n\n## Running locally with PyTorch\n\n### Installing the dependencies\n\nBefore running t"
},
{
"path": "examples/vision/StableDiffusion/quantize_StableDiffusion.py",
"chars": 3910,
"preview": "import argparse\nimport gc\n\nimport torch\nimport torch.utils.benchmark as benchmark\nfrom diffusers import DiffusionPipelin"
},
{
"path": "examples/vision/StableDiffusion/requirements.txt",
"chars": 52,
"preview": "quanto\ndiffusers\ntorch\ntransformers\naccelerate\nwandb"
},
{
"path": "examples/vision/image-classification/mnist/quantize_mnist_model.py",
"chars": 6640,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "examples/vision/image-classification/pets/quantize_vit_model.py",
"chars": 4622,
"preview": "import argparse\nimport time\nfrom tempfile import NamedTemporaryFile\n\nimport torch\nimport torch.nn.functional as F\nfrom a"
},
{
"path": "examples/vision/object-detection/quantize_owl_model.py",
"chars": 4464,
"preview": "import argparse\nimport gc\n\nimport numpy as np\nimport requests\nimport torch\nfrom PIL import Image\nfrom transformers impor"
},
{
"path": "examples/vision/text-to-image/quantize_pixart_sigma.py",
"chars": 2940,
"preview": "import argparse\nimport gc\n\nimport torch\nfrom diffusers import DiffusionPipeline\n\nfrom optimum.quanto import freeze, qflo"
},
{
"path": "external/awq/conftest.py",
"chars": 1512,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "external/awq/pack_intweight.py",
"chars": 2711,
"preview": "# MIT License\n#\n# Copyright (c) 2023 MIT HAN Lab\n#\n# Permission is hereby granted, free of charge, to any person obtaini"
},
{
"path": "external/awq/packing_utils.py",
"chars": 3443,
"preview": "import torch\n\n\nAWQ_ORDER = [0, 2, 4, 6, 1, 3, 5, 7]\nAWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]\n\n\ndef pack_awq(intweigh"
},
{
"path": "external/awq/test_awq_kernels.py",
"chars": 8278,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "external/awq/test_awq_packing.py",
"chars": 3499,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "external/awq/test_awq_quantize.py",
"chars": 2401,
"preview": "import pytest\nimport torch\n\nfrom optimum.quanto import AffineQuantizer, MaxOptimizer, qint4, ungroup\n\n\ndef awq_quantize("
},
{
"path": "external/smoothquant/README.md",
"chars": 662,
"preview": "# SmoothQuant original conversion script\n\nThis converts an OPT or Bloom [🤗 transformers](https://github.com/huggingface/"
},
{
"path": "external/smoothquant/smoothquant.py",
"chars": 5678,
"preview": "import argparse\nimport functools\nimport os\n\nimport torch\nimport torch.nn as nn\nfrom datasets import load_dataset\nfrom tq"
},
{
"path": "optimum/quanto/__init__.py",
"chars": 767,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/calibrate.py",
"chars": 8353,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/library/README.md",
"chars": 353,
"preview": "# Quanto operations library\n\nThis contains the `quanto::` operations, available in python under `torch.ops.quanto`.\n\nTo "
},
{
"path": "optimum/quanto/library/__init__.py",
"chars": 704,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/library/extensions/README.md",
"chars": 1231,
"preview": "# Quanto library extensions\n\nThis folder contains device-specific `quanto::` operations.\n\nImplementations can be provide"
},
{
"path": "optimum/quanto/library/extensions/__init__.py",
"chars": 1281,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/library/extensions/cpp/README.md",
"chars": 419,
"preview": "# Quanto generic C++ extension\n\nKernels in this extension must use only the C++ syntax.\n\nThey can use any pytorch operat"
},
{
"path": "optimum/quanto/library/extensions/cpp/__init__.py",
"chars": 1007,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/library/extensions/cpp/pybind_module.cpp",
"chars": 1175,
"preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
},
{
"path": "optimum/quanto/library/extensions/cpp/unpack.cpp",
"chars": 1535,
"preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
},
{
"path": "optimum/quanto/library/extensions/cpp/unpack.h",
"chars": 700,
"preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
},
{
"path": "optimum/quanto/library/extensions/cuda/README.md",
"chars": 437,
"preview": "# Quanto generic CUDA extension\n\nKernels in this extension can use both the C++ and CUDA syntax.\n\nThey can use any pytor"
},
{
"path": "optimum/quanto/library/extensions/cuda/__init__.py",
"chars": 6156,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/library/extensions/cuda/awq/dequantize.cuh",
"chars": 3931,
"preview": "/*\nModified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/c"
},
{
"path": "optimum/quanto/library/extensions/cuda/awq/v2/gemm_cuda.cu",
"chars": 50002,
"preview": "#include <cuda_fp16.h>\n#include \"semaphore.h\"\n#include \"gemm_cuda.h\"\n#include \"../dequantize.cuh\"\n#include <torch/extens"
},
{
"path": "optimum/quanto/library/extensions/cuda/awq/v2/gemm_cuda.h",
"chars": 156,
"preview": "#include <torch/extension.h>\n\ntorch::Tensor awq_v2_gemm_f16i4(torch::Tensor _in_feats, torch::Tensor _kernel, torch::Ten"
},
{
"path": "optimum/quanto/library/extensions/cuda/awq/v2/gemv_cuda.cu",
"chars": 11626,
"preview": "/*\n * Modified from NVIDIA [TRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/d37b507f41a87457fe9f10f7459d08f5db23574"
},
{
"path": "optimum/quanto/library/extensions/cuda/awq/v2/gemv_cuda.h",
"chars": 248,
"preview": "#pragma once\n#include <torch/extension.h>\n\ntorch::Tensor awq_v2_gemv_f16i4(\n torch::Tensor _in_feats,\n torch::Tens"
},
{
"path": "optimum/quanto/library/extensions/cuda/awq/v2/semaphore.h",
"chars": 3886,
"preview": "/***************************************************************************************************\n * Copyright (c) 20"
},
{
"path": "optimum/quanto/library/extensions/cuda/marlin/COPYRIGHT",
"chars": 751,
"preview": "These kernels were vendored from VLLM. The Marlin kernels were developed\nby Elias Frantar and extended by Neural Magic.\n"
},
{
"path": "optimum/quanto/library/extensions/cuda/marlin/fp8_marlin.cu",
"chars": 51215,
"preview": "/*\n * Modified by Neural Magic\n * Copyright (C) Marlin.2024 Elias Frantar\n *\n * Licensed under the Apache License, Versi"
},
{
"path": "optimum/quanto/library/extensions/cuda/marlin/fp8_marlin.cuh",
"chars": 495,
"preview": "// #pragma once\n#include <torch/all.h>\n#include <stdint.h>\n\n\n// #ifndef _fp8_marlin_cuh\n// #define _fp8_marlin_cuh\n\n// #"
},
{
"path": "optimum/quanto/library/extensions/cuda/marlin/gptq_marlin.cuh",
"chars": 2051,
"preview": "#pragma once\n\n#include <torch/all.h>\n\n#include <ATen/cuda/CUDAContext.h>\n#include <c10/cuda/CUDAGuard.h>\n#include <cuda."
},
{
"path": "optimum/quanto/library/extensions/cuda/marlin/gptq_marlin_dtypes.cuh",
"chars": 1933,
"preview": "\n#ifndef _data_types_cuh\n#define _data_types_cuh\n#include \"gptq_marlin.cuh\"\n#include <cuda_fp16.h>\n#include <cuda_bf16.h"
},
{
"path": "optimum/quanto/library/extensions/cuda/marlin/gptq_marlin_repack.cu",
"chars": 11605,
"preview": "#include \"gptq_marlin.cuh\"\n\nnamespace gptq_marlin {\n\nstatic constexpr int repack_stages = 8;\n\nstatic constexpr int repac"
},
{
"path": "optimum/quanto/library/extensions/cuda/marlin/gptq_marlin_repack.cuh",
"chars": 342,
"preview": "#include <torch/library.h>\n#include <torch/all.h>\n#include <stdint.h>\n\n#ifndef _gptq_marlin_repack_cuh\n#define _gptq_mar"
},
{
"path": "optimum/quanto/library/extensions/cuda/marlin/marlin_cuda.cpp",
"chars": 2270,
"preview": "/*\n * Copyright (C) Marlin.2024 Elias Frantar (elias.frantar@ist.ac.at)\n *\n * Licensed under the Apache License, Version"
},
{
"path": "optimum/quanto/library/extensions/cuda/marlin/marlin_cuda.h",
"chars": 922,
"preview": "/*\n * Copyright (C) Marlin.2024 Elias Frantar (elias.frantar@ist.ac.at)\n *\n * Licensed under the Apache License, Version"
},
{
"path": "optimum/quanto/library/extensions/cuda/marlin/marlin_cuda_kernel.cu",
"chars": 35383,
"preview": "/*\n * Copyright (C) Marlin.2024 Elias Frantar (elias.frantar@ist.ac.at)\n *\n * Licensed under the Apache License, Version"
},
{
"path": "optimum/quanto/library/extensions/cuda/marlin/marlin_cuda_kernel.cuh",
"chars": 1005,
"preview": "/*\n * Copyright (C) Marlin.2024 Elias Frantar (elias.frantar@ist.ac.at)\n *\n * Licensed under the Apache License, Version"
},
{
"path": "optimum/quanto/library/extensions/cuda/pybind_module.cpp",
"chars": 1734,
"preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
},
{
"path": "optimum/quanto/library/extensions/cuda/unpack.cu",
"chars": 2937,
"preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
},
{
"path": "optimum/quanto/library/extensions/cuda/unpack.h",
"chars": 700,
"preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
},
{
"path": "optimum/quanto/library/extensions/extension.py",
"chars": 2671,
"preview": "import os\nimport shutil\nimport warnings\nfrom typing import List\n\nimport torch\nfrom torch.utils.cpp_extension import load"
},
{
"path": "optimum/quanto/library/extensions/hip/__init__.py",
"chars": 1014,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/library/extensions/hip/pybind_module.cpp",
"chars": 754,
"preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
},
{
"path": "optimum/quanto/library/extensions/hip/unpack.cu",
"chars": 2937,
"preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
},
{
"path": "optimum/quanto/library/extensions/hip/unpack.h",
"chars": 700,
"preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
},
{
"path": "optimum/quanto/library/extensions/mps/README.md",
"chars": 379,
"preview": "# Quanto Metal Performance Shaders extension\n\nTo add a new implementation for an operation defined in `library./ops.py`:"
},
{
"path": "optimum/quanto/library/extensions/mps/__init__.py",
"chars": 1011,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/library/extensions/mps/pybind_module.cpp",
"chars": 754,
"preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
},
{
"path": "optimum/quanto/library/extensions/mps/unpack.h",
"chars": 710,
"preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
},
{
"path": "optimum/quanto/library/extensions/mps/unpack.mm",
"chars": 6396,
"preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
},
{
"path": "optimum/quanto/library/extensions/xpu/__init__.py",
"chars": 2452,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n# Copyright 2024 Intel Corporation. All rights reserved.\n#\n#"
},
{
"path": "optimum/quanto/library/extensions/xpu/pybind_module.cpp",
"chars": 1230,
"preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
},
{
"path": "optimum/quanto/library/extensions/xpu/unpack.h",
"chars": 700,
"preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
},
{
"path": "optimum/quanto/library/extensions/xpu/unpack.sycl",
"chars": 4848,
"preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n// Copyright 2024 Intel Corporation. All rights reserved.\n/"
},
{
"path": "optimum/quanto/library/qbytes_mm.py",
"chars": 5306,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/library/quantize.py",
"chars": 3047,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/library/unpack.py",
"chars": 1887,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/models/__init__.py",
"chars": 1070,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/models/diffusers_models.py",
"chars": 8033,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/models/shared_dict.py",
"chars": 1761,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/models/transformers_models.py",
"chars": 8275,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/nn/__init__.py",
"chars": 702,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/nn/qconv2d.py",
"chars": 1766,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/nn/qlayernorm.py",
"chars": 1797,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/nn/qlinear.py",
"chars": 1550,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/nn/qmodule.py",
"chars": 12393,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/quantize.py",
"chars": 6438,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/subpackage/__init__.py",
"chars": 631,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/subpackage/commands/__init__.py",
"chars": 627,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/subpackage/commands/base.py",
"chars": 1152,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/subpackage/commands/quantize.py",
"chars": 4258,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/__init__.py",
"chars": 813,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/activations/__init__.py",
"chars": 50,
"preview": "from .qbytes import *\nfrom .quantization import *\n"
},
{
"path": "optimum/quanto/tensor/activations/qbytes.py",
"chars": 3537,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/activations/qbytes_ops.py",
"chars": 10495,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/activations/quantization.py",
"chars": 1352,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/core.py",
"chars": 928,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/function.py",
"chars": 2428,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/grouped.py",
"chars": 2183,
"preview": "import math\nfrom typing import List\n\nimport torch\n\n\n__all__ = [\"group\", \"ungroup\", \"grouped_shape\"]\n\n\ndef grouped_shape("
},
{
"path": "optimum/quanto/tensor/optimizers/__init__.py",
"chars": 789,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/optimizers/absmax_optimizer.py",
"chars": 1279,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/optimizers/affine_optimizer.py",
"chars": 2554,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/optimizers/hqq_optimizer.py",
"chars": 3202,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/optimizers/max_optimizer.py",
"chars": 1312,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/optimizers/optimizer.py",
"chars": 939,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/optimizers/symmetric_optimizer.py",
"chars": 1344,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/packed.py",
"chars": 6236,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/qbits.py",
"chars": 2190,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/qbytes.py",
"chars": 1559,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/qtensor.py",
"chars": 3251,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/qtype.py",
"chars": 1931,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/weights/__init__.py",
"chars": 71,
"preview": "from .qbits import *\nfrom .qbytes import *\nfrom .quantization import *\n"
},
{
"path": "optimum/quanto/tensor/weights/awq/__init__.py",
"chars": 43,
"preview": "from .packed import *\nfrom .qbits import *\n"
},
{
"path": "optimum/quanto/tensor/weights/awq/packed.py",
"chars": 11394,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/weights/awq/qbits.py",
"chars": 6998,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/weights/marlin/__init__.py",
"chars": 67,
"preview": "from .fp8 import *\nfrom .int4 import *\nfrom .permutations import *\n"
},
{
"path": "optimum/quanto/tensor/weights/marlin/fp8/__init__.py",
"chars": 43,
"preview": "from .packed import *\nfrom .qbits import *\n"
},
{
"path": "optimum/quanto/tensor/weights/marlin/fp8/packed.py",
"chars": 9220,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/weights/marlin/fp8/qbits.py",
"chars": 7032,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/weights/marlin/int4/__init__.py",
"chars": 43,
"preview": "from .packed import *\nfrom .qbits import *\n"
},
{
"path": "optimum/quanto/tensor/weights/marlin/int4/packed.py",
"chars": 6050,
"preview": "import ast\nfrom copy import copy\n\nimport numpy as np\nimport torch\nfrom torch.utils import _pytree as pytree\n\nfrom ...pac"
},
{
"path": "optimum/quanto/tensor/weights/marlin/int4/qbits.py",
"chars": 6847,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/weights/marlin/permutations.py",
"chars": 1668,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/weights/packing.py",
"chars": 1502,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/weights/qbits.py",
"chars": 13009,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/weights/qbytes.py",
"chars": 13089,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/weights/quantization.py",
"chars": 3076,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/weights/reordering.py",
"chars": 1791,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/weights/tinygemm/__init__.py",
"chars": 43,
"preview": "from .packed import *\nfrom .qbits import *\n"
},
{
"path": "optimum/quanto/tensor/weights/tinygemm/packed.py",
"chars": 6316,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "optimum/quanto/tensor/weights/tinygemm/qbits.py",
"chars": 7459,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "pyproject.toml",
"chars": 1997,
"preview": "[project]\nname = 'optimum-quanto'\ndescription = 'A pytorch quantization backend for optimum.'\nclassifiers = [\n 'Devel"
},
{
"path": "setup.sh",
"chars": 443,
"preview": "#!/bin/bash\n\nNIGHTLY=${1:-0}\nVENV=\".venv\"\nif [ ! -d \"${VENV}\" ]; then\n python3 -m venv ${VENV}\nfi\n. ${VENV}/bin/activ"
},
{
"path": "tests/cli/cli_helpers.py",
"chars": 777,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/cli/test_quantize_cli.py",
"chars": 1619,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/conftest.py",
"chars": 1566,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/helpers.py",
"chars": 4187,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/library/test_extensions.py",
"chars": 1146,
"preview": "import platform\n\nimport pytest\nimport torch\nfrom packaging import version\n\nfrom optimum.quanto.library.extensions import"
},
{
"path": "tests/library/test_mm.py",
"chars": 9560,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/library/test_quantize.py",
"chars": 4715,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/library/test_unpack.py",
"chars": 1134,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/models/conftest.py",
"chars": 845,
"preview": "import pytest\nfrom huggingface_hub.constants import _staging_mode\n\n\n@pytest.fixture\ndef staging():\n \"\"\"A pytest fixtu"
},
{
"path": "tests/models/test_quantized_model_for_causal_lm.py",
"chars": 5870,
"preview": "import uuid\nfrom tempfile import TemporaryDirectory\n\nimport pytest\nimport torch\nfrom huggingface_hub import delete_repo\n"
},
{
"path": "tests/models/test_quantized_model_for_pixart.py",
"chars": 4200,
"preview": "import uuid\nfrom tempfile import TemporaryDirectory\n\nimport pytest\nimport torch\nfrom huggingface_hub import delete_repo\n"
},
{
"path": "tests/nn/test_calibrate.py",
"chars": 4200,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/nn/test_qattention.py",
"chars": 9521,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/nn/test_qconv2d.py",
"chars": 7173,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/nn/test_qlayernorm.py",
"chars": 4134,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/nn/test_qlinear.py",
"chars": 11119,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/nn/test_qmodule.py",
"chars": 2227,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/quantize/test_quantize_mlp.py",
"chars": 5842,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/quantize/test_quantize_patterns.py",
"chars": 4065,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/quantize/test_requantize.py",
"chars": 4380,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/activations/test_activations_compile.py",
"chars": 2424,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/activations/test_activations_dispatch.py",
"chars": 3928,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/activations/test_activations_quantize.py",
"chars": 2274,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/ops/test_linear_dispatch.py",
"chars": 3748,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/ops/test_mm_dispatch.py",
"chars": 2119,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/optimizers/test_hqq_optimizer.py",
"chars": 2206,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/test_absmax.py",
"chars": 1723,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/test_packed_tensor.py",
"chars": 2005,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/weights/optimized/test_awq_packed_tensor.py",
"chars": 2658,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/weights/optimized/test_awq_weight_qbits_tensor.py",
"chars": 5034,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/weights/optimized/test_marlin_fp8_packed_tensor.py",
"chars": 2547,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/weights/optimized/test_marlin_int4_packed_tensor.py",
"chars": 2266,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/weights/optimized/test_marlin_int4_weight_qbits_tensor.py",
"chars": 6070,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/weights/optimized/test_marlin_qbytes_tensor.py",
"chars": 1757,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/weights/optimized/test_tinygemm_packed_tensor.py",
"chars": 2819,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/weights/optimized/test_tinygemm_weight_qbits_tensor.py",
"chars": 5499,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/weights/test_weight_qbits_tensor.py",
"chars": 2911,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/weights/test_weight_qbits_tensor_dispatch.py",
"chars": 4444,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
},
{
"path": "tests/tensor/weights/test_weight_qbits_tensor_instantiate.py",
"chars": 3106,
"preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
}
]
// ... and 7 more files (download for full content)
About this extraction
This page contains the full source code of the huggingface/quanto GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 207 files (766.4 KB), approximately 208.4k tokens, and a symbol index with 641 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.