Full Code of huggingface/quanto for AI

main ef3aafb30e6b cached

207 files

766.4 KB

208.4k tokens

641 symbols

1 requests

Download .txt

Showing preview only (827K chars total). Download the full file or copy to clipboard to get everything.

Repository: huggingface/quanto
Branch: main
Commit: ef3aafb30e6b
Files: 207
Total size: 766.4 KB

Directory structure:
gitextract_e7pf933s/

├── .github/
│   ├── CODEOWNERS
│   ├── PULL_REQUEST_TEMPLATE.md
│   └── workflows/
│       ├── check-commits.yml
│       ├── linux-cpu-tests.yml
│       ├── linux-cuda-tests.yml
│       ├── linux-examples.yml
│       ├── python-quality.yml
│       ├── security.yml
│       └── stale.yml
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── bench/
│   ├── generation/
│   │   ├── README.md
│   │   ├── evaluate_configurations.py
│   │   ├── evaluate_many_models.sh
│   │   ├── evaluate_model.py
│   │   ├── gen_barchart.py
│   │   ├── metrics/
│   │   │   ├── __init__.py
│   │   │   ├── latency.py
│   │   │   ├── perplexity.py
│   │   │   └── prediction.py
│   │   └── setup/
│   │       ├── __init__.py
│   │       ├── awq.py
│   │       ├── bnb.py
│   │       ├── hqq.py
│   │       └── quanto.py
│   ├── kernels/
│   │   ├── benchmark.py
│   │   ├── benchmark_marlin_fp8.py
│   │   └── benchmark_w4a16.py
│   └── torch_kernels/
│       ├── README.md
│       ├── test_int_mm.py
│       ├── test_int_mm_inductor.py
│       ├── test_weight_int4pack_mm.py
│       └── test_weight_int8pack_mm.py
├── examples/
│   ├── nlp/
│   │   ├── text-classification/
│   │   │   └── sst2/
│   │   │       └── quantize_sst2_model.py
│   │   └── text-generation/
│   │       └── quantize_causal_lm_model.py
│   ├── speech/
│   │   └── speech_recognition/
│   │       ├── quantize_asr_model.py
│   │       └── requirements.txt
│   └── vision/
│       ├── StableDiffusion/
│       │   ├── README.md
│       │   ├── quantize_StableDiffusion.py
│       │   └── requirements.txt
│       ├── image-classification/
│       │   ├── mnist/
│       │   │   └── quantize_mnist_model.py
│       │   └── pets/
│       │       └── quantize_vit_model.py
│       ├── object-detection/
│       │   └── quantize_owl_model.py
│       └── text-to-image/
│           └── quantize_pixart_sigma.py
├── external/
│   ├── awq/
│   │   ├── conftest.py
│   │   ├── pack_intweight.py
│   │   ├── packing_utils.py
│   │   ├── test_awq_kernels.py
│   │   ├── test_awq_packing.py
│   │   └── test_awq_quantize.py
│   └── smoothquant/
│       ├── README.md
│       └── smoothquant.py
├── optimum/
│   └── quanto/
│       ├── __init__.py
│       ├── calibrate.py
│       ├── library/
│       │   ├── README.md
│       │   ├── __init__.py
│       │   ├── extensions/
│       │   │   ├── README.md
│       │   │   ├── __init__.py
│       │   │   ├── cpp/
│       │   │   │   ├── README.md
│       │   │   │   ├── __init__.py
│       │   │   │   ├── pybind_module.cpp
│       │   │   │   ├── unpack.cpp
│       │   │   │   └── unpack.h
│       │   │   ├── cuda/
│       │   │   │   ├── README.md
│       │   │   │   ├── __init__.py
│       │   │   │   ├── awq/
│       │   │   │   │   ├── dequantize.cuh
│       │   │   │   │   └── v2/
│       │   │   │   │       ├── gemm_cuda.cu
│       │   │   │   │       ├── gemm_cuda.h
│       │   │   │   │       ├── gemv_cuda.cu
│       │   │   │   │       ├── gemv_cuda.h
│       │   │   │   │       └── semaphore.h
│       │   │   │   ├── marlin/
│       │   │   │   │   ├── COPYRIGHT
│       │   │   │   │   ├── fp8_marlin.cu
│       │   │   │   │   ├── fp8_marlin.cuh
│       │   │   │   │   ├── gptq_marlin.cuh
│       │   │   │   │   ├── gptq_marlin_dtypes.cuh
│       │   │   │   │   ├── gptq_marlin_repack.cu
│       │   │   │   │   ├── gptq_marlin_repack.cuh
│       │   │   │   │   ├── marlin_cuda.cpp
│       │   │   │   │   ├── marlin_cuda.h
│       │   │   │   │   ├── marlin_cuda_kernel.cu
│       │   │   │   │   └── marlin_cuda_kernel.cuh
│       │   │   │   ├── pybind_module.cpp
│       │   │   │   ├── unpack.cu
│       │   │   │   └── unpack.h
│       │   │   ├── extension.py
│       │   │   ├── hip/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── pybind_module.cpp
│       │   │   │   ├── unpack.cu
│       │   │   │   └── unpack.h
│       │   │   ├── mps/
│       │   │   │   ├── README.md
│       │   │   │   ├── __init__.py
│       │   │   │   ├── pybind_module.cpp
│       │   │   │   ├── unpack.h
│       │   │   │   └── unpack.mm
│       │   │   └── xpu/
│       │   │       ├── __init__.py
│       │   │       ├── pybind_module.cpp
│       │   │       ├── unpack.h
│       │   │       └── unpack.sycl
│       │   ├── qbytes_mm.py
│       │   ├── quantize.py
│       │   └── unpack.py
│       ├── models/
│       │   ├── __init__.py
│       │   ├── diffusers_models.py
│       │   ├── shared_dict.py
│       │   └── transformers_models.py
│       ├── nn/
│       │   ├── __init__.py
│       │   ├── qconv2d.py
│       │   ├── qlayernorm.py
│       │   ├── qlinear.py
│       │   └── qmodule.py
│       ├── quantize.py
│       ├── subpackage/
│       │   ├── __init__.py
│       │   └── commands/
│       │       ├── __init__.py
│       │       ├── base.py
│       │       └── quantize.py
│       └── tensor/
│           ├── __init__.py
│           ├── activations/
│           │   ├── __init__.py
│           │   ├── qbytes.py
│           │   ├── qbytes_ops.py
│           │   └── quantization.py
│           ├── core.py
│           ├── function.py
│           ├── grouped.py
│           ├── optimizers/
│           │   ├── __init__.py
│           │   ├── absmax_optimizer.py
│           │   ├── affine_optimizer.py
│           │   ├── hqq_optimizer.py
│           │   ├── max_optimizer.py
│           │   ├── optimizer.py
│           │   └── symmetric_optimizer.py
│           ├── packed.py
│           ├── qbits.py
│           ├── qbytes.py
│           ├── qtensor.py
│           ├── qtype.py
│           └── weights/
│               ├── __init__.py
│               ├── awq/
│               │   ├── __init__.py
│               │   ├── packed.py
│               │   └── qbits.py
│               ├── marlin/
│               │   ├── __init__.py
│               │   ├── fp8/
│               │   │   ├── __init__.py
│               │   │   ├── packed.py
│               │   │   └── qbits.py
│               │   ├── int4/
│               │   │   ├── __init__.py
│               │   │   ├── packed.py
│               │   │   └── qbits.py
│               │   └── permutations.py
│               ├── packing.py
│               ├── qbits.py
│               ├── qbytes.py
│               ├── quantization.py
│               ├── reordering.py
│               └── tinygemm/
│                   ├── __init__.py
│                   ├── packed.py
│                   └── qbits.py
├── pyproject.toml
├── setup.sh
└── tests/
    ├── cli/
    │   ├── cli_helpers.py
    │   └── test_quantize_cli.py
    ├── conftest.py
    ├── helpers.py
    ├── library/
    │   ├── test_extensions.py
    │   ├── test_mm.py
    │   ├── test_quantize.py
    │   └── test_unpack.py
    ├── models/
    │   ├── conftest.py
    │   ├── test_quantized_model_for_causal_lm.py
    │   └── test_quantized_model_for_pixart.py
    ├── nn/
    │   ├── test_calibrate.py
    │   ├── test_qattention.py
    │   ├── test_qconv2d.py
    │   ├── test_qlayernorm.py
    │   ├── test_qlinear.py
    │   └── test_qmodule.py
    ├── quantize/
    │   ├── test_quantize_mlp.py
    │   ├── test_quantize_patterns.py
    │   └── test_requantize.py
    └── tensor/
        ├── activations/
        │   ├── test_activations_compile.py
        │   ├── test_activations_dispatch.py
        │   └── test_activations_quantize.py
        ├── ops/
        │   ├── test_linear_dispatch.py
        │   └── test_mm_dispatch.py
        ├── optimizers/
        │   └── test_hqq_optimizer.py
        ├── test_absmax.py
        ├── test_packed_tensor.py
        └── weights/
            ├── optimized/
            │   ├── test_awq_packed_tensor.py
            │   ├── test_awq_weight_qbits_tensor.py
            │   ├── test_marlin_fp8_packed_tensor.py
            │   ├── test_marlin_int4_packed_tensor.py
            │   ├── test_marlin_int4_weight_qbits_tensor.py
            │   ├── test_marlin_qbytes_tensor.py
            │   ├── test_tinygemm_packed_tensor.py
            │   └── test_tinygemm_weight_qbits_tensor.py
            ├── test_weight_qbits_tensor.py
            ├── test_weight_qbits_tensor_dispatch.py
            ├── test_weight_qbits_tensor_instantiate.py
            ├── test_weight_qbits_tensor_quantize.py
            ├── test_weight_qbytes_tensor_backward.py
            ├── test_weight_qbytes_tensor_dispatch.py
            ├── test_weight_qbytes_tensor_instantiate.py
            ├── test_weight_qbytes_tensor_quantize.py
            ├── test_weight_qbytes_tensor_serialization.py
            └── weight_helpers.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/CODEOWNERS
================================================
* @dacorvo @sunmarc


================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
# What does this PR do?

<!--
Congratulations! You've made it this far! You're not quite done yet though.

Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution.

Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change.

Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost.
-->

<!-- Remove if not applicable -->

Fixes # (issue)


## Before submitting
- [ ] Did you read the [contributor guideline](https://github.com/huggingface/optimum-quanto/blob/main/CONTRIBUTING.md#create-a-pull-request),
      Pull Request section?
- [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link
      to it if that's the case.
- [ ] Did you run all tests locally and make sure they pass.
- [ ] Did you write any new necessary tests?


## Who can review?

Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
members/contributors who may be interested in your PR.


================================================
FILE: .github/workflows/check-commits.yml
================================================
name: Check Commits

on: [workflow_call]

jobs:
  build:
    name: Check commits
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3

      - uses: huggingface/action-check-commits@v1.0.0
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          max-commits: "10"
          min-words: "3"
          forbidden-words: "fixup"


================================================
FILE: .github/workflows/linux-cpu-tests.yml
================================================
name: Linux CPU tests

on:
  push:
    branches:
      - main
    paths:
      - "optimum/quanto/**"
      - "tests/**"
      - "pyproject.toml"
  pull_request:
    types: [assigned, opened, synchronize, reopened]
    paths:
      - "optimum/quanto/**"
      - "tests/**"
      - "pyproject.toml"

jobs:
  check-commits:
    uses: ./.github/workflows/check-commits.yml
  python-quality:
    uses: ./.github/workflows/python-quality.yml
  test-ubuntu-cpu:
    needs: [check-commits, python-quality]
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        python-version: ["3.9", "3.11"]

    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@e9aba2c848f5ebd159c070c61ea2c4e2b122355e  # v2
        with:
          python-version: ${{ matrix.python-version }}

      - name: Build and install quanto
        run: |
          pip install --upgrade pip
          pip install -e .[dev]

      - name: Run base tests
        run: |
          python -m pytest tests --ignore=tests/models --ignore=tests/cli

      - name: Run models tests
        run: |
          pip install accelerate transformers diffusers
          python -m pytest tests/models


      - name: Run CLI tests
        run: |
          pip install optimum
          python -m pytest tests/cli

  run_staging_tests:
    needs: [check-commits, python-quality]
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        python-version: ["3.9", "3.11"]

    steps:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@e9aba2c848f5ebd159c070c61ea2c4e2b122355e  # v2
        with:
          python-version: ${{ matrix.python-version }}

      - name: Build and install quanto
        run: |
          pip install --upgrade pip
          pip install -e .[dev]

      - name: Run models hub tests
        run: |
          pip install accelerate transformers diffusers
          HUGGINGFACE_CO_STAGING=true python -m pytest tests/models -k "hub"


================================================
FILE: .github/workflows/linux-cuda-tests.yml
================================================
name: Linux CUDA tests

on:
  push:
    branches:
      - main
    paths:
      - "optimum/quanto/**"
      - "tests/**"
      - "pyproject.toml"
  pull_request:
    types: [assigned, opened, synchronize, reopened]
    paths:
      - "optimum/quanto/**"
      - "tests/**"
      - "pyproject.toml"

jobs:
  check-commits:
    uses: ./.github/workflows/check-commits.yml
  python-quality:
    uses: ./.github/workflows/python-quality.yml
  test-ubuntu-cuda:
    needs: [check-commits, python-quality]
    runs-on:
      group: aws-g5-4xlarge-plus
    strategy:
      fail-fast: false
      matrix:
        cuda-version: ["11.8", "12.4", "12.6"]
    container:
      image: pytorch/pytorch:2.6.0-cuda${{ matrix.cuda-version }}-cudnn9-devel
      options: --gpus 0

    steps:
      - uses: actions/checkout@v2
      - name: Check CUDA installation
        run: |
          nvcc -V

      - name: Build and install quanto
        run: |
          pip install --upgrade pip
          pip install -e .[dev]

      - name: Run base tests
        run: |
          python -m pytest tests --ignore=tests/models --ignore=tests/cli

      - name: Run models tests
        run: |
          pip install accelerate transformers diffusers
          python -m pytest tests/models

      - name: Run CLI tests
        run: |
          pip install optimum
          python -m pytest tests/cli


================================================
FILE: .github/workflows/linux-examples.yml
================================================
name: Linux examples (CPU, CUDA)

on:
  push:
    branches:
      - main
    paths:
      - "optimum/quanto/**"
      - "examples/**"
      - "pyproject.toml"
  pull_request:
    types: [assigned, opened, synchronize, reopened]
    paths:
      - "optimum/quanto/**"
      - "examples/**"
      - "pyproject.toml"

jobs:
  check-commits:
    uses: ./.github/workflows/check-commits.yml
  python-quality:
    uses: ./.github/workflows/python-quality.yml
  run-examples:
    needs: [check-commits, python-quality]
    runs-on:
      group: aws-g5-4xlarge-plus
    strategy:
      fail-fast: false
      matrix:
        device: ["cpu", "cuda"]
    container:
      image: pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel
      options: --gpus 0

    steps:
      - uses: actions/checkout@v2
      - name: Check CUDA installation
        run: |
          nvcc -V

      - name: Build and install packages
        run: |
          pip install --upgrade pip
          pip install -e .[examples]

      # Run examples
      - name: Run MNIST classification example
        run: |
          for w in int4 int8 float8; do \
            for a in none int8 float8; do \
              python examples/vision/image-classification/mnist/quantize_mnist_model.py \
                --weights $w --activations $a --device ${{ matrix.device }}; \
            done; \
          done
      - name: Run OWL detection example
        run: |
          for w in int4 int8 float8; do \
            python examples/vision/object-detection/quantize_owl_model.py \
              --image http://images.cocodataset.org/val2017/000000039769.jpg \
              --texts "a photo of a cat" "a remote" \
              --weights $w --device ${{ matrix.device }}; \
          done
      - name: Run text-classification example
        run: |
          for w in int4 int8; do \
            for a in none int8; do \
              python examples/nlp/text-classification/sst2/quantize_sst2_model.py \
                --weights $w --activations $a --device ${{ matrix.device }}; \
            done; \
          done
      - name: Run text-to-image example
        if: ${{ matrix.device == 'cuda'}}
        run: |
          for w in int4 int8 fp8; do \
            python examples/vision/text-to-image/quantize_pixart_sigma.py \
              --qtype $w --device ${{ matrix.device }}; \
          done


================================================
FILE: .github/workflows/python-quality.yml
================================================
name: Python code quality

on: [workflow_call]

jobs:
  check_code_quality:
    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v2
      - name: Set up Python
        uses: actions/setup-python@v2
        with:
          python-version: 3.9
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install .[dev]
      - run: ruff format bench examples optimum tests --diff
      - run: ruff check --show-fixes bench examples optimum tests


================================================
FILE: .github/workflows/security.yml
================================================
name: Security Checks

on:
  push:

permissions:
  contents: read

jobs:
  secrets:
    runs-on: ubuntu-latest
    steps:
      - shell: bash
        env:
          REF_NAME: ${{ github.ref_name }}
          HEAD_REF: ${{ github.event.pull_request.head.ref }}
        run: |
          if [ "${{ github.event_name }}" == "push" ]; then
            echo "depth=$(($(jq length <<< '${{ toJson(github.event.commits) }}') + 2))" >> $GITHUB_ENV
            echo "branch=$REF_NAME" >> $GITHUB_ENV
          fi
          if [ "${{ github.event_name }}" == "pull_request" ]; then
            echo "depth=$((${{ github.event.pull_request.commits }}+2))" >> $GITHUB_ENV
            echo "branch=$HEAD_REF" >> $GITHUB_ENV
          fi
      - name: Checkout code
        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
        with:
          ref: ${{env.branch}}
          fetch-depth: ${{env.depth}}
      - name: Scan for secrets
        uses: trufflesecurity/trufflehog@6bd2d14f7a4bc1e569fa3550efa7ec632a4fa67b  # main

================================================
FILE: .github/workflows/stale.yml
================================================
name: 'Close stale issues and PRs'
on:
  schedule:
    - cron: '30 1 * * *'
  workflow_dispatch:

permissions:
  issues: write
  pull-requests: write

jobs:
  stale:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/stale@v9
        with:
          stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
          stale-pr-message: 'This PR is stale because it has been open 15 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
          close-issue-message: 'This issue was closed because it has been stalled for 5 days with no activity.'
          close-pr-message: 'This PR was closed because it has been stalled for 5 days with no activity.'
          days-before-issue-stale: 30
          days-before-pr-stale: 15
          days-before-issue-close: 5
          days-before-pr-close: 5


================================================
FILE: .gitignore
================================================
__pycache__
.pytest_cache
*.egg-info
dist
.venv
build/

================================================
FILE: CONTRIBUTING.md
================================================
<!---
Copyright 2024 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->

# Contribute to optimum-quanto

Everyone is welcome to contribute, and we value everybody's contribution. Code
contributions are not the only way to help the community. Answering questions, helping
others, and improving the documentation are also immensely valuable.

It also helps us if you spread the word! Reference the library in blog posts
about the awesome projects it made possible, shout out on Twitter every time it has
helped you, or simply ⭐️ the repository to say thank you.

However you choose to contribute, please be mindful and respect our
[code of conduct](https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md).

**This guide is directly inspired by [transformers guide to contributing](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md).**

## Ways to contribute

There are several ways you can contribute:

* Fix outstanding issues with the existing code.
* Submit issues related to bugs or desired new features.
* Implement new kernels.

> All contributions are equally valuable to the community. 🥰

## Fixing outstanding issues

If you notice an issue with the existing code and have a fix in mind, feel free to [start contributing](https://github.com/huggingface/optimum-quanto/blob/main/CONTRIBUTING.md/#create-a-pull-request) and open a Pull Request!

## Submitting a bug-related issue or feature request

Do your best to follow these guidelines when submitting a bug-related issue or a feature
request. It will make it easier for us to come back to you quickly and with good
feedback.

### Did you find a bug?

The `optimum-quanto` backend will become more robust and reliable thanks to users who will report the problems they encounter.

Before you report an issue, we would really appreciate it if you could **make sure the bug was not
already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the library itself, and not your code. If you're unsure whether the bug is in your code or the library, please ask in the [forum](https://discuss.huggingface.co/) first. This helps us respond quicker to fixing issues related to the library versus general questions.

Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so we can quickly resolve it:

* Your **OS type and version** and **Python** and **PyTorch** versions.
* A short, self-contained, code snippet that allows us to reproduce the bug in
  less than 30s.
* The *full* traceback if an exception is raised.
* Attach any other additional information, like screenshots, you think may help.

### Do you want a new feature?

If there is a new feature you'd like to see, please open an issue and describe:

1. What is the *motivation* behind this feature? Is it related to a problem or frustration with the library? Is it a feature related to something you need for a project? Is it something you worked on and think it could benefit the community?

   Whatever it is, we'd love to hear about it!

2. Describe your requested feature in as much detail as possible. The more you can tell us about it, the better we'll be able to help you.
3. Provide a *code snippet* that demonstrates the features usage.
4. If the feature is related to a paper, please include a link.

If your issue is well written we're already 80% of the way there by the time you create it.

## Do you want to implement a new kernel?

With the constant evolution of hardware backends, there is always a need for updating the kernels for better performance.

* The hardware configuration(s) it will apply to.
* If any, a short description of the novel techniques that should be used to implement the kernel.

If you are willing to contribute the kernel yourself, let us know so we can help you add it to `optimum-quanto`!

## Create a Pull Request

Before writing any code, we strongly advise you to search through the existing PRs or
issues to make sure nobody is already working on the same thing. If you are
unsure, it is always a good idea to open an issue to get some feedback.

You will need basic `git` proficiency to contribute. While `git` is not the easiest tool to use, it has the greatest manual. Type `git --help` in a shell and enjoy! If you prefer books, [Pro Git](https://git-scm.com/book/en/v2) is a very good reference.

You'll need **Python 3.8** or above to contribute. Follow the steps below to start contributing:

1. Fork the [repository](https://github.com/huggingface/optimum-quanto) by
   clicking on the **[Fork](https://github.com/huggingface/optimum-quanto/fork)** button on the repository's page. This creates a copy of the code
   under your GitHub user account.

2. Clone your fork to your local disk, and add the base repository as a remote:

   ```bash
   git clone git@github.com:<your Github handle>/optimum-quanto.git
   cd optimum-quanto
   git remote add upstream https://github.com/huggingface/optimum-quanto.git
   ```

3. Create a new branch to hold your development changes:

   ```bash
   git checkout -b a-descriptive-name-for-my-changes
   ```

   🚨 **Do not** work on the `main` branch!

4. Set up a development environment by running the following command in a virtual environment:

   ```bash
   pip install -e ".[dev]"
   ```

   If `optimum-quanto` was already installed in the virtual environment, remove
   it with `pip uninstall optimum-quanto` before reinstalling it in editable
   mode with the `-e` flag.

5. Develop the features in your branch.

   As you work on your code, you should make sure the test suite
   passes. Run the tests impacted by your changes like this:

   ```bash
   pytest tests/<TEST_TO_RUN>.py
   ```

   `optimum-quanto` relies on `black` and `ruff` to format its source code
   consistently. After you make changes, apply automatic style corrections and code verifications
   that can't be automated in one go with:

   ```bash
   make style
   ```
   Once you're happy with your changes, add the changed files with `git add` and
   record your changes locally with `git commit`:

   ```bash
   git add modified_file.py
   git commit
   ```

   This repository uses a `rebase` strategy when merging pull-requests, meaning that your commits will **not** be squashed automatically.

   We therefore request you to keep a tidy queue of commits in your pull-request, clearly communicating the changes you made in each commit.

   **This is enforced by the continuous integration, so your pull-request will not be reviewed if your commit queue is not clean.**

   Although this is not mandatory, we kindly ask you to consider using [conventional commits](https://www.conventionalcommits.org/en/v1.0.0/#summary)
   (here the full [specification](https://www.conventionalcommits.org/en/v1.0.0/))!

   This article gives a brief [rationale](https://julien.ponge.org/blog/the-power-of-conventional-commits/) of why this will make our life and yours easier.

   To keep your copy of the code up to date with the original
   repository, rebase your branch on `upstream/branch` *before* you open a pull request or if requested by a maintainer:

   ```bash
   git fetch upstream
   git rebase upstream/main
   ```

   Before submitting, cleanup your commit history to make it more readable for the reviewer (like squashing temporary commits and editing commit messages to clearly explain what you changed).

   Push your changes to your branch:

   ```bash
   git push -u origin a-descriptive-name-for-my-changes
   ```

   If you've already opened a pull request, you'll need to force push with the `--force` flag. Otherwise, if the pull request hasn't been opened yet, you can just push your changes normally.

6. Now you can go to your fork of the repository on GitHub and click on **Pull Request** to open a pull request. Make sure you tick off all the boxes on our [checklist](https://github.com/huggingface/optimum-quanto/blob/main/CONTRIBUTING.md/#pull-request-checklist) below. When you're ready, you can send your changes to the project maintainers for review.

7. It's ok if maintainers request changes, it happens to our core contributors
   too! So everyone can see the changes in the pull request, work in your local
   branch and push the changes to your fork. They will automatically appear in
   the pull request.

### Pull request checklist

☐ The pull request title should summarize your contribution.<br>
☐ If your pull request addresses an issue, please mention the issue number in the pull
request description to make sure they are linked (and people viewing the issue know you
are working on it).<br>
☐ To indicate a work in progress please prefix the title with `[WIP]`. These are
useful to avoid duplicated work, and to differentiate it from PRs ready to be merged.<br>
☐ Make sure existing tests pass.<br>
☐ If adding a new feature, also add tests for it.<br>
☐ All public methods must have informative docstrings.<br>

### Tests

An extensive test suite is included to test the library behavior in the [tests](https://github.com/huggingface/optimum-quanto/tree/main/tests) folder.

From the root of the repository, specify a *path to a subfolder or a test file* to run the test.

```bash
python -m pytest -sv ./tests/<subfolder>/<test>.py
```

You can run all tests by typing:

```bash
make test
```

### Style guide

For documentation strings, `optimum-quanto` follows the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html).
Check `transformers` [documentation writing guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification)
for more information.


================================================
FILE: LICENSE
================================================
Copyright 2023 - The Hugging Face team. All rights reserved.

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: Makefile
================================================
.PHONY: check test style

check_dirs := optimum tests bench examples

check:
	ruff check --show-fixes ${check_dirs}
	ruff format ${check_dirs} --diff

style:
	ruff check ${check_dirs} --fix
	ruff format ${check_dirs}

test:
	python -m pytest -sv tests


================================================
FILE: README.md
================================================
# Optimum Quanto

> This project is currently in maintenance mode. We accept pull requests only for minor bug fixes, documentation improvements, and other maintenance tasks. Major new features or breaking changes are unlikely to be merged. For production-ready quantization features or active development, consider alternative projects such as [bitsandbytes](https://github.com/bitsandbytes-foundation/bitsandbytes) or [torchAO](https://github.com/pytorch/ao).

🤗 Optimum Quanto is a pytorch quantization backend for [optimum](https://huggingface.co/docs/optimum/en/index).

It has been designed with versatility and simplicity in mind:

- all features are available in eager mode (works with non-traceable models),
- quantized models can be placed on any device (including CUDA and MPS),
- automatically inserts quantization and dequantization stubs,
- automatically inserts quantized functional operations,
- automatically inserts quantized modules (see below the list of supported modules),
- provides a seamless workflow from a float model to a dynamic to a static quantized model,
- serialization compatible with pytorch `weight_only` and 🤗 `safetensors`,
- accelerated matrix multiplications on CUDA devices (int8-int8, fp16-int4, bf16-int8, bf16-int4),
- supports int2, int4, int8 and float8 weights,
- supports int8 and float8 activations.

Features yet to be implemented:

- dynamic activations smoothing,
- kernels for all mixed matrix multiplications on all devices,
- compatibility with [torch compiler](https://pytorch.org/docs/stable/torch.compiler.html) (aka dynamo).

## Performances

In a nutshell:

- accuracy: models compiled with `int8`/`float8` weights and `float8` activations are very close to the full-precision models,
- latency: whenever optimized kernels are available, the inference of quantized model is comparable with the full-precision models when quantizing only the model weights,
- device memory: approximately divided by float bits / integer bits.

The paragraph below is just an example. Please refer to the `bench` folder for detailed results per use-case of model.

### meta-llama/Meta-Llama-3.1-8B

<div class="row"><center>
  <div class="column">
    <img src="https://github.com/huggingface/optimum-quanto/blob/main/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Perplexity.png" alt="meta-llama/Meta-Llama-3.1-8B WikiText perplexity">
  </div>
 </center>
</div>

<div class="row"><center>
  <div class="column">
    <img src="https://github.com/huggingface/optimum-quanto/blob/main/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Latency__ms_.png" alt="meta-llama/Meta-Llama-3.1-8B Latency">
  </div>
 </center>
</div>

## Installation

Optimum Quanto is available as a pip package.

```sh
pip install optimum-quanto
```

## Quantization workflow for Hugging Face models

`optimum-quanto` provides helper classes to quantize, save and reload Hugging Face quantized models.

### LLM models

The first step is to quantize the model

```python
from transformers import AutoModelForCausalLM
from optimum.quanto import QuantizedModelForCausalLM, qint4

model = AutoModelForCausalLM.from_pretrained('meta-llama/Meta-Llama-3-8B')
qmodel = QuantizedModelForCausalLM.quantize(model, weights=qint4, exclude='lm_head')
```

Note: the model quantized weights will be frozen. If you want to keep them unfrozen to train them you need to use `optimum.quanto.quantize` directly.

The quantized model can be saved using `save_pretrained`:

```python
qmodel.save_pretrained('./Llama-3-8B-quantized')
```

It can later be reloaded using `from_pretrained`:

```python
from optimum.quanto import QuantizedModelForCausalLM

qmodel = QuantizedModelForCausalLM.from_pretrained('Llama-3-8B-quantized')
```

### Diffusers models

You can quantize any of the submodels inside a diffusers pipeline and seamlessly include them later in another pipeline.

Here we quantize the `transformer` of a `Pixart` pipeline.

```python
from diffusers import PixArtTransformer2DModel
from optimum.quanto import QuantizedPixArtTransformer2DModel, qfloat8

model = PixArtTransformer2DModel.from_pretrained("PixArt-alpha/PixArt-Sigma-XL-2-1024-MS", subfolder="transformer")
qmodel = QuantizedPixArtTransformer2DModel.quantize(model, weights=qfloat8)
qmodel.save_pretrained("./pixart-sigma-fp8")
```

Later, we can reload the quantized model and recreate the pipeline:

```python
from diffusers import PixArtTransformer2DModel
from optimum.quanto import QuantizedPixArtTransformer2DModel

transformer = QuantizedPixArtTransformer2DModel.from_pretrained("./pixart-sigma-fp8")
transformer.to(device="cuda")
pipe = PixArtSigmaPipeline.from_pretrained(
  "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
  transformer=None,
  torch_dtype=torch.float16,
).to("cuda")
pipe.transformer = transformer
```

## Quantization workflow for vanilla pytorch models (low-level API)

One thing to keep in mind when using the low-level quanto API is that by default models
weights are dynamically quantized: an explicit call must be made to 'freeze' the quantized weights.

A typical quantization workflow would consist of the following steps:

**1. Quantize**

The first step converts a standard float model into a dynamically quantized model.

```python
from optimum.quanto import quantize, qint8

quantize(model, weights=qint8, activations=qint8)
```

At this stage, only the inference of the model is modified to dynamically quantize the weights.

**2. Calibrate (optional if activations are not quantized)**

Quanto supports a calibration mode that allows to record the activation ranges while passing representative samples through the quantized model.

```python
from optimum.quanto import Calibration

with Calibration(momentum=0.9):
    model(samples)
```

This automatically activates the quantization of the activations in the quantized modules.


**3. Tune, aka Quantization-Aware-Training (optional)**

If the performance of the model degrades too much, one can tune it for a few epochs to recover the float model performance.

```python
import torch

model.train()
for batch_idx, (data, target) in enumerate(train_loader):
    data, target = data.to(device), target.to(device)
    optimizer.zero_grad()
    output = model(data).dequantize()
    loss = torch.nn.functional.nll_loss(output, target)
    loss.backward()
    optimizer.step()
```

**4. Freeze integer weights**

When freezing a model, its float weights are replaced by quantized integer weights.

```python
from optimum.quanto import freeze

freeze(model)
```

**5. Serialize quantized model**

Quantized models weights can be serialized to a `state_dict`, and saved to a file.
Both `pickle` and `safetensors` (recommended) are supported.

```python
from safetensors.torch import save_file

save_file(model.state_dict(), 'model.safetensors')
```

In order to be able to reload these weights, you also need to store the quantized
model quantization map.

```python
import json

from optimum.quanto import quantization_map

with open('quantization_map.json', 'w') as f:
  json.dump(quantization_map(model), f)
```

**5. Reload a quantized model**

A serialized quantized model can be reloaded from a `state_dict` and a `quantization_map` using the `requantize` helper.
Note that you need first to instantiate an empty model.

```python
import json

from safetensors.torch import load_file
from optimum.quanto import requantize

state_dict = load_file('model.safetensors')
with open('quantization_map.json', 'r') as f:
  quantization_map = json.load(f)

# Create an empty model from your modeling code and requantize it
with torch.device('meta'):
  new_model = ...
requantize(new_model, state_dict, quantization_map, device=torch.device('cuda'))
```

Please refer to the [examples](https://github.com/huggingface/quanto/tree/main/examples) for instantiations of that workflow.


## Design overview

### Tensors

At the heart of quanto is a Tensor subclass that corresponds to:
- the projection of a source Tensor into the optimal range for a given destination type,
- the mapping of projected values to the destination type.

For floating-point destination types, the mapping is done by the native pytorch cast (i.e. `Tensor.to()`).

For integer destination types, the mapping is a simple rounding operation (i.e. `torch.round()`).

The goal of the projection is to increase the accuracy of the conversion by minimizing the number of:
- saturated values (i.e. mapped to the destination type min/max),
- zeroed values (because they are below the smallest number that can be represented by the destination type)

The projection is symmetric per-tensor or per-channel for `int8` and `float8`, and group-wise affine (with a shift or 'zero-point') for lower bitwidth.

One of the benefits of using a lower-bitwidth representation is that you will be able to take advantage of accelerated operations
for the destination type, which is typically faster than their higher precision equivalents.

Quanto does not support the conversion of a Tensor using mixed destination types.

### Modules

Quanto provides a generic mechanism to replace `torch` modules by `optimum-quanto` modules that are able to process quanto tensors.

`optimum-quanto` modules dynamically convert their weights until a model is frozen, which slows down inference a bit but is
required if the model needs to be tuned.

Weights are usually quantized per-channel along the first dimension (output features).

Biases are not converted to preserve the accuracy of a typical `addmm` operation.

Explanation: to be consistent with the unquantized arithmetic operations, biases would need to be quantized with a scale that
is equal to the product of the input and weight scales, which leads to a ridiculously small scale, and conversely
requires a very high bitwidth to avoid clipping. Typically, with `int8` inputs and weights, biases would need to be quantized
with at least `12` bits, i.e. in `int16`. Since most biases are today `float16`, this is a waste of time.

Activations are dynamically quantized per-tensor using static scales (defaults to the range `[-1, 1]`).

To preserve accuracy, the model needs to be calibrated to evaluate the best activation scales (using a momentum).

The following modules can be quantized:

- [Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) (QLinear).
Weights are always quantized, and biases are not quantized. Inputs and outputs can be quantized.
- [Conv2d](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html) (QConv2D).
Weights are always quantized, and biases are not quantized. Inputs and outputs can be quantized.
- [LayerNorm](https://pytorch.org/docs/stable/generated/torch.nn.LayerNorm.html),
Weights and biases are __not__ quantized. Outputs can be quantized.

## Pitfalls to avoid when quantizing activations

Activations are always quantized per-tensor because most linear algebra operations in a model graph are not compatible
with per-axis inputs: you simply cannot add numbers that are not expressed in the same base (`you cannot add apples and oranges`).

Weights involved in matrix multiplications are, on the contrary, always quantized along their first axis, because all output features
are evaluated independently from one another.

The outputs of a quantized matrix multiplication will anyway always be dequantized, even if activations are quantized, because:

- the resulting accumulated values are expressed with a much higher bitwidth (typically `int32` or `float32`) than the activation bitwidth (typically `int8` or `float8`),
- they might be combined with a `float` bias.

Quantizing activations per-tensor to `int8` can lead to serious quantization errors if the corresponding tensors contain large outlier values.
Typically, this will lead to quantized tensors with most values set to zero (except the outliers).

A possible solution to work around that issue is to 'smooth' the activations statically as illustrated by [SmoothQuant](https://github.com/mit-han-lab/smoothquant).
You can find a script to smooth some model architectures under [external/smoothquant](external/smoothquant).

A better option is to represent activations using `float8`.


================================================
FILE: bench/generation/README.md
================================================
# Quanto generation benchmark

This repository contains scripts to evaluate the performances of quantized models using three metrics:

- `latency.py` evaluates the latency per generated token,
- `prediction.py` evaluates the accuracy when predicting the last token of prompts from the [Lambada dataset](https://huggingface.co/datasets/lambada),
- `perplexity.py` evaluates the perplexity of the model on the [WikiText dataset](https://huggingface.co/datasets/wikitext), as defined in the [transformers documentation](https://huggingface.co/docs/transformers/en/perplexity).

A `evaluate_model.py` utility script is also provided to evaluate the metrics on a specific model for several quantization configurations, and output the result to a `png` barchart and/or a `json` file.

Note: the language modeling head (lm_head) of the tested models is not quantized.

The paragraphs below display results for some popular models on a NVIDIA A10 GPU.

## meta-llama/Meta-Llama-3.1-8B

<div class="row"><center>
  <div class="column">
    <img src="https://github.com/huggingface/quanto/blob/main/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Accuracy.png" alt="meta-llama/Meta-llama-3.1-8B Lambada prediction accuracy">
  </div>
 </center>
</div>

<div class="row"><center>
  <div class="column">
    <img src="https://github.com/huggingface/quanto/blob/main/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Perplexity.png" alt="meta-llama/Meta-Llama-3.1-8B WikiText perplexity">
  </div>
 </center>
</div>

<div class="row"><center>
  <div class="column">
    <img src="https://github.com/huggingface/quanto/blob/main/bench/generation/charts/meta-llama-Meta-Llama-3.1-8B_bf16_Latency__ms_.png" alt="meta-llama/Meta-Llama-3.1-8B Latency">
  </div>
 </center>
</div>

## mistralai/Mistral-7B-Instruct-v0.3

<div class="row"><center>
  <div class="column">
    <img src="https://github.com/huggingface/quanto/blob/main/bench/generation/charts/mistralai-Mistral-7B-Instruct-v0.3_bf16_Accuracy.png" alt="mistralai/Mistral-7B-Instruct-v0.3 Lambada prediction accuracy">
  </div>
 </center>
</div>

<div class="row"><center>
  <div class="column">
    <img src="https://github.com/huggingface/quanto/blob/main/bench/generation/charts/mistralai-Mistral-7B-Instruct-v0.3_bf16_Perplexity.png" alt="mistralai/Mistral-7B-Instruct-v0.3 WikiText perplexity">
  </div>
 </center>
</div>

<div class="row"><center>
  <div class="column">
    <img src="https://github.com/huggingface/quanto/blob/main/bench/generation/charts/mistralai-Mistral-7B-Instruct-v0.3_bf16_Latency__ms_.png" alt="mistralai/Mistral-7B-Instruct-v0.3 Latency">
  </div>
 </center>
</div>

## google/gemma-2b

<div class="row"><center>
  <div class="column">
    <img src="https://github.com/huggingface/quanto/blob/main/bench/generation/charts/google-gemma-2b_bf16_Accuracy.png" alt="google-gemma-2b Lambada prediction accuracy">
  </div>
 </center>
</div>

<div class="row"><center>
  <div class="column">
    <img src="https://github.com/huggingface/quanto/blob/main/bench/generation/charts/google-gemma-2b_bf16_Perplexity.png" alt="google-gemma-2b WikiText perplexity">
  </div>
 </center>
</div>

<div class="row"><center>
  <div class="column">
    <img src="https://github.com/huggingface/quanto/blob/main/bench/generation/charts/google-gemma-2b_bf16_Latency__ms_.png" alt="google-gemma-2b Latency">
  </div>
 </center>
</div>


================================================
FILE: bench/generation/evaluate_configurations.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import json

import torch
from evaluate_model import evaluate
from gen_barchart import gen_barchart
from transformers import AutoConfig

from optimum.quanto import qtype


def evaluate_model_configurations(
    model_id: str, metric: str, device: torch.device, batch_size: int = 32, dtype: torch.dtype = torch.float16
):
    weights = [
        "int4",
        "int8",
        "float8",
    ]

    activations = [
        "none",
        "float8",
    ]

    def short_name(qtype: qtype):
        return {
            "none": "f16" if dtype == torch.float16 else "bf16",
            "int4": "i4",
            "int8": "i8",
            "float8": "f8",
        }[qtype]

    results = {}

    # Evaluate float16/bfloat16 model
    config_name = f"W{short_name('none')}A{short_name('none')}"
    print(f"{model_id}[{config_name}]:")
    results[config_name] = evaluate(model_id, metric, "quanto", "none", "none", batch_size, device, dtype)
    # Evaluate quantized models
    for w in weights:
        for a in activations:
            config_name = f"W{short_name(w)}A{short_name(a)}"
            print(f"{model_id}[{config_name}]:")
            results[config_name] = evaluate(model_id, metric, "quanto", w, a, batch_size, device, dtype)

    return results


def main():
    parser = argparse.ArgumentParser(description="Evaluate quantized model predictions on Lambada Dataset")
    parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
    parser.add_argument(
        "--model",
        type=str,
        default="facebook/opt-350m",
        help="The name of the trained Model.",
    )
    parser.add_argument("--device", type=str, default=None, help="The device to use for generation.")
    parser.add_argument("--metric", type=str, default="prediction", choices=["latency", "prediction", "perplexity"])
    parser.add_argument("--batch_size", type=int, default=32, help="The batch size during evaluation.")
    parser.add_argument("--dtype", type=str, help="Use the following dtype to load the model.")
    parser.add_argument("--json", action="store_true", help="Dump the results to a json file.")
    parser.add_argument("--png", action="store_true", help="Generate a PNG.")
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    if args.device is None:
        if torch.cuda.is_available():
            device = torch.device("cuda")
        elif torch.backends.mps.is_available():
            device = torch.device("mps")
        elif torch.xpu.is_available():
            device = torch.device("xpu")
        else:
            device = torch.device("cpu")
    else:
        device = torch.device(args.device)

    if args.dtype is None:
        config = AutoConfig.from_pretrained(args.model)
        dtype = getattr(config, "torch_dtype", torch.float16)
    else:
        dtype = torch.float16 if args.dtype == "fp16" else torch.bfloat16
    results = evaluate_model_configurations(args.model, args.metric, device, batch_size=args.batch_size, dtype=dtype)
    if args.json:
        model_name = args.model.split("/")[-1]
        json_path = f"{model_name}-{args.metric}.json"
        with open(json_path, "w") as fp:
            json.dump({model_name: results}, fp, indent=4)
    if args.png:
        if args.metric == "latency":
            title = f"{args.model}: Mean latency per token"
            label = "Latency (ms)"
        elif args.metric == "prediction":
            title = f"{args.model}: Prediction accuracy on Lambada dataset"
            label = "Accuracy"
        elif args.metric == "perplexity":
            title = f"{args.model}: Perplexity evaluated on WikiText dataset"
            label = "Perplexity"
        gen_barchart(args.model, title, label, results, dtype)


if __name__ == "__main__":
    main()


================================================
FILE: bench/generation/evaluate_many_models.sh
================================================
#!/bin/bash
# Absolute path to this script, e.g. /home/user/bin/foo.sh
SCRIPT=$(readlink -f "$0")
# Absolute path this script is in, thus /home/user/bin
SCRIPT_PATH=$(dirname "$SCRIPT")

models=(
    google/gemma-2b
    meta-llama/Meta-Llama-3.1-8B
    mistralai/Mistral-7B-Instruct-v0.3
)

for m in ${models[@]}; do
    python ${SCRIPT_PATH}/evaluate_configurations.py --model $m --metric prediction --png --json --batch_size 16
    python ${SCRIPT_PATH}/evaluate_configurations.py --model $m --metric perplexity --png --json --batch_size 16
    python ${SCRIPT_PATH}/evaluate_configurations.py --model $m --metric latency --png --json --batch_size 16
done


================================================
FILE: bench/generation/evaluate_model.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import importlib

import torch
from datasets import load_dataset
from metrics.latency import latency
from metrics.perplexity import perplexity
from metrics.prediction import prediction_accuracy


if importlib.util.find_spec("awq") is not None:
    from setup.awq import setup as awq_setup
if importlib.util.find_spec("bitsandbytes") is not None:
    from setup.bnb import setup as bnb_setup
if importlib.util.find_spec("hqq") is not None:
    from setup.hqq import setup as hqq_setup
from setup.quanto import setup as quanto_setup
from transformers import AutoConfig


@torch.no_grad()
def calibrate(model, tokenizer, batch_size, batches):
    samples = batch_size * batches
    cal_dataset = load_dataset("lambada", split=["validation"])[0]
    model.eval()
    total = 0
    for batch in cal_dataset.iter(batch_size=batch_size):
        inputs = tokenizer(batch["text"], return_tensors="pt", padding=True)
        input_ids = inputs.input_ids.to(model.device)
        attention_mask = inputs.attention_mask.to(model.device)
        model(input_ids, attention_mask=attention_mask)
        total += input_ids.size(0)
        if total >= samples:
            break


def evaluate(
    model_id: str,
    metric: str,
    quantizer: str,
    weights: str,
    activations: str,
    batch_size: int,
    device: torch.device,
    dtype: torch.dtype = None,
):
    if quantizer == "quanto":
        if dtype is None:
            config = AutoConfig.from_pretrained(model_id)
            dtype = getattr(config, "torch_dtype", torch.float16)
        model, tokenizer = quanto_setup(model_id, weights, activations, batch_size, device, dtype)
    elif quantizer == "awq":
        model, tokenizer = awq_setup(model_id, weights, activations, group_size=128)
    elif quantizer == "bnb":
        model, tokenizer = bnb_setup(model_id, weights, activations, device)
    elif quantizer == "hqq":
        model, tokenizer = hqq_setup(model_id, weights, activations, device)
    else:
        raise ValueError(f"Unsupported quantizer {quantizer}")
    dtype = next(model.parameters()).dtype
    weights = dtype if weights == "none" else weights
    activations = dtype if activations == "none" else activations
    print(f"Evaluating {model_id} {metric} with {weights} weights and {activations} activations.")
    if metric == "latency":
        return latency(model, tokenizer, device, batch_size=1, prompt_length=512, nb_tokens=512, iterations=3)
    elif metric == "prediction":
        return prediction_accuracy(model, tokenizer, batch_size)
    elif metric == "perplexity":
        return perplexity(model, tokenizer)


def main():
    parser = argparse.ArgumentParser(description="Evaluate quantized model metrics")
    parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
    parser.add_argument(
        "--model",
        type=str,
        default="facebook/opt-350m",
        help="The name of the trained Model.",
    )
    parser.add_argument("--device", type=str, default=None, help="The device to use for generation.")
    parser.add_argument("--metric", type=str, default="prediction", choices=["latency", "prediction", "perplexity"])
    parser.add_argument("--quantizer", type=str, default="quanto", choices=["quanto", "awq", "bnb", "hqq"])
    parser.add_argument(
        "--weights",
        type=str,
        default="none",
        choices=["none", "int4", "int8", "float8"],
    )
    parser.add_argument(
        "--activations",
        type=str,
        default="none",
        choices=["none", "int8", "float8"],
    )
    parser.add_argument("--batch_size", type=int, default=32, help="The batch size during evaluation.")
    parser.add_argument(
        "--dtype",
        type=str,
        default="none",
        choices=["none", "fp16", "bf16"],
    )
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    if args.device is None:
        if torch.cuda.is_available():
            device = torch.device("cuda")
        elif torch.backends.mps.is_available():
            device = torch.device("mps")
        elif torch.xpu.is_available():
            device = torch.device("xpu")
        else:
            device = torch.device("cpu")
    else:
        device = torch.device(args.device)
    dtype = {"none": None, "fp16": torch.float16, "bf16": torch.bfloat16}[args.dtype]
    evaluate(args.model, args.metric, args.quantizer, args.weights, args.activations, args.batch_size, device, dtype)


if __name__ == "__main__":
    main()


================================================
FILE: bench/generation/gen_barchart.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import json

import matplotlib.pyplot as plt
import numpy as np
import torch


def save_bar_chart(title, labels, ylabel, series, save_path):
    x = np.arange(len(labels))  # the label locations
    width = 0.15  # the width of the bars
    multiplier = 0

    fig, ax = plt.subplots(layout="constrained")
    fig.set_figwidth(10)

    max_value = 0

    for attribute, measurement in series.items():
        max_value = max(max_value, max(measurement))
        offset = width * multiplier
        rects = ax.bar(x + offset, measurement, width, label=attribute)
        ax.bar_label(rects, padding=5)
        multiplier += 1

    # Add some text for labels, title and custom x-axis tick labels, etc.
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    ax.set_xticks(x + width, labels)
    ax.legend(loc="upper left", ncols=4)
    ax.set_ylim(0, max_value * 1.2)

    plt.savefig(save_path)


def gen_barchart(model_id, title, label, results, dtype):
    dtype_str = "f16" if dtype is torch.float16 else "bf16"
    activations = (dtype_str, "f8")
    weights = ("i4", "i8", "f8")
    series = {}
    reference = round(results[f"W{dtype_str}A{dtype_str}"], 2)
    series[f"Weights {dtype_str}"] = [
        reference,
    ] * len(activations)
    for w in weights:
        name = f"Weights {w}"
        series[name] = []
        for a in activations:
            result = results[f"W{w}A{a}"]
            series[name].append(round(result, 2))
    model_name = model_id.replace("/", "-")
    metric_name = label.replace(" ", "_").replace("(", "_").replace(")", "_")
    save_bar_chart(
        title=title,
        labels=[f"Activations {a}" for a in activations],
        series=series,
        ylabel=label,
        save_path=f"{model_name}_{dtype_str}_{metric_name}.png",
    )


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("benchmark", type=str, help="A benchmark result file (.json).")
    parser.add_argument("--title", type=str, required=True, help="The graph title.")
    parser.add_argument("--label", type=str, required=True, help="The graph vertical label.")
    args = parser.parse_args()
    with open(args.benchmark) as f:
        benchmark = json.load(f)
        for model_id, results in benchmark.items():
            gen_barchart(model_id, args.title, args.label, results)


if __name__ == "__main__":
    main()


================================================
FILE: bench/generation/metrics/__init__.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: bench/generation/metrics/latency.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import gc
import time

import numpy as np
import torch
from tqdm.auto import tqdm
from transformers import GenerationConfig


def latency(model, tokenizer, device, batch_size=1, prompt_length=512, nb_tokens=512, iterations=10):
    def synchronize(device):
        if device.type == "cuda":
            torch.cuda.synchronize()
        elif device.type == "mps":
            torch.mps.synchronize()
        elif device.type == "xpu":
            torch.xpu.synchronize()
        else:
            torch.cpu.synchronize()

    def timing_event(device):
        if device.type == "cuda":
            return torch.cuda.Event(enable_timing=True)
        elif device.type == "mps":
            return torch.mps.Event(enable_timing=True)
        elif device.type == "xpu":
            return torch.xpu.Event(enable_timing=True)

        class CPUEvent:
            def __init__(self):
                self.time = None

            def record(self):
                self.time = time.time()

            def elapsed_time(self, other):
                assert self.time is not None
                assert other.time is not None
                return (other.time - self.time) * 1000

        return CPUEvent()

    generation_config = GenerationConfig(
        max_new_tokens=nb_tokens,
        min_new_tokens=nb_tokens,
        use_cache=True,
        pad_token_id=tokenizer.pad_token_id,
        num_beams=1,
        do_sample=False,
        eos_token_id=None,  # This is required for min_new_tokens to actually have an effect.
    )
    if getattr(model, "generation_config", None) is not None:
        model.generation_config.eos_token_id = None  # greedy_search falls back on this eos_token_id that we need to set to None as well for min_new_tokens to have an effect.

    synchronize(device)
    if device.type == "cuda":
        torch.cuda.reset_peak_memory_stats()
    elif device.type == "xpu":
        torch.xpu.reset_peak_memory_stats()

    memory = get_device_memory(device)
    if memory is not None:
        print(f"Device memory: {memory / (2**30):.4f} GB")

    latencies = []
    input_ids = torch.randint(1, model.config.vocab_size - 1, size=(batch_size, prompt_length)).to(device)
    masks = torch.ones(batch_size, prompt_length, dtype=torch.int32).to(device)

    for _ in tqdm(range(iterations)):
        start_event = timing_event(device)
        end_event = timing_event(device)
        synchronize(device)
        start_event.record()

        _ = model.generate(input_ids, attention_mask=masks, generation_config=generation_config)
        end_event.record()
        synchronize(device)

        latency_ms = start_event.elapsed_time(end_event)
        latencies.append(latency_ms)

    if device.type == "cuda":
        peak_memory = torch.cuda.max_memory_allocated()
        print(f"Peak memory during benchmark: {peak_memory / (2**30):.4f} GB")
    elif device.type == "xpu":
        peak_memory = torch.xpu.max_memory_allocated()
        print(f"Peak memory during benchmark: {peak_memory / (2**30):.4f} GB")

    mean_latency = np.mean(latencies) / generation_config.min_new_tokens
    print(f"Average latency per token: {mean_latency} ms")
    return mean_latency


def get_device_memory(device):
    gc.collect()
    if device.type == "cuda":
        torch.cuda.empty_cache()
        return torch.cuda.memory_allocated()
    elif device.type == "mps":
        torch.mps.empty_cache()
        return torch.mps.current_allocated_memory()
    elif device.type == "xpu":
        torch.xpu.empty_cache()
        return torch.xpu.memory_allocated()
    return None


================================================
FILE: bench/generation/metrics/perplexity.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys

import numpy as np
import torch
from datasets import load_dataset
from tqdm import tqdm


class Perplexity:
    """
    A class for calculating the perplexity of a language model.
    """

    def __init__(self, model, tokenizer, dataset_path="wikitext", dataset_name=None, split="test", text_column="text"):
        """
        Calculate perplexity using the same method as seen in llama.cpp.

        Parameters
        ----------
        model : AutoModelForCausalLM
            The language model for which the perplexity is calculated.
        tokenizer : AutoTokenizer
            The tokenizer corresponding to the model.
        dataset_path : str, optional
            The path to the dataset on the Hugging Face dataset hub. Default is 'wikitext'.
        dataset_name : str, optional
            The name of the dataset. Default is None.
        split : str, optional
            The split of the dataset to use. Default is 'test'.
        text_column : str, optional
            The name of the column in the dataset that contains the text data. Default is 'text'.
        """
        self._model = model
        self._tokenizer = tokenizer
        self._dataset_path = dataset_path
        self._dataset_name = dataset_name
        self._split = split
        self._text_column = text_column
        self._text = self._prepare_data()

    def _prepare_data(self):
        """
        Prepares the dataset by loading and formatting.

        Returns
        -------
        str
            The formatted dataset as a single string.
        """
        if self._dataset_path == "wikitext":
            self._dataset_name = "wikitext-2-raw-v1"

        # Load the dataset
        data = load_dataset(self._dataset_path, self._dataset_name, split=self._split)
        # Format the text column of the dataset
        text_list = [" \n" if s == "" else s for s in data[self._text_column]]
        return "".join(text_list)

    @staticmethod
    def softmax(logits):
        """
        Static method for applying the softmax function.

        Parameters
        ----------
        logits : np.ndarray
            The input to the softmax function.

        Returns
        -------
        np.ndarray
            The output of the softmax function.
        """
        e_x = np.exp(logits - np.max(logits))
        return e_x / e_x.sum(axis=0)

    def calculate_perplexity(self, n_ctx=512, n_batch=512):
        """
        Calculates the perplexity of the language model.

        Parameters
        ----------
        n_ctx : int
            The context size.
        n_batch : int
            The batch size.

        Returns
        -------
        list
            The list of perplexity scores calculated.
        """
        # Tokenize the text
        self._tokenizer.model_max_length = sys.maxsize
        tokens = self._tokenizer(self._text, truncation=False, return_tensors="pt").input_ids.to(self._model.device)

        nll = 0.0  # Negative log likelihood
        count = 0  # Counter for processed tokens
        curr_ppl = 0
        all_perplexity = []

        with tqdm(range(len(tokens[0]) // n_ctx), desc="Perplexity: - ") as progress:
            for i in progress:
                # Process each batch of tokens
                nll, count = self._process_batch(i, n_ctx, n_batch, tokens, nll, count)

                # Calculate and display the current perplexity
                curr_ppl = np.exp(nll / count)
                all_perplexity.append(curr_ppl)
                progress.set_description(f"Perplexity: {curr_ppl:.4f}")

        return all_perplexity

    def _process_batch(self, i, n_ctx, n_batch, tokens, nll, count):
        """
        Processes each batch of tokens.

        Parameters
        ----------
        i : int
            The batch index.
        n_ctx : int
            The context size.
        n_batch : int
            The batch size.
        tokens : torch.Tensor
            The tokenized text.
        nll : float
            The current negative log likelihood.
        count : int
            The current count of processed tokens.

        Returns
        -------
        float
            The updated negative log likelihood.
        int
            The updated count of processed tokens.
        """
        start = i * n_ctx
        end = start + n_ctx

        num_batches = (n_ctx + n_batch - 1) // n_batch

        logits = []

        for j in range(num_batches):
            batch_start = start + j * n_batch
            batch_size = min(end - batch_start, n_batch)

            token_org = tokens[0][batch_start].item()

            if j == 0:
                # Replace the first token with the BOS token
                tokens[0][batch_start] = self._tokenizer.bos_token_id

            # Compute the logits for the current batch of tokens
            batch_logits = self._compute_batch_logits(tokens, batch_start, batch_size)

            tokens[0][batch_start] = token_org

            logits.append(batch_logits)

        # We rely on the fact that attention in the forward pass only looks at previous
        # tokens here, so the logits returned for each token are an accurate representation
        # of what the model would have predicted at that point.
        #
        # Example, we have a context window of 512, we will compute perplexity for each of the
        # last 256 tokens.  Then, we split the input up into context window size chunks to
        # process the entire prompt.

        for j in range(min(512, n_ctx // 2), n_ctx - 1):
            tok_logits = logits[0][0][j].cpu().numpy()
            # Compute the probability of the next token
            prob = self.softmax(tok_logits)[tokens[0][start + j + 1]]

            # Update the negative log likelihood and the count of processed tokens
            nll += -np.log(prob, where=prob > 0)
            count += 1

        return nll, count

    def _compute_batch_logits(self, tokens, batch_start, batch_size):
        """
        Computes the logits for a batch of tokens.

        Parameters
        ----------
        tokens : torch.Tensor
            The tokenized text.
        batch_start : int
            The start index of the batch.
        batch_size : int
            The size of the batch.

        Returns
        -------
        torch.Tensor
            The logits for the batch of tokens.
        """
        # Compute the logits without keeping track of gradients
        with torch.no_grad():
            outputs = self._model(tokens[:, batch_start : batch_start + batch_size])
        return outputs.logits.detach()


def perplexity(
    model,
    tokenizer,
    stride: int = 512,
):
    print("Evaluating perplexity")
    ppl = Perplexity(model, tokenizer)
    ppl_value = np.mean(ppl.calculate_perplexity(n_ctx=stride))
    return ppl_value


================================================
FILE: bench/generation/metrics/prediction.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import time

import torch
from datasets import load_dataset


@torch.no_grad()
def prediction_accuracy(model, tokenizer, batch_size, samples=None):
    test_dataset = load_dataset("lambada", split=["test"])[0]
    model.eval()
    # The task is to predict the last token of the input.
    total, hit = 0, 0
    start = time.time()
    for batch in test_dataset.iter(batch_size=batch_size):
        inputs = tokenizer(batch["text"], return_tensors="pt", padding=True)
        input_ids = inputs.input_ids.to(model.device)
        attention_mask = inputs.attention_mask.to(model.device)
        labels = input_ids[:, -1]
        # Pass only the first tokens
        outputs = model(input_ids[:, :-1], attention_mask=attention_mask[:, :-1])
        preds = outputs.logits[:, -1, :].argmax(dim=-1)
        total += labels.size(0)
        hit += (preds == labels).sum().item()
        if samples is not None and total >= samples:
            break
    end = time.time()
    acc = hit / total
    print(f"{total} sequences evaluated in {end - start:.2f} s. accuracy = {acc:.2f}")
    return acc


================================================
FILE: bench/generation/setup/__init__.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


================================================
FILE: bench/generation/setup/awq.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer


def prepare_inputs_for_generation(input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs):
    if past_key_values is not None:
        cache_length = past_length = past_key_values[0][0].shape[2]
        max_cache_length = None

        # Keep only the unprocessed tokens:
        # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
        # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
        # input)
        if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
            input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
        # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
        # input_ids based on the past_length.
        elif past_length < input_ids.shape[1]:
            input_ids = input_ids[:, past_length:]
        # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

        # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
        if (
            max_cache_length is not None
            and attention_mask is not None
            and cache_length + input_ids.shape[1] > max_cache_length
        ):
            attention_mask = attention_mask[:, -max_cache_length:]

    position_ids = kwargs.get("position_ids", None)
    if attention_mask is not None and position_ids is None:
        # create position_ids on the fly for batch generation
        position_ids = attention_mask.long().cumsum(-1) - 1
        position_ids.masked_fill_(attention_mask == 0, 1)
        if past_key_values:
            position_ids = position_ids[:, -input_ids.shape[1] :]

    # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
    if inputs_embeds is not None and past_key_values is None:
        model_inputs = {"inputs_embeds": inputs_embeds}
    else:
        model_inputs = {"input_ids": input_ids}

    model_inputs.update(
        {
            "position_ids": position_ids,
            "past_key_values": past_key_values,
            "use_cache": kwargs.get("use_cache"),
            "attention_mask": attention_mask,
        }
    )
    return model_inputs


def setup(model_id: str, weights: str, activations: str, group_size: int = 64, version="GEMV_FAST"):
    if activations != "none":
        raise ValueError("Activation quantization is not supported by HQQ")
    if weights != "int4":
        raise ValueError("AWQ only supports int4 weights.")
    quant_config = {"zero_point": True, "q_group_size": group_size, "w_bit": 4, "version": version}
    # Load model
    model = AutoAWQForCausalLM.from_pretrained(model_id)
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.padding_side = "left"
    # Quantize
    model.quantize(tokenizer, quant_config=quant_config)
    # We need to save otherwise it doesn't work
    quant_path = model_id.replace("/", "-") + f"_{group_size}_{version}"
    model.save_quantized(quant_path)
    # Reload model
    model = AutoAWQForCausalLM.from_quantized(quant_path)
    # Hack: force transformers 4.36.2 behaviour
    model.model.prepare_inputs_for_generation = prepare_inputs_for_generation
    # Hack because AWQ models are not transformers models
    model.device = next(model.parameters()).device
    return model, tokenizer


================================================
FILE: bench/generation/setup/bnb.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig


def setup(
    model_id: str,
    weights: str,
    activations: str,
    device: torch.device,
):
    if activations != "none":
        raise ValueError("Activation quantization is not supported by BitsAndBytes")
    if weights == "int4":
        quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="fp4")
    elif weights == "int8":
        quantization_config = BitsAndBytesConfig(load_in_8bit=True)
    else:
        raise ValueError("BitsAndBytes only supports int4 and int8 weights.")
    dtype = torch.float32 if device.type == "cpu" else torch.float16
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.padding_side = "left"
    quantization_config.bnb_4bit_compute_dtype = dtype
    model = AutoModelForCausalLM.from_pretrained(
        model_id, torch_dtype=dtype, low_cpu_mem_usage=True, quantization_config=quantization_config
    )

    return model, tokenizer


================================================
FILE: bench/generation/setup/hqq.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
from hqq.core.quantize import BaseQuantizeConfig
from hqq.engine.hf import HQQModelForCausalLM
from transformers import AutoTokenizer


def setup(model_id: str, weights: str, activations: str, device: torch.device, group_size: int = 64):
    if activations != "none":
        raise ValueError("Activation quantization is not supported by HQQ")
    if weights == "int4":
        quant_config = BaseQuantizeConfig(nbits=4, group_size=group_size)
    elif weights == "int8":
        quant_config = BaseQuantizeConfig(nbits=8, group_size=group_size)
    else:
        raise ValueError("HQQ only supports int4 and int8 weights.")
    # Load model
    model = HQQModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
    # Quantize
    model.quantize_model(quant_config=quant_config, compute_dtype=torch.float16, device=device)
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.padding_side = "left"
    return model, tokenizer


================================================
FILE: bench/generation/setup/quanto.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import time

import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from optimum.quanto import Calibration, freeze, qfloat8, qint4, qint8, quantize


@torch.no_grad()
def calibrate(model, tokenizer, batch_size, batches):
    samples = batch_size * batches
    cal_dataset = load_dataset("lambada", split=["validation"])[0]
    model.eval()
    total = 0
    for batch in cal_dataset.iter(batch_size=batch_size):
        inputs = tokenizer(batch["text"], return_tensors="pt", padding=True)
        input_ids = inputs.input_ids.to(model.device)
        attention_mask = inputs.attention_mask.to(model.device)
        model(input_ids, attention_mask=attention_mask)
        total += input_ids.size(0)
        if total >= samples:
            break


def setup(
    model_id: str,
    weights: str,
    activations: str,
    batch_size: int,
    device: torch.device,
    dtype: torch.dtype,
):
    weights = keyword_to_qtype(weights)
    activations = keyword_to_qtype(activations)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.padding_side = "left"
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=dtype, low_cpu_mem_usage=True).to(device)
    if weights is not None or activations is not None:
        print("Quantizing")
        start = time.time()
        quantization_root = model
        if hasattr(model, "model"):
            quantization_root = model.model
        quantize(quantization_root, weights=weights, activations=activations)
        if activations is not None:
            print("Calibrating")
            with Calibration():
                calibrate(model, tokenizer, batch_size, batches=4)
        print("Freezing")
        freeze(model)
        print(f"Finished: {time.time() - start:.2f}")
    return model, tokenizer


def keyword_to_qtype(k):
    return {
        "none": None,
        "int4": qint4,
        "int8": qint8,
        "float8": qfloat8,
    }[k]


================================================
FILE: bench/kernels/benchmark.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import time
from contextlib import nullcontext

import numpy as np
import torch
from tqdm.auto import tqdm

from optimum.quanto.library import disable_extensions


def get_unpack_bench(bits, device):
    qmax = 2**bits
    a = torch.randint(0, qmax, [10240, 10240], dtype=torch.uint8).to(device)

    def bench_fn():
        return torch.ops.quanto.unpack(a, bits)

    return bench_fn


def timing(get_bench_func, device, iterations=10):
    def synchronize(device):
        if device.type == "cuda":
            torch.cuda.synchronize()
        elif device.type == "mps":
            torch.mps.synchronize()
        elif device.type == "xpu":
            torch.xpu.synchronize()
        else:
            torch.cpu.synchronize()

    def timing_event(device):
        if device.type == "cuda":
            return torch.cuda.Event(enable_timing=True)
        elif device.type == "mps":
            return torch.mps.Event(enable_timing=True)
        elif device.type == "xpu":
            return torch.xpu.Event(enable_timing=True)

        class CPUEvent:
            def __init__(self):
                self.time = None

            def record(self):
                self.time = time.time()

            def elapsed_time(self, other):
                assert self.time is not None
                assert other.time is not None
                return (other.time - self.time) * 1000

        return CPUEvent()

    synchronize(device)

    bench_func = get_bench_func(device)
    # Warmup to load library
    bench_func()
    latencies = np.empty((iterations, 2))
    for i in tqdm(range(iterations)):
        for j, context in enumerate([disable_extensions(), nullcontext()]):
            start_event = timing_event(device)
            end_event = timing_event(device)
            synchronize(device)
            start_event.record()
            with context:
                bench_func()
            end_event.record()
            synchronize(device)
            latencies[i, j] = start_event.elapsed_time(end_event)
    return np.mean(latencies[:, 0]), np.mean(latencies[:, 1])


GET_BENCH_FUNCTIONS = {
    "unpack_2bit": lambda device: get_unpack_bench(2, device),
    "unpack_4bit": lambda device: get_unpack_bench(4, device),
}


def main():
    parser = argparse.ArgumentParser(description="Kernel benchmark")
    parser.add_argument("--kernel", type=str, default=None, help="The kernel to benchmark. None to test all of them")
    parser.add_argument("--device", type=str, default=None, help="The device to use for benchmark.")
    parser.add_argument("--it", type=int, default=10, help="The number of benchmark iterations")
    args = parser.parse_args()
    if args.device is None:
        if torch.cuda.is_available():
            device = torch.device("cuda")
        elif torch.backends.mps.is_available():
            device = torch.device("mps")
        elif torch.xpu.is_available():
            device = torch.device("xpu")
        else:
            device = torch.device("cpu")
    else:
        device = torch.device(args.device)
    all_kernels = GET_BENCH_FUNCTIONS.keys()
    kernels = all_kernels if args.kernel is None else [args.kernel]
    for kernel in kernels:
        get_bench_fn = GET_BENCH_FUNCTIONS[kernel]
        python_ms, ext_ms = timing(get_bench_fn, device, iterations=args.it)
        ratio = python_ms / ext_ms
        print(f"\n{kernel}[{device.type}]: python = {python_ms:.3f} ms, ext = {ext_ms:.3f} ms, ratio = {ratio:.1f}x")


if __name__ == "__main__":
    main()


================================================
FILE: bench/kernels/benchmark_marlin_fp8.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from typing import Optional

import numpy as np
import torch

from optimum.quanto.tensor.weights.marlin.packed import pack_fp8_as_int32


M_SHAPES = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
N_SHAPES = [4096]
K_SHAPES = [4096]


def run_benchmark(
    m: Optional[int],
    n: Optional[int],
    k: Optional[int],
    n_runs: int,
    n_warmup: int,
    dtype: torch.dtype = torch.float16,
):
    print(f"\n----------- m={m}, n={n}, k={k}")
    n_tokens = m
    in_features = k
    out_features = n

    assert m is not None

    device = torch.device("cuda")
    inputs = torch.rand(n_tokens, in_features, dtype=dtype, device=device)

    other_shape = (in_features, out_features)
    other_data = torch.rand(other_shape, dtype=dtype, device=device).to(torch.float8_e4m3fn)
    other_data_int32 = pack_fp8_as_int32(other_data)
    perm = torch.empty(0, dtype=torch.int, device=device)

    other_data_repack = torch.ops.quanto.gptq_marlin_repack(
        b_q_weight=other_data_int32, perm=perm, size_k=in_features, size_n=out_features, num_bits=8
    )
    other_scale = torch.rand(1, dtype=dtype, device=device)
    other_scale = other_scale.repeat(1, out_features)

    workspace = torch.zeros(out_features // 64 * 16, dtype=torch.int, device=device)

    latencies_marlin_fp8 = []
    latencies_torch = []
    with torch.no_grad():
        for i in range(n_runs):
            start_event = torch.cuda.Event(enable_timing=True)
            end_event = torch.cuda.Event(enable_timing=True)
            torch.cuda.synchronize(device)
            start_event.record()

            _ = torch.ops.quanto.fp8_marlin_gemm(
                a=inputs,
                b_q_weight=other_data_repack,
                b_scales=other_scale,
                workspace=workspace,
                num_bits=8,
                size_m=n_tokens,
                size_n=out_features,
                size_k=in_features,
            )
            end_event.record()
            torch.cuda.synchronize(device)

            latency_ms = start_event.elapsed_time(end_event)
            if i >= n_warmup:
                latencies_marlin_fp8.append(latency_ms)

            start_event = torch.cuda.Event(enable_timing=True)
            end_event = torch.cuda.Event(enable_timing=True)
            torch.cuda.synchronize(device)
            start_event.record()
            other = other_data.to(dtype) * other_scale
            _ = torch.matmul(inputs, other)
            end_event.record()
            torch.cuda.synchronize(device)

            latency_ms = start_event.elapsed_time(end_event)
            if i >= n_warmup:
                latencies_torch.append(latency_ms)

    mean_latency_torch = np.mean(latencies_torch)
    mean_latency_marlin_fp8 = np.mean(latencies_marlin_fp8)
    print("mean_latency_torch:", mean_latency_torch)
    print("mean_latency_marlin_fp8:", mean_latency_marlin_fp8)

    return mean_latency_torch, mean_latency_marlin_fp8


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Marlin FP8 kernel benchmark")
    parser.add_argument("--nruns", type=int, default=20, help="The number of benchmark iterations")
    parser.add_argument("--nwarmup", type=int, default=2, help="The number of warmup iterations (deducted from nruns)")
    parser.add_argument(
        "--m",
        type=int,
        help="m dimension of A=m*k",
        default=None,
    )
    parser.add_argument(
        "--n",
        type=int,
        help="n dimension of B=k*n (out_features)",
        default=None,
    )
    parser.add_argument(
        "--k",
        type=int,
        help="k dimension of A=m*k and B=k*n (in_features), hidden_size",
        default=None,
    )
    args = parser.parse_args()

    if args.m is not None:

        def shape_generator():
            yield (args.m, args.n, args.k)

    else:

        def shape_generator():
            for m in M_SHAPES:
                for n in N_SHAPES:
                    for k in K_SHAPES:
                        yield (m, n, k)

    result = "m,n_out,k_in,torch_latency_ms,marlin_fp8_latency_ms\n"
    for m, n, k in shape_generator():
        mean_latency_torch, mean_latency_marlin_fp8 = run_benchmark(m, n, k, args.nruns, args.nwarmup)

        result += (
            ",".join(
                [
                    str(m),
                    str(n),
                    str(k),
                    f"{mean_latency_torch:.4f}",
                    f"{mean_latency_marlin_fp8:.4f}",
                ]
            )
            + "\n"
        )

    print("\nResults:")
    print(result)


================================================
FILE: bench/kernels/benchmark_w4a16.py
================================================
# From: https://github.com/IST-DASLab/marlin/blob/master/bench.py
import argparse
import time

import torch

from optimum.quanto.tensor.weights.awq import AWQPackedTensor, AWQPacking
from optimum.quanto.tensor.weights.marlin import marlin_permute
from optimum.quanto.tensor.weights.marlin.int4 import MarlinInt4PackedTensor


def benchmark(f, warmup=1, iter=10):
    for i in range(warmup + iter):
        f()
        # We do not synchronize here in order to hide the kernel launch overhead during benchmarkining as this will also
        # happen during realistic model inference as many launches are submitted to the kernel queue.
        if i == warmup - 1:
            torch.cuda.synchronize()
            tick = time.time()
    torch.cuda.synchronize()
    res = (time.time() - tick) / iter
    # Make sure there is enough to "cool down" the GPU in between benchmarks to avoid throttling for later runs when
    # we execute many benchmarks consecutively
    time.sleep(1.0)
    return res


def get_problem(m, n, k, groupsize=128):
    dev = torch.device("cuda:0")
    A = torch.rand((m, k), dtype=torch.half, device=dev)
    B_4bit = torch.randint(0, 2**4, (n, k), dtype=torch.uint8, device=dev)
    B_awq = AWQPackedTensor.pack(B_4bit, packing=AWQPacking.V2)._data
    B_marlin = MarlinInt4PackedTensor.pack(B_4bit)._data
    B_ref = torch.rand((k, n), dtype=torch.half, device=dev)
    s = torch.rand((k // groupsize, n), dtype=torch.half, device=dev) / 2**4
    s_marlin = marlin_permute(s)
    z = torch.randint(-(2 ** (4 - 1)), 2 ** (4 - 1), (k // groupsize, n), dtype=torch.int8, device=dev)
    sz = -z * s
    sz_marlin = marlin_permute(sz)
    torch.cuda.synchronize()
    return A, B_ref, B_awq, B_marlin, s, s_marlin, sz, sz_marlin


def benchmark_dense(A, B, m, n, k):
    res = benchmark(lambda: torch.matmul(A, B))
    return {
        "s": res,
        "TFLOP/s": 2 * A.numel() * n / res / 10**12,
        "GB/s": (2 * A.numel() + 2 * B.numel() + 2 * (m * n)) / res / 10**9,
    }


def benchmark_awq(A, B, s, sz, m, n, k):
    res = benchmark(
        lambda: torch.ops.quanto.gemm_f16i4_awq(A, B, s, sz, rows=m, out_cols=n, in_cols=k, bits=4, group_size=128)
    )
    return {
        "s": res,
        "TFLOP/s": 2 * (m * k) * n / res / 10**12,
        "GB/s": (2 * A.numel() + 2 * B.numel() + 2 * (m * n) + 2 * s.numel() + 2 * sz.numel()) / res / 10**9,
    }


def benchmark_marlin(A, B, s, sz, m, n, k):
    workspace = torch.zeros(n // 128 * 16, dtype=torch.int, device=torch.device("cuda:0"))
    res = benchmark(lambda: torch.ops.quanto.gemm_f16i4_marlin(A, B, s, sz, workspace))
    return {
        "s": res,
        "TFLOP/s": 2 * (m * k) * n / res / 10**12,
        "GB/s": (2 * A.numel() + 4 * B.numel() + 2 * (m * n) + 2 * s.numel() + 2 * sz.numel()) / res / 10**9,
    }


MODELS = {
    "Llama7B": [(4096, 3 * 4096), (4096, 4096), (4096, 2 * 10752), (10752, 4096)],
    "Llama13B": [(5120, 3 * 5120), (5120, 5120), (5120, 2 * 13568), (13568, 5120)],
    "Llama33B": [(6656, 3 * 6656), (6656, 6656), (6656, 2 * 17664), (17664, 6656)],
    "Llama65B": [(8192, 3 * 8192), (8192, 8192), (8192, 2 * 21760), (21760, 8192)],
    "Falcon180B": [
        # Note that parallel attention and FC allows layer fusions
        (14848, 14848 * 5 + 1024),
        (14848 * 5, 14848),
    ],
}


def run_benchmark(model, tokens=None):
    if tokens is None:
        tokens = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
    elif not isinstance(tokens, (list, tuple)):
        tokens = [tokens]
    groupsize = 128
    layers = MODELS[model]
    print(model)
    for m in tokens:
        tot_awq = {"s": 0, "TFLOP/s": 0, "GB/s": 0, "speedup": 0}
        tot_marlin = {"s": 0, "TFLOP/s": 0, "GB/s": 0, "speedup": 0}
        for layer in layers:
            k, n = layer
            A, B_ref, B_awq, B_marlin, s, s_marlin, sz, sz_marlin = get_problem(m, n, k, groupsize)
            res_d = benchmark_dense(A, B_ref, m, n, k)
            res_awq = benchmark_awq(A, B_awq, s, sz, m, n, k)
            res_awq["speedup"] = res_d["s"] / res_awq["s"]
            tot_awq["s"] += res_awq["s"]
            for key in tot_awq:
                if key != "s":
                    tot_awq[key] += res_awq[key] * res_awq["s"]
            res_marlin = benchmark_marlin(A, B_marlin, s_marlin, sz_marlin, m, n, k)
            res_marlin["speedup"] = res_d["s"] / res_marlin["s"]
            tot_marlin["s"] += res_marlin["s"]
            for key in tot_marlin:
                if key != "s":
                    tot_marlin[key] += res_marlin[key] * res_marlin["s"]
        for key in tot_awq:
            if key != "s":
                tot_awq[key] /= tot_awq["s"]
        for key in tot_marlin:
            if key != "s":
                tot_marlin[key] /= tot_marlin["s"]
        print(
            "AWQ, tokens=%04d: s=%.5f, TFLOP/s=%07.3f, GB/s=%08.3f, speedup=%.2f"
            % (m, tot_awq["s"], tot_awq["TFLOP/s"], tot_awq["GB/s"], tot_awq["speedup"])
        )
        print(
            "Marlin, batch=%04d: s=%.5f, TFLOP/s=%07.3f, GB/s=%08.3f, speedup=%.2f"
            % (m, tot_marlin["s"], tot_marlin["TFLOP/s"], tot_marlin["GB/s"], tot_marlin["speedup"])
        )


def main():
    parser = argparse.ArgumentParser(description="W4A16 Matrix Multiplication Kernel benchmark")
    parser.add_argument(
        "--model", type=str, default=None, help="The model configuration to benchmark. None to test all of them."
    )
    parser.add_argument(
        "--tokens",
        type=int,
        default=None,
        help="The numbers of input tokens used to benchmark. None to test a predefined range.",
    )
    args = parser.parse_args()
    models = MODELS if args.model is None else [args.model]
    for model in models:
        run_benchmark(model, args.tokens)
        print()


if __name__ == "__main__":
    main()


================================================
FILE: bench/torch_kernels/README.md
================================================
This contains a few scripts to test pytorch kernels that are relevant for quantization.


================================================
FILE: bench/torch_kernels/test_int_mm.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import timeit

import torch


def main():
    parser = argparse.ArgumentParser(description="Torch integer matmul benchmark")
    parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
    parser.add_argument("--device", type=str, default=None, help="The device to use for the test.")
    parser.add_argument("--it", type=int, default=100, help="Number of iterations for average")
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    if args.device is None:
        if torch.cuda.is_available():
            device = torch.device("cuda")
        elif torch.backends.mps.is_available():
            device = torch.device("mps")
        elif torch.xpu.is_available():
            device = torch.device("xpu")
        else:
            device = torch.device("cpu")
    else:
        device = torch.device(args.device)

    def avg_time(f, it):
        return timeit.Timer(f).timeit(it) / it

    # Resstrictions for accelerated integer matmul:
    # - input matrices must be 2D
    # - the collapsing dimension must be a multiple of 8
    A = torch.randint(1, 10, [2400, 3200]).type(torch.int8).to(device)
    B = torch.randint(1, 10, [3200, 4800]).type(torch.int8).to(device)

    print(f"Evaluating integer matmul on {device.type}:")
    # Warmup (slow)
    torch._int_mm(A, B)
    # Average on several calls
    t = avg_time(lambda: torch._int_mm(A, B), args.it) * 1000
    print(f"Average inference on {args.it} iterations: {t:.4f} ms")

    # Convert inputs to float

    def to_float(x):
        if x.device.type == ("cpu"):
            # matrix multiplication is not supported for float16 on CPU
            return x.to(torch.float32)
        return x.to(torch.float16)

    A = to_float(A)
    B = to_float(B)
    print(f"Evaluating {A.dtype} matmul on {device.type}:")

    # Warmup (slow)
    torch.matmul(A, B)
    # Average on several calls
    t = avg_time(lambda: torch.matmul(A, B), args.it) * 1000
    print(f"Average inference on {args.it} iterations: {t:.4f} ms")


if __name__ == "__main__":
    main()


================================================
FILE: bench/torch_kernels/test_int_mm_inductor.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import timeit

import torch


def mm(a, b):
    return torch._int_mm(a, b)


A = torch.randint(1, 10, [2400, 2400]).type(torch.int8).cuda()
B = torch.randint(1, 10, [2400, 2400]).type(torch.int8).cuda()
it = 100

# Warmup (slow)
mm(A, B)
# Get a reference
print(timeit.Timer(lambda: mm(A, B)).timeit(it) / it)

cmm = torch.compile(mm, backend="inductor")
# First invocation will trigger the actual compilation
cmm(A, B)
# Now compare execution time
print(timeit.Timer(lambda: cmm(A, B)).timeit(it) / it)


================================================
FILE: bench/torch_kernels/test_weight_int4pack_mm.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import timeit

import torch


def _group_quantize_tensor(w, n_bit=4, q_group_size=16):
    assert w.dim() == 2
    w = w.transpose(0, 1).contiguous()
    assert q_group_size > 1
    assert w.shape[-1] % q_group_size == 0

    to_quant = w.reshape(-1, q_group_size)
    assert torch.isnan(to_quant).sum() == 0

    max_val = to_quant.amax(dim=1, keepdim=True)
    min_val = to_quant.amin(dim=1, keepdim=True)
    max_int = 2**n_bit - 1
    min_int = 0
    scales = (max_val - min_val).clamp(min=1e-6) / max_int
    assert torch.isnan(scales).sum() == 0

    zeros = min_val + scales * (2 ** (n_bit - 1))
    assert torch.isnan(zeros).sum() == 0

    out = to_quant.sub(min_val).div(scales).round().clamp_(min_int, max_int)
    assert torch.isnan(out).sum() == 0

    out = out.to(dtype=torch.int32).reshape(w.shape)

    # Scales and zeros for the same q-group should be contiguous, so we can
    # load as a 32-bit word
    scales = scales.view(w.shape[0], -1)
    zeros = zeros.view(w.shape[0], -1)
    scales_and_zeros = (
        torch.cat(
            [
                scales.reshape(scales.size(0), scales.size(1), 1),
                zeros.reshape(zeros.size(0), zeros.size(1), 1),
            ],
            2,
        )
        .transpose(0, 1)
        .contiguous()
    )

    return out, scales_and_zeros


def main():
    parser = argparse.ArgumentParser(description="Torch quantized int4 weight matmul benchmark")
    parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
    parser.add_argument("--dtype", type=str, default="fp16", choices=["fp16", "bf16"], help="floating point type")
    parser.add_argument("--device", type=str, default=None, help="The device to use for the test.")
    parser.add_argument("--it", type=int, default=10, help="Number of iterations for average")
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    if args.device is None:
        if torch.cuda.is_available():
            device = torch.device("cuda")
        elif torch.backends.mps.is_available():
            device = torch.device("mps")
        elif torch.xpu.is_available():
            device = torch.device("xpu")
        else:
            device = torch.device("cpu")
    else:
        device = torch.device(args.device)

    def avg_time(f, it):
        return timeit.Timer(f).timeit(it) / it

    dtype = {"fp16": torch.float16, "bf16": torch.bfloat16}[args.dtype]

    A = torch.rand([2400, 3200], dtype=dtype, device=device)
    B = torch.rand([3200, 4800], dtype=dtype, device=device)
    group_size = 128
    B_int32, B_scale_and_zeros = _group_quantize_tensor(B, n_bit=4, q_group_size=group_size)
    if device.type == "cpu":
        B_packed = torch._convert_weight_to_int4pack_for_cpu(B_int32, innerKTiles=2)
    else:
        B_uint8 = (B_int32[::, ::2] << 4 | B_int32[::, 1::2]).to(torch.uint8)
        B_packed = torch._convert_weight_to_int4pack(B_uint8, innerKTiles=2)

    # Check quantized mm is close to float mm
    if device.type == "cpu":
        qout = torch._weight_int4pack_mm_for_cpu(A, B_packed, group_size, B_scale_and_zeros)
    else:
        qout = torch._weight_int4pack_mm(A, B_packed, group_size, B_scale_and_zeros)
    out = torch.mm(A, B)

    mean_err = ((qout - out).abs() / out.abs()).mean()
    print(mean_err)

    print(f"Evaluating quantized int4 matmul on {device.type}:")
    # Warmup (slow)
    if device.type == "cpu":
        torch._weight_int4pack_mm_for_cpu(A, B_packed, group_size, B_scale_and_zeros)
    else:
        torch._weight_int4pack_mm(A, B_packed, group_size, B_scale_and_zeros)
    # Average on several calls
    if device.type == "cpu":
        t = (
            avg_time(lambda: torch._weight_int4pack_mm_for_cpu(A, B_packed, group_size, B_scale_and_zeros), args.it)
            * 1000
        )
    else:
        t = avg_time(lambda: torch._weight_int4pack_mm(A, B_packed, group_size, B_scale_and_zeros), args.it) * 1000
    print(f"Average inference on {args.it} iterations: {t:.4f} ms")

    print(f"Evaluating {A.dtype} matmul on {device.type}:")

    # Warmup (slow)
    torch.mm(A, B)
    # Average on several calls
    t = avg_time(lambda: torch.mm(A, B), args.it) * 1000
    print(f"Average inference on {args.it} iterations: {t:.4f} ms")


if __name__ == "__main__":
    main()


================================================
FILE: bench/torch_kernels/test_weight_int8pack_mm.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import timeit

import torch


def main():
    parser = argparse.ArgumentParser(description="Torch quantized int8 weight matmul benchmark")
    parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
    parser.add_argument("--device", type=str, default=None, help="The device to use for the test.")
    parser.add_argument("--it", type=int, default=10, help="Number of iterations for average")
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    if args.device is None:
        if torch.cuda.is_available():
            device = torch.device("cuda")
        elif torch.backends.mps.is_available():
            device = torch.device("mps")
        elif torch.xpu.is_available():
            device = torch.device("xpu")
        else:
            device = torch.device("cpu")
    else:
        device = torch.device(args.device)

    def avg_time(f, it):
        return timeit.Timer(f).timeit(it) / it

    A = torch.rand([2400, 3200], dtype=torch.bfloat16, device=device)
    B = torch.randint(-128, 127, [4800, 3200], dtype=torch.int8, device=device)
    B_scale = torch.rand([4800], dtype=torch.bfloat16, device=device)

    print(f"Evaluating quantized int8 matmul on {device.type}:")
    # Warmup (slow)
    torch._weight_int8pack_mm(A, B, B_scale)
    # Average on several calls
    t = avg_time(lambda: torch._weight_int8pack_mm(A, B, B_scale), args.it) * 1000
    print(f"Average inference on {args.it} iterations: {t:.4f} ms")

    # Convert weights to float

    B = B.to(torch.bfloat16).t()
    print(f"Evaluating {A.dtype} matmul on {device.type}:")

    # Warmup (slow)
    torch.matmul(A, B) * B_scale
    # Average on several calls
    t = avg_time(lambda: torch.matmul(A, B) * B_scale, args.it) * 1000
    print(f"Average inference on {args.it} iterations: {t:.4f} ms")


if __name__ == "__main__":
    main()


================================================
FILE: examples/nlp/text-classification/sst2/quantize_sst2_model.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import io
import time

import numpy as np
import torch
from datasets import load_dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
from transformers.pipelines.pt_utils import KeyDataset

from optimum.quanto import Calibration, freeze, qint4, qint8, quantize


def evaluate_model(model, tokenizer, dataset, device, batch_size):
    p = pipeline("sentiment-analysis", model, tokenizer=tokenizer, device=device)
    results = p(KeyDataset(dataset, "sentence"), batch_size=batch_size)
    start = time.time()
    pred_labels = [0 if result["label"] == "NEGATIVE" else 1 for result in results]
    end = time.time()
    accuracy = np.sum(np.equal(pred_labels, dataset["label"])) / len(pred_labels)
    print(f"{len(pred_labels)} sentences evaluated in {end - start:.2f} s. accuracy = {accuracy}")


def keyword_to_itype(k):
    return {"none": None, "int8": qint8, "int4": qint4}[k]


def main():
    parser = argparse.ArgumentParser(description="Transformers SST2 Example")
    parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
    parser.add_argument(
        "--model",
        type=str,
        default="distilbert-base-uncased-finetuned-sst-2-english",
        help="The name of the trained Model.",
    )
    parser.add_argument("--samples", type=int, default=872, help="The number of sst2 samples to use for evaluation.")
    parser.add_argument("--batch_size", type=int, default=100, help="The batch size to use for evaluation.")
    parser.add_argument("--weights", type=str, default="int8", choices=["int4", "int8"])
    parser.add_argument("--activations", type=str, default="int8", choices=["none", "int8"])
    parser.add_argument("--device", type=str, default=None, help="The device to use for evaluation.")
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    if args.device is None:
        if torch.cuda.is_available():
            device = torch.device("cuda")
        elif torch.backends.mps.is_available():
            device = torch.device("mps")
        elif torch.xpu.is_available():
            device = torch.device("xpu")
        else:
            device = torch.device("cpu")
    else:
        device = torch.device(args.device)

    model = AutoModelForSequenceClassification.from_pretrained(args.model).to(device)
    tokenizer = AutoTokenizer.from_pretrained(args.model)
    dataset = load_dataset("sst2", split=f"validation[:{args.samples}]")

    print("Float model")
    evaluate_model(model, tokenizer, dataset, device, args.batch_size)
    weights = keyword_to_itype(args.weights)
    activations = keyword_to_itype(args.activations)
    quantize(model, weights=weights, activations=activations)
    if activations is not None:
        print("Calibrating ...")
        with Calibration():
            evaluate_model(model, tokenizer, dataset, device, args.batch_size)
    freeze(model)
    print(f"Quantized model (w: {args.weights}, a: {args.activations})")
    evaluate_model(model, tokenizer, dataset, device, args.batch_size)
    b = io.BytesIO()
    torch.save(model.state_dict(), b)
    b.seek(0)
    state_dict = torch.load(b)
    model_reloaded = AutoModelForSequenceClassification.from_pretrained(args.model).to(device)
    quantize(model_reloaded, weights=weights, activations=activations)
    model_reloaded.load_state_dict(state_dict)
    print("Serialized quantized model")
    evaluate_model(model, tokenizer, dataset, device, args.batch_size)


if __name__ == "__main__":
    main()


================================================
FILE: examples/nlp/text-generation/quantize_causal_lm_model.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import time

import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from optimum.quanto import Calibration, QuantizedModelForCausalLM, qfloat8, qint4, qint8


@torch.no_grad()
def generate(model, tokenizer, device, prompt, max_new_tokens):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
    start = time.time()
    outputs = model.generate(
        input_ids=inputs.input_ids.to(device),
        max_new_tokens=max_new_tokens,
        attention_mask=inputs.attention_mask.to(device),
        do_sample=True,
        top_k=50,
        top_p=0.9,
    )
    end = time.time()
    generated_text = tokenizer.decode(outputs[0])
    print(f"Generated '{generated_text}' in [{end - start:.2f} s]")


@torch.no_grad()
def calibrate(model, tokenizer, dataset, device, batch_size, samples=None):
    model.eval()
    total = 0
    for batch in dataset.iter(batch_size=batch_size):
        inputs = tokenizer(batch["text"], return_tensors="pt", padding=True)
        input_ids = inputs.input_ids.to(device)
        attention_mask = inputs.attention_mask.to(device)
        model(input_ids, attention_mask=attention_mask)
        total += input_ids.size(0)
        if samples is not None and total >= samples:
            break


def keyword_to_itype(k):
    return {
        "none": None,
        "int4": qint4,
        "int8": qint8,
        "float8": qfloat8,
    }[k]


def main():
    parser = argparse.ArgumentParser(description="Transformers Causal LM Example")
    parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
    parser.add_argument(
        "--model",
        type=str,
        default="facebook/opt-350m",
        help="The name of the trained Model.",
    )
    parser.add_argument("--prompt", type=str, default="One of my fondest memory is", help="The generation prompt.")
    parser.add_argument("--max_new_tokens", type=int, default=20, help="The maximum number of tokens to generate.")
    parser.add_argument("--batch_size", type=int, default=32, help="The batch_size for evaluation (and calibration).")
    parser.add_argument("--validation_batch", type=int, default=4, help="The number of batch to use for calibration.")
    parser.add_argument(
        "--load_dtype",
        type=str,
        default="float16",
        choices=["float16", "float32", "bfloat16"],
        help="Precision to load the initial model",
    )
    parser.add_argument(
        "--weights",
        type=str,
        default="int8",
        choices=["int4", "int8", "float8"],
    )
    parser.add_argument(
        "--activations",
        type=str,
        default="int8",
        choices=["none", "int8", "float8"],
    )
    parser.add_argument("--device", type=str, default=None, help="The device to use for generation.")
    parser.add_argument(
        "--no-streamline",
        action="store_false",
        help="Do not remove consecutive quantize/dequantize (not recommended).",
    )
    parser.add_argument(
        "--debug", action="store_true", help="Provide detailed feedback on the console during calibration."
    )
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    if args.device is None:
        if torch.cuda.is_available():
            device = torch.device("cuda")
        elif torch.backends.mps.is_available():
            device = torch.device("mps")
        elif torch.xpu.is_available():
            device = torch.device("xpu")
        else:
            device = torch.device("cpu")
    else:
        device = torch.device(args.device)

    torch_dtype = (
        torch.float16
        if args.load_dtype == "float16"
        else torch.bfloat16
        if args.load_dtype == "bfloat16"
        else torch.float32
    )
    model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch_dtype, low_cpu_mem_usage=True).to(
        device
    )
    tokenizer = AutoTokenizer.from_pretrained(args.model)
    tokenizer.pad_token_id = tokenizer.eos_token_id
    tokenizer.padding_side = "left"
    cal_dataset = load_dataset("lambada", split=["validation"])[0]

    print(f"{args.model} (w: {args.weights}, a: {args.activations})")
    weights = keyword_to_itype(args.weights)
    activations = keyword_to_itype(args.activations)
    qmodel = QuantizedModelForCausalLM.quantize(model, weights=weights, activations=activations)
    if activations is not None:
        print("Calibrating ...")
        cal_dataset.shuffle(args.seed)
        with Calibration(streamline=args.no_streamline, debug=args.debug):
            cal_samples = args.batch_size * args.validation_batch
            calibrate(qmodel, tokenizer, cal_dataset, device, args.batch_size, samples=cal_samples)
    generate(qmodel, tokenizer, device, args.prompt, args.max_new_tokens)


if __name__ == "__main__":
    main()


================================================
FILE: examples/speech/speech_recognition/quantize_asr_model.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# REQUIRES: librosa, soundfile
import argparse
import io
import time
from functools import partial

import evaluate
import numpy as np
import torch
from datasets import load_dataset
from evaluate import load
from transformers import WhisperForConditionalGeneration, WhisperProcessor

from optimum.quanto import Calibration, freeze, qint4, qint8, quantize


def map_to_feats(batch, processor):
    audio = batch["audio"]
    input_features = processor(
        audio["array"], sampling_rate=audio["sampling_rate"], return_tensors="pt"
    ).input_features
    batch["input_features"] = input_features
    batch["reference"] = processor.tokenizer.normalize(batch["text"])

    return batch


def transcribe_batch(batch, model, processor):
    with torch.no_grad():
        features = torch.from_numpy(np.array(batch["input_features"], dtype=np.float32)).squeeze(1)
        predicted_ids = model.generate(features.to(model.device))
    transcription = [processor.decode(ids) for ids in predicted_ids]
    batch["prediction"] = [processor.tokenizer.normalize(x) for x in transcription]
    return batch


def evaluate_model(model, processor, dataset, metric: evaluate.EvaluationModule, batch_size=10):
    map_fn = partial(transcribe_batch, model=model, processor=processor)
    start = time.time()
    result = dataset.map(map_fn, batched=True, batch_size=batch_size)
    end = time.time()
    score = 100 * metric.compute(references=result["reference"], predictions=result["prediction"])
    print(score)
    print(f"{len(result)} sentences evaluated in {end - start:.2f} s. {metric.name} = {score}")


def keyword_to_itype(k):
    return {"none": None, "int8": qint8, "int4": qint4}[k]


def main():
    parser = argparse.ArgumentParser(description="Transformers Whisper Example")
    parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
    parser.add_argument(
        "--model",
        type=str,
        default="openai/whisper-medium",
        help="The name of the trained Model.",
    )
    parser.add_argument(
        "--samples", type=int, default=872, help="The number of librispeech samples to use for evaluation."
    )
    parser.add_argument("--batch_size", type=int, default=10, help="The batch size to use for evaluation.")
    parser.add_argument("--weights", type=str, default="int8", choices=["int4", "int8"])
    parser.add_argument("--activations", type=str, default="int8", choices=["none", "int8"])
    parser.add_argument("--device", type=str, default=None, help="The device to use for evaluation.")
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    if args.device is None:
        if torch.cuda.is_available():
            device = torch.device("cuda")
            print("USING CUDA")
        elif torch.backends.mps.is_available():
            device = torch.device("mps")
        else:
            device = torch.device("cpu")
            print("USING CPU")
    else:
        device = torch.device(args.device)

    model = WhisperForConditionalGeneration.from_pretrained(args.model).to(device)
    model.config.forced_decoder_ids = None
    processor = WhisperProcessor.from_pretrained(args.model)
    dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
    processed_dataset = dataset.map(lambda x: map_to_feats(x, processor))
    wer = load("wer")

    print("Float model:")
    evaluate_model(model, processor, processed_dataset, wer, args.batch_size)
    weights = keyword_to_itype(args.weights)
    activations = keyword_to_itype(args.activations)
    quantize(model, weights=weights, activations=activations)
    if activations is not None:
        print("Calibrating ...")
        with Calibration():
            evaluate_model(model, processor, processed_dataset, wer, args.batch_size)
    freeze(model)
    print(f"Quantized model (w: {args.weights}, a: {args.activations})")
    evaluate_model(model, processor, processed_dataset, wer, args.batch_size)
    b = io.BytesIO()
    torch.save(model.state_dict(), b)
    b.seek(0)
    state_dict = torch.load(b)
    model_reloaded = WhisperForConditionalGeneration.from_pretrained(args.model).to(device)
    quantize(model_reloaded, weights=weights, activations=activations)
    model_reloaded.load_state_dict(state_dict)
    print("Serialized quantized model")
    evaluate_model(model, processor, processed_dataset, wer, args.batch_size)


if __name__ == "__main__":
    main()


================================================
FILE: examples/speech/speech_recognition/requirements.txt
================================================
transformers
evaluate
librosa
soundfile
jiwer


================================================
FILE: examples/vision/StableDiffusion/README.md
================================================
# Quantize Stable Diffusion examples

## Running locally with PyTorch

### Installing the dependencies

Before running the scripts, make sure to install the library's training dependencies:

**Important**

To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
```bash
git clone https://github.com/huggingface/quanto
cd quanto
pip install -e .
```

Then cd in the `examples/vision/StableDiffusion` folder and run
```bash
pip install -r requirements.txt
```

**Now, we can launch the image generation script:**

```bash
python quantize_StableDiffusion.py --batch_size=1 --torch_dtype="fp32"
```

To better track our training experiments, we're using the following flags in the command above:

* `batch_size` Batch size is the number of samples used in one iteration of training.

* `torch_dtype` {fp32,fp16,bf16}
* `unet_qtype` {fp8,int8,int4,none}

Our experiments were conducted on a single 24GB A10 GPU.
```bash
fp16-fp16

batch_size: 1, torch_dtype: fp16, unet_dtype: none  in 3.307 seconds.Memory: 3.192GB.
```

```bash
bf16-int8

batch_size: 1, torch_dtype: bf16, unet_dtype: int8  in 3.918 seconds.Memory: 2.644GB.
```

```bash
fp16-int8

batch_size: 1, torch_dtype: fp16, unet_dtype: int8  in 3.920 seconds.Memory: 2.634GB.
``` 

will both get high-quality images at fast speed generation

================================================
FILE: examples/vision/StableDiffusion/quantize_StableDiffusion.py
================================================
import argparse
import gc

import torch
import torch.utils.benchmark as benchmark
from diffusers import DiffusionPipeline

from optimum.quanto import freeze, qfloat8, qint4, qint8, quantize


CKPT = "runwayml/stable-diffusion-v1-5"
NUM_INFERENCE_STEPS = 50
WARM_UP_ITERS = 5
PROMPT = "ghibli style, a fantasy landscape with castles"

TORCH_DTYPES = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
UNET_QTYPES = {
    "fp8": qfloat8,
    "int8": qint8,
    "int4": qint4,
    "none": None,
}


def load_pipeline(torch_dtype, unet_dtype=None, device="cpu"):
    pipe = DiffusionPipeline.from_pretrained(CKPT, torch_dtype=torch_dtype, use_safetensors=True).to(device)

    if unet_dtype:
        quantize(pipe.unet, weights=unet_dtype)
        freeze(pipe.unet)

    pipe.set_progress_bar_config(disable=True)
    return pipe


def run_inference(pipe, batch_size=1):
    _ = pipe(
        prompt=args.prompt,
        num_inference_steps=args.num_inference_steps,
        num_images_per_prompt=args.batch_size,
        generator=torch.manual_seed(0),
    )


def benchmark_fn(f, *args, **kwargs):
    t0 = benchmark.Timer(stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f})
    return f"{(t0.blocked_autorange().mean):.3f}"


def bytes_to_giga_bytes(bytes):
    return f"{(bytes / 1024 / 1024 / 1024):.3f}"


def get_device_memory(device):
    gc.collect()
    if device.type == "cuda":
        torch.cuda.empty_cache()
        return torch.cuda.memory_allocated()
    elif device.type == "mps":
        torch.mps.empty_cache()
        return torch.mps.current_allocated_memory()
    elif device.type == "xpu":
        torch.xpu.empty_cache()
        return torch.xpu.memory_allocated()
    return None


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--prompt", type=str, default="ghibli style, a fantasy landscape with castles")
    parser.add_argument("--output_path", type=str, default=None)
    parser.add_argument("--num_inference_steps", type=int, default=50, help="Number of inference steps")
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--torch_dtype", type=str, default="fp32", choices=list(TORCH_DTYPES.keys()))
    parser.add_argument("--unet_qtype", type=str, default=None, choices=list(UNET_QTYPES.keys()))
    parser.add_argument("--device", type=str, default=None, help="The device to use for generation.")
    args = parser.parse_args()

    if args.device is None:
        if torch.cuda.is_available():
            device = torch.device("cuda")
        elif torch.backends.mps.is_available():
            device = torch.device("mps")
        elif torch.xpu.is_available():
            device = torch.device("xpu")
        else:
            device = torch.device("cpu")
    else:
        device = torch.device(args.device)

    pipeline = load_pipeline(
        TORCH_DTYPES[args.torch_dtype], UNET_QTYPES[args.unet_qtype] if args.unet_qtype else None, device
    )

    for _ in range(WARM_UP_ITERS):
        run_inference(pipeline, args.batch_size)

    time = benchmark_fn(run_inference, pipeline, args.batch_size)
    if device.type == "cuda":
        memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated())  # in GBs.
    elif device.type == "xpu":
        memory = bytes_to_giga_bytes(torch.xpu.max_memory_allocated())  # in GBs.
    else:
        memory = 0
    get_device_memory(device)
    print(
        f"batch_size: {args.batch_size}, torch_dtype: {args.torch_dtype}, unet_dtype: {args.unet_qtype}  in {time} seconds."
    )
    print(f"Memory: {memory}GB.")

    img_name = f"bs@{args.batch_size}-dtype@{args.torch_dtype}-unet_dtype@{args.unet_qtype}.png"
    image = pipeline(
        prompt=args.prompt,
        num_inference_steps=NUM_INFERENCE_STEPS,
        num_images_per_prompt=args.batch_size,
    ).images[0]
    image.save(img_name)


================================================
FILE: examples/vision/StableDiffusion/requirements.txt
================================================
quanto
diffusers
torch
transformers
accelerate
wandb

================================================
FILE: examples/vision/image-classification/mnist/quantize_mnist_model.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import time
from tempfile import NamedTemporaryFile

import torch
import torch.nn.functional as F
from accelerate import init_empty_weights
from safetensors.torch import load_file, save_file
from torchvision import datasets, transforms
from transformers import AutoConfig, AutoModel

from optimum.quanto import (
    Calibration,
    QTensor,
    freeze,
    qfloat8,
    qint4,
    qint8,
    quantization_map,
    quantize,
    requantize,
)


def test(model, device, test_loader):
    model.to(device)
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        start = time.time()
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            if isinstance(output, QTensor):
                output = output.dequantize()
            test_loss += F.nll_loss(output, target, reduction="sum").item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()
        end = time.time()

    test_loss /= len(test_loader.dataset)

    print(
        "\nTest set evaluated in {:.2f} s: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
            end - start, test_loss, correct, len(test_loader.dataset), 100.0 * correct / len(test_loader.dataset)
        )
    )


def train(log_interval, model, device, train_loader, optimizer, epoch):
    model.to(device)
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        if isinstance(output, QTensor):
            output = output.dequantize()
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print(
                "Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                    epoch,
                    batch_idx * len(data),
                    len(train_loader.dataset),
                    100.0 * batch_idx / len(train_loader),
                    loss.item(),
                )
            )


def keyword_to_itype(k):
    return {"none": None, "int4": qint4, "int8": qint8, "float8": qfloat8}[k]


def main():
    # Training settings
    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
    parser.add_argument(
        "--batch-size", type=int, default=250, metavar="N", help="input batch size for testing (default: 250)"
    )
    parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
    parser.add_argument("--model", type=str, default="dacorvo/mnist-mlp", help="The name of the trained Model.")
    parser.add_argument("--weights", type=str, default="int8", choices=["int4", "int8", "float8"])
    parser.add_argument("--activations", type=str, default="int8", choices=["none", "int8", "float8"])
    parser.add_argument("--device", type=str, default=None, help="The device to use for evaluation.")
    args = parser.parse_args()

    torch.manual_seed(args.seed)

    if args.device is None:
        if torch.cuda.is_available():
            device = torch.device("cuda")
        elif torch.backends.mps.is_available():
            device = torch.device("mps")
        elif torch.xpu.is_available():
            device = torch.device("xpu")
        else:
            device = torch.device("cpu")
    else:
        device = torch.device(args.device)

    dataset_kwargs = {"batch_size": args.batch_size}
    if torch.cuda.is_available() or torch.xpu.is_available():
        backend_kwargs = {"num_workers": 1, "pin_memory": True, "shuffle": True}
        dataset_kwargs.update(backend_kwargs)

    transform = transforms.Compose(
        [
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,)),
            transforms.Lambda(lambda x: torch.flatten(x)),
        ]
    )
    dataset1 = datasets.MNIST("./data", train=True, download=True, transform=transform)
    train_loader = torch.utils.data.DataLoader(dataset1, **dataset_kwargs)
    dataset2 = datasets.MNIST("./data", train=False, download=True, transform=transform)
    test_loader = torch.utils.data.DataLoader(dataset2, **dataset_kwargs)
    model = AutoModel.from_pretrained(args.model, trust_remote_code=True)
    model.eval()
    print("Float model")
    test(model, device, test_loader)
    weights = keyword_to_itype(args.weights)
    activations = keyword_to_itype(args.activations)
    quantize(model, weights=weights, activations=activations)
    if activations is not None:
        print("Calibrating ...")
        with Calibration():
            test(model, device, test_loader)
    print(f"Quantized model (w: {args.weights}, a: {args.activations})")
    test(model, device, test_loader)
    print("Tuning quantized model for one epoch")
    optimizer = torch.optim.Adadelta(model.parameters(), lr=0.5)
    train(50, model, device, train_loader, optimizer, 1)
    print("Quantized tuned model")
    test(model, device, test_loader)
    print("Quantized frozen model")
    freeze(model)
    test(model, device, test_loader)
    # Serialize model to a state_dict, save it to disk and reload it
    with NamedTemporaryFile() as tmp_file:
        save_file(model.state_dict(), tmp_file.name)
        state_dict = load_file(tmp_file.name)
    model_reloaded = AutoModel.from_pretrained(args.model, trust_remote_code=True)
    # Create an empty model
    config = AutoConfig.from_pretrained(args.model, trust_remote_code=True)
    with init_empty_weights():
        model_reloaded = AutoModel.from_config(config, trust_remote_code=True)
    # Requantize it using the serialized state_dict
    requantize(model_reloaded, state_dict, quantization_map(model), device)
    print("Serialized quantized model")
    test(model_reloaded, device, test_loader)


if __name__ == "__main__":
    main()


================================================
FILE: examples/vision/image-classification/pets/quantize_vit_model.py
================================================
import argparse
import time
from tempfile import NamedTemporaryFile

import torch
import torch.nn.functional as F
from accelerate import init_empty_weights
from datasets import load_dataset
from safetensors.torch import load_file, save_file
from transformers import (
    ViTConfig,
    ViTForImageClassification,
    ViTImageProcessor,
)

from optimum.quanto import (
    Calibration,
    QTensor,
    freeze,
    qfloat8,
    qint4,
    qint8,
    quantization_map,
    quantize,
    requantize,
)


def test(model, device, test_loader):
    model.to(device)
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        start = time.time()
        for batch in test_loader:
            data, target = batch["pixel_values"], batch["labels"]
            data, target = data.to(device), target.to(device)
            output = model(data).logits
            if isinstance(output, QTensor):
                output = output.dequantize()
            test_loss += F.nll_loss(output, target, reduction="sum").item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()
        end = time.time()

    test_loss /= len(test_loader.dataset)

    print(
        "\nTest set evaluated in {:.2f} s: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n".format(
            end - start, test_loss, correct, len(test_loader.dataset), 100.0 * correct / len(test_loader.dataset)
        )
    )


def keyword_to_itype(k):
    return {"none": None, "int4": qint4, "int8": qint8, "float8": qfloat8}[k]


def main():
    parser = argparse.ArgumentParser(description="ViT PETS Example")
    parser.add_argument("--model", type=str, default="super-j/vit-base-pets")
    parser.add_argument("--device", type=str, default=None, help="The device to use for evaluation.")
    parser.add_argument("--weights", type=str, default="int8", choices=["int4", "int8", "float8"])
    parser.add_argument("--activations", type=str, default="int8", choices=["none", "int8", "float8"])
    args = parser.parse_args()

    dataset_kwargs = {}

    if args.device is None:
        if torch.cuda.is_available():
            device = torch.device("cuda")
            cuda_kwargs = {"num_workers": 1, "pin_memory": True, "shuffle": True}
            dataset_kwargs.update(cuda_kwargs)
        elif all([torch.backends.mps.is_available(), args.weights != "float8", args.activations != "float8"]):
            device = torch.device("mps")
        else:
            device = torch.device("cpu")
    else:
        device = torch.device(args.device)

    # load  the processor and model
    model_name = args.model
    processor = ViTImageProcessor.from_pretrained(model_name)
    model = ViTForImageClassification.from_pretrained(model_name)

    def transform(data_batch):
        # Take a list of PIL images and turn them to pixel values
        inputs = processor(data_batch["image"], return_tensors="pt")

        # Don't forget to include the labels!
        inputs["labels"] = data_batch["label"]
        return inputs

    ds = load_dataset("rokmr/pets")
    prepared_ds = ds.with_transform(transform)
    test_loader = torch.utils.data.DataLoader(prepared_ds["test"], **dataset_kwargs)
    print("Model before quantization...")
    test(model, device, test_loader)
    weights = keyword_to_itype(args.weights)
    activations = keyword_to_itype(args.activations)
    quantize(model, weights=weights, activations=activations)
    if activations is not None:
        print("Calibrating ...")
        with Calibration():
            test(model, device, test_loader)
    print(f"Quantized model (w: {args.weights}, a: {args.activations})")
    test(model, device, test_loader)
    print("Quantized frozen model")
    freeze(model)
    test(model, device, test_loader)
    # Serialize model to a state_dict, save it to disk and reload it
    with NamedTemporaryFile() as tmp_file:
        save_file(model.state_dict(), tmp_file.name)
        state_dict = load_file(tmp_file.name)
    model_reloaded = ViTForImageClassification.from_pretrained(model_name)
    # Create an empty model
    config = ViTConfig.from_pretrained(model_name)
    with init_empty_weights():
        model_reloaded = ViTForImageClassification.from_pretrained(model_name, config=config)
    # Requantize it using the serialized state_dict
    requantize(model_reloaded, state_dict, quantization_map(model), device)
    print("Serialized quantized model")
    test(model_reloaded, device, test_loader)


if __name__ == "__main__":
    main()


================================================
FILE: examples/vision/object-detection/quantize_owl_model.py
================================================
import argparse
import gc

import numpy as np
import requests
import torch
from PIL import Image
from transformers import AutoProcessor, Owlv2ForObjectDetection
from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD

from optimum.quanto import freeze, qfloat8, qint4, qint8, quantize


def detect(model, processor, image, texts):
    inputs = processor(text=texts, images=image, return_tensors="pt").to(model.device)

    # forward pass
    with torch.no_grad():
        outputs = model(**inputs)

    # Note: boxes need to be visualized on the padded, unnormalized image
    # hence we'll set the target image sizes (height, width) based on that
    def get_preprocessed_image(pixel_values):
        pixel_values = pixel_values.squeeze().cpu().numpy()
        unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[
            :, None, None
        ]
        unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
        unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
        unnormalized_image = Image.fromarray(unnormalized_image)
        return unnormalized_image

    unnormalized_image = get_preprocessed_image(inputs.pixel_values)

    target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
    # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
    results = processor.post_process_object_detection(outputs=outputs, threshold=0.2, target_sizes=target_sizes)

    i = 0  # Retrieve predictions for the first image for the corresponding text queries
    text = texts[i]
    boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]

    if len(boxes) == 0:
        print("None of the specified labels were detected")
        return

    for box, score, label in zip(boxes, scores, labels):
        box = [round(i, 2) for i in box.tolist()]
        print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")


def get_device_memory(device):
    gc.collect()
    if device.type == "cuda":
        torch.cuda.empty_cache()
        return torch.cuda.memory_allocated()
    elif device.type == "mps":
        torch.mps.empty_cache()
        return torch.mps.current_allocated_memory()
    elif device.type == "xpu":
        torch.xpu.empty_cache()
        return torch.xpu.memory_allocated()
    return None


def keyword_to_qtype(k):
    return {"none": None, "int4": qint4, "int8": qint8, "float8": qfloat8}[k]


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, default="google/owlv2-base-patch16")
    parser.add_argument("--image", type=str, required=True)
    parser.add_argument("--texts", type=str, nargs="+", required=True)
    parser.add_argument("--weights", type=str, default="none", choices=["none", "int4", "int8", "float8"])
    parser.add_argument("--exclude-heads", action="store_true", help="Do not quantize detection heads")
    parser.add_argument("--device", type=str, default=None, help="The device to use for generation.")
    args = parser.parse_args()

    if args.device is None:
        if torch.cuda.is_available():
            device = torch.device("cuda")
        elif torch.backends.mps.is_available():
            # MPS backend does not support torch.float64 that is required for owl models
            device = torch.device("cpu")
        elif torch.xpu.is_available():
            device = torch.device("xpu")
        else:
            device = torch.device("cpu")
    else:
        device = torch.device(args.device)

    processor = AutoProcessor.from_pretrained(args.model)
    model = Owlv2ForObjectDetection.from_pretrained(args.model, low_cpu_mem_usage=True).to(device)

    weights_qtype = keyword_to_qtype(args.weights)
    if weights_qtype is not None:
        if args.exclude_heads:
            quantize(model.owlv2, weights=weights_qtype)
        else:
            quantize(model, weights=weights_qtype)
        freeze(model)

    memory = get_device_memory(device)
    if memory is not None:
        memory_gb = memory / 2**30
        print(f"{device.type} device memory: {memory_gb:.2f} GB.")

    image_path = args.image
    if image_path.startswith("http"):
        image_path = requests.get(args.image, stream=True).raw
    image = Image.open(image_path)

    texts = [args.texts]
    detect(model, processor, image, texts)


if __name__ == "__main__":
    main()


================================================
FILE: examples/vision/text-to-image/quantize_pixart_sigma.py
================================================
import argparse
import gc

import torch
from diffusers import DiffusionPipeline

from optimum.quanto import freeze, qfloat8, qint4, qint8, quantize


NUM_INFERENCE_STEPS = 50

TORCH_DTYPES = {"fp16": torch.float16, "bf16": torch.bfloat16}
QTYPES = {
    "fp8": qfloat8,
    "int8": qint8,
    "int4": qint4,
    "none": None,
}


def load_pipeline(model_id, torch_dtype, qtype=None, device="cpu"):
    pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch_dtype, use_safetensors=True).to(device)

    if qtype:
        quantize(pipe.transformer, weights=qtype)
        freeze(pipe.transformer)
        quantize(pipe.text_encoder, weights=qtype)
        freeze(pipe.text_encoder)

    pipe.set_progress_bar_config(disable=True)
    return pipe


def get_device_memory(device):
    gc.collect()
    if device.type == "cuda":
        torch.cuda.empty_cache()
        return torch.cuda.memory_allocated()
    elif device.type == "mps":
        torch.mps.empty_cache()
        return torch.mps.current_allocated_memory()
    elif device.type == "xpu":
        torch.xpu.empty_cache()
        return torch.xpu.memory_allocated()
    return None


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_id", type=str, default="PixArt-alpha/PixArt-Sigma-XL-2-1024-MS")
    parser.add_argument("--prompt", type=str, default="ghibli style, a fantasy landscape with castles")
    parser.add_argument("--torch_dtype", type=str, default="fp16", choices=list(TORCH_DTYPES.keys()))
    parser.add_argument("--qtype", type=str, default=None, choices=list(QTYPES.keys()))
    parser.add_argument("--device", type=str, default=None, help="The device to use for generation.")
    args = parser.parse_args()

    if args.device is None:
        if torch.cuda.is_available():
            device = torch.device("cuda")
        elif torch.backends.mps.is_available():
            device = torch.device("mps")
        elif torch.xpu.is_available():
            device = torch.device("xpu")
        else:
            device = torch.device("cpu")
    else:
        device = torch.device(args.device)

    pipeline = load_pipeline(
        args.model_id, TORCH_DTYPES[args.torch_dtype], QTYPES[args.qtype] if args.qtype else None, device
    )

    print(f"torch_dtype: {args.torch_dtype}, qtype: {args.qtype}.")
    memory = get_device_memory(device)
    if memory is not None:
        memory_gb = memory / 2**30
        print(f"{device.type} device memory: {memory_gb:.2f} GB.")

    if args.qtype == "int4" and device.type == "CUDA":
        raise ValueError("This example does not work (yet) for int4 on CUDA")

    img_name = f"pixart-sigma-dtype@{args.torch_dtype}-qtype@{args.qtype}.png"
    image = pipeline(
        prompt=args.prompt,
        num_inference_steps=NUM_INFERENCE_STEPS,
        num_images_per_prompt=1,
        generator=torch.manual_seed(0),
    ).images[0]
    image.save(img_name)


================================================
FILE: external/awq/conftest.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest
import torch


devices = ["cpu"]
if torch.cuda.is_available():
    devices += ["cuda"]
elif torch.backends.mps.is_available():
    devices += ["mps"]


@pytest.fixture(scope="module", params=devices)
def device(request):
    return torch.device(request.param)


def pytest_configure(config):
    # register additional markers
    config.addinivalue_line("markers", "skip_device(type): mark test to be skipped for the specified device type")


def pytest_runtest_call(item):
    fixture_name = "device"
    if fixture_name in item.fixturenames:
        # TODO: should be able to recover the fixture id instead of the actual value
        fixture_arg = item.funcargs[fixture_name].type
        skip_marks = {mark.args[0] for mark in item.iter_markers(name=f"skip_{fixture_name}")}
        if fixture_arg in skip_marks:
            pytest.skip(f"Test skipped for {fixture_name} {fixture_arg}")


================================================
FILE: external/awq/pack_intweight.py
================================================
# MIT License
#
# Copyright (c) 2023 MIT HAN Lab
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import torch


def pack_intweight(unpacked_qweight, interleave, kstride):
    # unpacked_qweight: [N, K]
    N = unpacked_qweight.shape[0]
    K = unpacked_qweight.shape[1]

    Packed_Kernel = unpacked_qweight.cpu().numpy().reshape(N, K // 32, 32)
    # np.arange(32).reshape(4, 4, 2).transpose(1, 0, 2) => [0, 1, 8, 9, 16, 17, 24, 25, ...]
    Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 4, 4, 2).transpose(0, 1, 3, 2, 4)
    Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 32)

    # reorder each 8 weights for fast dequantization
    # [0, 1, 2, 3, 4, 5, 6, 7] => [0, 2, 4, 6, 1, 3, 5, 7]
    Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 4, 8)
    Packed_Kernel = Packed_Kernel.reshape(N, K // 32, 4, 4, 2).transpose(0, 1, 2, 4, 3)
    Packed_Kernel = Packed_Kernel.reshape(N, K)

    # interleaving every four rows
    Packed_Kernel = Packed_Kernel.reshape(
        N // interleave, interleave, K // kstride, kstride
    )
    # N // 4, K // 64, 4, 64
    Packed_Kernel = Packed_Kernel.transpose(0, 2, 1, 3)
    Packed_Kernel = Packed_Kernel.reshape(
        N // interleave, K // kstride, kstride, interleave
    )
    # Packing -> (N // 4, K // 64, 64)
    Packed_Kernel = (
        Packed_Kernel[..., 0]
        | (Packed_Kernel[..., 1] << 4)
        | (Packed_Kernel[..., 2] << 8)
        | (Packed_Kernel[..., 3] << 12)
    )
    # reshape to (N // 4, K), FP16 format
    Packed_Kernel = Packed_Kernel.reshape(N // interleave, K)
    qweight = (
        torch.tensor(Packed_Kernel.astype("int16"))
        .to(unpacked_qweight.device)
        .contiguous()
    )
    return qweight


================================================
FILE: external/awq/packing_utils.py
================================================
import torch


AWQ_ORDER = [0, 2, 4, 6, 1, 3, 5, 7]
AWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]


def pack_awq(intweight: torch.Tensor, reorder=False):
    bits = 4
    pack_num = 32 // bits
    qweight = torch.zeros(intweight.shape[0], intweight.shape[1] // pack_num, dtype=torch.int32, device=intweight.device)
    for col in range(intweight.shape[1] // pack_num):
        if reorder:
            order_map = [0, 2, 4, 6, 1, 3, 5, 7]
        else:
            order_map = [0, 1, 2, 3, 4, 5, 6, 7]
        for i in range(pack_num):
            qweight_col = intweight[:, col * pack_num + order_map[i]]
            qweight[:, col] |= qweight_col << (i * bits)
    return qweight


def unpack_awq(qweight: torch.Tensor, bits: int):
    shifts = torch.arange(0, 32, bits, device=qweight.device)

    # unpacking columnwise
    iweights = torch.bitwise_right_shift(qweight[:, :, None], shifts[None, None, :]).to(
        torch.int8  # smallest dtype available
    )
    iweights = iweights.view(iweights.shape[0], -1)

    return iweights


def reverse_awq_order(iweights: torch.Tensor, bits: int):
    reverse_order_tensor = torch.arange(
        iweights.shape[-1],
        dtype=torch.int32,
        device=iweights.device,
    )
    reverse_order_tensor = reverse_order_tensor.view(-1, 32 // bits)
    reverse_order_tensor = reverse_order_tensor[:, AWQ_REVERSE_ORDER]
    reverse_order_tensor = reverse_order_tensor.view(-1)

    iweights = iweights[:, reverse_order_tensor]

    return iweights


def pack_exllama(iweights: torch.Tensor, izeros: torch.Tensor, bits: int):
    shifts = torch.arange(0, 32, bits, device=iweights.device)

    # packing rowwise
    iweights = iweights.view(iweights.shape[0] // (32 // bits), 32 // bits, -1)
    qweight = (
        torch.bitwise_left_shift(iweights, shifts[None, :, None])
        .sum(dim=1)
        .to(torch.int32)
    )

    # packing columnwise
    izeros = izeros.view(-1, izeros.shape[1] // (32 // bits), 32 // bits)
    qzeros = (
        torch.bitwise_left_shift(izeros, shifts[None, None, :])
        .sum(dim=-1)
        .to(torch.int32)
    )

    return qweight, qzeros


def unpack_reorder_pack(qweight, qzeros, bits):
    # Unpack the qweight and qzeros tensors
    iweight, izeros = unpack_awq(qweight, qzeros, bits)
    # Reverse the order of the iweight and izeros tensors
    iweight, izeros = reverse_awq_order(iweight, izeros, bits)

    # overflow checks
    iweight = torch.bitwise_and(iweight, (2**bits) - 1)
    izeros = torch.bitwise_and(izeros, (2**bits) - 1)

    # Subtract 1 from the izeros tensor (exllama adds 1 during inference)
    # We can remove it if we remove the +1 in the exllama code
    izeros = izeros - 1
    # Pack the qweight and qzeros tensors
    qweight, qzeros = pack_exllama(iweight, izeros, bits)

    return qweight, qzeros


def dequantize_gemm(qweight, qzeros, scales, bits, group_size):
    # Unpack the qweight and qzeros tensors
    iweight, izeros = unpack_awq(qweight, qzeros, bits)
    # Reverse the order of the iweight and izeros tensors
    iweight, izeros = reverse_awq_order(iweight, izeros, bits)

    # overflow checks
    iweight = torch.bitwise_and(iweight, (2**bits) - 1)
    izeros = torch.bitwise_and(izeros, (2**bits) - 1)

    # fp16 weights
    scales = scales.repeat_interleave(group_size, dim=0)
    izeros = izeros.repeat_interleave(group_size, dim=0)
    iweight = (iweight - izeros) * scales

    return iweight


================================================
FILE: external/awq/test_awq_kernels.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
import torch
from pack import pack_awq

from optimum.quanto import AffineQuantizer, MaxOptimizer, qint4, ungroup


def assert_similar(a, b, atol=None, rtol=None):
    """Verify that the cosine similarity of the two inputs is close to 1.0 everywhere"""
    assert a.dtype == b.dtype
    assert a.shape == b.shape
    if atol is None:
        # We use torch finfo resolution
        atol = torch.finfo(a.dtype).resolution
    if rtol is None:
        # Please refer to that discussion for default rtol values based on the float type:
        # https://scicomp.stackexchange.com/questions/43111/float-equality-tolerance-for-single-and-half-precision
        rtol = {torch.float32: 1e-5, torch.float16: 1e-3, torch.bfloat16: 1e-1}[a.dtype]
    sim = torch.nn.functional.cosine_similarity(a.flatten(), b.flatten(), dim=0)
    if not torch.allclose(sim, torch.tensor(1.0, dtype=sim.dtype), atol=atol, rtol=rtol):
        max_deviation = torch.min(sim)
        raise ValueError(f"Alignment {max_deviation:.8f} deviates too much from 1.0 with atol={atol}, rtol={rtol}")


@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.parametrize("in_features, out_features", [(256, 256), (512, 256)])
@pytest.mark.parametrize("kernel", ["gemv", "gemm"])
def test_standalone_kernel(in_features, out_features, kernel):
    """This test verifies that the GEMM operation is equivalent to torch.mm.
    """
    bits = 4
    group_size = 128 # Hard-coded in kernels
    interleave = 4 # Hard-coded in kernels
    kstride = 64 # Hard-coded in kernels
    device = torch.device('cuda')
    batch_size, tokens = (4, 1) if kernel =="gemv" else (10, 128)
    input_shape = (batch_size, tokens, in_features)
    # FIXME: does not work if inputs are negative !!??
    inputs = torch.rand(input_shape, dtype=torch.float16, device=device)
    qmax = 2**bits
    other_shape = (out_features, in_features)
    other_data = torch.randint(0, qmax, other_shape, dtype=torch.uint8, device=device)
    #packed_other_data = pack_intweight(other_data.to(torch.int32), interleave=interleave, kstride=kstride)
    packed_other_data = pack_awq(other_data.to(torch.int32), interleave=interleave, kstride=kstride)
    # The GEMM kernel works on transposed scales
    scales_shape = (in_features // group_size, out_features)
    other_scales = torch.rand(scales_shape, dtype=torch.float16, device=device) / qmax
    # The GEMM kernel works on transposed, negated and scaled zeropoints
    qmin = -2**(bits -1)
    qmax = 2**(bits -1)
    other_zeropoints = torch.randint(qmin, qmax, scales_shape, dtype=torch.int8, device=device)
    # Negate and scale
    other_scaled_zeropoints = - other_zeropoints * other_scales
    # Evaluate mm outputs using the GEMM kernel
    if kernel == "gemv":
        awq_outputs = torch.ops.quanto.gemv(inputs,
                                         packed_other_data,
                                         other_scales,
                                         other_scaled_zeropoints,
                                         rows=inputs.numel() // inputs.shape[-1],
                                         out_cols=out_features,
                                         in_cols=in_features,
                                         bits=4,
                                         group_size=group_size)
    else:
        awq_outputs = torch.ops.quanto.gemm(inputs,
                                                  packed_other_data,
                                                  other_scales,
                                                  other_scaled_zeropoints,
                                                  rows=inputs.numel() // inputs.shape[-1],
                                                  out_cols=out_features,
                                                  in_cols=in_features,
                                                  bits=4,
                                                  group_size=group_size)
    # Transpose other data and reshape it to align it with transposed scales and zeros
    other_data_t = other_data.t().reshape(group_size, in_features // group_size, out_features)
    # Dequantize transposed other
    other_t = (other_data_t - other_zeropoints) * other_scales
    # Reshape it as expected by the matmul
    other_t = other_t.reshape(in_features, out_features)
    # Evaluate the matrix multiplication using pytorch float16 mm
    pt_outputs = torch.matmul(inputs, other_t)
    # Verify the results are similar
    assert_similar(awq_outputs, pt_outputs, rtol=5e-3)


@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.parametrize("in_features, out_features", [(256, 256), (512, 256)])
@pytest.mark.parametrize("kernel", ["gemm", "gemv"])
def test_integrated_kernel(in_features, out_features, kernel):
    group_size = 128 # Hard-coded in kernels
    interleave = 4 # Hard-coded in kernels
    kstride = 64 # Hard-coded in kernels
    device = torch.device('cuda')
    batch_size, tokens = (4, 1) if kernel == "gemv" else (10, 128)
    input_shape = (batch_size, tokens, in_features)
    inputs = torch.rand(input_shape, dtype=torch.float16, device=device) * 2 - 1
    other_shape = (out_features, in_features)
    other = torch.rand(other_shape, dtype=torch.float16, device=device) * 2 - 1
    # Quantize using quanto
    scale, zeropoint = MaxOptimizer()(other, bits=4, axis=0, group_size=128)
    quanto_base = AffineQuantizer.apply(other, qint4, 0, group_size, scale, zeropoint)
    # Evaluate mm
    quanto_outputs = torch.matmul(inputs, quanto_base.t())

    # Extract quantized data, unpack and ungroup to recover original shape
    quanto_data = ungroup(quanto_base._data.unpack(), axis=0, orig_shape=other_shape)
    # Pack data for AWQ kernel
    awq_data = pack_awq(quanto_data.to(torch.int32), interleave=interleave, kstride=kstride)
    # Reshape and transpose scale as expected by AWQ kernel (! buffer must be contiguous)
    awq_scale = scale.reshape(out_features, in_features // group_size).t().contiguous()
    # Reshape and transpose zeropoint as expected by AWQ kernel (! buffer must be contiguous)
    awq_zeropoint = zeropoint.reshape(out_features, in_features // group_size).t().contiguous()
    # Negate and rescale
    awq_scaled_zeropoint = - awq_zeropoint * awq_scale

    # Evaluate mm outputs using the AWQ kernels
    if kernel == "gemv":
        awq_outputs = torch.ops.quanto.gemv(inputs,
                                         awq_data,
                                         awq_scale,
                                         awq_scaled_zeropoint,
                                         rows=inputs.numel() // inputs.shape[-1],
                                         out_cols=out_features,
                                         in_cols=in_features,
                                         bits=4,
                                         group_size=group_size)
    else:
        awq_outputs = torch.ops.quanto.gemm(inputs,
                                                  awq_data,
                                                  awq_scale,
                                                  awq_scaled_zeropoint,
                                                  rows=inputs.numel() // inputs.shape[-1],
                                                  out_cols=out_features,
                                                  in_cols=in_features,
                                                  bits=4,
                                                  group_size=group_size)

    # Verify the results are similar
    assert_similar(awq_outputs, quanto_outputs, rtol=5e-3)


================================================
FILE: external/awq/test_awq_packing.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import pytest
import torch
from pack_intweight import pack_intweight
from packing_utils import pack_awq, reverse_awq_order, unpack_awq

from optimum.quanto import AWQPackedTensor, AWQPacking


@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.parametrize("in_features", [128, 256, 512, 1024])
@pytest.mark.parametrize("out_features", [128, 256, 512, 1024])
@pytest.mark.parametrize("reorder", [True, False])
@pytest.mark.parametrize("random", [True, False])
def test_awq_pack(in_features, out_features, reorder, random):
    """This test verifies two things:

    - that we are able to replicate awq packing,
    - that we can unpack awq packed tensors and recover the original tensor.
    """
    bits = 4
    interleave = 4
    kstride = 64
    qmax = 2**bits
    shape = (out_features, in_features)
    device = torch.device('cuda')
    if random:
        t = torch.randint(0, qmax, shape, dtype=torch.uint8).to(device)
    else:
        numel = np.prod(shape)
        t = torch.tensor(range(numel), dtype=torch.int32)
        t = (t % qmax).reshape(shape).to(torch.uint8).to(device)
    packed = pack_awq(t.to(torch.int32), reorder=reorder)
    # Sanity check: verify we can recover the Tensor using AWQ unpacking
    unpacked = unpack_awq(packed, bits=4)
    if reorder:
        unpacked = reverse_awq_order(unpacked, bits=4)
    unpacked = torch.bitwise_and(unpacked, qmax - 1)
    assert torch.equal(t, unpacked)
    # Compare with quanto packing
    repacked = AWQPackedTensor.pack(t, packing=AWQPacking.V1, reorder=reorder)
    assert torch.equal(packed, repacked._data)
    unpacked = repacked.unpack()
    assert torch.equal(unpacked, t)


@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@pytest.mark.parametrize("in_features", [128, 256, 512, 1024])
@pytest.mark.parametrize("out_features", [128, 256, 512, 1024])
@pytest.mark.parametrize("random", [True, False])
def test_awq_pack_v2(in_features, out_features, random):
    """This test verifies two things:

    - that we are able to replicate awq packing,
    - that we can unpack awq packed tensors and recover the original tensor.
    """
    bits = 4
    interleave = 4
    kstride = 64
    qmax = 2**bits
    shape = (out_features, in_features)
    device = torch.device('cuda')
    if random:
        t = torch.randint(0, qmax, shape, dtype=torch.uint8).to(device)
    else:
        numel = np.prod(shape)
        t = torch.tensor(range(numel), dtype=torch.int32)
        t = (t % qmax).reshape(shape).to(torch.uint8).to(device)
    packed = pack_intweight(t.to(torch.int32), interleave=interleave, kstride=kstride)
    # Compare with quanto packing
    repacked = AWQPackedTensor.pack(t, packing=AWQPacking.V2)
    assert torch.equal(packed, repacked._data)
    unpacked = repacked.unpack()
    assert torch.equal(unpacked, t)



================================================
FILE: external/awq/test_awq_quantize.py
================================================
import pytest
import torch

from optimum.quanto import AffineQuantizer, MaxOptimizer, qint4, ungroup


def awq_quantize(base, scales, zeros, group_size):
    _, in_features = base.shape
    scale_zeros = scales * zeros
    intweight = []
    # From https://github.com/casper-hansen/AutoAWQ/blob/main/awq/modules/linear/gemv_fast.py#L165
    for idx in range(in_features):
        intweight.append(
            torch.round(
                (base[:, idx] + scale_zeros[:, idx // group_size])
                        / scales[:, idx // group_size]
                    ).to(torch.uint8)[:, None]
                )
    intweight = torch.cat(intweight, dim=1)
    return intweight


@pytest.mark.parametrize("in_features, out_features", [(256, 512), (1024, 1024)])
def test_awq_quantize(in_features, out_features):
    """Verify that AWQ quantization is equivalent to quanto affine quantization
    """
    shape = (out_features, in_features)
    base = torch.rand(shape, dtype=torch.float16)
    group_size = 128

    # Quantize using quanto
    scale, zeropoint = MaxOptimizer()(base, bits=4, axis=0, group_size=128)
    quanto_base = AffineQuantizer.apply(base, qint4, 0, group_size, scale, zeropoint)
    # Extract quantized data, unpack and ungroup to recover original shape
    quanto_data = ungroup(quanto_base._data.unpack(), axis=0, orig_shape=shape)

    # Reshape scale and zeropoint as expected by awq
    awq_shape = (out_features, in_features // group_size)
    scale = scale.reshape(awq_shape)
    zeropoint = zeropoint.reshape(awq_shape)

    # Compare with awq quantization
    awq_data = awq_quantize(base, scale, zeropoint, group_size)
    # FIX: AWQ does not clamp values before packing
    qmax = 2 ** 4 - 1
    awq_data = torch.clamp(awq_data, 0, qmax)

    mismatches = quanto_data != awq_data
    n = torch.sum(mismatches).numpy()
    rate = n / base.numel()
    print(f"Mismatches: {n}/{base.numel()} ({rate:.8f} %)")
    # Extract mismatches
    display = 10
    quanto_values = torch.masked_select(quanto_data, mismatches)[:display]
    awq_values = torch.masked_select(awq_data, mismatches)[:display]
    print(f"First {display} mismatches")
    print(list(quanto_values.numpy()))
    print(list(awq_values.numpy()))
    # Due to a slightly different order of operations (zero is multiplied by scale before subtracting it),
    # there are some mismatches
    assert rate < 5e-4


================================================
FILE: external/smoothquant/README.md
================================================
# SmoothQuant original conversion script

This converts an OPT or Bloom [🤗 transformers](https://github.com/huggingface/transformers) model to a "smoothed" version, as described in
[SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models](https://arxiv.org/abs/2211.10438).

```bash
$ python smoothquant.py --model facebook/opt-1.3b --save-path smoothed-models/facebook/opt-1.3b
```

Note: due to hard-coded assumptions on model architecture in the script this only works for OPT models that apply the layer_norm
before the attention (`do_layer_norm_before=true` in `config.json`). This means all models but `facebook/opt-350m`.


================================================
FILE: external/smoothquant/smoothquant.py
================================================
import argparse
import functools
import os

import torch
import torch.nn as nn
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.models.bloom.modeling_bloom import BloomBlock
from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRMSNorm
from transformers.models.mistral.modeling_mistral import MistralDecoderLayer, MistralRMSNorm
from transformers.models.opt.modeling_opt import OPTDecoderLayer


def get_act_scales(model, tokenizer, dataset, num_samples=512, seq_len=512):
    model.eval()
    device = next(model.parameters()).device
    act_scales = {}

    def stat_tensor(name, tensor):
        hidden_dim = tensor.shape[-1]
        tensor = tensor.view(-1, hidden_dim).abs().detach()
        comming_max = torch.max(tensor, dim=0)[0].float().cpu()
        if name in act_scales:
            act_scales[name] = torch.max(act_scales[name], comming_max)
        else:
            act_scales[name] = comming_max

    def stat_input_hook(m, x, y, name):
        if isinstance(x, tuple):
            x = x[0]
        stat_tensor(name, x)

    hooks = []
    for name, m in model.named_modules():
        if isinstance(m, nn.Linear):
            hooks.append(m.register_forward_hook(functools.partial(stat_input_hook, name=name)))

    for i in tqdm(range(num_samples)):
        input_ids = tokenizer(
            dataset[i]["text"], return_tensors="pt", max_length=seq_len, truncation=True
        ).input_ids.to(device)
        model(input_ids)

    for h in hooks:
        h.remove()

    return act_scales


@torch.no_grad()
def smooth_ln_fcs(ln, fcs, act_scales, alpha=0.5):
    if not isinstance(fcs, list):
        fcs = [fcs]
    assert isinstance(ln, (nn.LayerNorm, LlamaRMSNorm, MistralRMSNorm))
    for fc in fcs:
        assert isinstance(fc, nn.Linear)
        assert ln.weight.numel() == fc.in_features == act_scales.numel()

    device, dtype = fcs[0].weight.device, fcs[0].weight.dtype
    act_scales = act_scales.to(device=device, dtype=dtype)
    weight_scales = torch.cat([fc.weight.abs().max(dim=0, keepdim=True)[0] for fc in fcs], dim=0)
    weight_scales = weight_scales.max(dim=0)[0].clamp(min=1e-5)

    scales = (act_scales.pow(alpha) / weight_scales.pow(1 - alpha)).clamp(min=1e-5).to(device).to(dtype)

    ln.weight.div_(scales)
    if getattr(ln, 'bias', None) is not None:
        ln.bias.div_(scales)

    for fc in fcs:
        fc.weight.mul_(scales.view(1, -1))


@torch.no_grad()
def smooth_lm(model, scales, alpha=0.5):
    for name, module in model.named_modules():
        if isinstance(module, OPTDecoderLayer):
            attn_ln = module.self_attn_layer_norm
            qkv = [module.self_attn.q_proj, module.self_attn.k_proj, module.self_attn.v_proj]
            qkv_input_scales = scales[name + ".self_attn.q_proj"]
            smooth_ln_fcs(attn_ln, qkv, qkv_input_scales, alpha)

            ffn_ln = module.final_layer_norm
            fc1 = module.fc1
            fc1_input_scales = scales[name + ".fc1"]
            smooth_ln_fcs(ffn_ln, fc1, fc1_input_scales, alpha)
        elif isinstance(module, BloomBlock):
            attn_ln = module.input_layernorm
            qkv = module.self_attention.query_key_value
            qkv_input_scales = scales[name + ".self_attention.query_key_value"]
            smooth_ln_fcs(attn_ln, qkv, qkv_input_scales, alpha)

            ffn_ln = module.post_attention_layernorm
            fc1 = module.mlp.dense_h_to_4h
            fc1_input_scales = scales[name + ".mlp.dense_h_to_4h"]
            smooth_ln_fcs(ffn_ln, fc1, fc1_input_scales, alpha)
        elif isinstance(module, (LlamaDecoderLayer, MistralDecoderLayer)):
            attn_ln = module.input_layernorm
            qkv = [module.self_attn.q_proj, module.self_attn.k_proj, module.self_attn.v_proj]
            qkv_input_scales = scales[name + ".self_attn.q_proj"]
            smooth_ln_fcs(attn_ln, qkv, qkv_input_scales, alpha)

            ffn_ln = module.post_attention_layernorm
            fc = [module.mlp.gate_proj, module.mlp.up_proj]
            fc_input_scales = scales[name + ".mlp.gate_proj"]
            smooth_ln_fcs(ffn_ln, fc, fc_input_scales, alpha)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", type=str, default="facebook/opt-125m", help="model name")
    parser.add_argument("--save-path", type=str, default=None, help="smoothed model model save path")
    parser.add_argument("--num-samples", type=int, default=512)
    parser.add_argument("--seq-len", type=int, default=512)
    parser.add_argument("--device", type=str, default=None, help="The device to use for generation.")
    args = parser.parse_args()

    if args.device is None:
        if torch.cuda.is_available():
            device = torch.device("cuda")
        elif torch.backends.mps.is_available():
            device = torch.device("mps")
        else:
            device = torch.device("cpu")
    else:
        device = torch.device(args.device)

    dataset = load_dataset("lambada", split=f"validation[:{args.num_samples}]").shuffle()
    tokenizer = AutoTokenizer.from_pretrained(args.model, model_max_length=args.seq_len)
    model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype="auto").to(device)

    act_scales = get_act_scales(model, tokenizer, dataset, args.num_samples, args.seq_len)
    smooth_lm(model, act_scales, 0.5)
    save_path = args.save_path
    if save_path is None:
        save_path = os.path.join("smoothed_models", args.model)
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)


if __name__ == "__main__":
    main()


================================================
FILE: optimum/quanto/__init__.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

__version__ = "0.2.7dev"

from .calibrate import *
from .library import *
from .models import *
from .nn import *
from .quantize import *
from .tensor import *


================================================
FILE: optimum/quanto/calibrate.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional

import torch
from torch.nn.modules.module import (
    register_module_forward_hook,
    register_module_forward_pre_hook,
)
from torch.overrides import TorchFunctionMode

from .nn import QModuleMixin
from .tensor import ActivationQBytesTensor, QTensor, axis_to_dim, dtype_info, qint8, qtype


__all__ = ["Calibration", "absmax_scale"]


def _updated_scale(scale, new_scale, momentum):
    if torch.all(scale == 1):
        return new_scale
    return momentum * scale + new_scale * (1.0 - momentum)


def absmax_scale(base: torch.Tensor, qtype: qtype = qint8, axis: Optional[int] = None) -> torch.Tensor:
    """Evaluate the quantization scale using the absmax algorithm.

    The Absolute Maximum quantization algorithm is a symmetrical quantization
    algorithm where the scale corresponds to the maximum absolute value of the
    base divided by the highest positive integer value for the target integer
    representation.

    Args:
        base (`torch.Tensor`): the base tensor on which the scale will be applied.
        qtype (`quanto.qtype`): the target qtype for quantization.
        axis (`int`): the index of the axis to preserve, or -1 for the last one.
            Defaults to None to reduce all axis.

    Returns:
        `torch.Tensor`: a scale tensor of the same dtype as the base.
    """
    base = torch.abs(base)
    if axis is None:
        qranges = torch.max(base)
    else:
        dim = axis_to_dim(base, axis)
        qranges = torch.amax(base, dim=dim, keepdim=True)
    info = dtype_info(qtype.dtype)
    return qranges / info.max


class Calibration(TorchFunctionMode):
    """A custom torch dispatch mode to calibrate quantized modules.

    In order to improve the accuracy of the quantized activations, the input and output
    scales of each quantized module are evaluated per-batch using the absmax algorithm and aggregated using a
    momentum.

    The dispatch mode also tracks the calls to each torch function down the model graph, and applies optional
    optimizations:
    - streamline: do not quantize activations that are immediately consumed by an incompatible function (like `add` or `silu`).

    Args:
        momentum (`float`): the momentum to use when updating scales.
        streamline (`bool`): if True, avoid quantizing activations when they are consumed by an incompatible function. Defaults to True.
        debug (`bool`): provide very verbose feedback on the console during calibration.
    """

    def __init__(self, *args, momentum: float = 0.9, streamline=True, debug=False, **kwargs):
        super().__init__(*args, **kwargs)
        self.momentum = momentum
        self.streamline = streamline
        if streamline:
            self.modules_qactivations = {}
            self.streamline_hooks = {}
        self.debug = debug

    def __torch_function__(self, func, types, args=(), kwargs=None):
        kwargs = kwargs if kwargs is not None else {}
        qinput = QTensor in types
        output = func(*args, **kwargs)
        if self.streamline and qinput:
            for i, arg in enumerate(args):
                module = getattr(arg, "src_module", None)
                if module is not None:
                    if isinstance(output, ActivationQBytesTensor):
                        # Quantized activations are required for that module
                        self.modules_qactivations[module] = True
                    elif isinstance(output, torch.Tensor):
                        # Quantized activations are not required for that module unless another function requires them
                        qactivations_required = self.modules_qactivations.get(module, False)
                        self.modules_qactivations[module] = qactivations_required
        return output

    def __enter__(self):
        super().__enter__()
        self.pre_handle = register_module_forward_pre_hook(self.calibrate_input)
        self.post_handle = register_module_forward_hook(self.calibrate_output)

    def __exit__(self, exc_type, exc_val, exc_tb):
        super().__exit__(exc_type, exc_val, exc_tb)
        self.pre_handle.remove()
        self.post_handle.remove()
        if self.streamline:
            for handle in self.streamline_hooks.values():
                handle.remove()

    def calibrate_input(self, module: torch.nn.Module, input, momentum: float = 0.9):
        """Calibrate a module input scale

        This is registered as a global hook that is called before any module forward pre hook.
        """
        if isinstance(module, QModuleMixin) and module.activation_qtype is not None:
            input = input[0]
            if isinstance(input, ActivationQBytesTensor):
                # Just adopt the maximum scale of the input
                module.input_scale = torch.max(input._scale)
            else:
                # Evaluate the best scale
                input_scale = absmax_scale(input, module.activation_qtype)
                module.input_scale = _updated_scale(module.input_scale, input_scale, momentum)
            if self.streamline and module not in self.streamline_hooks:
                # Add a hook to tag the module outputs (after the module quantization hook in QModuleMixin)
                self.streamline_hooks[module] = module.register_forward_hook(self.tag_outputs)
            return input

    def calibrate_output(
        self,
        module: torch.nn.Module,
        input: torch.Tensor,
        output: torch.Tensor,
    ):
        """Calibrate a module output scale

        This is registered as a global hook that is called before any module forward hook.

        When the module is a QModuleMixin, its outputs are not quantized yet because they
        are only quantized in the QModuleMixin.quantize_output forward hook.
        """
        if isinstance(module, (QModuleMixin)) and module.activation_qtype is not None:
            # Evaluate the optimal scale per-tensor and update output scale
            output_scale = absmax_scale(output, module.activation_qtype, axis=None)
            module.output_scale = _updated_scale(module.output_scale, output_scale, self.momentum)
            return output
        else:
            if self.streamline:
                for name, child in module.named_children():
                    if isinstance(child, QModuleMixin) and child.activation_qtype is not None:
                        qactivations_required = self.modules_qactivations.get(child, False)
                        if not qactivations_required:
                            # Disable output quantization for this child as its outputs are only consumed by incompatible functions.
                            child.disable_output_quantization()
            if self.debug:
                for name, child in module.named_children():
                    if isinstance(child, QModuleMixin):
                        classname = child.__class__.__name__
                        trace = f"{name}({classname}) activations are"
                        if child.activation_qtype is None:
                            trace += " not quantized."
                        else:
                            trace += f" quantized to {child.activation_qtype} with scale {child.output_scale}."
                        print(trace)

    def tag_outputs(
        self,
        module: torch.nn.Module,
        input: torch.Tensor,
        output: torch.Tensor,
    ):
        """Mark outputs as generated by a module

        This is called as a module forward hook that is called after the QModuleMixin.quantize_output
        forward hook.

        This is useful in streamline mode to identify the module that generated a specific QTensor.
        """
        output.src_module = module


================================================
FILE: optimum/quanto/library/README.md
================================================
# Quanto operations library

This contains the `quanto::` operations, available in python under `torch.ops.quanto`.

To add a new operation:

- add a definition for the operation in `library/ops.py`,
- provide a default implementation using pytorch operators only under `library/python`,
- provide optimized kernels for all devices under `library/ext`.


================================================
FILE: optimum/quanto/library/__init__.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .extensions import *
from .qbytes_mm import *
from .quantize import *
from .unpack import *


================================================
FILE: optimum/quanto/library/extensions/README.md
================================================
# Quanto library extensions

This folder contains device-specific `quanto::` operations.

Implementations can be provided as part of:

- the generic C++ pytorch extension under `cpp`,
- the CUDA extension under `cuda`,
- the Metal Performance Shader extension under `mps`,
- the XPU SYCL extension under `xpu`.


To provide a device-specific implementation of an operation that already has a default implementation (such as unpack), use the following syntax:

```python
@torch.library.impl("quanto::unpack", ["CPU", "CUDA"])
def unpack(packed: torch.Tensor, bits: int) -> torch.Tensor:
    return ext.unpack(t, bits)
```

To declare a new device-specific operation, you need to add it to the library:

```python
torch.library.define(
    "quanto::gemm_f16i4",
    "(Tensor input,"
    " Tensor other,"
    " Tensor other_scale,"
    " Tensor other_shift,"
    " int group_size)"
    " -> Tensor",
)
```

Then you can provide its implementation:

```python
@torch.library.impl("quanto::gemm_f16i4", ["CUDA"])
def gemm_f16i4(
    input: torch.Tensor,
    other: torch.Tensor,
    scales: torch.Tensor,
    shift: torch.Tensor,
    group_size: int,
) -> torch.Tensor:
    ...
```


Please refer to each extension folder for examples.


================================================
FILE: optimum/quanto/library/extensions/__init__.py
================================================
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHO

Download .txt

gitextract_e7pf933s/

├── .github/
│   ├── CODEOWNERS
│   ├── PULL_REQUEST_TEMPLATE.md
│   └── workflows/
│       ├── check-commits.yml
│       ├── linux-cpu-tests.yml
│       ├── linux-cuda-tests.yml
│       ├── linux-examples.yml
│       ├── python-quality.yml
│       ├── security.yml
│       └── stale.yml
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── bench/
│   ├── generation/
│   │   ├── README.md
│   │   ├── evaluate_configurations.py
│   │   ├── evaluate_many_models.sh
│   │   ├── evaluate_model.py
│   │   ├── gen_barchart.py
│   │   ├── metrics/
│   │   │   ├── __init__.py
│   │   │   ├── latency.py
│   │   │   ├── perplexity.py
│   │   │   └── prediction.py
│   │   └── setup/
│   │       ├── __init__.py
│   │       ├── awq.py
│   │       ├── bnb.py
│   │       ├── hqq.py
│   │       └── quanto.py
│   ├── kernels/
│   │   ├── benchmark.py
│   │   ├── benchmark_marlin_fp8.py
│   │   └── benchmark_w4a16.py
│   └── torch_kernels/
│       ├── README.md
│       ├── test_int_mm.py
│       ├── test_int_mm_inductor.py
│       ├── test_weight_int4pack_mm.py
│       └── test_weight_int8pack_mm.py
├── examples/
│   ├── nlp/
│   │   ├── text-classification/
│   │   │   └── sst2/
│   │   │       └── quantize_sst2_model.py
│   │   └── text-generation/
│   │       └── quantize_causal_lm_model.py
│   ├── speech/
│   │   └── speech_recognition/
│   │       ├── quantize_asr_model.py
│   │       └── requirements.txt
│   └── vision/
│       ├── StableDiffusion/
│       │   ├── README.md
│       │   ├── quantize_StableDiffusion.py
│       │   └── requirements.txt
│       ├── image-classification/
│       │   ├── mnist/
│       │   │   └── quantize_mnist_model.py
│       │   └── pets/
│       │       └── quantize_vit_model.py
│       ├── object-detection/
│       │   └── quantize_owl_model.py
│       └── text-to-image/
│           └── quantize_pixart_sigma.py
├── external/
│   ├── awq/
│   │   ├── conftest.py
│   │   ├── pack_intweight.py
│   │   ├── packing_utils.py
│   │   ├── test_awq_kernels.py
│   │   ├── test_awq_packing.py
│   │   └── test_awq_quantize.py
│   └── smoothquant/
│       ├── README.md
│       └── smoothquant.py
├── optimum/
│   └── quanto/
│       ├── __init__.py
│       ├── calibrate.py
│       ├── library/
│       │   ├── README.md
│       │   ├── __init__.py
│       │   ├── extensions/
│       │   │   ├── README.md
│       │   │   ├── __init__.py
│       │   │   ├── cpp/
│       │   │   │   ├── README.md
│       │   │   │   ├── __init__.py
│       │   │   │   ├── pybind_module.cpp
│       │   │   │   ├── unpack.cpp
│       │   │   │   └── unpack.h
│       │   │   ├── cuda/
│       │   │   │   ├── README.md
│       │   │   │   ├── __init__.py
│       │   │   │   ├── awq/
│       │   │   │   │   ├── dequantize.cuh
│       │   │   │   │   └── v2/
│       │   │   │   │       ├── gemm_cuda.cu
│       │   │   │   │       ├── gemm_cuda.h
│       │   │   │   │       ├── gemv_cuda.cu
│       │   │   │   │       ├── gemv_cuda.h
│       │   │   │   │       └── semaphore.h
│       │   │   │   ├── marlin/
│       │   │   │   │   ├── COPYRIGHT
│       │   │   │   │   ├── fp8_marlin.cu
│       │   │   │   │   ├── fp8_marlin.cuh
│       │   │   │   │   ├── gptq_marlin.cuh
│       │   │   │   │   ├── gptq_marlin_dtypes.cuh
│       │   │   │   │   ├── gptq_marlin_repack.cu
│       │   │   │   │   ├── gptq_marlin_repack.cuh
│       │   │   │   │   ├── marlin_cuda.cpp
│       │   │   │   │   ├── marlin_cuda.h
│       │   │   │   │   ├── marlin_cuda_kernel.cu
│       │   │   │   │   └── marlin_cuda_kernel.cuh
│       │   │   │   ├── pybind_module.cpp
│       │   │   │   ├── unpack.cu
│       │   │   │   └── unpack.h
│       │   │   ├── extension.py
│       │   │   ├── hip/
│       │   │   │   ├── __init__.py
│       │   │   │   ├── pybind_module.cpp
│       │   │   │   ├── unpack.cu
│       │   │   │   └── unpack.h
│       │   │   ├── mps/
│       │   │   │   ├── README.md
│       │   │   │   ├── __init__.py
│       │   │   │   ├── pybind_module.cpp
│       │   │   │   ├── unpack.h
│       │   │   │   └── unpack.mm
│       │   │   └── xpu/
│       │   │       ├── __init__.py
│       │   │       ├── pybind_module.cpp
│       │   │       ├── unpack.h
│       │   │       └── unpack.sycl
│       │   ├── qbytes_mm.py
│       │   ├── quantize.py
│       │   └── unpack.py
│       ├── models/
│       │   ├── __init__.py
│       │   ├── diffusers_models.py
│       │   ├── shared_dict.py
│       │   └── transformers_models.py
│       ├── nn/
│       │   ├── __init__.py
│       │   ├── qconv2d.py
│       │   ├── qlayernorm.py
│       │   ├── qlinear.py
│       │   └── qmodule.py
│       ├── quantize.py
│       ├── subpackage/
│       │   ├── __init__.py
│       │   └── commands/
│       │       ├── __init__.py
│       │       ├── base.py
│       │       └── quantize.py
│       └── tensor/
│           ├── __init__.py
│           ├── activations/
│           │   ├── __init__.py
│           │   ├── qbytes.py
│           │   ├── qbytes_ops.py
│           │   └── quantization.py
│           ├── core.py
│           ├── function.py
│           ├── grouped.py
│           ├── optimizers/
│           │   ├── __init__.py
│           │   ├── absmax_optimizer.py
│           │   ├── affine_optimizer.py
│           │   ├── hqq_optimizer.py
│           │   ├── max_optimizer.py
│           │   ├── optimizer.py
│           │   └── symmetric_optimizer.py
│           ├── packed.py
│           ├── qbits.py
│           ├── qbytes.py
│           ├── qtensor.py
│           ├── qtype.py
│           └── weights/
│               ├── __init__.py
│               ├── awq/
│               │   ├── __init__.py
│               │   ├── packed.py
│               │   └── qbits.py
│               ├── marlin/
│               │   ├── __init__.py
│               │   ├── fp8/
│               │   │   ├── __init__.py
│               │   │   ├── packed.py
│               │   │   └── qbits.py
│               │   ├── int4/
│               │   │   ├── __init__.py
│               │   │   ├── packed.py
│               │   │   └── qbits.py
│               │   └── permutations.py
│               ├── packing.py
│               ├── qbits.py
│               ├── qbytes.py
│               ├── quantization.py
│               ├── reordering.py
│               └── tinygemm/
│                   ├── __init__.py
│                   ├── packed.py
│                   └── qbits.py
├── pyproject.toml
├── setup.sh
└── tests/
    ├── cli/
    │   ├── cli_helpers.py
    │   └── test_quantize_cli.py
    ├── conftest.py
    ├── helpers.py
    ├── library/
    │   ├── test_extensions.py
    │   ├── test_mm.py
    │   ├── test_quantize.py
    │   └── test_unpack.py
    ├── models/
    │   ├── conftest.py
    │   ├── test_quantized_model_for_causal_lm.py
    │   └── test_quantized_model_for_pixart.py
    ├── nn/
    │   ├── test_calibrate.py
    │   ├── test_qattention.py
    │   ├── test_qconv2d.py
    │   ├── test_qlayernorm.py
    │   ├── test_qlinear.py
    │   └── test_qmodule.py
    ├── quantize/
    │   ├── test_quantize_mlp.py
    │   ├── test_quantize_patterns.py
    │   └── test_requantize.py
    └── tensor/
        ├── activations/
        │   ├── test_activations_compile.py
        │   ├── test_activations_dispatch.py
        │   └── test_activations_quantize.py
        ├── ops/
        │   ├── test_linear_dispatch.py
        │   └── test_mm_dispatch.py
        ├── optimizers/
        │   └── test_hqq_optimizer.py
        ├── test_absmax.py
        ├── test_packed_tensor.py
        └── weights/
            ├── optimized/
            │   ├── test_awq_packed_tensor.py
            │   ├── test_awq_weight_qbits_tensor.py
            │   ├── test_marlin_fp8_packed_tensor.py
            │   ├── test_marlin_int4_packed_tensor.py
            │   ├── test_marlin_int4_weight_qbits_tensor.py
            │   ├── test_marlin_qbytes_tensor.py
            │   ├── test_tinygemm_packed_tensor.py
            │   └── test_tinygemm_weight_qbits_tensor.py
            ├── test_weight_qbits_tensor.py
            ├── test_weight_qbits_tensor_dispatch.py
            ├── test_weight_qbits_tensor_instantiate.py
            ├── test_weight_qbits_tensor_quantize.py
            ├── test_weight_qbytes_tensor_backward.py
            ├── test_weight_qbytes_tensor_dispatch.py
            ├── test_weight_qbytes_tensor_instantiate.py
            ├── test_weight_qbytes_tensor_quantize.py
            ├── test_weight_qbytes_tensor_serialization.py
            └── weight_helpers.py

Download .txt

SYMBOL INDEX (641 symbols across 138 files)

FILE: bench/generation/evaluate_configurations.py
  function evaluate_model_configurations (line 26) | def evaluate_model_configurations(
  function main (line 64) | def main():

FILE: bench/generation/evaluate_model.py
  function calibrate (line 36) | def calibrate(model, tokenizer, batch_size, batches):
  function evaluate (line 51) | def evaluate(
  function main (line 86) | def main():

FILE: bench/generation/gen_barchart.py
  function save_bar_chart (line 23) | def save_bar_chart(title, labels, ylabel, series, save_path):
  function gen_barchart (line 50) | def gen_barchart(model_id, title, label, results, dtype):
  function main (line 76) | def main():

FILE: bench/generation/metrics/latency.py
  function latency (line 24) | def latency(model, tokenizer, device, batch_size=1, prompt_length=512, n...
  function get_device_memory (line 108) | def get_device_memory(device):

FILE: bench/generation/metrics/perplexity.py
  class Perplexity (line 23) | class Perplexity:
    method __init__ (line 28) | def __init__(self, model, tokenizer, dataset_path="wikitext", dataset_...
    method _prepare_data (line 55) | def _prepare_data(self):
    method softmax (line 74) | def softmax(logits):
    method calculate_perplexity (line 91) | def calculate_perplexity(self, n_ctx=512, n_batch=512):
    method _process_batch (line 128) | def _process_batch(self, i, n_ctx, n_batch, tokens, nll, count):
    method _compute_batch_logits (line 197) | def _compute_batch_logits(self, tokens, batch_start, batch_size):
  function perplexity (line 221) | def perplexity(

FILE: bench/generation/metrics/prediction.py
  function prediction_accuracy (line 22) | def prediction_accuracy(model, tokenizer, batch_size, samples=None):

FILE: bench/generation/setup/awq.py
  function prepare_inputs_for_generation (line 19) | def prepare_inputs_for_generation(input_ids, past_key_values=None, atten...
  function setup (line 69) | def setup(model_id: str, weights: str, activations: str, group_size: int...

FILE: bench/generation/setup/bnb.py
  function setup (line 19) | def setup(

FILE: bench/generation/setup/hqq.py
  function setup (line 21) | def setup(model_id: str, weights: str, activations: str, device: torch.d...

FILE: bench/generation/setup/quanto.py
  function calibrate (line 25) | def calibrate(model, tokenizer, batch_size, batches):
  function setup (line 40) | def setup(
  function keyword_to_qtype (line 71) | def keyword_to_qtype(k):

FILE: bench/kernels/benchmark.py
  function get_unpack_bench (line 26) | def get_unpack_bench(bits, device):
  function timing (line 36) | def timing(get_bench_func, device, iterations=10):
  function main (line 95) | def main():

FILE: bench/kernels/benchmark_marlin_fp8.py
  function run_benchmark (line 28) | def run_benchmark(
  function shape_generator (line 132) | def shape_generator():
  function shape_generator (line 137) | def shape_generator():

FILE: bench/kernels/benchmark_w4a16.py
  function benchmark (line 12) | def benchmark(f, warmup=1, iter=10):
  function get_problem (line 28) | def get_problem(m, n, k, groupsize=128):
  function benchmark_dense (line 44) | def benchmark_dense(A, B, m, n, k):
  function benchmark_awq (line 53) | def benchmark_awq(A, B, s, sz, m, n, k):
  function benchmark_marlin (line 64) | def benchmark_marlin(A, B, s, sz, m, n, k):
  function run_benchmark (line 87) | def run_benchmark(model, tokens=None):
  function main (line 130) | def main():

FILE: bench/torch_kernels/test_int_mm.py
  function main (line 21) | def main():

FILE: bench/torch_kernels/test_int_mm_inductor.py
  function mm (line 20) | def mm(a, b):

FILE: bench/torch_kernels/test_weight_int4pack_mm.py
  function _group_quantize_tensor (line 21) | def _group_quantize_tensor(w, n_bit=4, q_group_size=16):
  function main (line 64) | def main():

FILE: bench/torch_kernels/test_weight_int8pack_mm.py
  function main (line 21) | def main():

FILE: examples/nlp/text-classification/sst2/quantize_sst2_model.py
  function evaluate_model (line 28) | def evaluate_model(model, tokenizer, dataset, device, batch_size):
  function keyword_to_itype (line 38) | def keyword_to_itype(k):
  function main (line 42) | def main():

FILE: examples/nlp/text-generation/quantize_causal_lm_model.py
  function generate (line 26) | def generate(model, tokenizer, device, prompt, max_new_tokens):
  function calibrate (line 43) | def calibrate(model, tokenizer, dataset, device, batch_size, samples=None):
  function keyword_to_itype (line 56) | def keyword_to_itype(k):
  function main (line 65) | def main():

FILE: examples/speech/speech_recognition/quantize_asr_model.py
  function map_to_feats (line 31) | def map_to_feats(batch, processor):
  function transcribe_batch (line 42) | def transcribe_batch(batch, model, processor):
  function evaluate_model (line 51) | def evaluate_model(model, processor, dataset, metric: evaluate.Evaluatio...
  function keyword_to_itype (line 61) | def keyword_to_itype(k):
  function main (line 65) | def main():

FILE: examples/vision/StableDiffusion/quantize_StableDiffusion.py
  function load_pipeline (line 25) | def load_pipeline(torch_dtype, unet_dtype=None, device="cpu"):
  function run_inference (line 36) | def run_inference(pipe, batch_size=1):
  function benchmark_fn (line 45) | def benchmark_fn(f, *args, **kwargs):
  function bytes_to_giga_bytes (line 50) | def bytes_to_giga_bytes(bytes):
  function get_device_memory (line 54) | def get_device_memory(device):

FILE: examples/vision/image-classification/mnist/quantize_mnist_model.py
  function test (line 39) | def test(model, device, test_loader):
  function train (line 65) | def train(log_interval, model, device, train_loader, optimizer, epoch):
  function keyword_to_itype (line 89) | def keyword_to_itype(k):
  function main (line 93) | def main():

FILE: examples/vision/image-classification/pets/quantize_vit_model.py
  function test (line 29) | def test(model, device, test_loader):
  function keyword_to_itype (line 56) | def keyword_to_itype(k):
  function main (line 60) | def main():

FILE: examples/vision/object-detection/quantize_owl_model.py
  function detect (line 14) | def detect(model, processor, image, texts):
  function get_device_memory (line 52) | def get_device_memory(device):
  function keyword_to_qtype (line 66) | def keyword_to_qtype(k):
  function main (line 70) | def main():

FILE: examples/vision/text-to-image/quantize_pixart_sigma.py
  function load_pipeline (line 21) | def load_pipeline(model_id, torch_dtype, qtype=None, device="cpu"):
  function get_device_memory (line 34) | def get_device_memory(device):

FILE: external/awq/conftest.py
  function device (line 27) | def device(request):
  function pytest_configure (line 31) | def pytest_configure(config):
  function pytest_runtest_call (line 36) | def pytest_runtest_call(item):

FILE: external/awq/pack_intweight.py
  function pack_intweight (line 25) | def pack_intweight(unpacked_qweight, interleave, kstride):

FILE: external/awq/packing_utils.py
  function pack_awq (line 8) | def pack_awq(intweight: torch.Tensor, reorder=False):
  function unpack_awq (line 23) | def unpack_awq(qweight: torch.Tensor, bits: int):
  function reverse_awq_order (line 35) | def reverse_awq_order(iweights: torch.Tensor, bits: int):
  function pack_exllama (line 50) | def pack_exllama(iweights: torch.Tensor, izeros: torch.Tensor, bits: int):
  function unpack_reorder_pack (line 72) | def unpack_reorder_pack(qweight, qzeros, bits):
  function dequantize_gemm (line 91) | def dequantize_gemm(qweight, qzeros, scales, bits, group_size):

FILE: external/awq/test_awq_kernels.py
  function assert_similar (line 21) | def assert_similar(a, b, atol=None, rtol=None):
  function test_standalone_kernel (line 41) | def test_standalone_kernel(in_features, out_features, kernel):
  function test_integrated_kernel (line 103) | def test_integrated_kernel(in_features, out_features, kernel):

FILE: external/awq/test_awq_packing.py
  function test_awq_pack (line 28) | def test_awq_pack(in_features, out_features, reorder, random):
  function test_awq_pack_v2 (line 64) | def test_awq_pack_v2(in_features, out_features, random):

FILE: external/awq/test_awq_quantize.py
  function awq_quantize (line 7) | def awq_quantize(base, scales, zeros, group_size):
  function test_awq_quantize (line 24) | def test_awq_quantize(in_features, out_features):

FILE: external/smoothquant/smoothquant.py
  function get_act_scales (line 16) | def get_act_scales(model, tokenizer, dataset, num_samples=512, seq_len=5...
  function smooth_ln_fcs (line 53) | def smooth_ln_fcs(ln, fcs, act_scales, alpha=0.5):
  function smooth_lm (line 77) | def smooth_lm(model, scales, alpha=0.5):
  function main (line 111) | def main():

FILE: optimum/quanto/calibrate.py
  function _updated_scale (line 31) | def _updated_scale(scale, new_scale, momentum):
  function absmax_scale (line 37) | def absmax_scale(base: torch.Tensor, qtype: qtype = qint8, axis: Optiona...
  class Calibration (line 64) | class Calibration(TorchFunctionMode):
    method __init__ (line 81) | def __init__(self, *args, momentum: float = 0.9, streamline=True, debu...
    method __torch_function__ (line 90) | def __torch_function__(self, func, types, args=(), kwargs=None):
    method __enter__ (line 107) | def __enter__(self):
    method __exit__ (line 112) | def __exit__(self, exc_type, exc_val, exc_tb):
    method calibrate_input (line 120) | def calibrate_input(self, module: torch.nn.Module, input, momentum: fl...
    method calibrate_output (line 139) | def calibrate_output(
    method tag_outputs (line 176) | def tag_outputs(

FILE: optimum/quanto/library/extensions/__init__.py
  function _is_xpu_available (line 34) | def _is_xpu_available():

FILE: optimum/quanto/library/extensions/cpp/__init__.py
  function unpack_cpp (line 35) | def unpack_cpp(t: torch.Tensor, bits: int):

FILE: optimum/quanto/library/extensions/cpp/pybind_module.cpp
  function PYBIND11_MODULE (line 24) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: optimum/quanto/library/extensions/cpp/unpack.cpp
  function unpack_4bit (line 19) | static torch::Tensor unpack_4bit(torch::Tensor &t) {
  function unpack_2bit (line 27) | static torch::Tensor unpack_2bit(torch::Tensor &t) {
  function unpack (line 37) | torch::Tensor unpack(torch::Tensor &t, int bits) {

FILE: optimum/quanto/library/extensions/cuda/__init__.py
  function get_max_cuda_arch (line 25) | def get_max_cuda_arch():
  function unpack_cuda (line 78) | def unpack_cuda(t: torch.Tensor, bits: int):
  function gemm_f16i4_awq (line 98) | def gemm_f16i4_awq(
  function fp8_marlin_gemm (line 139) | def fp8_marlin_gemm(
  function gptq_marlin_repack (line 162) | def gptq_marlin_repack(
  function gemm_f16i4_marlin (line 177) | def gemm_f16i4_marlin(

FILE: optimum/quanto/library/extensions/cuda/awq/v2/semaphore.h
  function class (line 44) | class Semaphore

FILE: optimum/quanto/library/extensions/cuda/marlin/marlin_cuda.cpp
  function mul (line 28) | void mul(

FILE: optimum/quanto/library/extensions/cuda/pybind_module.cpp
  function PYBIND11_MODULE (line 30) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: optimum/quanto/library/extensions/extension.py
  class Extension (line 13) | class Extension(object):
    method __init__ (line 14) | def __init__(
    method lib (line 30) | def lib(self):
  function register_extension (line 60) | def register_extension(extension: Extension):
  function get_extension (line 65) | def get_extension(extension_type: str):
  function is_extension_available (line 77) | def is_extension_available(extension_type: str):

FILE: optimum/quanto/library/extensions/hip/__init__.py
  function unpack_hip (line 35) | def unpack_hip(t: torch.Tensor, bits: int):

FILE: optimum/quanto/library/extensions/hip/pybind_module.cpp
  function PYBIND11_MODULE (line 19) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: optimum/quanto/library/extensions/mps/__init__.py
  function unpack_mps (line 35) | def unpack_mps(t: torch.Tensor, bits: int):

FILE: optimum/quanto/library/extensions/mps/pybind_module.cpp
  function PYBIND11_MODULE (line 19) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: optimum/quanto/library/extensions/xpu/__init__.py
  function unpack_xpu (line 41) | def unpack_xpu(t: torch.Tensor, bits: int):
  function gemm_f16i4_awq (line 61) | def gemm_f16i4_awq(

FILE: optimum/quanto/library/extensions/xpu/pybind_module.cpp
  function PYBIND11_MODULE (line 25) | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {

FILE: optimum/quanto/library/qbytes_mm.py
  function qbytes_mm (line 25) | def qbytes_mm(activations: torch.Tensor, weights: torch.Tensor, output_s...
  function qbytes_int_mm (line 36) | def qbytes_int_mm(activations: torch.Tensor, weights: torch.Tensor, outp...
  function qbytes_int8pack_mm (line 53) | def qbytes_int8pack_mm(activations: torch.Tensor, weights: torch.Tensor,...
  function qbytes_mm_impl_default (line 67) | def qbytes_mm_impl_default(
  function qbytes_mm_impl_cuda (line 74) | def qbytes_mm_impl_cuda(activations: torch.Tensor, weights: torch.Tensor...
  function qbytes_mm_impl_cpu (line 92) | def qbytes_mm_impl_cpu(activations: torch.Tensor, weights: torch.Tensor,...
  function qbytes_mm_impl_mps (line 109) | def qbytes_mm_impl_mps(activations: torch.Tensor, weights: torch.Tensor,...

FILE: optimum/quanto/library/quantize.py
  function quantize_symmetric (line 28) | def quantize_symmetric(
  function quantize_affine (line 65) | def quantize_affine(

FILE: optimum/quanto/library/unpack.py
  function unpack (line 22) | def unpack(packed: torch.Tensor, bits: int) -> torch.Tensor:

FILE: optimum/quanto/models/__init__.py
  function is_transformers_available (line 21) | def is_transformers_available() -> bool:
  function is_diffusers_available (line 25) | def is_diffusers_available() -> bool:

FILE: optimum/quanto/models/diffusers_models.py
  class QuantizedDiffusersModel (line 44) | class QuantizedDiffusersModel(ModelHubMixin):
    method __init__ (line 48) | def __init__(self, model: ModelMixin):
    method __getattr__ (line 53) | def __getattr__(self, name: str) -> Any:
    method forward (line 61) | def forward(self, *args, **kwargs):
    method __call__ (line 64) | def __call__(self, *args, **kwargs):
    method _qmap_name (line 68) | def _qmap_name():
    method quantize (line 72) | def quantize(
    method from_pretrained (line 119) | def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os....
    method _save_pretrained (line 180) | def _save_pretrained(self, save_directory: Path) -> None:
  class QuantizedPixArtTransformer2DModel (line 189) | class QuantizedPixArtTransformer2DModel(QuantizedDiffusersModel):

FILE: optimum/quanto/models/shared_dict.py
  class ShardedStateDict (line 22) | class ShardedStateDict(Mapping):
    method __init__ (line 30) | def __init__(self, base_dir: str, tensor_index: Dict[str, str]):
    method __iter__ (line 35) | def __iter__(self):
    method __len__ (line 38) | def __len__(self):
    method __getitem__ (line 41) | def __getitem__(self, key: Any) -> Any:
    method __contains__ (line 49) | def __contains__(self, key: object) -> bool:
    method keys (line 52) | def keys(self):

FILE: optimum/quanto/models/transformers_models.py
  class QuantizedTransformersModel (line 38) | class QuantizedTransformersModel(ModelHubMixin):
    method __init__ (line 42) | def __init__(self, model: PreTrainedModel):
    method __getattr__ (line 47) | def __getattr__(self, name: str) -> Any:
    method forward (line 55) | def forward(self, *args, **kwargs):
    method __call__ (line 58) | def __call__(self, *args, **kwargs):
    method __repr__ (line 61) | def __repr__(self):
    method _qmap_name (line 65) | def _qmap_name():
    method quantize (line 69) | def quantize(
    method from_pretrained (line 115) | def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os....
    method _save_pretrained (line 165) | def _save_pretrained(self, save_directory: Path) -> None:
  class QuantizedModelForCausalLM (line 182) | class QuantizedModelForCausalLM(QuantizedTransformersModel):

FILE: optimum/quanto/nn/qconv2d.py
  class QConv2d (line 27) | class QConv2d(QModuleMixin, torch.nn.Conv2d):
    method qcreate (line 29) | def qcreate(
    method forward (line 54) | def forward(self, input: torch.Tensor) -> torch.Tensor:

FILE: optimum/quanto/nn/qlayernorm.py
  class QLayerNorm (line 27) | class QLayerNorm(QModuleMixin, torch.nn.LayerNorm):
    method qcreate (line 29) | def qcreate(
    method forward (line 52) | def forward(self, input: torch.Tensor) -> torch.Tensor:

FILE: optimum/quanto/nn/qlinear.py
  class QLinear (line 27) | class QLinear(QModuleMixin, torch.nn.Linear):
    method qcreate (line 29) | def qcreate(
    method forward (line 49) | def forward(self, input: torch.Tensor) -> torch.Tensor:

FILE: optimum/quanto/nn/qmodule.py
  function register_qmodule (line 44) | def register_qmodule(module_cls):
  function quantize_module (line 81) | def quantize_module(
  class QModuleMixin (line 94) | class QModuleMixin(ABC):
    method __init__ (line 95) | def __init__(
    method disable_output_quantization (line 143) | def disable_output_quantization(self):
    method _save_to_state_dict (line 147) | def _save_to_state_dict(self, destination, prefix, keep_vars):
    method _load_from_state_dict (line 161) | def _load_from_state_dict(
    method from_module (line 210) | def from_module(
    method qcreate (line 235) | def qcreate(
    method qweight (line 246) | def qweight(self):
    method qforward (line 281) | def qforward(self, input: torch.Tensor) -> torch.Tensor:
    method quantize_input (line 284) | def quantize_input(self, module: torch.nn.Module, input: torch.Tensor)...
    method quantize_output (line 296) | def quantize_output(
    method freeze (line 304) | def freeze(self):
    method frozen (line 311) | def frozen(self):

FILE: optimum/quanto/quantize.py
  function set_module_by_name (line 27) | def set_module_by_name(parent_module, name, child_module):
  function _quantize_submodule (line 37) | def _quantize_submodule(
  function quantize (line 55) | def quantize(
  function requantize (line 101) | def requantize(
  function freeze (line 143) | def freeze(model):
  function quantization_map (line 149) | def quantization_map(model: torch.nn.Module) -> Dict[str, Dict[str, str]]:

FILE: optimum/quanto/subpackage/commands/base.py
  class QuantoCommand (line 25) | class QuantoCommand(BaseOptimumCLICommand):

FILE: optimum/quanto/subpackage/commands/quantize.py
  function parse_quantize_args (line 32) | def parse_quantize_args(parser: "ArgumentParser"):
  class QuantizeCommand (line 95) | class QuantizeCommand(BaseOptimumCLICommand):
    method parse_args (line 97) | def parse_args(parser: "ArgumentParser"):
    method run (line 100) | def run(self):

FILE: optimum/quanto/tensor/activations/qbytes.py
  class ActivationQBytesQuantizer (line 28) | class ActivationQBytesQuantizer(Function):
    method forward (line 30) | def forward(ctx, base: torch.Tensor, qtype: qtype, scale: torch.Tensor...
    method backward (line 41) | def backward(ctx, gO):
  class ActivationQBytesTensor (line 46) | class ActivationQBytesTensor(QBytesTensor):
    method __new__ (line 48) | def __new__(cls, qtype, size, stride, data, scale, requires_grad=False):
    method __init__ (line 54) | def __init__(self, qtype, size, stride, data, scale, requires_grad=Fal...
    method quantize (line 58) | def quantize(cls, base: torch.Tensor, qtype: qtype, scale: torch.Tenso...
    method __tensor_flatten__ (line 61) | def __tensor_flatten__(self):
    method __tensor_unflatten__ (line 71) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
    method __torch_dispatch__ (line 82) | def __torch_dispatch__(cls, op, types, args, kwargs=None):

FILE: optimum/quanto/tensor/activations/qbytes_ops.py
  function register_qbytestensor_op (line 34) | def register_qbytestensor_op(aten_ops: List[Callable]):
  function get_qbytestensor_op_dispatch (line 52) | def get_qbytestensor_op_dispatch(aten_op):
  function is_scalar (line 56) | def is_scalar(t):
  function _to_copy (line 61) | def _to_copy(op, t, dtype=None, **kwargs):
  function detach (line 70) | def detach(op, t):
  function cat (line 78) | def cat(op, inputs, dim=0):
  function lt (line 97) | def lt(op, input, other):
  function clone (line 109) | def clone(op, t, memory_format=torch.preserve_format):
  function copy_ (line 121) | def copy_(op, dest, src):
  function div (line 129) | def div(op, input, other):
  function neg (line 137) | def neg(op, input, *args, **kwargs):
  function unary_type_agnostic_op (line 154) | def unary_type_agnostic_op(op, input, *args, **kwargs):
  function is_same_size (line 164) | def is_same_size(op, input, other):
  function cannot_mm (line 170) | def cannot_mm(t: QTensor):
  function bmm (line 176) | def bmm(op, input, other):
  function mul (line 190) | def mul(op, input, other):
  function relu (line 200) | def relu(op, input):
  function _softmax (line 209) | def _softmax(op, input, dim, half_to_float):
  function stack (line 219) | def stack(op, inputs, dim=0):
  function split (line 237) | def split(op, input, *args, **kwargs):
  function transpose (line 248) | def transpose(op, input, *args):
  function transpose2d (line 257) | def transpose2d(op, input):
  function view (line 268) | def view(op, input, *shape):
  function where (line 277) | def where(op, condition, input, other):

FILE: optimum/quanto/tensor/activations/quantization.py
  function quantize_activation (line 24) | def quantize_activation(t: torch.Tensor, qtype: qtype, scale: torch.Tens...

FILE: optimum/quanto/tensor/core.py
  function dtype_info (line 22) | def dtype_info(dtype):
  function axis_to_dim (line 27) | def axis_to_dim(t, axis):

FILE: optimum/quanto/tensor/function.py
  class QuantizedLinearFunction (line 21) | class QuantizedLinearFunction(torch.autograd.Function):
    method forward (line 42) | def forward(ctx, input, other, bias=None):
    method backward (line 49) | def backward(ctx, gO):

FILE: optimum/quanto/tensor/grouped.py
  function grouped_shape (line 10) | def grouped_shape(shape: List, axis: int, group_size: int) -> List:
  function group (line 17) | def group(base: torch.Tensor, axis: int, group_size: int):
  function ungroup (line 39) | def ungroup(grouped: torch.Tensor, axis: int, orig_shape: torch.Size):

FILE: optimum/quanto/tensor/optimizers/absmax_optimizer.py
  class AbsmaxOptimizer (line 26) | class AbsmaxOptimizer(SymmetricOptimizer):
    method optimize (line 27) | def optimize(

FILE: optimum/quanto/tensor/optimizers/affine_optimizer.py
  class AffineOptimizer (line 27) | class AffineOptimizer(Optimizer):
    method __call__ (line 28) | def __call__(
    method optimize (line 63) | def optimize(self, base: torch.Tensor, qtype: qtype, axis: int) -> Tup...

FILE: optimum/quanto/tensor/optimizers/hqq_optimizer.py
  function shrink_lp_op (line 28) | def shrink_lp_op(x: torch.Tensor, beta: float, lp_norm: float) -> torch....
  class HqqOptimizer (line 37) | class HqqOptimizer(MaxOptimizer):
    method __init__ (line 46) | def __init__(
    method optimize (line 60) | def optimize(

FILE: optimum/quanto/tensor/optimizers/max_optimizer.py
  class MaxOptimizer (line 26) | class MaxOptimizer(AffineOptimizer):
    method optimize (line 27) | def optimize(

FILE: optimum/quanto/tensor/optimizers/optimizer.py
  class Optimizer (line 24) | class Optimizer(ABC):
    method __call__ (line 25) | def __call__(

FILE: optimum/quanto/tensor/optimizers/symmetric_optimizer.py
  class SymmetricOptimizer (line 26) | class SymmetricOptimizer(Optimizer):
    method __call__ (line 27) | def __call__(self, base: torch.Tensor, qtype: qtype, axis: Optional[in...
    method optimize (line 37) | def optimize(self, base: torch.Tensor, qmax: float, axis: Optional[int...

FILE: optimum/quanto/tensor/packed.py
  function pack_weights (line 24) | def pack_weights(intweights: torch.Tensor, bits: int) -> torch.Tensor:
  class PackedTensor (line 72) | class PackedTensor(torch.Tensor):
    method __new__ (line 74) | def __new__(cls, data, bits, size, stride, requires_grad=False):
    method __init__ (line 82) | def __init__(self, data, bits, size, stride, requires_grad=False):
    method __repr__ (line 86) | def __repr__(self):
    method pack (line 93) | def pack(cls, t, bits=4):
    method unpack (line 101) | def unpack(self):
    method bits (line 107) | def bits(self):
    method dtype (line 111) | def dtype(self):
    method load_from_state_dict (line 115) | def load_from_state_dict(state_dict, prefix, bits, size, stride, missi...
    method __tensor_flatten__ (line 125) | def __tensor_flatten__(self):
    method __tensor_unflatten__ (line 132) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
    method __torch_dispatch__ (line 145) | def __torch_dispatch__(cls, op, types, args, kwargs=None):
    method numpy (line 162) | def numpy(self):

FILE: optimum/quanto/tensor/qbits.py
  class QBitsDequantizer (line 27) | class QBitsDequantizer(Function):
    method forward (line 29) | def forward(ctx, t):
    method backward (line 52) | def backward(ctx, gO):
  class QBitsTensor (line 56) | class QBitsTensor(QTensor):
    method __init__ (line 57) | def __init__(self, qtype, axis, group_size, size, stride, data, scale,...
    method __repr__ (line 64) | def __repr__(self):
    method dequantize (line 67) | def dequantize(self):

FILE: optimum/quanto/tensor/qbytes.py
  class QBytesDequantizer (line 23) | class QBytesDequantizer(Function):
    method forward (line 25) | def forward(ctx, t):
    method backward (line 34) | def backward(ctx, gO):
  class QBytesTensor (line 39) | class QBytesTensor(QTensor):
    method __init__ (line 40) | def __init__(self, qtype, axis, size, stride, data, scale, requires_gr...
    method __repr__ (line 45) | def __repr__(self):
    method dequantize (line 48) | def dequantize(self):

FILE: optimum/quanto/tensor/qtensor.py
  function qfallback (line 21) | def qfallback(callable, *args, **kwargs):
  class QTensor (line 32) | class QTensor(torch.Tensor):
    method __init__ (line 33) | def __init__(self, qtype, axis):
    method dequantize (line 37) | def dequantize(self):
    method save_to_state_dict (line 40) | def save_to_state_dict(self, destination, prefix, keep_vars):
    method axis (line 56) | def axis(self):
    method qtype (line 60) | def qtype(self):
    method numpy (line 63) | def numpy(self):
    method equal (line 66) | def equal(self, other):

FILE: optimum/quanto/tensor/qtype.py
  class qtype (line 21) | class qtype:
    method __str__ (line 32) | def __str__(self):
    method __hash__ (line 35) | def __hash__(self):
  function qint (line 42) | def qint(bits):
  function qfloat (line 55) | def qfloat(dtype: torch.dtype):

FILE: optimum/quanto/tensor/weights/awq/packed.py
  function pack (line 33) | def pack(unpacked: torch.Tensor, reorder=False):
  function reverse_awq_order (line 64) | def reverse_awq_order(t: torch.Tensor):
  function unpack (line 80) | def unpack(packed: torch.Tensor, reorder=False):
  function pack_v2 (line 100) | def pack_v2(unpacked: torch.Tensor) -> torch.Tensor:
  function unpack_v2 (line 156) | def unpack_v2(packed):
  class AWQPacking (line 204) | class AWQPacking(Enum):
  class AWQPackedTensor (line 209) | class AWQPackedTensor(torch.Tensor):
    method __new__ (line 211) | def __new__(cls, data, packing, reorder, size, stride, requires_grad=F...
    method __init__ (line 220) | def __init__(self, data, packing, reorder, size, stride, requires_grad...
    method __repr__ (line 225) | def __repr__(self):
    method pack (line 229) | def pack(cls, t, packing=AWQPacking.V1, reorder=False):
    method unpack (line 237) | def unpack(self):
    method dtype (line 243) | def dtype(self):
    method __tensor_flatten__ (line 246) | def __tensor_flatten__(self):
    method __tensor_unflatten__ (line 258) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
    method __torch_dispatch__ (line 272) | def __torch_dispatch__(cls, op, types, args, kwargs=None):
    method numpy (line 293) | def numpy(self):

FILE: optimum/quanto/tensor/weights/awq/qbits.py
  class AWQWeightQBitsDequantizer (line 30) | class AWQWeightQBitsDequantizer(Function):
    method forward (line 32) | def forward(ctx, t):
    method backward (line 49) | def backward(ctx, gO):
  class AWQWeightQBitsLinearFunction (line 53) | class AWQWeightQBitsLinearFunction(QuantizedLinearFunction):
    method forward (line 55) | def forward(ctx, input, other, bias):
  class AWQWeightQBitsTensor (line 77) | class AWQWeightQBitsTensor(WeightQBitsTensor):
    method __new__ (line 79) | def __new__(cls, qtype, axis, group_size, size, stride, data, scale, s...
    method __init__ (line 87) | def __init__(self, qtype, axis, group_size, size, stride, data, scale,...
    method dequantize (line 106) | def dequantize(self):
    method weight_qbits_tensor (line 109) | def weight_qbits_tensor(self):
    method __tensor_flatten__ (line 123) | def __tensor_flatten__(self):
    method __tensor_unflatten__ (line 136) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
    method __torch_function__ (line 149) | def __torch_function__(cls, func, types, args=(), kwargs=None):

FILE: optimum/quanto/tensor/weights/marlin/fp8/packed.py
  function pack_fp8_as_int32 (line 22) | def pack_fp8_as_int32(fp8_tensor: torch.Tensor) -> torch.Tensor:
  function unpack_int32_to_fp8 (line 51) | def unpack_int32_to_fp8(int32_tensor: torch.Tensor) -> torch.Tensor:
  function get_scale_perms (line 71) | def get_scale_perms() -> torch.Tensor:
  function get_row_permutation (line 78) | def get_row_permutation(n_rows: int) -> torch.Tensor:
  function get_column_permutation (line 116) | def get_column_permutation(n_col: int) -> torch.Tensor:
  class MarlinF8PackedTensor (line 160) | class MarlinF8PackedTensor(torch.Tensor):
    method __new__ (line 161) | def __new__(cls, data, size, stride, requires_grad=False):
    method __init__ (line 169) | def __init__(self, data, size, stride, requires_grad=False):
    method __repr__ (line 172) | def __repr__(self):
    method pack (line 176) | def pack(cls, tensor: torch.Tensor):
    method unpack (line 189) | def unpack(self) -> torch.Tensor:
    method dtype (line 220) | def dtype(self):
    method __tensor_flatten__ (line 223) | def __tensor_flatten__(self):
    method __tensor_unflatten__ (line 233) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
    method __torch_dispatch__ (line 245) | def __torch_dispatch__(cls, op, types, args, kwargs=None):

FILE: optimum/quanto/tensor/weights/marlin/fp8/qbits.py
  class MarlinF8QBytesLinearFunction (line 28) | class MarlinF8QBytesLinearFunction(QuantizedLinearFunction):
    method forward (line 30) | def forward(ctx, input, other, bias=None):
  class MarlinF8QBytesTensor (line 54) | class MarlinF8QBytesTensor(WeightQBytesTensor):
    method __new__ (line 56) | def __new__(cls, qtype, axis, size, stride, data, scale, requires_grad...
    method __init__ (line 63) | def __init__(self, qtype, axis, size, stride, data, scale, requires_gr...
    method dequantize (line 88) | def dequantize(self):
    method __repr__ (line 102) | def __repr__(self):
    method weight_qbytes_tensor (line 105) | def weight_qbytes_tensor(self):
    method __tensor_flatten__ (line 119) | def __tensor_flatten__(self):
    method __tensor_unflatten__ (line 130) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
    method __torch_function__ (line 142) | def __torch_function__(cls, func, types, args=(), kwargs=None):

FILE: optimum/quanto/tensor/weights/marlin/int4/packed.py
  function _get_perm (line 19) | def _get_perm():
  function pack (line 59) | def pack(unpacked: torch.Tensor):
  function unpack (line 78) | def unpack(packed, orig_shape):
  class MarlinInt4PackedTensor (line 91) | class MarlinInt4PackedTensor(torch.Tensor):
    method __new__ (line 93) | def __new__(cls, data, size, stride, requires_grad=False):
    method __init__ (line 101) | def __init__(self, data, size, stride, requires_grad=False):
    method __repr__ (line 104) | def __repr__(self):
    method pack (line 108) | def pack(cls, t):
    method unpack (line 112) | def unpack(self):
    method dtype (line 116) | def dtype(self):
    method __tensor_flatten__ (line 119) | def __tensor_flatten__(self):
    method __tensor_unflatten__ (line 128) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
    method __torch_dispatch__ (line 139) | def __torch_dispatch__(cls, op, types, args, kwargs=None):
    method numpy (line 159) | def numpy(self):

FILE: optimum/quanto/tensor/weights/marlin/int4/qbits.py
  class MarlinQBitsDequantizer (line 31) | class MarlinQBitsDequantizer(Function):
    method forward (line 33) | def forward(ctx, t):
    method backward (line 49) | def backward(ctx, gO):
  class MarlinQBitsLinearFunction (line 53) | class MarlinQBitsLinearFunction(QuantizedLinearFunction):
    method forward (line 55) | def forward(ctx, input, other, bias):
  class MarlinInt4WeightQBitsTensor (line 72) | class MarlinInt4WeightQBitsTensor(WeightQBitsTensor):
    method __new__ (line 74) | def __new__(cls, qtype, axis, group_size, size, stride, data, scale, s...
    method __init__ (line 82) | def __init__(self, qtype, axis, group_size, size, stride, data, scale,...
    method dequantize (line 103) | def dequantize(self):
    method weight_qbits_tensor (line 106) | def weight_qbits_tensor(self):
    method __tensor_flatten__ (line 121) | def __tensor_flatten__(self):
    method __tensor_unflatten__ (line 134) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
    method __torch_function__ (line 147) | def __torch_function__(cls, func, types, args=(), kwargs=None):

FILE: optimum/quanto/tensor/weights/marlin/permutations.py
  function _get_perms (line 28) | def _get_perms() -> Tuple[List[int], List[int]]:
  function _get_inverted_perms (line 39) | def _get_inverted_perms() -> Tuple[List[int], List[int]]:
  function marlin_permute (line 44) | def marlin_permute(t: torch.Tensor, reverse=False):

FILE: optimum/quanto/tensor/weights/packing.py
  function unpack_int32_to_uint8 (line 18) | def unpack_int32_to_uint8(packed: torch.Tensor, bits: int):

FILE: optimum/quanto/tensor/weights/qbits.py
  class WeightsQBitsQuantizer (line 34) | class WeightsQBitsQuantizer(Function):
    method forward (line 36) | def forward(
    method backward (line 60) | def backward(ctx, gO):
  class WeightQBitsTensor (line 65) | class WeightQBitsTensor(QBitsTensor):
    method create (line 67) | def create(qtype, axis, group_size, size, stride, data, scale, shift, ...
    method __new__ (line 141) | def __new__(cls, qtype, axis, group_size, size, stride, data, scale, s...
    method __init__ (line 148) | def __init__(self, qtype, axis, group_size, size, stride, data, scale,...
    method quantize (line 154) | def quantize(
    method load_from_state_dict (line 167) | def load_from_state_dict(state_dict, prefix, qtype, axis, group_size, ...
    method optimize (line 201) | def optimize(self):
    method save_to_state_dict (line 223) | def save_to_state_dict(self, destination, prefix, keep_vars):
    method weight_qbits_tensor (line 230) | def weight_qbits_tensor(self):
    method __tensor_flatten__ (line 237) | def __tensor_flatten__(self):
    method __tensor_unflatten__ (line 250) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
    method __torch_function__ (line 263) | def __torch_function__(cls, func, types, args=(), kwargs=None):
    method __torch_dispatch__ (line 290) | def __torch_dispatch__(cls, op, types, args, kwargs=None):

FILE: optimum/quanto/tensor/weights/qbytes.py
  class WeightQBytesQuantizer (line 31) | class WeightQBytesQuantizer(Function):
    method forward (line 33) | def forward(
    method backward (line 63) | def backward(ctx, gO):
  class WeightQBytesLinearFunction (line 68) | class WeightQBytesLinearFunction(QuantizedLinearFunction):
    method forward (line 70) | def forward(ctx, input, other, bias=None):
  class WeightQBytesTensor (line 85) | class WeightQBytesTensor(QBytesTensor):
    method create (line 87) | def create(
    method __new__ (line 146) | def __new__(cls, qtype, axis, size, stride, data, scale, activation_qt...
    method __init__ (line 152) | def __init__(self, qtype, axis, size, stride, data, scale, activation_...
    method quantize (line 157) | def quantize(
    method load_from_state_dict (line 169) | def load_from_state_dict(state_dict, prefix, qtype, axis, size, stride...
    method optimize (line 191) | def optimize(self):
    method save_to_state_dict (line 211) | def save_to_state_dict(self, destination, prefix, keep_vars):
    method weight_qbytes_tensor (line 218) | def weight_qbytes_tensor(self):
    method __tensor_flatten__ (line 225) | def __tensor_flatten__(self):
    method __tensor_unflatten__ (line 237) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
    method __torch_function__ (line 250) | def __torch_function__(cls, func, types, args=(), kwargs=None):
    method __torch_dispatch__ (line 277) | def __torch_dispatch__(cls, op, types, args, kwargs=None):

FILE: optimum/quanto/tensor/weights/quantization.py
  function quantize_weight (line 27) | def quantize_weight(

FILE: optimum/quanto/tensor/weights/reordering.py
  function reorder (line 23) | def reorder(t: torch.Tensor, permutation: Union[torch.Tensor, List[int]]):
  function reverse (line 38) | def reverse(permutation: Union[torch.Tensor, List[int]]):

FILE: optimum/quanto/tensor/weights/tinygemm/packed.py
  class TinyGemmPackedTensor (line 25) | class TinyGemmPackedTensor(torch.Tensor):
    method __new__ (line 27) | def __new__(cls, data, size, stride, requires_grad=False):
    method __init__ (line 34) | def __init__(self, data, size, stride, requires_grad=False):
    method __repr__ (line 37) | def __repr__(self):
    method pack (line 41) | def pack(cls, t):
    method unpack (line 66) | def unpack(self):
    method dtype (line 98) | def dtype(self):
    method __tensor_flatten__ (line 101) | def __tensor_flatten__(self):
    method __tensor_unflatten__ (line 111) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
    method __torch_dispatch__ (line 123) | def __torch_dispatch__(cls, op, types, args, kwargs=None):
    method numpy (line 147) | def numpy(self):

FILE: optimum/quanto/tensor/weights/tinygemm/qbits.py
  class TinyGemmQBitsDequantizer (line 30) | class TinyGemmQBitsDequantizer(Function):
    method forward (line 32) | def forward(ctx, t):
    method backward (line 38) | def backward(ctx, gO):
  class TinyGemmQBitsLinearFunction (line 42) | class TinyGemmQBitsLinearFunction(QuantizedLinearFunction):
    method forward (line 44) | def forward(ctx, input, other, bias):
  class TinyGemmWeightQBitsTensor (line 65) | class TinyGemmWeightQBitsTensor(WeightQBitsTensor):
    method __new__ (line 67) | def __new__(cls, qtype, axis, group_size, size, stride, data, scale_sh...
    method __init__ (line 82) | def __init__(self, qtype, axis, group_size, size, stride, data, scale_...
    method dequantize (line 111) | def dequantize(self):
    method weight_qbits_tensor (line 114) | def weight_qbits_tensor(self):
    method __tensor_flatten__ (line 130) | def __tensor_flatten__(self):
    method __tensor_unflatten__ (line 143) | def __tensor_unflatten__(inner_tensors, meta, outer_size, outer_stride):
    method __torch_function__ (line 156) | def __torch_function__(cls, func, types, args=(), kwargs=None):

FILE: tests/cli/test_quantize_cli.py
  function test_export_decoder_cli (line 26) | def test_export_decoder_cli(weights):

FILE: tests/conftest.py
  function device (line 29) | def device(request):
  function pytest_configure (line 33) | def pytest_configure(config):
  function pytest_runtest_call (line 38) | def pytest_runtest_call(item):

FILE: tests/helpers.py
  function torch_min_version (line 33) | def torch_min_version(v):
  function device_eq (line 46) | def device_eq(a, b):
  function random_tensor (line 54) | def random_tensor(shape, dtype=torch.float32, device="cpu"):
  function random_qactivation (line 65) | def random_qactivation(shape, qtype=qint8, dtype=torch.float32, device="...
  function random_qweight (line 71) | def random_qweight(shape, qtype, dtype=torch.float32, axis=0, group_size...
  function assert_similar (line 85) | def assert_similar(a, b, atol=None, rtol=None):
  function get_device_memory (line 102) | def get_device_memory(device):

FILE: tests/library/test_extensions.py
  function _is_xpu_available (line 10) | def _is_xpu_available():
  function test_extension_available (line 32) | def test_extension_available(extension_name):
  function test_extension_compilation (line 37) | def test_extension_compilation(extension_name):

FILE: tests/library/test_mm.py
  function test_qbytes_mm (line 35) | def test_qbytes_mm(batch_size, input_features, input_dtype, weight_dtype...
  function test_gemm_fp16_int4 (line 59) | def test_gemm_fp16_int4(batch_size, tokens, in_features, out_features):
  function test_fp8_marlin (line 112) | def test_fp8_marlin(tokens, in_features, out_features, dtype):
  function test_gemm_marlin_fp16_int4 (line 155) | def test_gemm_marlin_fp16_int4(batch_size, tokens, in_features, out_feat...

FILE: tests/library/test_quantize.py
  function test_symmetric_quantize_int (line 41) | def test_symmetric_quantize_int(input_shape, dtype, qtype, axis, device):
  function test_symmetric_quantize_float8 (line 63) | def test_symmetric_quantize_float8(input_shape, dtype, qtype, axis, devi...
  function test_affine_quantize (line 78) | def test_affine_quantize(input_shape, dtype, qtype, axis, group_size, sh...
  function test_affine_quantize_integer_tensor (line 107) | def test_affine_quantize_integer_tensor(dtype, qtype, device):

FILE: tests/library/test_unpack.py
  function test_unpack (line 24) | def test_unpack(bits, shape, device):

FILE: tests/models/conftest.py
  function staging (line 6) | def staging():
  function skip_if_staging (line 25) | def skip_if_staging(request):

FILE: tests/models/test_quantized_model_for_causal_lm.py
  function quantized_model_for_causal_lm (line 11) | def quantized_model_for_causal_lm(model_id, qtype, exclude, from_config=...
  function compare_models (line 49) | def compare_models(a_model, b_model):
  function test_quantized_model_for_causal_lm_base (line 79) | def test_quantized_model_for_causal_lm_base(model_id, qtype, exclude_lm_...
  function test_quantized_model_for_causal_lm_sharded (line 92) | def test_quantized_model_for_causal_lm_sharded():
  function test_causal_lm_base_push_to_hub (line 107) | def test_causal_lm_base_push_to_hub(staging, in_org):
  function test_quantized_model_load_state_dict_non_strict (line 134) | def test_quantized_model_load_state_dict_non_strict(model_id, qtype):

FILE: tests/models/test_quantized_model_for_pixart.py
  function quantized_model_for_pixart (line 11) | def quantized_model_for_pixart(qtype, exclude):
  function compare_models (line 40) | def compare_models(a_model, b_model):
  function test_quantized_model_for_pixart (line 80) | def test_quantized_model_for_pixart(qtype, exclude_proj_out):
  function test_push_to_hub (line 94) | def test_push_to_hub(staging, in_org):

FILE: tests/nn/test_calibrate.py
  function _test_calibrate_qlinear (line 23) | def _test_calibrate_qlinear(batch_size, tokens, embeddings, use_bias, ac...
  function test_calibrate_qlinear_activations_int8 (line 45) | def test_calibrate_qlinear_activations_int8(batch_size, tokens, embeddin...
  function test_calibrate_qlinear_activations_float8 (line 58) | def test_calibrate_qlinear_activations_float8(batch_size, tokens, embedd...
  function _test_calibrate_custom_module (line 62) | def _test_calibrate_custom_module(activations, device):
  function test_calibrate_custom_module_activations_int8 (line 88) | def test_calibrate_custom_module_activations_int8(device):
  function test_calibrate_custom_module_activations_float8 (line 98) | def test_calibrate_custom_module_activations_float8(activations, device):

FILE: tests/nn/test_qattention.py
  class RotaryEmbedding (line 27) | class RotaryEmbedding(nn.Module):
    method __init__ (line 28) | def __init__(self, dim, max_position_embeddings=2048, base=10000, devi...
    method _set_cos_sin_cache (line 42) | def _set_cos_sin_cache(self, seq_len, device, dtype):
    method forward (line 52) | def forward(self, x, seq_len=None):
  function rotate_half (line 63) | def rotate_half(x):
  function apply_rotary_pos_emb (line 70) | def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
  function repeat_kv (line 98) | def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
  class Attention (line 110) | class Attention(nn.Module):
    method __init__ (line 113) | def __init__(self, hidden_size=128, num_heads=4, max_position_embeddin...
    method _shape (line 130) | def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
    method forward (line 133) | def forward(
  function _test_quantize_attention (line 174) | def _test_quantize_attention(device, dtype=torch.float32, weights=qint8,...
  function test_quantize_attention_weights_only (line 193) | def test_quantize_attention_weights_only(weights, device):
  function test_quantize_attention_weights_only_float8 (line 198) | def test_quantize_attention_weights_only_float8(device):
  function test_quantize_attention_activations_int8 (line 203) | def test_quantize_attention_activations_int8(weights, device):
  function test_quantize_attention_activations_float8 (line 214) | def test_quantize_attention_activations_float8(weights, activations, dev...

FILE: tests/nn/test_qconv2d.py
  function _test_quantize_conv2d (line 31) | def _test_quantize_conv2d(batch_size, img_shape, out_channels, use_bias,...
  function test_quantize_conv2d_float16_activations_int8 (line 59) | def test_quantize_conv2d_float16_activations_int8(batch_size, img_shape,...
  function test_quantize_conv2d_float32_activations_int8 (line 68) | def test_quantize_conv2d_float32_activations_int8(batch_size, img_shape,...
  function test_quantize_conv2d_float16_activations_float8 (line 83) | def test_quantize_conv2d_float16_activations_float8(
  function test_quantize_conv2d_float32_activations_float8 (line 100) | def test_quantize_conv2d_float32_activations_float8(
  function test_quantize_conv2d_float16_weight_only (line 111) | def test_quantize_conv2d_float16_weight_only(batch_size, img_shape, out_...
  function test_quantize_conv2d_float32_weight_only (line 120) | def test_quantize_conv2d_float32_weight_only(batch_size, img_shape, out_...
  function test_qconv2d_gradient (line 128) | def test_qconv2d_gradient(img_shape, out_channels, activations, weights,...

FILE: tests/nn/test_qlayernorm.py
  function _test_quantize_layernorm (line 23) | def _test_quantize_layernorm(batch_size, tokens, embeddings, affine, dty...
  function test_quantize_layernorm_float16_activations_int8 (line 47) | def test_quantize_layernorm_float16_activations_int8(batch_size, tokens,...
  function test_quantize_layernorm_float32_activations_int8 (line 54) | def test_quantize_layernorm_float32_activations_int8(batch_size, tokens,...
  function test_quantize_layernorm_float16_activations_float8 (line 67) | def test_quantize_layernorm_float16_activations_float8(batch_size, token...
  function test_quantize_layernorm_float32_activations_float8 (line 80) | def test_quantize_layernorm_float32_activations_float8(batch_size, token...
  function test_quantize_layernom_no_activation (line 84) | def test_quantize_layernom_no_activation():

FILE: tests/nn/test_qlinear.py
  function _test_quantize_linear (line 37) | def _test_quantize_linear(batch_size, tokens, embeddings, use_bias, weig...
  function test_quantize_linear_float16_activations_int8 (line 65) | def test_quantize_linear_float16_activations_int8(batch_size, tokens, em...
  function test_quantize_linear_float32_activations_int8 (line 73) | def test_quantize_linear_float32_activations_int8(batch_size, tokens, em...
  function test_quantize_linear_float16_activations_float8 (line 90) | def test_quantize_linear_float16_activations_float8(
  function test_quantize_linear_float32_activations_float8 (line 107) | def test_quantize_linear_float32_activations_float8(
  function test_quantize_linear_float16_weight_only (line 120) | def test_quantize_linear_float16_weight_only(batch_size, tokens, embeddi...
  function test_quantize_linear_float32_weight_only (line 134) | def test_quantize_linear_float32_weight_only(batch_size, tokens, embeddi...
  function test_qlinear_gradient (line 141) | def test_qlinear_gradient(tokens, embeddings, activations, weights, devi...
  function test_move_qlinear (line 182) | def test_move_qlinear(dtype, use_bias, weights, device):
  function test_qlinear_serialization (line 200) | def test_qlinear_serialization(features, use_bias, activations, weights,...

FILE: tests/nn/test_qmodule.py
  function test_qmodule_freeze (line 26) | def test_qmodule_freeze(in_features, out_features, use_bias, dtype):
  function test_qmodule_qtype_as_string (line 50) | def test_qmodule_qtype_as_string(weights, activations):

FILE: tests/quantize/test_quantize_mlp.py
  class MLP (line 40) | class MLP(torch.nn.Module):
    method __init__ (line 41) | def __init__(self, input_size, output_size, hidden_size):
    method forward (line 47) | def forward(self, inputs):
  function check_mlp (line 53) | def check_mlp(model, frozen):
  function _test_quantize_mlp (line 63) | def _test_quantize_mlp(weights, activations, optimizer, frozen, device, ...
  function test_quantize_mlp_weights_only (line 85) | def test_quantize_mlp_weights_only(weights, frozen, device):
  function test_quantize_mlp_weights_only_float8 (line 92) | def test_quantize_mlp_weights_only_float8(weights, frozen, device):
  function test_quantize_mlp_int8_activations (line 99) | def test_quantize_mlp_int8_activations(weights, frozen, device):
  function test_quantize_mlp_float8_activations (line 111) | def test_quantize_mlp_float8_activations(weights, activations, frozen, d...
  function test_quantized_mlp_device_memory (line 120) | def test_quantized_mlp_device_memory(weights, dtype, weights_only, device):
  function test_quantize_mlp_weights_only_optimizers (line 140) | def test_quantize_mlp_weights_only_optimizers(weights, optimizer, frozen...
  function test_quantize_mlp_wrong_optimizer (line 148) | def test_quantize_mlp_wrong_optimizer(weights, optimizer, device):

FILE: tests/quantize/test_quantize_patterns.py
  class MLP (line 25) | class MLP(torch.nn.Module):
    method __init__ (line 26) | def __init__(self, input_size, output_size, hidden_size):
    method forward (line 32) | def forward(self, inputs):
  class ClassificationModel (line 38) | class ClassificationModel(torch.nn.Module):
    method __init__ (line 39) | def __init__(self, input_size, output_size, hidden_size, classes):
    method forward (line 44) | def forward(self, inputs):
  function has_children (line 49) | def has_children(module: torch.nn.Module):
  function leaf_module_names (line 53) | def leaf_module_names(module: torch.nn.Module):
  function parent_module_names (line 57) | def parent_module_names(module: torch.nn.Module):
  function test_quantize_mlp_include_explicit_layers (line 61) | def test_quantize_mlp_include_explicit_layers():
  function test_quantize_mlp_exclude_explicit_layers (line 74) | def test_quantize_mlp_exclude_explicit_layers():
  function test_quantize_mlp_include_layer_patterns (line 87) | def test_quantize_mlp_include_layer_patterns():
  function test_quantize_mlp_exclude_layer_patterns (line 100) | def test_quantize_mlp_exclude_layer_patterns():

FILE: tests/quantize/test_requantize.py
  function save_and_reload_state_dict (line 28) | def save_and_reload_state_dict(state_dict, serialization):
  function test_requantize_serialized_model (line 50) | def test_requantize_serialized_model(
  function test_requantized_model_device_memory (line 78) | def test_requantized_model_device_memory(weights, dtype, serialization, ...

FILE: tests/tensor/activations/test_activations_compile.py
  function compile_for_device (line 22) | def compile_for_device(f, device):
  function test_compile_quantize_tensor (line 34) | def test_compile_quantize_tensor(input_shape, qtype, dtype, device):
  function test_compile_qtensor_to (line 51) | def test_compile_qtensor_to(device):

FILE: tests/tensor/activations/test_activations_dispatch.py
  function test_qactivation_mul_scalar (line 24) | def test_qactivation_mul_scalar(input_shape, scalar, device):
  function test_qactivation_relu (line 40) | def test_qactivation_relu(batch_size, tokens, embeddings, device):
  function test_qactivation_softmax (line 49) | def test_qactivation_softmax(batch_size, tokens, embeddings, device):
  function test_qactivation_view (line 58) | def test_qactivation_view(input_shape, device):
  function test_qactivation_cat (line 65) | def test_qactivation_cat(input_shape, device):
  function test_qactivation_transpose_2d (line 75) | def test_qactivation_transpose_2d(device):
  function test_qactivation_transpose (line 84) | def test_qactivation_transpose(device):

FILE: tests/tensor/activations/test_activations_quantize.py
  function test_symmetric_quantize_int (line 33) | def test_symmetric_quantize_int(input_shape, dtype, qtype, device):
  function test_symmetric_quantize_float8 (line 52) | def test_symmetric_quantize_float8(input_shape, dtype, qtype, device):

FILE: tests/tensor/ops/test_linear_dispatch.py
  function test_qactivation_qweight_linear (line 28) | def test_qactivation_qweight_linear(
  function test_linear_fp16_int4 (line 48) | def test_linear_fp16_int4(batch_size, tokens, embeddings, use_bias, devi...
  function test_linear_bf16_int4 (line 63) | def test_linear_bf16_int4(batch_size, tokens, embeddings, use_bias, devi...

FILE: tests/tensor/ops/test_mm_dispatch.py
  function test_qactivation_qweight_matmul (line 26) | def test_qactivation_qweight_matmul(dtype, in_features, hidden, out_feat...
  function test_qactivation_qactivation_bmm (line 38) | def test_qactivation_qactivation_bmm(dtype, batch_size, a_shape, b_shape...

FILE: tests/tensor/optimizers/test_hqq_optimizer.py
  function compare_quantized_tensor (line 28) | def compare_quantized_tensor(a, qtype, axis, group_size, scale, shift):
  function test_hqq_optimizer (line 42) | def test_hqq_optimizer(input_shape, dtype, qtype, axis, group_size, devi...

FILE: tests/tensor/test_absmax.py
  function test_absmax_scale (line 26) | def test_absmax_scale(input_shape, axis, dtype, qtype, device):

FILE: tests/tensor/test_packed_tensor.py
  function test_pack_tensor (line 26) | def test_pack_tensor(shape, bits, device):
  function test_packed_tensor_serialization (line 39) | def test_packed_tensor_serialization(bits, device):

FILE: tests/tensor/weights/optimized/test_awq_packed_tensor.py
  function test_pack_awq_tensor (line 30) | def test_pack_awq_tensor(in_features, out_features, random, packing, reo...
  function test_move_awq_tensor (line 51) | def test_move_awq_tensor(packing, reorder, device):

FILE: tests/tensor/weights/optimized/test_awq_weight_qbits_tensor.py
  function test_awq_weight_qbits_tensor_from_qbits_tensor (line 30) | def test_awq_weight_qbits_tensor_from_qbits_tensor(in_features, out_feat...
  function test_awq_weight_qbits_tensor_move (line 66) | def test_awq_weight_qbits_tensor_move(device):
  function _test_awq_weight_qbits_tensor_linear (line 94) | def _test_awq_weight_qbits_tensor_linear(
  function test_awq_weight_qbits_tensor_linear (line 124) | def test_awq_weight_qbits_tensor_linear(batch_size, tokens, in_features,...

FILE: tests/tensor/weights/optimized/test_marlin_fp8_packed_tensor.py
  function get_fp8_tensor (line 25) | def get_fp8_tensor(shape, device, random=False):
  function test_pack_marlin_fp8_tensor (line 44) | def test_pack_marlin_fp8_tensor(in_features, out_features, random):
  function test_move_marlin_fp8_tensor (line 55) | def test_move_marlin_fp8_tensor():

FILE: tests/tensor/weights/optimized/test_marlin_int4_packed_tensor.py
  function get_uint4_tensor (line 24) | def get_uint4_tensor(shape, device, random=False):
  function test_pack_marlin_int4_tensor (line 39) | def test_pack_marlin_int4_tensor(in_features, out_features, random):
  function test_move_marlin_int4_packed_tensor (line 50) | def test_move_marlin_int4_packed_tensor(device):

FILE: tests/tensor/weights/optimized/test_marlin_int4_weight_qbits_tensor.py
  function test_marlin_int4_weight_qbits_tensor_from_qbits_tensor (line 31) | def test_marlin_int4_weight_qbits_tensor_from_qbits_tensor(in_features, ...
  function test_marlin_int4_weight_qbits_tensor_move (line 67) | def test_marlin_int4_weight_qbits_tensor_move(device):
  function _test_marlin_int4_weight_qbits_tensor_linear (line 96) | def _test_marlin_int4_weight_qbits_tensor_linear(
  function test_marlin_int4_weight_qbits_tensor_linear (line 125) | def test_marlin_int4_weight_qbits_tensor_linear(batch_size, tokens, in_f...
  function test_marlin_int4_weight_qbits_tensor_linear_failing (line 144) | def test_marlin_int4_weight_qbits_tensor_linear_failing(batch_size, toke...

FILE: tests/tensor/weights/optimized/test_marlin_qbytes_tensor.py
  function test_pack_unpack (line 29) | def test_pack_unpack(in_features: int, out_features: int):

FILE: tests/tensor/weights/optimized/test_tinygemm_packed_tensor.py
  function test_pack_tinygemm_tensor (line 29) | def test_pack_tinygemm_tensor(in_features, out_features, random, device):
  function test_move_tinygemm_packed_tensor (line 53) | def test_move_tinygemm_packed_tensor(device):

FILE: tests/tensor/weights/optimized/test_tinygemm_weight_qbits_tensor.py
  function test_tinygemm_weight_qbits_tensor_from_qbits_tensor (line 28) | def test_tinygemm_weight_qbits_tensor_from_qbits_tensor(in_features, out...
  function test_tinygemm_weight_qbits_tensor_move (line 71) | def test_tinygemm_weight_qbits_tensor_move(device):
  function test_tinygemm_weight_qbits_tensor_linear (line 101) | def test_tinygemm_weight_qbits_tensor_linear(batch_size, tokens, embeddi...

FILE: tests/tensor/weights/test_weight_qbits_tensor.py
  function test_weight_qbits_tensor_serialization (line 26) | def test_weight_qbits_tensor_serialization(qtype, axis):
  function test_weight_qbits_tensor_requires_grad (line 43) | def test_weight_qbits_tensor_requires_grad(qtype, axis, group_size, devi...
  function test_weight_qbits_tensor_backward (line 54) | def test_weight_qbits_tensor_backward(qtype, axis, group_size, device):

FILE: tests/tensor/weights/test_weight_qbits_tensor_dispatch.py
  function test_qbitstensor_to_device (line 25) | def test_qbitstensor_to_device(dtype, group_size, device):
  function test_qbitstensor_detach (line 45) | def test_qbitstensor_detach():
  function test_qbitstensor_equal (line 54) | def test_qbitstensor_equal(dtype, qtype, axis, device):
  function test_weight_qbits_tensor_linear (line 68) | def test_weight_qbits_tensor_linear(dtype, batch_size, tokens, in_featur...
  function test_weight_qbits_tensor_linear_gpu (line 82) | def test_weight_qbits_tensor_linear_gpu(dtype, batch_size, tokens, in_fe...

FILE: tests/tensor/weights/test_weight_qbits_tensor_instantiate.py
  function random_data_scale_shift (line 23) | def random_data_scale_shift(input_shape, dtype, qtype, axis, group_size):
  function test_weight_qbits_tensor_instantiate (line 40) | def test_weight_qbits_tensor_instantiate(input_shape, dtype, qtype, axis...
  function test_weight_qbits_tensor_equal (line 56) | def test_weight_qbits_tensor_equal(input_shape, dtype, qtype, axis, grou...

FILE: tests/tensor/weights/test_weight_qbits_tensor_quantize.py
  function test_weight_qbits_tensor_quantize (line 33) | def test_weight_qbits_tensor_quantize(input_shape, dtype, qtype, axis, g...
  function test_weight_qbits_tensor_quantize_integer_tensor (line 58) | def test_weight_qbits_tensor_quantize_integer_tensor(dtype, qtype, device):

FILE: tests/tensor/weights/test_weight_qbytes_tensor_backward.py
  function test_weight_qbytes_tensor_requires_grad (line 22) | def test_weight_qbytes_tensor_requires_grad(device):
  function test_weight_qbytes_tensor_backward (line 30) | def test_weight_qbytes_tensor_backward(device):
  function test_weight_qbytes_tensor_chained_backward (line 41) | def test_weight_qbytes_tensor_chained_backward(device):

FILE: tests/tensor/weights/test_weight_qbytes_tensor_dispatch.py
  function test_weight_qytes_tensor_to_device (line 8) | def test_weight_qytes_tensor_to_device(device):
  function test_weight_qbytes_tensor_equal (line 20) | def test_weight_qbytes_tensor_equal(dtype, qtype, axis, device):
  function test_weight_qbytes_tensor_transpose_contiguous (line 30) | def test_weight_qbytes_tensor_transpose_contiguous(axis, qtype, device):
  function test_weight_qbytes_tensor_transposed_stride (line 43) | def test_weight_qbytes_tensor_transposed_stride(axis, qtype, device):

FILE: tests/tensor/weights/test_weight_qbytes_tensor_instantiate.py
  function random_data_scale (line 22) | def random_data_scale(input_shape, dtype, qtype):
  function test_qbytestensor_instantiate (line 37) | def test_qbytestensor_instantiate(input_shape, dtype, qtype, device):
  function test_qbytestensor_equal (line 53) | def test_qbytestensor_equal(input_shape, dtype, qtype, device):

FILE: tests/tensor/weights/test_weight_qbytes_tensor_quantize.py
  function test_symmetric_quantize_int (line 38) | def test_symmetric_quantize_int(input_shape, dtype, qtype, axis, device):
  function test_symmetric_quantize_float8 (line 62) | def test_symmetric_quantize_float8(input_shape, dtype, qtype, axis, devi...
  function test_quantize_weight_axis_dim_1 (line 74) | def test_quantize_weight_axis_dim_1(axis, device):

FILE: tests/tensor/weights/test_weight_qbytes_tensor_serialization.py
  function test_weights_qbytes_tensor_serialization (line 28) | def test_weights_qbytes_tensor_serialization(input_shape, qtype, dtype, ...

FILE: tests/tensor/weights/weight_helpers.py
  function check_weight_qtensor_linear (line 19) | def check_weight_qtensor_linear(qweight, batch_size, tokens, use_bias, r...

Download .json

Condensed preview — 207 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (827K chars).

[
  {
    "path": ".github/CODEOWNERS",
    "chars": 20,
    "preview": "* @dacorvo @sunmarc\n"
  },
  {
    "path": ".github/PULL_REQUEST_TEMPLATE.md",
    "chars": 1525,
    "preview": "# What does this PR do?\n\n<!--\nCongratulations! You've made it this far! You're not quite done yet though.\n\nOnce merged, "
  },
  {
    "path": ".github/workflows/check-commits.yml",
    "chars": 362,
    "preview": "name: Check Commits\n\non: [workflow_call]\n\njobs:\n  build:\n    name: Check commits\n    runs-on: ubuntu-latest\n    steps:\n "
  },
  {
    "path": ".github/workflows/linux-cpu-tests.yml",
    "chars": 2207,
    "preview": "name: Linux CPU tests\n\non:\n  push:\n    branches:\n      - main\n    paths:\n      - \"optimum/quanto/**\"\n      - \"tests/**\"\n"
  },
  {
    "path": ".github/workflows/linux-cuda-tests.yml",
    "chars": 1375,
    "preview": "name: Linux CUDA tests\n\non:\n  push:\n    branches:\n      - main\n    paths:\n      - \"optimum/quanto/**\"\n      - \"tests/**\""
  },
  {
    "path": ".github/workflows/linux-examples.yml",
    "chars": 2355,
    "preview": "name: Linux examples (CPU, CUDA)\n\non:\n  push:\n    branches:\n      - main\n    paths:\n      - \"optimum/quanto/**\"\n      - "
  },
  {
    "path": ".github/workflows/python-quality.yml",
    "chars": 501,
    "preview": "name: Python code quality\n\non: [workflow_call]\n\njobs:\n  check_code_quality:\n    runs-on: ubuntu-latest\n\n    steps:\n     "
  },
  {
    "path": ".github/workflows/security.yml",
    "chars": 1036,
    "preview": "name: Security Checks\n\non:\n  push:\n\npermissions:\n  contents: read\n\njobs:\n  secrets:\n    runs-on: ubuntu-latest\n    steps"
  },
  {
    "path": ".github/workflows/stale.yml",
    "chars": 941,
    "preview": "name: 'Close stale issues and PRs'\non:\n  schedule:\n    - cron: '30 1 * * *'\n  workflow_dispatch:\n\npermissions:\n  issues:"
  },
  {
    "path": ".gitignore",
    "chars": 54,
    "preview": "__pycache__\n.pytest_cache\n*.egg-info\ndist\n.venv\nbuild/"
  },
  {
    "path": "CONTRIBUTING.md",
    "chars": 10213,
    "preview": "<!---\nCopyright 2024 The HuggingFace Team. All rights reserved.\n\nLicensed under the Apache License, Version 2.0 (the \"Li"
  },
  {
    "path": "LICENSE",
    "chars": 11419,
    "preview": "Copyright 2023 - The Hugging Face team. All rights reserved.\n\n                                 Apache License\n          "
  },
  {
    "path": "Makefile",
    "chars": 252,
    "preview": ".PHONY: check test style\n\ncheck_dirs := optimum tests bench examples\n\ncheck:\n\truff check --show-fixes ${check_dirs}\n\truf"
  },
  {
    "path": "README.md",
    "chars": 12178,
    "preview": "# Optimum Quanto\n\n> This project is currently in maintenance mode. We accept pull requests only for minor bug fixes, doc"
  },
  {
    "path": "bench/generation/README.md",
    "chars": 3403,
    "preview": "# Quanto generation benchmark\n\nThis repository contains scripts to evaluate the performances of quantized models using t"
  },
  {
    "path": "bench/generation/evaluate_configurations.py",
    "chars": 4421,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "bench/generation/evaluate_many_models.sh",
    "chars": 658,
    "preview": "#!/bin/bash\n# Absolute path to this script, e.g. /home/user/bin/foo.sh\nSCRIPT=$(readlink -f \"$0\")\n# Absolute path this s"
  },
  {
    "path": "bench/generation/evaluate_model.py",
    "chars": 5134,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "bench/generation/gen_barchart.py",
    "chars": 2983,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "bench/generation/metrics/__init__.py",
    "chars": 606,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "bench/generation/metrics/latency.py",
    "chars": 4189,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "bench/generation/metrics/perplexity.py",
    "chars": 7439,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "bench/generation/metrics/prediction.py",
    "chars": 1696,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "bench/generation/setup/__init__.py",
    "chars": 606,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "bench/generation/setup/awq.py",
    "chars": 4238,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "bench/generation/setup/bnb.py",
    "chars": 1676,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "bench/generation/setup/hqq.py",
    "chars": 1654,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "bench/generation/setup/quanto.py",
    "chars": 2642,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "bench/kernels/benchmark.py",
    "chars": 4134,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "bench/kernels/benchmark_marlin_fp8.py",
    "chars": 5215,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "bench/kernels/benchmark_w4a16.py",
    "chars": 5850,
    "preview": "# From: https://github.com/IST-DASLab/marlin/blob/master/bench.py\nimport argparse\nimport time\n\nimport torch\n\nfrom optimu"
  },
  {
    "path": "bench/torch_kernels/README.md",
    "chars": 88,
    "preview": "This contains a few scripts to test pytorch kernels that are relevant for quantization.\n"
  },
  {
    "path": "bench/torch_kernels/test_int_mm.py",
    "chars": 2700,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "bench/torch_kernels/test_int_mm_inductor.py",
    "chars": 1111,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "bench/torch_kernels/test_weight_int4pack_mm.py",
    "chars": 4939,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "bench/torch_kernels/test_weight_int8pack_mm.py",
    "chars": 2506,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "examples/nlp/text-classification/sst2/quantize_sst2_model.py",
    "chars": 4158,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "examples/nlp/text-generation/quantize_causal_lm_model.py",
    "chars": 5479,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "examples/speech/speech_recognition/quantize_asr_model.py",
    "chars": 5079,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "examples/speech/speech_recognition/requirements.txt",
    "chars": 46,
    "preview": "transformers\nevaluate\nlibrosa\nsoundfile\njiwer\n"
  },
  {
    "path": "examples/vision/StableDiffusion/README.md",
    "chars": 1566,
    "preview": "# Quantize Stable Diffusion examples\n\n## Running locally with PyTorch\n\n### Installing the dependencies\n\nBefore running t"
  },
  {
    "path": "examples/vision/StableDiffusion/quantize_StableDiffusion.py",
    "chars": 3910,
    "preview": "import argparse\nimport gc\n\nimport torch\nimport torch.utils.benchmark as benchmark\nfrom diffusers import DiffusionPipelin"
  },
  {
    "path": "examples/vision/StableDiffusion/requirements.txt",
    "chars": 52,
    "preview": "quanto\ndiffusers\ntorch\ntransformers\naccelerate\nwandb"
  },
  {
    "path": "examples/vision/image-classification/mnist/quantize_mnist_model.py",
    "chars": 6640,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "examples/vision/image-classification/pets/quantize_vit_model.py",
    "chars": 4622,
    "preview": "import argparse\nimport time\nfrom tempfile import NamedTemporaryFile\n\nimport torch\nimport torch.nn.functional as F\nfrom a"
  },
  {
    "path": "examples/vision/object-detection/quantize_owl_model.py",
    "chars": 4464,
    "preview": "import argparse\nimport gc\n\nimport numpy as np\nimport requests\nimport torch\nfrom PIL import Image\nfrom transformers impor"
  },
  {
    "path": "examples/vision/text-to-image/quantize_pixart_sigma.py",
    "chars": 2940,
    "preview": "import argparse\nimport gc\n\nimport torch\nfrom diffusers import DiffusionPipeline\n\nfrom optimum.quanto import freeze, qflo"
  },
  {
    "path": "external/awq/conftest.py",
    "chars": 1512,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "external/awq/pack_intweight.py",
    "chars": 2711,
    "preview": "# MIT License\n#\n# Copyright (c) 2023 MIT HAN Lab\n#\n# Permission is hereby granted, free of charge, to any person obtaini"
  },
  {
    "path": "external/awq/packing_utils.py",
    "chars": 3443,
    "preview": "import torch\n\n\nAWQ_ORDER = [0, 2, 4, 6, 1, 3, 5, 7]\nAWQ_REVERSE_ORDER = [0, 4, 1, 5, 2, 6, 3, 7]\n\n\ndef pack_awq(intweigh"
  },
  {
    "path": "external/awq/test_awq_kernels.py",
    "chars": 8278,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "external/awq/test_awq_packing.py",
    "chars": 3499,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "external/awq/test_awq_quantize.py",
    "chars": 2401,
    "preview": "import pytest\nimport torch\n\nfrom optimum.quanto import AffineQuantizer, MaxOptimizer, qint4, ungroup\n\n\ndef awq_quantize("
  },
  {
    "path": "external/smoothquant/README.md",
    "chars": 662,
    "preview": "# SmoothQuant original conversion script\n\nThis converts an OPT or Bloom [🤗 transformers](https://github.com/huggingface/"
  },
  {
    "path": "external/smoothquant/smoothquant.py",
    "chars": 5678,
    "preview": "import argparse\nimport functools\nimport os\n\nimport torch\nimport torch.nn as nn\nfrom datasets import load_dataset\nfrom tq"
  },
  {
    "path": "optimum/quanto/__init__.py",
    "chars": 767,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/calibrate.py",
    "chars": 8353,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/library/README.md",
    "chars": 353,
    "preview": "# Quanto operations library\n\nThis contains the `quanto::` operations, available in python under `torch.ops.quanto`.\n\nTo "
  },
  {
    "path": "optimum/quanto/library/__init__.py",
    "chars": 704,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/library/extensions/README.md",
    "chars": 1231,
    "preview": "# Quanto library extensions\n\nThis folder contains device-specific `quanto::` operations.\n\nImplementations can be provide"
  },
  {
    "path": "optimum/quanto/library/extensions/__init__.py",
    "chars": 1281,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/library/extensions/cpp/README.md",
    "chars": 419,
    "preview": "# Quanto generic C++ extension\n\nKernels in this extension must use only the C++ syntax.\n\nThey can use any pytorch operat"
  },
  {
    "path": "optimum/quanto/library/extensions/cpp/__init__.py",
    "chars": 1007,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/library/extensions/cpp/pybind_module.cpp",
    "chars": 1175,
    "preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
  },
  {
    "path": "optimum/quanto/library/extensions/cpp/unpack.cpp",
    "chars": 1535,
    "preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
  },
  {
    "path": "optimum/quanto/library/extensions/cpp/unpack.h",
    "chars": 700,
    "preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/README.md",
    "chars": 437,
    "preview": "# Quanto generic CUDA extension\n\nKernels in this extension can use both the C++ and CUDA syntax.\n\nThey can use any pytor"
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/__init__.py",
    "chars": 6156,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/awq/dequantize.cuh",
    "chars": 3931,
    "preview": "/*\nModified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/c"
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/awq/v2/gemm_cuda.cu",
    "chars": 50002,
    "preview": "#include <cuda_fp16.h>\n#include \"semaphore.h\"\n#include \"gemm_cuda.h\"\n#include \"../dequantize.cuh\"\n#include <torch/extens"
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/awq/v2/gemm_cuda.h",
    "chars": 156,
    "preview": "#include <torch/extension.h>\n\ntorch::Tensor awq_v2_gemm_f16i4(torch::Tensor _in_feats, torch::Tensor _kernel, torch::Ten"
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/awq/v2/gemv_cuda.cu",
    "chars": 11626,
    "preview": "/*\n * Modified from NVIDIA [TRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/tree/d37b507f41a87457fe9f10f7459d08f5db23574"
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/awq/v2/gemv_cuda.h",
    "chars": 248,
    "preview": "#pragma once\n#include <torch/extension.h>\n\ntorch::Tensor awq_v2_gemv_f16i4(\n    torch::Tensor _in_feats,\n    torch::Tens"
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/awq/v2/semaphore.h",
    "chars": 3886,
    "preview": "/***************************************************************************************************\n * Copyright (c) 20"
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/marlin/COPYRIGHT",
    "chars": 751,
    "preview": "These kernels were vendored from VLLM. The Marlin kernels were developed\nby Elias Frantar and extended by Neural Magic.\n"
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/marlin/fp8_marlin.cu",
    "chars": 51215,
    "preview": "/*\n * Modified by Neural Magic\n * Copyright (C) Marlin.2024 Elias Frantar\n *\n * Licensed under the Apache License, Versi"
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/marlin/fp8_marlin.cuh",
    "chars": 495,
    "preview": "// #pragma once\n#include <torch/all.h>\n#include <stdint.h>\n\n\n// #ifndef _fp8_marlin_cuh\n// #define _fp8_marlin_cuh\n\n// #"
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/marlin/gptq_marlin.cuh",
    "chars": 2051,
    "preview": "#pragma once\n\n#include <torch/all.h>\n\n#include <ATen/cuda/CUDAContext.h>\n#include <c10/cuda/CUDAGuard.h>\n#include <cuda."
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/marlin/gptq_marlin_dtypes.cuh",
    "chars": 1933,
    "preview": "\n#ifndef _data_types_cuh\n#define _data_types_cuh\n#include \"gptq_marlin.cuh\"\n#include <cuda_fp16.h>\n#include <cuda_bf16.h"
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/marlin/gptq_marlin_repack.cu",
    "chars": 11605,
    "preview": "#include \"gptq_marlin.cuh\"\n\nnamespace gptq_marlin {\n\nstatic constexpr int repack_stages = 8;\n\nstatic constexpr int repac"
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/marlin/gptq_marlin_repack.cuh",
    "chars": 342,
    "preview": "#include <torch/library.h>\n#include <torch/all.h>\n#include <stdint.h>\n\n#ifndef _gptq_marlin_repack_cuh\n#define _gptq_mar"
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/marlin/marlin_cuda.cpp",
    "chars": 2270,
    "preview": "/*\n * Copyright (C) Marlin.2024 Elias Frantar (elias.frantar@ist.ac.at)\n *\n * Licensed under the Apache License, Version"
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/marlin/marlin_cuda.h",
    "chars": 922,
    "preview": "/*\n * Copyright (C) Marlin.2024 Elias Frantar (elias.frantar@ist.ac.at)\n *\n * Licensed under the Apache License, Version"
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/marlin/marlin_cuda_kernel.cu",
    "chars": 35383,
    "preview": "/*\n * Copyright (C) Marlin.2024 Elias Frantar (elias.frantar@ist.ac.at)\n *\n * Licensed under the Apache License, Version"
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/marlin/marlin_cuda_kernel.cuh",
    "chars": 1005,
    "preview": "/*\n * Copyright (C) Marlin.2024 Elias Frantar (elias.frantar@ist.ac.at)\n *\n * Licensed under the Apache License, Version"
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/pybind_module.cpp",
    "chars": 1734,
    "preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/unpack.cu",
    "chars": 2937,
    "preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
  },
  {
    "path": "optimum/quanto/library/extensions/cuda/unpack.h",
    "chars": 700,
    "preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
  },
  {
    "path": "optimum/quanto/library/extensions/extension.py",
    "chars": 2671,
    "preview": "import os\nimport shutil\nimport warnings\nfrom typing import List\n\nimport torch\nfrom torch.utils.cpp_extension import load"
  },
  {
    "path": "optimum/quanto/library/extensions/hip/__init__.py",
    "chars": 1014,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/library/extensions/hip/pybind_module.cpp",
    "chars": 754,
    "preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
  },
  {
    "path": "optimum/quanto/library/extensions/hip/unpack.cu",
    "chars": 2937,
    "preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
  },
  {
    "path": "optimum/quanto/library/extensions/hip/unpack.h",
    "chars": 700,
    "preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
  },
  {
    "path": "optimum/quanto/library/extensions/mps/README.md",
    "chars": 379,
    "preview": "# Quanto Metal Performance Shaders extension\n\nTo add a new implementation for an operation defined in `library./ops.py`:"
  },
  {
    "path": "optimum/quanto/library/extensions/mps/__init__.py",
    "chars": 1011,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/library/extensions/mps/pybind_module.cpp",
    "chars": 754,
    "preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
  },
  {
    "path": "optimum/quanto/library/extensions/mps/unpack.h",
    "chars": 710,
    "preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
  },
  {
    "path": "optimum/quanto/library/extensions/mps/unpack.mm",
    "chars": 6396,
    "preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
  },
  {
    "path": "optimum/quanto/library/extensions/xpu/__init__.py",
    "chars": 2452,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n# Copyright 2024 Intel Corporation. All rights reserved.\n#\n#"
  },
  {
    "path": "optimum/quanto/library/extensions/xpu/pybind_module.cpp",
    "chars": 1230,
    "preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
  },
  {
    "path": "optimum/quanto/library/extensions/xpu/unpack.h",
    "chars": 700,
    "preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n//\n// Licensed under the Apache License, Version 2.0 (the \""
  },
  {
    "path": "optimum/quanto/library/extensions/xpu/unpack.sycl",
    "chars": 4848,
    "preview": "// Copyright 2024 The HuggingFace Team. All rights reserved.\n// Copyright 2024 Intel Corporation. All rights reserved.\n/"
  },
  {
    "path": "optimum/quanto/library/qbytes_mm.py",
    "chars": 5306,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/library/quantize.py",
    "chars": 3047,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/library/unpack.py",
    "chars": 1887,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/models/__init__.py",
    "chars": 1070,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/models/diffusers_models.py",
    "chars": 8033,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/models/shared_dict.py",
    "chars": 1761,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/models/transformers_models.py",
    "chars": 8275,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/nn/__init__.py",
    "chars": 702,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/nn/qconv2d.py",
    "chars": 1766,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/nn/qlayernorm.py",
    "chars": 1797,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/nn/qlinear.py",
    "chars": 1550,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/nn/qmodule.py",
    "chars": 12393,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/quantize.py",
    "chars": 6438,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/subpackage/__init__.py",
    "chars": 631,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/subpackage/commands/__init__.py",
    "chars": 627,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/subpackage/commands/base.py",
    "chars": 1152,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/subpackage/commands/quantize.py",
    "chars": 4258,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/__init__.py",
    "chars": 813,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/activations/__init__.py",
    "chars": 50,
    "preview": "from .qbytes import *\nfrom .quantization import *\n"
  },
  {
    "path": "optimum/quanto/tensor/activations/qbytes.py",
    "chars": 3537,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/activations/qbytes_ops.py",
    "chars": 10495,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/activations/quantization.py",
    "chars": 1352,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/core.py",
    "chars": 928,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/function.py",
    "chars": 2428,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/grouped.py",
    "chars": 2183,
    "preview": "import math\nfrom typing import List\n\nimport torch\n\n\n__all__ = [\"group\", \"ungroup\", \"grouped_shape\"]\n\n\ndef grouped_shape("
  },
  {
    "path": "optimum/quanto/tensor/optimizers/__init__.py",
    "chars": 789,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/optimizers/absmax_optimizer.py",
    "chars": 1279,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/optimizers/affine_optimizer.py",
    "chars": 2554,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/optimizers/hqq_optimizer.py",
    "chars": 3202,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/optimizers/max_optimizer.py",
    "chars": 1312,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/optimizers/optimizer.py",
    "chars": 939,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/optimizers/symmetric_optimizer.py",
    "chars": 1344,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/packed.py",
    "chars": 6236,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/qbits.py",
    "chars": 2190,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/qbytes.py",
    "chars": 1559,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/qtensor.py",
    "chars": 3251,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/qtype.py",
    "chars": 1931,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/weights/__init__.py",
    "chars": 71,
    "preview": "from .qbits import *\nfrom .qbytes import *\nfrom .quantization import *\n"
  },
  {
    "path": "optimum/quanto/tensor/weights/awq/__init__.py",
    "chars": 43,
    "preview": "from .packed import *\nfrom .qbits import *\n"
  },
  {
    "path": "optimum/quanto/tensor/weights/awq/packed.py",
    "chars": 11394,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/weights/awq/qbits.py",
    "chars": 6998,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/weights/marlin/__init__.py",
    "chars": 67,
    "preview": "from .fp8 import *\nfrom .int4 import *\nfrom .permutations import *\n"
  },
  {
    "path": "optimum/quanto/tensor/weights/marlin/fp8/__init__.py",
    "chars": 43,
    "preview": "from .packed import *\nfrom .qbits import *\n"
  },
  {
    "path": "optimum/quanto/tensor/weights/marlin/fp8/packed.py",
    "chars": 9220,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/weights/marlin/fp8/qbits.py",
    "chars": 7032,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/weights/marlin/int4/__init__.py",
    "chars": 43,
    "preview": "from .packed import *\nfrom .qbits import *\n"
  },
  {
    "path": "optimum/quanto/tensor/weights/marlin/int4/packed.py",
    "chars": 6050,
    "preview": "import ast\nfrom copy import copy\n\nimport numpy as np\nimport torch\nfrom torch.utils import _pytree as pytree\n\nfrom ...pac"
  },
  {
    "path": "optimum/quanto/tensor/weights/marlin/int4/qbits.py",
    "chars": 6847,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/weights/marlin/permutations.py",
    "chars": 1668,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/weights/packing.py",
    "chars": 1502,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/weights/qbits.py",
    "chars": 13009,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/weights/qbytes.py",
    "chars": 13089,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/weights/quantization.py",
    "chars": 3076,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/weights/reordering.py",
    "chars": 1791,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/weights/tinygemm/__init__.py",
    "chars": 43,
    "preview": "from .packed import *\nfrom .qbits import *\n"
  },
  {
    "path": "optimum/quanto/tensor/weights/tinygemm/packed.py",
    "chars": 6316,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "optimum/quanto/tensor/weights/tinygemm/qbits.py",
    "chars": 7459,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "pyproject.toml",
    "chars": 1997,
    "preview": "[project]\nname = 'optimum-quanto'\ndescription = 'A pytorch quantization backend for optimum.'\nclassifiers = [\n    'Devel"
  },
  {
    "path": "setup.sh",
    "chars": 443,
    "preview": "#!/bin/bash\n\nNIGHTLY=${1:-0}\nVENV=\".venv\"\nif [ ! -d \"${VENV}\" ]; then\n    python3 -m venv ${VENV}\nfi\n. ${VENV}/bin/activ"
  },
  {
    "path": "tests/cli/cli_helpers.py",
    "chars": 777,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/cli/test_quantize_cli.py",
    "chars": 1619,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/conftest.py",
    "chars": 1566,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/helpers.py",
    "chars": 4187,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/library/test_extensions.py",
    "chars": 1146,
    "preview": "import platform\n\nimport pytest\nimport torch\nfrom packaging import version\n\nfrom optimum.quanto.library.extensions import"
  },
  {
    "path": "tests/library/test_mm.py",
    "chars": 9560,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/library/test_quantize.py",
    "chars": 4715,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/library/test_unpack.py",
    "chars": 1134,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/models/conftest.py",
    "chars": 845,
    "preview": "import pytest\nfrom huggingface_hub.constants import _staging_mode\n\n\n@pytest.fixture\ndef staging():\n    \"\"\"A pytest fixtu"
  },
  {
    "path": "tests/models/test_quantized_model_for_causal_lm.py",
    "chars": 5870,
    "preview": "import uuid\nfrom tempfile import TemporaryDirectory\n\nimport pytest\nimport torch\nfrom huggingface_hub import delete_repo\n"
  },
  {
    "path": "tests/models/test_quantized_model_for_pixart.py",
    "chars": 4200,
    "preview": "import uuid\nfrom tempfile import TemporaryDirectory\n\nimport pytest\nimport torch\nfrom huggingface_hub import delete_repo\n"
  },
  {
    "path": "tests/nn/test_calibrate.py",
    "chars": 4200,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/nn/test_qattention.py",
    "chars": 9521,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/nn/test_qconv2d.py",
    "chars": 7173,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/nn/test_qlayernorm.py",
    "chars": 4134,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/nn/test_qlinear.py",
    "chars": 11119,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/nn/test_qmodule.py",
    "chars": 2227,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/quantize/test_quantize_mlp.py",
    "chars": 5842,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/quantize/test_quantize_patterns.py",
    "chars": 4065,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/quantize/test_requantize.py",
    "chars": 4380,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/activations/test_activations_compile.py",
    "chars": 2424,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/activations/test_activations_dispatch.py",
    "chars": 3928,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/activations/test_activations_quantize.py",
    "chars": 2274,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/ops/test_linear_dispatch.py",
    "chars": 3748,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/ops/test_mm_dispatch.py",
    "chars": 2119,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/optimizers/test_hqq_optimizer.py",
    "chars": 2206,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/test_absmax.py",
    "chars": 1723,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/test_packed_tensor.py",
    "chars": 2005,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/weights/optimized/test_awq_packed_tensor.py",
    "chars": 2658,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/weights/optimized/test_awq_weight_qbits_tensor.py",
    "chars": 5034,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/weights/optimized/test_marlin_fp8_packed_tensor.py",
    "chars": 2547,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/weights/optimized/test_marlin_int4_packed_tensor.py",
    "chars": 2266,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/weights/optimized/test_marlin_int4_weight_qbits_tensor.py",
    "chars": 6070,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/weights/optimized/test_marlin_qbytes_tensor.py",
    "chars": 1757,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/weights/optimized/test_tinygemm_packed_tensor.py",
    "chars": 2819,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/weights/optimized/test_tinygemm_weight_qbits_tensor.py",
    "chars": 5499,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/weights/test_weight_qbits_tensor.py",
    "chars": 2911,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/weights/test_weight_qbits_tensor_dispatch.py",
    "chars": 4444,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  },
  {
    "path": "tests/tensor/weights/test_weight_qbits_tensor_instantiate.py",
    "chars": 3106,
    "preview": "# Copyright 2024 The HuggingFace Team. All rights reserved.\n#\n# Licensed under the Apache License, Version 2.0 (the \"Lic"
  }
]

// ... and 7 more files (download for full content)

About this extraction

This page contains the full source code of the huggingface/quanto GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 207 files (766.4 KB), approximately 208.4k tokens, and a symbol index with 641 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Extract another repo