Repository: ankane/informers
Branch: master
Commit: 14e74907c83d
Files: 30
Total size: 199.8 KB

Directory structure:
gitextract_hoze518r/

├── .github/
│   └── workflows/
│       └── build.yml
├── .gitignore
├── CHANGELOG.md
├── Gemfile
├── LICENSE.txt
├── README.md
├── Rakefile
├── informers.gemspec
├── lib/
│   ├── informers/
│   │   ├── backends/
│   │   │   └── onnx.rb
│   │   ├── configs.rb
│   │   ├── env.rb
│   │   ├── model.rb
│   │   ├── models.rb
│   │   ├── pipelines.rb
│   │   ├── processors.rb
│   │   ├── tokenizers.rb
│   │   ├── utils/
│   │   │   ├── audio.rb
│   │   │   ├── core.rb
│   │   │   ├── dtypes.rb
│   │   │   ├── ffmpeg.rb
│   │   │   ├── generation.rb
│   │   │   ├── hub.rb
│   │   │   ├── image.rb
│   │   │   ├── math.rb
│   │   │   └── tensor.rb
│   │   └── version.rb
│   └── informers.rb
└── test/
    ├── model_test.rb
    ├── pipeline_test.rb
    └── test_helper.rb

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/build.yml
================================================
name: build
on: [push, pull_request]
jobs:
  build:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v6
      - uses: ruby/setup-ruby@v1
        with:
          ruby-version: "4.0"
          bundler-cache: true
      - uses: actions/cache@v5
        with:
          path: ~/.cache/informers
          key: informers
      - run: sudo apt-get update && sudo apt-get install libvips
      - run: bundle exec rake download:files
      - run: bundle exec rake test


================================================
FILE: .gitignore
================================================
/.bundle/
/.yardoc
/_yardoc/
/coverage/
/doc/
/pkg/
/spec/reports/
/test/support/
/tmp/
*.lock


================================================
FILE: CHANGELOG.md
================================================
## 1.3.0 (unreleased)

- Dropped support for Ruby < 3.3

## 1.2.1 (2025-02-01)

- Fixed error when terminal width is zero

## 1.2.0 (2024-11-14)

- Added support for models with external data
- Added `device` option
- Added `dtype` option
- Added `session_options` option

## 1.1.1 (2024-10-14)

- Added `audio-classification` pipeline
- Fixed error with `sentence-transformers/all-MiniLM-L6-v2`

## 1.1.0 (2024-09-17)

- Added more pipelines

## 1.0.3 (2024-08-29)

- Added `model_output` option
- Improved `model_file_name` option

## 1.0.2 (2024-08-28)

- Added `embedding` pipeline
- Added experimental `reranking` pipeline
- Added support for `nomic-ai/nomic-embed-text-v1`

## 1.0.1 (2024-08-27)

- Added support for `Supabase/gte-small` to `Model`
- Fixed error with downloads

## 1.0.0 (2024-08-26)

- Replaced task classes with `pipeline` method
- Added `Model` class
- Dropped support for Ruby < 3.1

## 0.2.0 (2022-09-06)

- Added support for `optimum` and `transformers.onnx` models
- Dropped support for Ruby < 2.7

## 0.1.3 (2021-09-25)

- Added text generation
- Added fill mask

## 0.1.2 (2020-11-24)

- Added feature extraction

## 0.1.1 (2020-10-05)

- Fixed question answering for Ruby < 2.7

## 0.1.0 (2020-10-01)

- First release


================================================
FILE: Gemfile
================================================
source "https://rubygems.org"

gemspec

gem "rake"
gem "minitest"
gem "ruby-vips"


================================================
FILE: LICENSE.txt
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
# Informers

:fire: Fast [transformer](https://github.com/huggingface/transformers.js) inference for Ruby

For non-ONNX models, check out [Transformers.rb](https://github.com/ankane/transformers-ruby) :slightly_smiling_face:

[![Build Status](https://github.com/ankane/informers/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/informers/actions)

## Installation

Add this line to your application’s Gemfile:

```ruby
gem "informers"
```

## Getting Started

- [Models](#models)
- [Pipelines](#pipelines)

## Models

Embedding

- [sentence-transformers/all-MiniLM-L6-v2](#sentence-transformersall-MiniLM-L6-v2)
- [sentence-transformers/multi-qa-MiniLM-L6-cos-v1](#sentence-transformersmulti-qa-MiniLM-L6-cos-v1)
- [sentence-transformers/all-mpnet-base-v2](#sentence-transformersall-mpnet-base-v2)
- [sentence-transformers/paraphrase-MiniLM-L6-v2](#sentence-transformersparaphrase-minilm-l6-v2)
- [mixedbread-ai/mxbai-embed-large-v1](#mixedbread-aimxbai-embed-large-v1)
- [Supabase/gte-small](#supabasegte-small)
- [intfloat/e5-base-v2](#intfloate5-base-v2)
- [nomic-ai/nomic-embed-text-v1](#nomic-ainomic-embed-text-v1)
- [BAAI/bge-base-en-v1.5](#baaibge-base-en-v15)
- [jinaai/jina-embeddings-v2-base-en](#jinaaijina-embeddings-v2-base-en)
- [Snowflake/snowflake-arctic-embed-m-v1.5](#snowflakesnowflake-arctic-embed-m-v15)

Reranking

- [mixedbread-ai/mxbai-rerank-base-v1](#mixedbread-aimxbai-rerank-base-v1)
- [jinaai/jina-reranker-v1-turbo-en](#jinaaijina-reranker-v1-turbo-en)
- [BAAI/bge-reranker-base](#baaibge-reranker-base)
- [Xenova/ms-marco-MiniLM-L-6-v2](#xenovams-marco-minilm-l-6-v2)

### sentence-transformers/all-MiniLM-L6-v2

[Docs](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2)

```ruby
sentences = ["This is an example sentence", "Each sentence is converted"]

model = Informers.pipeline("embedding", "sentence-transformers/all-MiniLM-L6-v2")
embeddings = model.(sentences)
```

### sentence-transformers/multi-qa-MiniLM-L6-cos-v1

[Docs](https://huggingface.co/Xenova/multi-qa-MiniLM-L6-cos-v1)

```ruby
query = "How many people live in London?"
docs = ["Around 9 Million people live in London", "London is known for its financial district"]

model = Informers.pipeline("embedding", "sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
query_embedding = model.(query)
doc_embeddings = model.(docs)
scores = doc_embeddings.map { |e| e.zip(query_embedding).sum { |d, q| d * q } }
doc_score_pairs = docs.zip(scores).sort_by { |d, s| -s }
```

### sentence-transformers/all-mpnet-base-v2

[Docs](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)

```ruby
sentences = ["This is an example sentence", "Each sentence is converted"]

model = Informers.pipeline("embedding", "sentence-transformers/all-mpnet-base-v2")
embeddings = model.(sentences)
```

### sentence-transformers/paraphrase-MiniLM-L6-v2

[Docs](https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2)

```ruby
sentences = ["This is an example sentence", "Each sentence is converted"]

model = Informers.pipeline("embedding", "sentence-transformers/paraphrase-MiniLM-L6-v2")
embeddings = model.(sentences, normalize: false)
```

### mixedbread-ai/mxbai-embed-large-v1

[Docs](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1)

```ruby
query_prefix = "Represent this sentence for searching relevant passages: "

input = [
  "The dog is barking",
  "The cat is purring",
  query_prefix + "puppy"
]

model = Informers.pipeline("embedding", "mixedbread-ai/mxbai-embed-large-v1")
embeddings = model.(input)
```

### Supabase/gte-small

[Docs](https://huggingface.co/Supabase/gte-small)

```ruby
sentences = ["That is a happy person", "That is a very happy person"]

model = Informers.pipeline("embedding", "Supabase/gte-small")
embeddings = model.(sentences)
```

### intfloat/e5-base-v2

[Docs](https://huggingface.co/intfloat/e5-base-v2)

```ruby
doc_prefix = "passage: "
query_prefix = "query: "

input = [
  doc_prefix + "Ruby is a programming language created by Matz",
  query_prefix + "Ruby creator"
]

model = Informers.pipeline("embedding", "intfloat/e5-base-v2")
embeddings = model.(input)
```

### nomic-ai/nomic-embed-text-v1

[Docs](https://huggingface.co/nomic-ai/nomic-embed-text-v1)

```ruby
doc_prefix = "search_document: "
query_prefix = "search_query: "

input = [
  doc_prefix + "The dog is barking",
  doc_prefix + "The cat is purring",
  query_prefix + "puppy"
]

model = Informers.pipeline("embedding", "nomic-ai/nomic-embed-text-v1")
embeddings = model.(input)
```

### BAAI/bge-base-en-v1.5

[Docs](https://huggingface.co/BAAI/bge-base-en-v1.5)

```ruby
query_prefix = "Represent this sentence for searching relevant passages: "

input = [
  "The dog is barking",
  "The cat is purring",
  query_prefix + "puppy"
]

model = Informers.pipeline("embedding", "BAAI/bge-base-en-v1.5")
embeddings = model.(input)
```

### jinaai/jina-embeddings-v2-base-en

[Docs](https://huggingface.co/jinaai/jina-embeddings-v2-base-en)

```ruby
sentences = ["How is the weather today?", "What is the current weather like today?"]

model = Informers.pipeline("embedding", "jinaai/jina-embeddings-v2-base-en", model_file_name: "../model")
embeddings = model.(sentences)
```

### Snowflake/snowflake-arctic-embed-m-v1.5

[Docs](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5)

```ruby
query_prefix = "Represent this sentence for searching relevant passages: "

input = [
  "The dog is barking",
  "The cat is purring",
  query_prefix + "puppy"
]

model = Informers.pipeline("embedding", "Snowflake/snowflake-arctic-embed-m-v1.5")
embeddings = model.(input, model_output: "sentence_embedding", pooling: "none")
```

### mixedbread-ai/mxbai-rerank-base-v1

[Docs](https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v1)

```ruby
query = "How many people live in London?"
docs = ["Around 9 Million people live in London", "London is known for its financial district"]

model = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-base-v1")
result = model.(query, docs)
```

### jinaai/jina-reranker-v1-turbo-en

[Docs](https://huggingface.co/jinaai/jina-reranker-v1-turbo-en)

```ruby
query = "How many people live in London?"
docs = ["Around 9 Million people live in London", "London is known for its financial district"]

model = Informers.pipeline("reranking", "jinaai/jina-reranker-v1-turbo-en")
result = model.(query, docs)
```

### BAAI/bge-reranker-base

[Docs](https://huggingface.co/BAAI/bge-reranker-base)

```ruby
query = "How many people live in London?"
docs = ["Around 9 Million people live in London", "London is known for its financial district"]

model = Informers.pipeline("reranking", "BAAI/bge-reranker-base")
result = model.(query, docs)
```

### Xenova/ms-marco-MiniLM-L-6-v2

[Docs](https://huggingface.co/Xenova/ms-marco-MiniLM-L-6-v2)

```ruby
query = "How many people live in London?"
docs = ["Around 9 Million people live in London", "London is known for its financial district"]

model = Informers.pipeline("reranking", "Xenova/ms-marco-MiniLM-L-6-v2")
result = model.(query, docs)
```

### Other

The model must include a `.onnx` file ([example](https://huggingface.co/Xenova/all-MiniLM-L6-v2/tree/main/onnx)). If the file is not at `onnx/model.onnx`, use the `model_file_name` option to specify the location.

## Pipelines

- [Text](#text)
- [Vision](#vision)
- [Audio](#audio)
- [Multimodel](#multimodal)

### Text

Embedding

```ruby
embed = Informers.pipeline("embedding")
embed.("We are very happy to show you the 🤗 Transformers library.")
```

Reranking

```ruby
rerank = Informers.pipeline("reranking")
rerank.("Who created Ruby?", ["Matz created Ruby", "Another doc"])
```

Named-entity recognition

```ruby
ner = Informers.pipeline("ner")
ner.("Ruby is a programming language created by Matz")
```

Sentiment analysis

```ruby
classifier = Informers.pipeline("sentiment-analysis")
classifier.("We are very happy to show you the 🤗 Transformers library.")
```

Question answering

```ruby
qa = Informers.pipeline("question-answering")
qa.("Who invented Ruby?", "Ruby is a programming language created by Matz")
```

Zero-shot classification

```ruby
classifier = Informers.pipeline("zero-shot-classification")
classifier.("text", ["label1", "label2", "label3"])
```

Text generation

```ruby
generator = Informers.pipeline("text-generation")
generator.("I enjoy walking with my cute dog,")
```

Text-to-text generation

```ruby
text2text = Informers.pipeline("text2text-generation")
text2text.("translate from English to French: I'm very happy")
```

Translation

```ruby
translator = Informers.pipeline("translation", "Xenova/nllb-200-distilled-600M")
translator.("जीवन एक चॉकलेट बॉक्स की तरह है।", src_lang: "hin_Deva", tgt_lang: "fra_Latn")
```

Summarization

```ruby
summarizer = Informers.pipeline("summarization")
summarizer.("Many paragraphs of text")
```

Fill mask

```ruby
unmasker = Informers.pipeline("fill-mask")
unmasker.("Paris is the [MASK] of France.")
```

Feature extraction

```ruby
extractor = Informers.pipeline("feature-extraction")
extractor.("We are very happy to show you the 🤗 Transformers library.")
```

### Vision

Note: [ruby-vips](https://github.com/libvips/ruby-vips) is required to load images

Image classification

```ruby
classifier = Informers.pipeline("image-classification")
classifier.("image.jpg")
```

Zero-shot image classification

```ruby
classifier = Informers.pipeline("zero-shot-image-classification")
classifier.("image.jpg", ["label1", "label2", "label3"])
```

Image segmentation

```ruby
segmenter = Informers.pipeline("image-segmentation")
segmenter.("image.jpg")
```

Object detection

```ruby
detector = Informers.pipeline("object-detection")
detector.("image.jpg")
```

Zero-shot object detection

```ruby
detector = Informers.pipeline("zero-shot-object-detection")
detector.("image.jpg", ["label1", "label2", "label3"])
```

Depth estimation

```ruby
estimator = Informers.pipeline("depth-estimation")
estimator.("image.jpg")
```

Image-to-image

```ruby
upscaler = Informers.pipeline("image-to-image")
upscaler.("image.jpg")
```

Image feature extraction

```ruby
extractor = Informers.pipeline("image-feature-extraction")
extractor.("image.jpg")
```

### Audio

Note: [ffmpeg](https://www.ffmpeg.org/) is required to load audio files

Audio classification

```ruby
classifier = Informers.pipeline("audio-classification")
classifier.("audio.wav")
```

### Multimodal

Image captioning

```ruby
captioner = Informers.pipeline("image-to-text")
captioner.("image.jpg")
```

Document question answering

```ruby
qa = Informers.pipeline("document-question-answering")
qa.("image.jpg", "What is the invoice number?")
```

## Reference

Specify a variant of the model if available (`fp32`, `fp16`, `int8`, `uint8`, `q8`, `q4`, `q4f16`, or `bnb4`)

```ruby
Informers.pipeline("embedding", "Xenova/all-MiniLM-L6-v2", dtype: "fp16")
```

Specify a device (`cpu`, `cuda`, or `coreml`)

```ruby
Informers.pipeline("embedding", device: "cuda")
```

Note: Follow [these instructions](https://github.com/ankane/onnxruntime-ruby?tab=readme-ov-file#gpu-support) for `cuda`

Specify ONNX Runtime [session options](https://github.com/ankane/onnxruntime-ruby?tab=readme-ov-file#session-options)

```ruby
Informers.pipeline("embedding", session_options: {log_severity_level: 2})
```

## Credits

This library was ported from [Transformers.js](https://github.com/huggingface/transformers.js) and is available under the same license.

## History

View the [changelog](https://github.com/ankane/informers/blob/master/CHANGELOG.md)

## Contributing

Everyone is encouraged to help improve this project. Here are a few ways you can help:

- [Report bugs](https://github.com/ankane/informers/issues)
- Fix bugs and [submit pull requests](https://github.com/ankane/informers/pulls)
- Write, clarify, or fix documentation
- Suggest or add new features

To get started with development:

```sh
git clone https://github.com/ankane/informers.git
cd informers
bundle install
bundle exec rake download:files
bundle exec rake test
```


================================================
FILE: Rakefile
================================================
require "bundler/gem_tasks"
require "rake/testtask"

Rake::TestTask.new do |t|
  t.pattern = FileList["test/**/*_test.rb"].exclude("test/model_test.rb")
end

task default: :test

def download_file(url)
  require "open-uri"

  file = File.basename(url)
  puts "Downloading #{file}..."
  dest = "test/support/#{file}"
  File.binwrite(dest, URI.parse(url).read)
  puts "Saved #{dest}"
end

namespace :download do
  task :files do
    Dir.mkdir("test/support") unless Dir.exist?("test/support")

    download_file("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg")
    download_file("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_1.png")
  end
end


================================================
FILE: informers.gemspec
================================================
require_relative "lib/informers/version"

Gem::Specification.new do |spec|
  spec.name          = "informers"
  spec.version       = Informers::VERSION
  spec.summary       = "Fast transformer inference for Ruby"
  spec.homepage      = "https://github.com/ankane/informers"
  spec.license       = "Apache-2.0"

  spec.author        = "Andrew Kane"
  spec.email         = "andrew@ankane.org"

  spec.files         = Dir["*.{md,txt}", "{lib}/**/*"]
  spec.require_path  = "lib"

  spec.required_ruby_version = ">= 3.3"

  spec.add_dependency "onnxruntime", ">= 0.9"
  spec.add_dependency "tokenizers", ">= 0.5.3"
end


================================================
FILE: lib/informers/backends/onnx.rb
================================================
module Informers
  module Backends
    module Onnx
      def self.device_to_execution_providers(device)
        case device&.to_s
        when "cpu", nil
          []
        when "cuda"
          ["CUDAExecutionProvider"]
        when "coreml"
          ["CoreMLExecutionProvider"]
        else
          supported_devices = ["cpu", "cuda", "coreml"]
          raise ArgumentError, "Unsupported device: #{device}. Should be one of: #{supported_devices.join(", ")}"
        end
      end
    end
  end
end


================================================
FILE: lib/informers/configs.rb
================================================
module Informers
  class PretrainedConfig
    def initialize(config_json)
      @config_json = config_json.to_h
    end

    def [](key)
      @config_json[key.to_s]
    end

    def []=(key, value)
      @config_json[key.to_s] = value
    end

    def to_h
      @config_json.to_h
    end

    def self.from_pretrained(
      pretrained_model_name_or_path,
      progress_callback: nil,
      config: nil,
      cache_dir: nil,
      local_files_only: false,
      revision: "main",
      **kwargs
    )
      data = config || load_config(
        pretrained_model_name_or_path,
        progress_callback:,
        config:,
        cache_dir:,
        local_files_only:,
        revision:
      )
      new(data)
    end

    def self.load_config(pretrained_model_name_or_path, **options)
      info = Utils::Hub.get_model_json(pretrained_model_name_or_path, "config.json", true, **options)
      info
    end
  end

  class AutoConfig
    def self.from_pretrained(...)
      PretrainedConfig.from_pretrained(...)
    end
  end
end


================================================
FILE: lib/informers/env.rb
================================================
module Informers
  CACHE_HOME = ENV.fetch("XDG_CACHE_HOME", File.join(ENV.fetch("HOME"), ".cache"))
  DEFAULT_CACHE_DIR = File.expand_path(File.join(CACHE_HOME, "informers"))

  class << self
    attr_accessor :allow_remote_models, :remote_host, :remote_path_template, :cache_dir
  end

  self.allow_remote_models = ENV["INFORMERS_OFFLINE"].to_s.empty?
  self.remote_host = "https://huggingface.co/"
  self.remote_path_template = "{model}/resolve/{revision}/"

  self.cache_dir = DEFAULT_CACHE_DIR
end


================================================
FILE: lib/informers/model.rb
================================================
module Informers
  # TODO remove in 2.0
  class Model
    def initialize(model_id, quantized: false)
      @model = Informers.pipeline("embedding", model_id, quantized: quantized)
      @options = model_id == "mixedbread-ai/mxbai-embed-large-v1" ? {pooling: "cls", normalize: false} : {}
    end

    def embed(texts)
      @model.(texts, **@options)
    end
  end
end


================================================
FILE: lib/informers/models.rb
================================================
module Informers
  MODEL_TYPES = {
    EncoderOnly: 0,
    EncoderDecoder: 1,
    Seq2Seq: 2,
    Vision2Seq: 3,
    DecoderOnly: 4,
    MaskGeneration: 5
  }

  # NOTE: These will be populated fully later
  MODEL_TYPE_MAPPING = {}
  MODEL_NAME_TO_CLASS_MAPPING = {}
  MODEL_CLASS_TO_NAME_MAPPING = {}

  class PretrainedMixin
    def self.from_pretrained(
      pretrained_model_name_or_path,
      quantized: true,
      progress_callback: nil,
      config: nil,
      cache_dir: nil,
      local_files_only: false,
      revision: "main",
      device: nil,
      dtype: nil,
      model_file_name: nil,
      session_options: {}
    )
      options = {
        quantized:,
        progress_callback:,
        config:,
        cache_dir:,
        local_files_only:,
        revision:,
        device:,
        dtype:,
        model_file_name:,
        session_options:
      }
      config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **options)
      if options[:config].nil?
        # If no config was passed, reuse this config for future processing
        options[:config] = config
      end

      if !const_defined?(:MODEL_CLASS_MAPPINGS)
        raise Error, "`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: #{name}"
      end

      const_get(:MODEL_CLASS_MAPPINGS).each do |model_class_mapping|
        model_info = model_class_mapping[config[:model_type]]
        if !model_info
          next # Item not found in this mapping
        end
        return model_info[1].from_pretrained(pretrained_model_name_or_path, **options)
      end

      if const_defined?(:BASE_IF_FAIL)
        warn "Unknown model class #{config[:model_type].inspect}, attempting to construct from base class."
        PreTrainedModel.from_pretrained(pretrained_model_name_or_path, **options)
      else
        raise Error, "Unsupported model type: #{config[:model_type]}"
      end
    end
  end

  class PreTrainedModel
    MAIN_INPUT_NAME = :input_ids

    attr_reader :config

    def initialize(config, session)
      super()

      @config = config
      @session = session

      @output_names = nil

      model_name = MODEL_CLASS_TO_NAME_MAPPING[self.class]
      model_type = MODEL_TYPE_MAPPING[model_name]

      case model_type
      when MODEL_TYPES[:DecoderOnly]
        @can_generate = true

        @run_beam = method(:decoder_run_beam)
        @get_start_beams = method(:decoder_start_beams)
        @update_beam = method(:decoder_update_beam)
        @forward = method(:decoder_forward)

      when MODEL_TYPES[:Seq2Seq], MODEL_TYPES[:Vision2Seq]
        @can_generate = true

        @run_beam = method(:seq2seq_run_beam)
        @get_start_beams = method(:seq2seq_start_beams)
        @update_beam = method(:seq2seq_update_beam)
        @forward = method(:seq2seq_forward)

      when MODEL_TYPES[:EncoderDecoder]
        @forward = method(:encoder_forward)

      else
        @forward = method(:encoder_forward)
      end
    end

    def self.from_pretrained(
      pretrained_model_name_or_path,
      quantized: true,
      progress_callback: nil,
      config: nil,
      cache_dir: nil,
      local_files_only: false,
      revision: "main",
      device: nil,
      dtype: nil,
      model_file_name: nil,
      session_options: {}
    )
      options = {
        quantized:,
        progress_callback:,
        config:,
        cache_dir:,
        local_files_only:,
        revision:,
        device:,
        dtype:,
        model_file_name:,
        session_options:
      }

      model_name = MODEL_CLASS_TO_NAME_MAPPING[self]
      model_type = MODEL_TYPE_MAPPING[model_name]

      config ||= AutoConfig.from_pretrained(pretrained_model_name_or_path, **options)

      if model_type == MODEL_TYPES[:DecoderOnly]
        info = [
          construct_session(pretrained_model_name_or_path, options[:model_file_name] || "decoder_model_merged", **options),
          Utils::Hub.get_model_json(pretrained_model_name_or_path, "generation_config.json", false, **options)
        ]

      elsif model_type == MODEL_TYPES[:Seq2Seq] || model_type == MODEL_TYPES[:Vision2Seq]
        info = [
          construct_session(pretrained_model_name_or_path, "encoder_model", **options),
          construct_session(pretrained_model_name_or_path, "decoder_model_merged", **options),
          Utils::Hub.get_model_json(pretrained_model_name_or_path, "generation_config.json", false, **options)
        ]

      elsif model_type == MODEL_TYPES[:MaskGeneration]
        info = [
          construct_session(pretrained_model_name_or_path, "vision_encoder", **options),
          construct_session(pretrained_model_name_or_path, "prompt_encoder_mask_decoder", **options)
        ]

      elsif model_type == MODEL_TYPES[:EncoderDecoder]
        info = [
          construct_session(pretrained_model_name_or_path, "encoder_model", **options),
          construct_session(pretrained_model_name_or_path, "decoder_model_merged", **options)
        ]

      else
        if model_type != MODEL_TYPES[:EncoderOnly]
          warn "Model type for '#{model_name || config[:model_type]}' not found, assuming encoder-only architecture. Please report this."
        end
        info = [
          construct_session(pretrained_model_name_or_path, options[:model_file_name] || "model", **options)
        ]
      end

      new(config, *info)
    end

    def self.construct_session(pretrained_model_name_or_path, file_name, **options)
      prefix = "onnx/"
      if file_name.start_with?("../")
        prefix = ""
        file_name = file_name[3..]
      elsif file_name.start_with?("/")
        prefix = ""
        file_name = file_name[1..]
      end
      dtype = options[:dtype] || (options[:quantized] ? "q8" : "fp32")
      suffix = Utils::DEFAULT_DTYPE_SUFFIX_MAPPING[dtype.to_sym]
      if !suffix
        raise ArgumentError, "Invalid dtype: #{dtype}. Should be one of: #{Utils::DEFAULT_DTYPE_SUFFIX_MAPPING.keys.join(", ")}"
      end
      model_file_name = "#{prefix}#{file_name}#{suffix}.onnx"
      path = Utils::Hub.get_model_file(pretrained_model_name_or_path, model_file_name, true, **options)

      session_options = {
        providers: Backends::Onnx.device_to_execution_providers(options[:device]),
        log_severity_level: 4
      }.merge(options[:session_options] || {})

      begin
        OnnxRuntime::InferenceSession.new(path, **session_options)
      rescue OnnxRuntime::Error => e
        raise e unless e.message.include?("No such file or directory") && e.message.include?(".onnx_data")

        Utils::Hub.get_model_file(pretrained_model_name_or_path, "#{model_file_name}_data", true, **options)
        OnnxRuntime::InferenceSession.new(path, **session_options)
      end
    end

    def call(model_inputs, **kwargs)
      @forward.(model_inputs, **kwargs)
    end

    def generate(inputs, generation_config = nil, logits_processor = nil, inputs_attention_mask: nil)
      if !@can_generate
        model_name = MODEL_CLASS_TO_NAME_MAPPING[self.class]
        error_message = "The current model class (#{model_name}) is not compatible with `.generate()`, as it doesn't have a language model head."
        raise Error, error_message
      end

      if !inputs.is_a?(Array)
        raise ArgumentError, "`inputs` must be an Array, but is #{inputs.class.name}"
      end

      if @config[:is_encoder_decoder]
        # Generating from the encoder outputs
        input_ids_seq_length = 0
      else
        input_ids_seq_length = inputs.length

        # decoder-only
        if input_ids_seq_length == 0
          raise Error, "Must supply a non-empty array of input token ids."
        end
      end

      # Update generation config with defaults
      generation_config = get_generation_config(generation_config)

      logits_processor ||= Utils::LogitsProcessorList.new

      # Update logits processor
      logits_processor = get_logits_processor(
        generation_config,
        input_ids_seq_length,
        logits_processor
      )

      eos_token_ids = generation_config[:eos_token_id]
      if !eos_token_ids.nil? && !eos_token_ids.is_a?(Array)
        eos_token_ids = [eos_token_ids]
      end

      num_output_tokens = 1
      max_output_tokens = num_output_tokens + (generation_config[:max_new_tokens] || Float::INFINITY)

      # Only use max length if max_new_tokens is not provided
      use_max_length = generation_config[:max_length].is_a?(Integer) && generation_config[:max_new_tokens].nil?
      sampler = Utils::Sampler.get_sampler(generation_config)

      beams = get_start_beams(inputs, generation_config, num_output_tokens, inputs_attention_mask)

      while beams.any? { |x| !x[:done] } && num_output_tokens < max_output_tokens
        newest_beams = []
        beams.each do |beam|
          if beam[:done]
            # Add this beam back into the pool
            newest_beams << beam
            next
          end
          if use_max_length && beam[:output_token_ids].length >= generation_config["max_length"]
            # Set this beam to done and add it back into the pool
            beam[:done] = true
            newest_beams << beam
            next
          end

          output = run_beam(beam)

          # add attentions/scores to beam only if user requested
          if generation_config["output_attentions"]
            add_attentions_to_beam(beam, output)
          end

          # Logits are of the form [batch_size, out_seq_length, vocab_size]
          # In most cases, this will be [batch_size, 1, vocab_size]
          # So, we select the last token's logits:
          # (equivalent to `logits = outputs.logits[:, -1, :]`)
          logits = output["logits"].map { |v| v[-1] }

          # Apply logits processor
          logits_processor.(beam[:output_token_ids], logits)

          sampled_tokens = sampler.(logits)
          sampled_tokens.each do |new_token_id, log_prob|
            # use previous beam as a starting point
            new_beam = beam.dup

            # update new beam
            update_beam(new_beam, new_token_id)

            new_beam[:score] += log_prob

            if eos_token_ids && eos_token_ids.include?(new_token_id)
              new_beam[:done] = true
            end

            newest_beams << new_beam
          end
        end
        num_output_tokens += 1

        # Next, we get the best beams, per ID
        newest_beams =
          group_beams(newest_beams).map do |group|
            group.sort_by { |v| -v[:score] }[0...generation_config["num_beams"]]
          end

        # Flatten beams
        beams = newest_beams.flatten(1)

        # Run callback
        if generation_config["callback_function"]
          generation_config["callback_function"].(beams)
        end
      end

      # TODO: Ensure that we can return non-batched outputs

      grouped_beams = group_beams(beams)

      get_flattened = lambda do |key|
        grouped_beams.flat_map do |batch|
          if generation_config["num_return_sequences"] > 1
            raise Todo
          else
            [batch[0][key]]
          end
        end
      end

      sequences = get_flattened.(:output_token_ids) # [1, seqLength]

      if generation_config["return_dict_in_generate"]
        raise Todo
      else
        sequences
      end
    end

    private

    def get_logits_processor(
      generation_config,
      input_ids_seq_length,
      logits_processor = nil
    )
      processors = Utils::LogitsProcessorList.new

      if !generation_config["repetition_penalty"].nil? && generation_config["repetition_penalty"] != 1.0
        processors.push(Utils::RepetitionPenaltyLogitsProcessor.new(generation_config["repetition_penalty"]))
      end

      if !generation_config["no_repeat_ngram_size"].nil? && generation_config["no_repeat_ngram_size"] > 0
        processors.push(Utils::NoRepeatNGramLogitsProcessor.new(generation_config["no_repeat_ngram_size"]))
      end

      if !generation_config["bad_words_ids"].nil?
        processors.push(Utils::NoBadWordsLogitsProcessor.new(generation_config["bad_words_ids"], generation_config["eos_token_id"]))
      end

      if !generation_config["min_length"].nil? && !generation_config["eos_token_id"].nil? && generation_config["min_length"] > 0
        processors.push(Utils::MinLengthLogitsProcessor.new(generation_config["min_length"], generation_config["eos_token_id"]))
      end

      if !generation_config["min_new_tokens"].nil? && !generation_config["eos_token_id"].nil? && generation_config["min_new_tokens"] > 0
        processors.push(Utils::MinNewTokensLengthLogitsProcessor.new(
          input_ids_seq_length,
          generation_config["min_new_tokens"],
          generation_config["eos_token_id"]
        ))
      end

      if !generation_config["forced_bos_token_id"].nil?
        processors.push(Utils::ForcedBOSTokenLogitsProcessor.new(generation_config["forced_bos_token_id"]))
      end

      if !generation_config["forced_eos_token_id"].nil?
        processors.push(Utils::ForcedEOSTokenLogitsProcessor.new(
          generation_config["max_length"],
          generation_config["forced_eos_token_id"]
        ))
      end

      if !generation_config["begin_suppress_tokens"].nil?
        raise Todo
      end

      if !generation_config["forced_decoder_ids"].nil?
        processors.push(Utils::ForceTokensLogitsProcessor.new(generation_config["forced_decoder_ids"]))
      end

      if !logits_processor.nil?
        processors.concat(logits_processor)
      end

      processors
    end

    def get_generation_config(generation_config)
      # Create empty generation config (contains defaults)
      # We pass `@config` so that if `eos_token_id` or `bos_token_id` exist in the model's config, we will use them
      gen_config = Utils::GenerationConfig.new(@config.to_h)

      # Apply model's generation config, if it exists
      if @generation_config
        gen_config.merge!(@generation_config)
      end

      # Finally, use any generation config specified by the user
      # when calling `generate`
      if !generation_config.nil?
        gen_config.merge!(generation_config)
      end

      gen_config
    end

    def seq2seq_forward(model_inputs)
      encoder_outputs = model_inputs[:encoder_outputs]
      past_key_values = model_inputs[:past_key_values]

      if !encoder_outputs
        # Encoder outputs are not given, so we must compute them.
        encoder_outputs = encoder_forward(model_inputs)[0]
      end
      decoder_feeds = {
        input_ids: model_inputs[:decoder_input_ids],
        encoder_hidden_states: encoder_outputs
      }
      use_cache_branch = !!past_key_values

      if @decoder_merged_session.inputs.map { |v| v[:name] }.include?("use_cache_branch")
        decoder_feeds[:use_cache_branch] = [use_cache_branch]
      end

      if @decoder_merged_session.inputs.map { |v| v[:name] }.include?("encoder_attention_mask")
        decoder_feeds[:encoder_attention_mask] = model_inputs[:attention_mask]
      end

      prepare_position_ids(@decoder_merged_session, decoder_feeds, use_cache_branch)
      add_past_key_values(decoder_feeds, past_key_values)

      decoder_results = session_run(@decoder_merged_session, decoder_feeds)
      decoder_results = @decoder_merged_session.outputs.map { |v| v[:name] }.zip(decoder_results).to_h
      logits = decoder_results["logits"]
      past_key_values = get_past_key_values(decoder_results, past_key_values)

      # Get cross attention and/or decoder attentions if they are present
      attns = get_attentions(decoder_results)

      Seq2SeqLMOutput.new(logits, past_key_values, encoder_outputs, attns["decoder_attentions"], attns["cross_attentions"])
    end

    def prepare_position_ids(session, feeds, use_cache_branch)
      if !session.inputs.map { |v| v[:name] }.include?("position_ids")
        return
      end

      raise Todo
    end

    def get_past_key_values(decoder_results, past_key_values)
      pkvs = {}

      decoder_results.each_key do |name|
        if name.start_with?("present")
          new_name = name.sub("present", "past_key_values")

          if past_key_values && name.include?("encoder")
            # Optimization introduced by optimum to reuse past key values. So, we just replace the constant
            # outputs with the previous past key values.
            # https://github.com/huggingface/optimum/blob/0bf2c05fb7e1182b52d21b703cfc95fd9e4ea3dc/optimum/onnxruntime/base.py#L677-L704
            pkvs[new_name] = past_key_values[new_name]
          else
            pkvs[new_name] = decoder_results[name]
          end
        end
      end
      pkvs
    end

    def get_attentions(decoder_results)
      attns = {}

      ["cross_attentions", "decoder_attentions"].each do |attn_name|
        result = []
        decoder_results.each_key do |name|
          if name.start_with?(attn_name)
            index = name.split(".").pop
            result[index] = decoder_results[name]
          end
        end
        attns[attn_name] = result
      end
      attns
    end

    def add_past_key_values(decoder_feeds, past_key_values)
      if past_key_values
        decoder_feeds.merge!(past_key_values)
      else
        # TODO support batches (i.e., batch_size > 1)
        batch_size = 1

        if @config[:is_encoder_decoder] && (!@add_encoder_pkv.nil? ? @add_encoder_pkv : true)
          _encoder_dims = [batch_size, @num_encoder_heads, 0, @encoder_dim_kv]
          _decoder_dims = [batch_size, @num_decoder_heads, 0, @decoder_dim_kv]
          @num_decoder_layers.times do |i|
            # decoder_feeds["past_key_values.#{i}.encoder.key"] = OnnxRuntime::OrtValue.from_shape_and_type(encoder_dims, :float)
            # decoder_feeds["past_key_values.#{i}.encoder.value"] = OnnxRuntime::OrtValue.from_shape_and_type(encoder_dims, :float)
            # decoder_feeds["past_key_values.#{i}.decoder.key"] = OnnxRuntime::OrtValue.from_shape_and_type(decoder_dims, :float)
            # decoder_feeds["past_key_values.#{i}.decoder.value"] = OnnxRuntime::OrtValue.from_shape_and_type(decoder_dims, :float)
          end
        elsif @config[:model_type] == "falcon"
          raise Todo
        elsif @config[:multi_query]
          raise Todo
        elsif @config[:model_type] == "bloom"
          raise Todo
        else
          _dims = [batch_size, @num_heads, 0, @dim_kv]
          @num_layers.times do |i|
            # decoder_feeds["past_key_values.#{i}.key"] = OnnxRuntime::OrtValue.from_shape_and_type(dims, :float)
            # decoder_feeds["past_key_values.#{i}.value"] = OnnxRuntime::OrtValue.from_shape_and_type(dims, :float)
          end
        end
      end
    end

    def seq2seq_start_beams(input_token_ids, generation_config, num_output_tokens, inputs_attention_mask = nil)
      beams = []
      beam_id = 0

      requires_attention_mask = !@requires_attention_mask.nil? ? @requires_attention_mask : true

      # decoder_input_ids == output_token_ids
      decoder_input_ids =
        generation_config["decoder_input_ids"] ||
        generation_config["decoder_start_token_id"] ||
        generation_config["bos_token_id"] ||
        generation_config["eos_token_id"]

      if !decoder_input_ids.is_a?(Array)
        decoder_input_ids = [decoder_input_ids]
      end

      input_token_ids.each do |tokens|
        # TODO: Improve
        # Currently, just add back batch dimension.
        # In future, allow for true parallel execution
        tokens = [tokens]

        # Create beam
        start = {
          inputs: tokens,
          encoder_outputs: nil,
          prev_model_outputs: nil,

          output_token_ids: decoder_input_ids,
          done: false,
          score: 0,
          id: beam_id # assign unique id to beams
        }
        beam_id += 1

        if requires_attention_mask
          start[:attention_mask] = prepare_attention_mask(tokens)
        end

        beams << start
      end

      beams
    end

    def prepare_attention_mask(tokens)
      # Prepare attention mask
      pad_token_id = @config["pad_token_id"]
      eos_token_id = @config["eos_token_id"]
      if eos_token_id.is_a?(Integer)
        eos_token_id = [eos_token_id]
      end

      is_pad_token_in_inputs = !tokens.index(pad_token_id).nil?
      is_pad_token_not_equal_to_eos_token_id = eos_token_id.nil? || !eos_token_id.include?(pad_token_id)

      if is_pad_token_in_inputs && is_pad_token_not_equal_to_eos_token_id
        raise Todo
      else
        Utils.ones_like(tokens)
      end
    end

    def seq2seq_run_beam(beam)
      input_name = self.class.const_get(:MAIN_INPUT_NAME)

      decoder_input_ids = beam[:output_token_ids]
      if beam[:prev_model_outputs]
        # After the first step, `prev_model_outputs` won't be null.
        # So, we cut decoder_input_ids if past is used
        decoder_input_ids = [decoder_input_ids[-1]]
      end

      # 1. Prepare
      model_inputs = {
        input_name => beam[:inputs],
        decoder_input_ids: [decoder_input_ids],
        encoder_outputs: beam[:encoder_outputs],
        past_key_values: beam[:prev_model_outputs] && beam[:prev_model_outputs][:past_key_values]
      }
      if beam[:attention_mask]
        model_inputs[:attention_mask] = beam[:attention_mask]
      end

      # 2. Run
      output = @forward.(model_inputs)

      # 3. Update
      beam[:prev_model_outputs] = output
      beam[:encoder_outputs] = output[:encoder_outputs]

      output
    end

    def seq2seq_update_beam(beam, new_token_id)
      beam[:output_token_ids] += [new_token_id]
    end

    def group_beams(beams)
      # Group beams by their ids
      groups = {}
      beams.each do |obj|
        if !groups[obj[:id]]
          groups[obj[:id]] = [obj]
        else
          groups[obj[:id]] << obj
        end
      end
      groups.values
    end

    def encoder_forward(model_inputs, output_names: nil)
      encoder_feeds = {}
      @session.inputs.each do |input|
        key = input[:name].to_sym
        encoder_feeds[key] = model_inputs[key]
      end
      if @session.inputs.any? { |v| v[:name] == "token_type_ids" } && !encoder_feeds[:token_type_ids]
        raise Todo
      end
      session_run(@session, encoder_feeds, output_names:)
    end

    def decoder_forward(model_inputs)
      input_ids, past_key_values, attention_mask =
        model_inputs.values_at(:input_ids, :past_key_values, :attention_mask)
      decoder_feeds = {
        input_ids: input_ids,
        attention_mask: attention_mask || prepare_attention_mask(input_ids)
      }
      use_cache_branch = !!past_key_values

      if @session.inputs.map { |v| v[:name] }.include?("use_cache_branch")
        decoder_feeds[:use_cache_branch] = [use_cache_branch]
      end

      prepare_position_ids(@session, decoder_feeds, use_cache_branch)

      add_past_key_values(decoder_feeds, past_key_values)

      decoder_results = session_run(@session, decoder_feeds)
      decoder_results = @session.outputs.map { |v| v[:name] }.zip(decoder_results).to_h

      logits = decoder_results["logits"]

      past_key_values = get_past_key_values(decoder_results, past_key_values)
      {"logits" => logits, past_key_values: past_key_values}
    end

    def decoder_start_beams(input_token_ids, generation_config, num_output_tokens, inputs_attention_mask)
      beams = []

      beam_id = 0
      input_token_ids.each do |tokens|
        output_token_ids = tokens.dup

        # TODO: Improve
        # Currently, just add back batch dimension.
        # In future, allow for true parallel execution
        tokens = [tokens]

        if inputs_attention_mask
          attn_mask = inputs_attention_mask[beam_id]
          attn_mask = [attn_mask]
        else
          attn_mask = prepare_attention_mask(tokens)
        end

        start = {
          input: tokens,
          model_input_ids: tokens,
          attention_mask: attn_mask,
          prev_model_outputs: nil,

          output_token_ids: output_token_ids,
          num_output_tokens: num_output_tokens,

          done: false,
          score: 0,
          id: beam_id # assign unique id to beams
        }
        beam_id += 1

        beams << start
      end
      beams
    end

    def decoder_run_beam(beam)
      attn_mask_data = Array.new(beam[:output_token_ids].length, 1)

      # 1. Prepare
      model_inputs = {
        input_ids: beam[:model_input_ids],
        attention_mask: [attn_mask_data],
        past_key_values: beam[:prev_model_outputs] && beam[:prev_model_outputs][:past_key_values]
      }

      # 2. Run
      output = @forward.(model_inputs)

      # 3. Update
      beam[:prev_model_outputs] = output

      output
    end

    def decoder_update_beam(beam, new_token_id)
      beam[:output_token_ids] += [new_token_id]
      beam[:model_input_ids] = [[new_token_id]]
    end

    def session_run(session, inputs, output_names: nil)
      checked_inputs = validate_inputs(session, inputs)
      begin
        output = session.run(output_names || @output_names, checked_inputs)
        output = replace_tensors(output)
        output
      rescue => e
        raise e
      end
    end

    # TODO
    def replace_tensors(obj)
      obj
    end

    # TODO
    def validate_inputs(session, inputs)
      inputs
    end

    def get_start_beams(input_token_ids, generation_config, num_output_tokens, inputs_attention_mask)
      @get_start_beams.(input_token_ids, generation_config, num_output_tokens, inputs_attention_mask)
    end

    def run_beam(beam)
      @run_beam.(beam)
    end

    def update_beam(beam, new_token_id)
      @update_beam.(beam, new_token_id)
    end
  end

  class BertPreTrainedModel < PreTrainedModel
  end

  class BertModel < BertPreTrainedModel
  end

  class BertForMaskedLM < BertPreTrainedModel
    def call(model_inputs)
      MaskedLMOutput.new(*super(model_inputs))
    end
  end

  class BertForSequenceClassification < BertPreTrainedModel
    def call(model_inputs)
      SequenceClassifierOutput.new(*super(model_inputs))
    end
  end

  class BertForTokenClassification < BertPreTrainedModel
    def call(model_inputs)
      TokenClassifierOutput.new(*super(model_inputs))
    end
  end

  class ModernBertPreTrainedModel < PreTrainedModel
  end

  class ModernBertModel < ModernBertPreTrainedModel
  end

  class ModernBertForMaskedLM < ModernBertPreTrainedModel
    def call(model_inputs)
      MaskedLMOutput.new(*super(model_inputs))
    end
  end

  class ModernBertForSequenceClassification < ModernBertPreTrainedModel
    def call(model_inputs)
      SequenceClassifierOutput.new(*super(model_inputs))
    end
  end

  class ModernBertForTokenClassification < ModernBertPreTrainedModel
    def call(model_inputs)
      TokenClassifierOutput.new(*super(model_inputs))
    end
  end

  class NomicBertPreTrainedModel < PreTrainedModel
  end

  class NomicBertModel < NomicBertPreTrainedModel
  end

  class ConvBertPreTrainedModel < PreTrainedModel
  end

  class ConvBertModel < ConvBertPreTrainedModel
  end

  class ElectraPreTrainedModel < PreTrainedModel
  end

  # TODO add ElectraForPreTraining
  class ElectraModel < ElectraPreTrainedModel
  end

  class DebertaV2PreTrainedModel < PreTrainedModel
  end

  class DebertaV2Model < DebertaV2PreTrainedModel
  end

  class DistilBertPreTrainedModel < PreTrainedModel
  end

  class DistilBertModel < DistilBertPreTrainedModel
  end

  class DistilBertForSequenceClassification < DistilBertPreTrainedModel
    def call(model_inputs)
      SequenceClassifierOutput.new(*super(model_inputs))
    end
  end

  class DistilBertForQuestionAnswering < DistilBertPreTrainedModel
    def call(model_inputs)
      QuestionAnsweringModelOutput.new(*super(model_inputs))
    end
  end

  class MPNetPreTrainedModel < PreTrainedModel
  end

  class MPNetModel < MPNetPreTrainedModel
  end

  class T5PreTrainedModel < PreTrainedModel
  end

  class T5Model < T5PreTrainedModel
  end

  class T5ForConditionalGeneration < T5PreTrainedModel
    def initialize(config, session, decoder_merged_session, generation_config)
      super(config, session)
      @decoder_merged_session = decoder_merged_session
      @generation_config = generation_config

      @num_decoder_layers = @config[:num_decoder_layers]
      @num_decoder_heads = @config[:num_heads]
      @decoder_dim_kv = @config[:d_kv]

      @num_encoder_layers = @config[:num_layers]
      @num_encoder_heads = @config[:num_heads]
      @encoder_dim_kv = @config[:d_kv]
    end
  end

  class BartPretrainedModel < PreTrainedModel
  end

  class BartModel < BartPretrainedModel
  end

  class BartForConditionalGeneration < BartPretrainedModel
    def initialize(config, session, decoder_merged_session, generation_config)
      super(config, session)
      @decoder_merged_session = decoder_merged_session
      @generation_config = generation_config

      @num_decoder_layers = @config["decoder_layers"]
      @num_decoder_heads = @config["decoder_attention_heads"]
      @decoder_dim_kv = @config["d_model"] / @num_decoder_heads.to_f

      @num_encoder_layers = @config["encoder_layers"]
      @num_encoder_heads = @config["encoder_attention_heads"]
      @encoder_dim_kv = @config["d_model"] / @num_encoder_heads
    end
  end

  class BartForSequenceClassification < BartPretrainedModel
    def call(model_inputs)
      SequenceClassifierOutput.new(*super(model_inputs))
    end
  end

  class MBartPreTrainedModel < PreTrainedModel
  end

  class MBartModel < MBartPreTrainedModel
  end

  class MBartForCausalLM < MBartPreTrainedModel
    attr_reader :num_decoder_layers, :num_decoder_heads, :decoder_dim_kv,
      :num_encoder_layers, :num_encoder_heads, :encoder_dim_kv

    def initialize(config, decoder_merged_session, generation_config)
      super(config, decoder_merged_session)
      @generation_config = generation_config

      @num_decoder_layers = @config["decoder_layers"]
      @num_decoder_heads = @config["decoder_attention_heads"]
      @decoder_dim_kv = @config["d_model"] / @num_decoder_heads.to_f

      @num_encoder_layers = @config["encoder_layers"]
      @num_encoder_heads = @config["encoder_attention_heads"]
      @encoder_dim_kv = @config["d_model"] / @num_encoder_heads.to_f
    end
  end

  class M2M100PreTrainedModel < PreTrainedModel
  end

  class M2M100Model < M2M100PreTrainedModel
  end

  class M2M100ForConditionalGeneration < M2M100PreTrainedModel
    def initialize(config, session, decoder_merged_session, generation_config)
      super(config, session)
      @decoder_merged_session = decoder_merged_session
      @generation_config = generation_config

      @num_decoder_layers = @config["decoder_layers"]
      @num_decoder_heads = @config["decoder_attention_heads"]
      @decoder_dim_kv = @config["d_model"] / @num_decoder_heads.to_f

      @num_encoder_layers = @config["encoder_layers"]
      @num_encoder_heads = @config["encoder_attention_heads"]
      @encoder_dim_kv = @config["d_model"] / @num_encoder_heads.to_f
    end
  end

  class Wav2Vec2PreTrainedModel < PreTrainedModel
  end

  class Wav2Vec2Model < Wav2Vec2PreTrainedModel
  end

  class Wav2Vec2ForSequenceClassification < Wav2Vec2PreTrainedModel
    def call(model_inputs)
      SequenceClassifierOutput.new(*super(model_inputs))
    end
  end

  class RobertaPreTrainedModel < PreTrainedModel
  end

  class RobertaModel < RobertaPreTrainedModel
  end

  class RobertaForMaskedLM < RobertaPreTrainedModel
    def call(model_inputs)
      MaskedLMOutput.new(*super(model_inputs))
    end
  end

  class RobertaForTokenClassification <  RobertaPreTrainedModel
    def call(model_inputs)
      TokenClassifierOutput.new(*super(model_inputs))
    end
  end

  class RobertaForSequenceClassification < RobertaPreTrainedModel
    def call(model_inputs)
      SequenceClassifierOutput.new(*super(model_inputs))
    end
  end

  class XLMRobertaPreTrainedModel < PreTrainedModel
  end

  class XLMRobertaModel < XLMRobertaPreTrainedModel
  end

  class XLMRobertaForSequenceClassification < XLMRobertaPreTrainedModel
    def call(model_inputs)
      SequenceClassifierOutput.new(*super(model_inputs))
    end
  end

  class ViTPreTrainedModel < PreTrainedModel
  end

  class ViTModel < ViTPreTrainedModel
  end

  class ViTForImageClassification < ViTPreTrainedModel
    def call(model_inputs)
      SequenceClassifierOutput.new(*super(model_inputs))
    end
  end

  class CLIPPreTrainedModel < PreTrainedModel
  end

  class CLIPModel < CLIPPreTrainedModel
  end

  class GPT2PreTrainedModel < PreTrainedModel
    attr_reader :num_heads, :num_layers, :dim_kv

    def initialize(config, session, generation_config)
      super(config, session)
      @generation_config = generation_config

      # config doesn't contain pad_token_id, so we assume it is the eos_token_id
      @config["pad_token_id"] = @config["eos_token_id"]

      @num_heads = @config["n_head"]
      @num_layers = @config["n_layer"]
      @dim_kv = @config["n_embd"] / @num_heads.to_f
    end
  end

  class GPT2Model < GPT2PreTrainedModel
  end

  class GPT2LMHeadModel < GPT2PreTrainedModel
  end

  class OwlViTPreTrainedModel < PreTrainedModel
  end

  class OwlViTModel < OwlViTPreTrainedModel
  end

  class OwlViTForObjectDetection < OwlViTPreTrainedModel
  end

  class DetrPreTrainedModel < PreTrainedModel
  end

  class DetrModel < DetrPreTrainedModel
  end

  class DetrForObjectDetection < DetrPreTrainedModel
    def call(model_inputs)
      DetrObjectDetectionOutput.new(*super(model_inputs))
    end
  end

  class DetrForSegmentation < DetrPreTrainedModel
    def call(model_inputs)
      DetrSegmentationOutput.new(*super(model_inputs))
    end
  end

  class Swin2SRPreTrainedModel < PreTrainedModel
  end

  class Swin2SRModel < Swin2SRPreTrainedModel
  end

  class Swin2SRForImageSuperResolution < Swin2SRPreTrainedModel
  end

  class DPTPreTrainedModel < PreTrainedModel
  end

  class DPTModel < DPTPreTrainedModel
  end

  class DPTForDepthEstimation < DPTPreTrainedModel
  end

  class VisionEncoderDecoderModel < PreTrainedModel
    MAIN_INPUT_NAME = :pixel_values

    def initialize(config, session, decoder_merged_session, generation_config)
      super(config, session)
      @decoder_merged_session = decoder_merged_session
      @generation_config = generation_config

      # Extract configs
      encoder_config = @config["encoder"]
      decoder_config = @config["decoder"]

      # Validate encoder
      encoder_model_type = encoder_config["model_type"]
      encoder_model = MODEL_MAPPING_NAMES_ENCODER_ONLY[encoder_model_type] || MODEL_MAPPING_NAMES_ENCODER_DECODER[encoder_model_type]
      if !encoder_model
        warn "Model type for encoder '#{encoder_model_type}' not found, assuming encoder-only architecture. Please report this."
      end

      # Validate decoder
      decoder_model = MODEL_WITH_LM_HEAD_MAPPING_NAMES[decoder_config["model_type"]]
      if !decoder_model
        raise Error, "Unable to construct `VisionEncoderDecoder` due to unsupported decoder: \"#{decoder_config["model_type"]}\""
      end

      decoder_model_class = decoder_model[1]
      decoder = decoder_model_class.new(decoder_config, decoder_merged_session, generation_config)

      @add_encoder_pkv = decoder.respond_to?(:num_decoder_layers)
      if @add_encoder_pkv
        # Decoder is part of an encoder-decoder model
        @num_decoder_layers = decoder.num_decoder_layers
        @num_decoder_heads = decoder.num_decoder_heads
        @decoder_dim_kv = decoder.decoder_dim_kv

        @num_encoder_layers = decoder.num_encoder_layers
        @num_encoder_heads = decoder.num_encoder_heads
        @encoder_dim_kv = decoder.encoder_dim_kv
      else
        # Decoder is a decoder-only model
        @num_layers = decoder.num_layers
        @num_heads = decoder.num_heads
        @dim_kv = decoder.dim_kv
      end
    end
  end

  class DonutSwinPreTrainedModel < PreTrainedModel
  end

  class DonutSwinModel < DonutSwinPreTrainedModel
  end

  class WhisperPreTrainedModel < PreTrainedModel
  end

  class WhisperModel < WhisperPreTrainedModel
  end

  class WhisperForConditionalGeneration < WhisperPreTrainedModel
    REQUIRES_ATTENTION_MASK = false
    MAIN_INPUT_NAME = :input_features

    def initialize(config, session, decoder_merged_session, generation_config)
      super(config, session)
      @decoder_merged_session = decoder_merged_session
      @generation_config = generation_config

      @num_decoder_layers = @config["decoder_layers"]
      @num_decoder_heads = @config["decoder_attention_heads"]
      @decoder_dim_kv = @config["d_model"] / @num_decoder_heads.to_f

      @num_encoder_layers = @config["encoder_layers"]
      @num_encoder_heads = @config["encoder_attention_heads"]
      @encoder_dim_kv = @config["d_model"] / @num_encoder_heads.to_f
    end

    def generate(inputs, generation_config = nil, logits_processor = nil)
      raise Todo
    end
  end

  class VitsPreTrainedModel < PreTrainedModel
  end

  class VitsModel < VitsPreTrainedModel
    def call(model_inputs)
      VitsModelOutput.new(*super(model_inputs))
    end
  end

  class SpeechT5PreTrainedModel < PreTrainedModel
  end

  class SpeechT5Model < SpeechT5PreTrainedModel
  end

  class SpeechT5ForSpeechToText < SpeechT5PreTrainedModel
  end

  class SpeechT5ForTextToSpeech < SpeechT5PreTrainedModel
  end

  class ClapPreTrainedModel < PreTrainedModel
  end

  class ClapModel < ClapPreTrainedModel
  end

  MODEL_MAPPING_NAMES_ENCODER_ONLY = {
    "bert" => ["BertModel", BertModel],
    "modernbert" => ["ModernBertModel", ModernBertModel],
    "nomic_bert" => ["NomicBertModel", NomicBertModel],
    "electra" => ["ElectraModel", ElectraModel],
    "convbert" => ["ConvBertModel", ConvBertModel],
    "deberta-v2" => ["DebertaV2Model", DebertaV2Model],
    "mpnet" => ["MPNetModel", MPNetModel],
    "distilbert" => ["DistilBertModel", DistilBertModel],
    "roberta" => ["RobertaModel", RobertaModel],
    "xlm-roberta" => ["XLMRobertaModel", XLMRobertaModel],
    "clap" => ["ClapModel", ClapModel],
    "clip" => ["CLIPModel", CLIPModel],
    "detr" => ["DetrModel", DetrModel],
    "vit" => ["ViTModel", ViTModel],
    "owlvit" => ["OwlViTModel", OwlViTModel],
    "donut-swin" => ["DonutSwinModel", DonutSwinModel]
  }

  MODEL_MAPPING_NAMES_ENCODER_DECODER = {
    "bart" => ["BartModel", BartModel]
  }

  MODEL_MAPPING_NAMES_DECODER_ONLY = {
  }

  MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = {
    "whisper" => ["WhisperForConditionalGeneration", WhisperForConditionalGeneration]
  }

  MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = {
    "speecht5" => ["SpeechT5ForTextToSpeech", SpeechT5ForTextToSpeech]
  }

  MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = {
    "vits" => ["VitsModel", VitsModel]
  }

  MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = {
    "bert" => ["BertForSequenceClassification", BertForSequenceClassification],
    "modernbert" => ["ModernBertForSequenceClassification", ModernBertForSequenceClassification],
    "distilbert" => ["DistilBertForSequenceClassification", DistilBertForSequenceClassification],
    "roberta" => ["RobertaForSequenceClassification", RobertaForSequenceClassification],
    "xlm-roberta" => ["XLMRobertaForSequenceClassification", XLMRobertaForSequenceClassification],
    "bart" => ["BartForSequenceClassification", BartForSequenceClassification]
  }

  MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = {
    "bert" => ["BertForTokenClassification", BertForTokenClassification],
    "modernbert" => ["ModernBertForTokenClassification", ModernBertForTokenClassification],
    "roberta" => ["RobertaForTokenClassification", RobertaForTokenClassification]
  }

  MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = {
    "t5" => ["T5ForConditionalGeneration", T5ForConditionalGeneration],
    "bart" => ["BartForConditionalGeneration", BartForConditionalGeneration],
    "m2m_100" => ["M2M100ForConditionalGeneration", M2M100ForConditionalGeneration]
  }

  MODEL_WITH_LM_HEAD_MAPPING_NAMES = {
    "gpt2" => ["GPT2LMHeadModel", GPT2LMHeadModel],
    "mbart" => ["MBartForCausalLM", MBartForCausalLM]
  }

  MODEL_FOR_MASKED_LM_MAPPING_NAMES = {
    "bert" => ["BertForMaskedLM", BertForMaskedLM],
    "modernbert" => ["ModernBertForMaskedLM", ModernBertForMaskedLM],
    "roberta" => ["RobertaForMaskedLM", RobertaForMaskedLM]
  }

  MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = {
    "distilbert" => ["DistilBertForQuestionAnswering", DistilBertForQuestionAnswering]
  }

  MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = {
    "vision-encoder-decoder" => ["VisionEncoderDecoderModel", VisionEncoderDecoderModel]
  }

  MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = {
    "vision-encoder-decoder" => ["VisionEncoderDecoderModel", VisionEncoderDecoderModel]
  }

  MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = {
    "vit" => ["ViTForImageClassification", ViTForImageClassification]
  }

  MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = {
    "detr" => ["DetrForObjectDetection", DetrForObjectDetection]
  }

  MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = {
    "owlvit" => ["OwlViTForObjectDetection", OwlViTForObjectDetection]
  }

  MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = {
    "detr" => ["DetrForSegmentation", DetrForSegmentation]
  }

  MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = {
  }

  MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = {
  }

  MODEL_FOR_CTC_MAPPING_NAMES = {
  }

  MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = {
    "wav2vec2" => ["Wav2Vec2ForSequenceClassification", Wav2Vec2ForSequenceClassification]
  }

  MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES = {
  }

  MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES = {
  }

  MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES = {
  }

  MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = {
    "swin2sr" => ["Swin2SRForImageSuperResolution", Swin2SRForImageSuperResolution]
  }

  MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = {
    "dpt" => ["DPTForDepthEstimation", DPTForDepthEstimation]
  }

  MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES = {
  }

  MODEL_CLASS_TYPE_MAPPING = [
    [MODEL_MAPPING_NAMES_ENCODER_ONLY, MODEL_TYPES[:EncoderOnly]],
    [MODEL_MAPPING_NAMES_ENCODER_DECODER, MODEL_TYPES[:EncoderDecoder]],
    [MODEL_MAPPING_NAMES_DECODER_ONLY, MODEL_TYPES[:DecoderOnly]],
    [MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
    [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
    [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
    [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
    [MODEL_WITH_LM_HEAD_MAPPING_NAMES, MODEL_TYPES[:DecoderOnly]],
    [MODEL_FOR_MASKED_LM_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
    [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
    [MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, MODEL_TYPES[:Vision2Seq]],
    [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
    [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
    [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
    [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
    [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
    [MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
    [MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
    [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
    [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES, MODEL_TYPES[:MaskGeneration]],
    [MODEL_FOR_CTC_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
    [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
    [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_TYPES[:Seq2Seq]],
    [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
    [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
    [MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]],
    [MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES, MODEL_TYPES[:EncoderOnly]]
  ]

  MODEL_CLASS_TYPE_MAPPING.each do |mappings, type|
    mappings.values.each do |name, model|
      MODEL_TYPE_MAPPING[name] = type
      MODEL_CLASS_TO_NAME_MAPPING[model] = name
      MODEL_NAME_TO_CLASS_MAPPING[name] = model
    end
  end

  class AutoModel < PretrainedMixin
    MODEL_CLASS_MAPPINGS = MODEL_CLASS_TYPE_MAPPING.map { |x| x[0] }
    BASE_IF_FAIL = true
  end

  class AutoModelForSequenceClassification < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES]
  end

  class AutoModelForTokenClassification < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES]
  end

  class AutoModelForSeq2SeqLM < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES]
  end

  class AutoModelForSpeechSeq2Seq < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES]
  end

  class AutoModelForTextToSpectrogram < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES]
  end

  class AutoModelForTextToWaveform < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES]
  end

  class AutoModelForCausalLM < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_WITH_LM_HEAD_MAPPING_NAMES]
  end

  class AutoModelForMaskedLM < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_MASKED_LM_MAPPING_NAMES]
  end

  class AutoModelForQuestionAnswering < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES]
  end

  class AutoModelForVision2Seq < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES]
  end

  class AutoModelForImageClassification < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES]
  end

  class AutoModelForImageSegmentation < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES]
  end

  class AutoModelForSemanticSegmentation < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES]
  end

  class AutoModelForObjectDetection < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES]
  end

  class AutoModelForZeroShotObjectDetection < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES]
  end

  class AutoModelForMaskGeneration < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_MASK_GENERATION_MAPPING_NAMES]
  end

  class AutoModelForCTC < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_CTC_MAPPING_NAMES]
  end

  class AutoModelForAudioClassification < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES]
  end

  class AutoModelForXVector < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES]
  end

  class AutoModelForAudioFrameClassification < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES]
  end

  class AutoModelForDocumentQuestionAnswering < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES]
  end

  class AutoModelForImageMatting < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES]
  end

  class AutoModelForImageToImage < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES]
  end

  class AutoModelForDepthEstimation < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES]
  end

  class AutoModelForImageFeatureExtraction < PretrainedMixin
    MODEL_CLASS_MAPPINGS = [MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES]
  end

  class ModelOutput
    def [](key)
      instance_variable_get("@#{key}")
    end
  end

  class Seq2SeqLMOutput < ModelOutput
    def initialize(logits, past_key_values, encoder_outputs, decoder_attentions = nil, cross_attentions = nil)
      super()
      @logits = logits
      @past_key_values = past_key_values
      @encoder_outputs = encoder_outputs
      @decoder_attentions = decoder_attentions
      @cross_attentions = cross_attentions
    end
  end

  class SequenceClassifierOutput < ModelOutput
    attr_reader :logits

    def initialize(logits)
      super()
      @logits = logits
    end
  end

  class TokenClassifierOutput < ModelOutput
    attr_reader :logits

    def initialize(logits)
      super()
      @logits = logits
    end
  end

  class MaskedLMOutput < ModelOutput
    attr_reader :logits

    def initialize(logits)
      super()
      @logits = logits
    end
  end

  class QuestionAnsweringModelOutput < ModelOutput
    attr_reader :start_logits, :end_logits

    def initialize(start_logits, end_logits)
      super()
      @start_logits = start_logits
      @end_logits = end_logits
    end
  end

  class DetrObjectDetectionOutput < ModelOutput
    attr_reader :logits, :pred_boxes

    def initialize(logits, pred_boxes)
      super()
      @logits = logits
      @pred_boxes = pred_boxes
    end
  end

  class DetrSegmentationOutput < ModelOutput
    attr_reader :logits, :pred_boxes, :pred_masks

    def initialize(logits, pred_boxes, pred_masks)
      super()
      @logits = logits
      @pred_boxes = pred_boxes
      @pred_masks = pred_masks
    end
  end
end


================================================
FILE: lib/informers/pipelines.rb
================================================
module Informers
  class Pipeline
    def initialize(task:, model:, tokenizer: nil, processor: nil)
      super()
      @task = task
      @model = model
      @tokenizer = tokenizer
      @processor = processor
    end

    private

    def prepare_images(images)
      if !images.is_a?(Array)
        images = [images]
      end

      # Possibly convert any non-images to images
      images.map { |x| Utils::RawImage.read(x) }
    end

    def prepare_audios(audios, sampling_rate)
      if !audios.is_a?(Array)
        audios = [audios]
      end

      audios.map do |x|
        if x.is_a?(String) || x.is_a?(URI)
          Utils.read_audio(x, sampling_rate)
        else
          x
        end
      end
    end

    def get_bounding_box(box, as_integer)
      if as_integer
        box = box.map { |x| x.to_i }
      end
      xmin, ymin, xmax, ymax = box

      {xmin:, ymin:, xmax:, ymax:}
    end
  end

  class TextClassificationPipeline < Pipeline
    def call(texts, top_k: 1)
      # Run tokenization
      model_inputs = @tokenizer.(texts,
        padding: true,
        truncation: true
      )

      # Run model
      outputs = @model.(model_inputs)

      function_to_apply =
        if @model.config[:problem_type] == "multi_label_classification"
          ->(batch) { Utils.sigmoid(batch) }
        else
          ->(batch) { Utils.softmax(batch) } # single_label_classification (default)
        end

      id2label = @model.config[:id2label]

      to_return = []
      outputs.logits.each do |batch|
        output = function_to_apply.(batch)
        scores = Utils.get_top_items(output, top_k)

        vals = scores.map do |x|
          {
            label: id2label[x[0].to_s],
            score: x[1]
          }
        end
        if top_k == 1
          to_return.concat(vals)
        else
          to_return << vals
        end
      end

      texts.is_a?(Array) ? to_return : to_return[0]
    end
  end

  class TokenClassificationPipeline < Pipeline
    def call(
      texts,
      ignore_labels: ["O"],
      aggregation_strategy: "simple"
    )
      is_batched = texts.is_a?(Array)

      # Run tokenization
      model_inputs = @tokenizer.(is_batched ? texts : [texts],
        padding: true,
        truncation: true,
        return_offsets: true
      )

      # Run model
      outputs = @model.(model_inputs)

      logits = outputs.logits
      id2label = @model.config[:id2label]

      to_return = []
      logits.length.times do |i|
        ids = model_inputs[:input_ids][i]
        batch = logits[i]
        offsets = model_inputs[:offsets][i]

        # List of tokens that aren't ignored
        tokens = []
        batch.length.times do |j|
          token_data = batch[j]
          top_score_index = Utils.max(token_data)[1]

          entity = id2label ? id2label[top_score_index.to_s] : "LABEL_#{top_score_index}"
          if ignore_labels.include?(entity)
            # We predicted a token that should be ignored. So, we skip it.
            next
          end

          # TODO add option to keep special tokens?
          word = @tokenizer.decode([ids[j]], skip_special_tokens: true)
          if word == ""
            # Was a special token. So, we skip it.
            next
          end

          scores = Utils.softmax(token_data)

          tokens << {
            entity: entity,
            score: scores[top_score_index],
            index: j,
            word: word,
            start: offsets[j][0],
            end: offsets[j][1]
          }
        end

        case aggregation_strategy
        when "simple"
          tokens = group_entities(tokens)
        when "none"
          # do nothing
        else
          raise ArgumentError, "Invalid aggregation_strategy"
        end

        to_return << tokens
      end
      is_batched ? to_return : to_return[0]
    end

    def group_sub_entities(entities)
      # Get the first entity in the entity group
      entity = entities[0][:entity].split("-", 2)[-1]
      scores = entities.map { |entity| entity[:score] }
      tokens = entities.map { |entity| entity[:word] }

      entity_group = {
        entity_group: entity,
        score: scores.sum / scores.count.to_f,
        word: @tokenizer.convert_tokens_to_string(tokens),
        start: entities[0][:start],
        end: entities[-1][:end]
      }
      entity_group
    end

    def get_tag(entity_name)
      if entity_name.start_with?("B-")
        bi = "B"
        tag = entity_name[2..]
      elsif entity_name.start_with?("I-")
        bi = "I"
        tag = entity_name[2..]
      else
        # It's not in B-, I- format
        # Default to I- for continuation.
        bi = "I"
        tag = entity_name
      end
      [bi, tag]
    end

    def group_entities(entities)
      entity_groups = []
      entity_group_disagg = []

      entities.each do |entity|
        if entity_group_disagg.empty?
          entity_group_disagg << entity
          next
        end

        # If the current entity is similar and adjacent to the previous entity,
        # append it to the disaggregated entity group
        # The split is meant to account for the "B" and "I" prefixes
        # Shouldn't merge if both entities are B-type
        bi, tag = get_tag(entity[:entity])
        _last_bi, last_tag = get_tag(entity_group_disagg[-1][:entity])

        if tag == last_tag && bi != "B"
          # Modify subword type to be previous_type
          entity_group_disagg << entity
        else
          # If the current entity is different from the previous entity
          # aggregate the disaggregated entity group
          entity_groups << group_sub_entities(entity_group_disagg)
          entity_group_disagg = [entity]
        end
      end
      if entity_group_disagg.any?
        # it's the last entity, add it to the entity groups
        entity_groups << group_sub_entities(entity_group_disagg)
      end

      entity_groups
    end
  end

  class QuestionAnsweringPipeline < Pipeline
    def call(question, context, top_k: 1)
      # Run tokenization
      inputs = @tokenizer.(question,
        text_pair: context,
        padding: true,
        truncation: true,
        return_offsets: true
      )

      output = @model.(inputs)

      to_return = []
      output.start_logits.length.times do |j|
        ids = inputs[:input_ids][j]
        sep_index = ids.index(@tokenizer.sep_token_id)
        offsets = inputs[:offsets][j]

        s1 = Utils.softmax(output.start_logits[j])
          .map.with_index
          .select { |x| x[1] > sep_index }
        e1 = Utils.softmax(output.end_logits[j])
          .map.with_index
          .select { |x| x[1] > sep_index }

        options = s1.product(e1)
          .select { |x| x[0][1] <= x[1][1] }
          .map { |x| [x[0][1], x[1][1], x[0][0] * x[1][0]] }
          .sort_by { |v| -v[2] }

        [options.length, top_k].min.times do |k|
          start, end_, score = options[k]

          answer_tokens = ids.slice(start, end_ + 1)

          answer = @tokenizer.decode(answer_tokens,
            skip_special_tokens: true
          )

          to_return << {
            answer:,
            score:,
            start: offsets[start][0],
            end: offsets[end_][1]
          }
        end
      end

      question.is_a?(Array) ? to_return : to_return[0]
    end
  end

  class FillMaskPipeline < Pipeline
    def call(texts, top_k: 5)
      model_inputs = @tokenizer.(texts, padding: true, truncation: true)
      outputs = @model.(model_inputs)

      to_return = []
      model_inputs[:input_ids].each_with_index do |ids, i|
        mask_token_index = ids.index(@tokenizer.mask_token_id)

        if mask_token_index.nil?
          raise ArgumentError, "Mask token (#{@tokenizer.mask_token}) not found in text."
        end
        logits = outputs.logits[i]
        item_logits = logits[mask_token_index]

        scores = Utils.get_top_items(Utils.softmax(item_logits), top_k)

        to_return <<
          scores.map do |x|
            sequence = ids.dup
            sequence[mask_token_index] = x[0]

            {
              score: x[1],
              token: x[0],
              token_str: @tokenizer.id_to_token(x[0]),
              sequence: @tokenizer.decode(sequence, skip_special_tokens: true)
            }
          end
      end
      texts.is_a?(Array) ? to_return : to_return[0]
    end
  end

  class Text2TextGenerationPipeline < Pipeline
    KEY = :generated_text

    def call(texts, **generate_kwargs)
      if !texts.is_a?(Array)
        texts = [texts]
      end

      # Add global prefix, if present
      if @model.config[:prefix]
        texts = texts.map { |x| @model.config[:prefix] + x }
      end

      # Handle task specific params:
      task_specific_params = @model.config[:task_specific_params]
      if task_specific_params && task_specific_params[@task]
        # Add prefixes, if present
        if task_specific_params[@task]["prefix"]
          texts = texts.map { |x| task_specific_params[@task]["prefix"] + x }
        end

        # TODO update generation config
      end

      tokenizer = @tokenizer
      tokenizer_options = {
        padding: true,
        truncation: true
      }
      if is_a?(TranslationPipeline) && tokenizer.respond_to?(:_build_translation_inputs)
        input_ids = tokenizer._build_translation_inputs(texts, tokenizer_options, generate_kwargs)[:input_ids]
      else
        input_ids = tokenizer.(texts, **tokenizer_options)[:input_ids]
      end

      output_token_ids = @model.generate(input_ids, generate_kwargs)

      tokenizer.batch_decode(output_token_ids, skip_special_tokens: true)
        .map { |text| {self.class.const_get(:KEY) => text} }
    end
  end

  class SummarizationPipeline < Text2TextGenerationPipeline
    KEY = :summary_text
  end

  class TranslationPipeline < Text2TextGenerationPipeline
    KEY = :translation_text
  end

  class TextGenerationPipeline < Pipeline
    def call(texts, **generate_kwargs)
      is_batched = false
      is_chat_input = false

      # Normalize inputs
      if texts.is_a?(String)
        texts = [texts]
        inputs = texts
      else
        raise Todo
      end

      # By default, do not add special tokens
      add_special_tokens = generate_kwargs[:add_special_tokens] || false

      # /By default, return full text
      return_full_text =
        if is_chat_input
          false
        else
          generate_kwargs[:return_full_text] || true
        end

      @tokenizer.padding_side = "left"
      input_ids, attention_mask =
        @tokenizer.(inputs, add_special_tokens:, padding: true, truncation: true)
          .values_at(:input_ids, :attention_mask)

      output_token_ids =
        @model.generate(
          input_ids, generate_kwargs, nil, inputs_attention_mask: attention_mask
        )

      decoded = @tokenizer.batch_decode(output_token_ids, skip_special_tokens: true)

      if !return_full_text && Utils.dims(input_ids)[-1] > 0
        prompt_lengths = @tokenizer.batch_decode(input_ids, skip_special_tokens: true).map { |x| x.length }
      end

      to_return = Array.new(texts.length) { [] }
      decoded.length.times do |i|
        text_index = (i / output_token_ids.length.to_i * texts.length).floor

        if prompt_lengths
          raise Todo
        end
        # TODO is_chat_input
        to_return[text_index] << {
          generated_text: decoded[i]
        }
      end
      !is_batched && to_return.length == 1 ? to_return[0] : to_return
    end
  end

  class ZeroShotClassificationPipeline < Pipeline
    def initialize(**options)
      super(**options)

      @label2id = @model.config[:label2id].transform_keys(&:downcase)

      @entailment_id = @label2id["entailment"]
      if @entailment_id.nil?
        warn "Could not find 'entailment' in label2id mapping. Using 2 as entailment_id."
        @entailment_id = 2
      end

      @contradiction_id = @label2id["contradiction"] || @label2id["not_entailment"]
      if @contradiction_id.nil?
        warn "Could not find 'contradiction' in label2id mapping. Using 0 as contradiction_id."
        @contradiction_id = 0
      end
    end

    def call(texts, candidate_labels, hypothesis_template: "This example is {}.", multi_label: false)
      is_batched = texts.is_a?(Array)
      if !is_batched
        texts = [texts]
      end
      if !candidate_labels.is_a?(Array)
        candidate_labels = [candidate_labels]
      end

      # Insert labels into hypothesis template
      hypotheses = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }

      # How to perform the softmax over the logits:
      #  - true:  softmax over the entailment vs. contradiction dim for each label independently
      #  - false: softmax the "entailment" logits over all candidate labels
      softmax_each = multi_label || candidate_labels.length == 1

      to_return = []
      texts.each do |premise|
        entails_logits = []

        hypotheses.each do |hypothesis|
          inputs = @tokenizer.(
            premise,
            text_pair: hypothesis,
            padding: true,
            truncation: true
          )
          outputs = @model.(inputs)

          if softmax_each
            entails_logits << [
              outputs.logits[0][@contradiction_id],
              outputs.logits[0][@entailment_id]
            ]
          else
            entails_logits << outputs.logits[0][@entailment_id]
          end
        end

        scores =
          if softmax_each
            entails_logits.map { |x| Utils.softmax(x)[1] }
          else
            Utils.softmax(entails_logits)
          end

        # Sort by scores (desc) and return scores with indices
        scores_sorted = scores.map.with_index { |x, i| [x, i] }.sort_by { |v| -v[0] }

        to_return << {
          sequence: premise,
          labels: scores_sorted.map { |x| candidate_labels[x[1]] },
          scores: scores_sorted.map { |x| x[0] }
        }
      end
      is_batched ? to_return : to_return[0]
    end
  end

  class ImageToTextPipeline < Pipeline
    def call(images, **generate_kwargs)
      is_batched = images.is_a?(Array)
      prepared_images = prepare_images(images)

      pixel_values = @processor.(prepared_images)[:pixel_values]

      to_return = []
      pixel_values.each do |batch|
        batch = [batch]
        output = @model.generate(batch, **generate_kwargs)
        decoded = @tokenizer
          .batch_decode(output, skip_special_tokens: true)
          .map { |x| {generated_text: x.strip} }
        to_return << decoded
      end

      is_batched ? to_return : to_return[0]
    end
  end

  class ImageClassificationPipeline < Pipeline
    def call(images, top_k: 1)
      is_batched = images.is_a?(Array)
      prepared_images = prepare_images(images)

      pixel_values = @processor.(prepared_images)[:pixel_values]
      output = @model.({pixel_values: pixel_values})

      id2label = @model.config[:id2label]
      to_return = []
      output.logits.each do |batch|
        scores = Utils.get_top_items(Utils.softmax(batch), top_k)

        vals =
          scores.map do |x|
            {
              label: id2label[x[0].to_s],
              score: x[1]
            }
          end
        if top_k == 1
          to_return.push(*vals)
        else
          to_return << vals
        end
      end

      is_batched || top_k == 1 ? to_return : to_return[0]
    end
  end

  class ImageSegmentationPipeline < Pipeline
    def initialize(**options)
      super(**options)

      @subtasks_mapping = {
        "panoptic" => "post_process_panoptic_segmentation",
        "instance" => "post_process_instance_segmentation",
        "semantic" => "post_process_semantic_segmentation"
      }
    end

    def call(
      images,
      threshold: 0.5,
      mask_threshold: 0.5,
      overlap_mask_area_threshold: 0.8,
      label_ids_to_fuse: nil,
      target_sizes: nil,
      subtask: nil
    )
      is_batched = images.is_a?(Array)

      if is_batched && images.length != 1
        raise Error, "Image segmentation pipeline currently only supports a batch size of 1."
      end

      prepared_images = prepare_images(images)
      image_sizes = prepared_images.map { |x| [x.height, x.width] }

      model_inputs = @processor.(prepared_images).slice(:pixel_values, :pixel_mask)
      output = @model.(model_inputs)

      if !subtask.nil?
        fn = @subtasks_mapping[subtask]
      else
        @subtasks_mapping.each do |task, func|
          if @processor.feature_extractor.respond_to?(func)
            fn = @processor.feature_extractor.method(func)
            subtask = task
            break
          end
        end
      end

      id2label = @model.config[:id2label]

      annotation = []
      if subtask == "panoptic" || subtask == "instance"
        processed = fn.(
          output,
          threshold:,
          mask_threshold:,
          overlap_mask_area_threshold:,
          label_ids_to_fuse:,
          target_sizes: target_sizes || image_sizes, # TODO FIX?
        )[0]

        _segmentation = processed[:segmentation]

        processed[:segments_info].each do |segment|
          annotation << {
            label: id2label[segment[:label_id].to_s],
            score: segment[:score]
            # TODO mask
          }
        end
      elsif subtask == "semantic"
        raise Todo
      else
        raise Error, "Subtask #{subtask} not supported."
      end

      annotation
    end
  end

  class ZeroShotImageClassificationPipeline < Pipeline
    def call(images, candidate_labels, hypothesis_template: "This is a photo of {}")
      is_batched = images.is_a?(Array)
      prepared_images = prepare_images(images)

      # Insert label into hypothesis template
      texts = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }

      #  Run tokenization
      text_inputs = @tokenizer.(texts,
        padding: @model.config[:model_type] == "siglip" ? "max_length" : true,
        truncation: true
      )

      # Run processor
      pixel_values = @processor.(prepared_images)[:pixel_values]

      # Run model with both text and pixel inputs
      output = @model.(text_inputs.merge(pixel_values: pixel_values))

      function_to_apply =
        if @model.config[:model_type] == "siglip"
          ->(batch) { Utils.sigmoid(batch) }
        else
          ->(batch) { Utils.softmax(batch) }
        end

      # Compare each image with each candidate label
      to_return = []
      output[0].each do |batch|
        # Compute softmax per image
        probs = function_to_apply.(batch)

        result = probs
          .map.with_index { |x, i| {label: candidate_labels[i], score: x} }
          .sort_by { |v| -v[:score] }

        to_return << result
      end

      is_batched ? to_return : to_return[0]
    end
  end

  class ObjectDetectionPipeline < Pipeline
    def call(images, threshold: 0.9, percentage: false)
      is_batched = images.is_a?(Array)

      if is_batched && images.length != 1
        raise Error, "Object detection pipeline currently only supports a batch size of 1."
      end
      prepared_images = prepare_images(images)

      image_sizes = percentage ? nil : prepared_images.map { |x| [x.height, x.width] }

      model_inputs = @processor.(prepared_images).slice(:pixel_values, :pixel_mask)
      output = @model.(model_inputs)

      processed = @processor.feature_extractor.post_process_object_detection(output, threshold, image_sizes)

      # Add labels
      id2label = @model.config[:id2label]

      # Format output
      result =
        processed.map do |batch|
          batch[:boxes].map.with_index do |box, i|
            {
              label: id2label[batch[:classes][i].to_s],
              score: batch[:scores][i],
              box: get_bounding_box(box, !percentage)
            }
          end.sort_by { |v| -v[:score] }
        end

      is_batched ? result : result[0]
    end
  end

  class ZeroShotObjectDetectionPipeline < Pipeline
    def call(
      images,
      candidate_labels,
      threshold: 0.1,
      top_k: nil,
      percentage: false
    )
      is_batched = images.is_a?(Array)
      prepared_images = prepare_images(images)

      # Run tokenization
      text_inputs = @tokenizer.(candidate_labels,
        padding: true,
        truncation: true
      )

      # Run processor
      model_inputs = @processor.(prepared_images)

      # Since non-maximum suppression is performed for exporting, we need to
      # process each image separately. For more information, see:
      # https://github.com/huggingface/optimum/blob/e3b7efb1257c011db907ef40ab340e795cc5684c/optimum/exporters/onnx/model_configs.py#L1028-L1032
      to_return = []
      prepared_images.length.times do |i|
        image = prepared_images[i]
        image_size = percentage ? nil : [[image.height, image.width]]
        pixel_values = [model_inputs[:pixel_values][i]]

        # Run model with both text and pixel inputs
        output = @model.(text_inputs.merge(pixel_values: pixel_values))
        # TODO remove
        output = @model.instance_variable_get(:@session).outputs.map { |v| v[:name].to_sym }.zip(output).to_h

        processed = @processor.feature_extractor.post_process_object_detection(output, threshold, image_size, true)[0]
        result =
          processed[:boxes].map.with_index do |box, i|
            {
              label: candidate_labels[processed[:classes][i]],
              score: processed[:scores][i],
              box: get_bounding_box(box, !percentage)
            }
          end
        result.sort_by! { |v| -v[:score] }
        if !top_k.nil?
          result = result[0...topk]
        end
        to_return << result
      end

      is_batched ? to_return : to_return[0]
    end
  end

  class DocumentQuestionAnsweringPipeline < Pipeline
    def call(image, question, **generate_kwargs)
      # NOTE: For now, we only support a batch size of 1

      # Preprocess image
      prepared_image = prepare_images(image)[0]
      pixel_values = @processor.(prepared_image)[:pixel_values]

      # Run tokenization
      task_prompt = "<s_docvqa><s_question>#{question}</s_question><s_answer>"
      decoder_input_ids =
        @tokenizer.(
          task_prompt,
          add_special_tokens: false,
          padding: true,
          truncation: true
        )[:input_ids]

      # Run model
      output =
        @model.generate(
          pixel_values,
          generate_kwargs.merge(
            decoder_input_ids: decoder_input_ids[0],
            max_length: @model.config["decoder"]["max_position_embeddings"]
          ).transform_keys(&:to_s)
        )

      # Decode output
      decoded = @tokenizer.batch_decode(output, skip_special_tokens: false)[0]

      # Parse answer
      match = decoded.match(/<s_answer>(.*?)<\/s_answer>/)
      answer = nil
      if match && match.length >= 2
        answer = match[1].strip
      end
      [{answer:}]
    end
  end

  class TextToAudioPipeline < Pipeline
    DEFAULT_VOCODER_ID = "Xenova/speecht5_hifigan"

    def initialize(**options)
      super(**options)

      # TODO: Find a better way for `pipeline` to set the default vocoder
      @vocoder = options[:vocoder]
    end

    def call(text_inputs, speaker_embeddings: nil)
      # If this.processor is not set, we are using a `AutoModelForTextToWaveform` model
      if @processor
        call_text_to_spectrogram(text_inputs, speaker_embeddings:)
      else
        call_text_to_waveform(text_inputs)
      end
    end
  end

  class FeatureExtractionPipeline < Pipeline
    def call(
      texts,
      pooling: "none",
      normalize: false,
      quantize: false,
      precision: "binary",
      model_output: nil
    )
      # Run tokenization
      model_inputs = @tokenizer.(texts,
        padding: true,
        truncation: true
      )
      model_options = {}

      if !model_output.nil?
        model_options[:output_names] = Array(model_output)
      elsif @model.instance_variable_get(:@output_names) == ["token_embeddings"] && pooling == "mean" && normalize
        # optimization for previous revision of sentence-transformers/all-MiniLM-L6-v2
        model_options[:output_names] = ["sentence_embedding"]
        pooling = "none"
        normalize = false
      end

      # Run model
      outputs = @model.(model_inputs, **model_options)

      # TODO improve
      result =
        if outputs.is_a?(Array)
          # TODO show returned instead of all
          output_names = @model.instance_variable_get(:@session).outputs.map { |v| v[:name] }
          raise Error, "unexpected outputs: #{output_names}" if outputs.size != 1
          outputs[0]
        else
          outputs.logits
        end

      case pooling
      when "none"
        # Skip pooling
      when "mean"
        result = Utils.mean_pooling(result, model_inputs[:attention_mask])
      when "cls"
        result = result.map(&:first)
      else
        # TODO raise ArgumentError in 2.0
        raise Error, "Pooling method '#{pooling}' not supported."
      end

      if normalize
        result = Utils.normalize(result)
      end

      if quantize
        result = quantize_embeddings(result, precision)
      end

      texts.is_a?(Array) ? result : result[0]
    end
  end

  class ImageFeatureExtractionPipeline < Pipeline
    def call(images)
      prepared_images = prepare_images(images)
      pixel_values = @processor.(prepared_images)[:pixel_values]
      outputs = @model.({pixel_values: pixel_values})

      result = outputs[0]
      result
    end
  end

  class AudioClassificationPipeline < Pipeline
    def call(audio, top_k: nil)
      single = !audio.is_a?(Array)

      sampling_rate = @processor.feature_extractor.config["sampling_rate"]
      prepared_audios = prepare_audios(audio, sampling_rate)

      id2label = @model.config[:id2label]

      to_return = []
      prepared_audios.each do |aud|
        inputs = @processor.(aud)
        output = @model.(inputs)
        logits = output.logits[0]

        scores = Utils.get_top_items(Utils.softmax(logits), top_k)

        vals =
          scores.map do |x|
            {
              label: id2label[x[0].to_s],
              score: x[1]
            }
          end

        if top_k == 1
          to_return.concat(vals)
        else
          to_return << vals
        end
      end
      !single || top_k == 1 ? to_return : to_return[0]
    end
  end

  class ZeroShotAudioClassificationPipeline < Pipeline
    def call(audio, candidate_labels, hypothesis_template: "This is a sound of {}.")
      single = !audio.is_a?(Array)
      if single
        audio = [audio]
      end

      # Insert label into hypothesis template
      texts = candidate_labels.map { |x| hypothesis_template.sub("{}", x) }

      # Run tokenization
      text_inputs =
        @tokenizer.(
          texts,
          padding: true,
          truncation: true
        )

      sampling_rate = @processor.feature_extractor.config["sampling_rate"]
      prepared_audios = prepare_audios(audio, sampling_rate)

      to_return = []
      prepared_audios.each do |aud|
        audio_inputs = @processor.(aud)

        # Run model with both text and audio inputs
        output = @model.(text_inputs.merge(audio_inputs))

        # Compute softmax per audio
        probs = Utils.softmax(output.logits_per_audio.data)

        to_return <<
          probs.map.with_index do |x, i|
            {
              label: candidate_labels[i],
              score: x
            }
          end
      end
      single ? to_return[0] : to_return
    end
  end

  class AutomaticSpeechRecognitionPipeline < Pipeline
    def call(audio, **kwargs)
      case @model.config["model_type"]
      when "whisper"
        call_whisper(audio, **kwargs)
      else
        raise Error, "AutomaticSpeechRecognitionPipeline does not support model type '#{@model.config["model_type"]}'."
      end
    end

    private

    def call_whisper(audio, **kwargs)
      raise Todo
    end
  end

  class ImageToImagePipeline < Pipeline
    def call(images)
      prepared_images = prepare_images(images)
      inputs = @processor.(prepared_images)
      outputs = @model.(inputs)

      to_return = []
      outputs[0].each do |batch|
        # TODO flatten first
        output =
          batch.map do |v|
            v.map do |v2|
              v2.map do |v3|
                (v3.clamp(0, 1) * 255).round
              end
            end
          end
        to_return << Utils::RawImage.from_array(output).image
      end

      to_return.length > 1 ? to_return : to_return[0]
    end
  end

  class DepthEstimationPipeline < Pipeline
    def call(images)
      prepared_images = prepare_images(images)

      inputs = @processor.(prepared_images)
      predicted_depth = @model.(inputs)[0]

      to_return = []
      prepared_images.length.times do |i|
        prediction = Utils.interpolate(predicted_depth[i], prepared_images[i].size.reverse, "bilinear", false)
        max_prediction = Utils.max(prediction.flatten)[0]
        formatted =
          prediction.map do |v|
            v.map do |v2|
              v2.map do |v3|
                (v3 * 255 / max_prediction).round
              end
            end
          end
        to_return << {
          predicted_depth: predicted_depth[i],
          depth: Utils::RawImage.from_array(formatted).image
        }
      end
      to_return.length > 1 ? to_return : to_return[0]
    end
  end

  class EmbeddingPipeline < FeatureExtractionPipeline
    def call(
      texts,
      pooling: "mean",
      normalize: true,
      model_output: nil
    )
      super(texts, pooling:, normalize:, model_output:)
    end
  end

  class RerankingPipeline < Pipeline
    def call(
      query,
      documents,
      return_documents: false,
      top_k: nil
    )
      model_inputs = @tokenizer.([query] * documents.size,
        text_pair: documents,
        padding: true,
        truncation: true
      )

      outputs = @model.(model_inputs)

      result =
        Utils.sigmoid(outputs[0].map(&:first))
          .map.with_index { |s, i| {doc_id: i, score: s} }
          .sort_by { |v| -v[:score] }

      if return_documents
        result.each do |v|
          v[:text] = documents[v[:doc_id]]
        end
      end

      top_k ? result.first(top_k) : result
    end
  end

  SUPPORTED_TASKS = {
    "text-classification" => {
      tokenizer: AutoTokenizer,
      pipeline: TextClassificationPipeline,
      model: AutoModelForSequenceClassification,
      default: {
        model: "Xenova/distilbert-base-uncased-finetuned-sst-2-english"
      },
      type: "text"
    },
    "token-classification" => {
      tokenizer: AutoTokenizer,
      pipeline: TokenClassificationPipeline,
      model: AutoModelForTokenClassification,
      default: {
        model: "Xenova/bert-base-multilingual-cased-ner-hrl"
      },
      type: "text"
    },
    "question-answering" => {
      tokenizer: AutoTokenizer,
      pipeline: QuestionAnsweringPipeline,
      model: AutoModelForQuestionAnswering,
      default: {
        model: "Xenova/distilbert-base-cased-distilled-squad"
      },
      type: "text"
    },
    "fill-mask" => {
      tokenizer: AutoTokenizer,
      pipeline: FillMaskPipeline,
      model: AutoModelForMaskedLM,
      default: {
        model: "Xenova/bert-base-uncased"
      },
      type: "text"
    },
    "summarization" => {
      tokenizer: AutoTokenizer,
      pipeline: SummarizationPipeline,
      model: AutoModelForSeq2SeqLM,
      default: {
        model: "Xenova/distilbart-cnn-6-6"
      },
      type: "text"
    },
    "translation" => {
      tokenizer: AutoTokenizer,
      pipeline: TranslationPipeline,
      model: AutoModelForSeq2SeqLM,
      default: {
        model: "Xenova/t5-small"
      },
      type: "text"
    },
    "text2text-generation" => {
      tokenizer: AutoTokenizer,
      pipeline: Text2TextGenerationPipeline,
      model: AutoModelForSeq2SeqLM,
      default: {
        model: "Xenova/flan-t5-small"
      },
      type: "text"
    },
    "text-generation" => {
      tokenizer: AutoTokenizer,
      pipeline: TextGenerationPipeline,
      model: AutoModelForCausalLM,
      default: {
        model: "Xenova/gpt2"
      },
      type: "text"
    },
    "zero-shot-classification" => {
      tokenizer: AutoTokenizer,
      pipeline: ZeroShotClassificationPipeline,
      model: AutoModelForSequenceClassification,
      default: {
        model: "Xenova/distilbert-base-uncased-mnli"
      },
      type: "text"
    },
    "audio-classification" => {
      pipeline: AudioClassificationPipeline,
      model: AutoModelForAudioClassification,
      processor: AutoProcessor,
      default: {
        model: "Xenova/wav2vec2-base-superb-ks"
      },
      type: "audio"
    },
    # TODO
    # "zero-shot-audio-classification" => {
    #   tokenizer: AutoTokenizer,
    #   pipeline: ZeroShotAudioClassificationPipeline,
    #   model: AutoModel,
    #   processor: AutoProcessor,
    #   default: {
    #      model: "Xenova/clap-htsat-unfused"
    #   },
    #   type: "multimodal"
    # },
    # TODO
    # "automatic-speech-recognition" => {
    #   tokenizer: AutoTokenizer,
    #   pipeline: AutomaticSpeechRecognitionPipeline,
    #   model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
    #   processor: AutoProcessor,
    #   default: {
    #     model: "Xenova/whisper-tiny.en"
    #   },
    #   type: "multimodal"
    # },
    "text-to-audio" => {
      tokenizer: AutoTokenizer,
      pipeline: TextToAudioPipeline,
      model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
      processor: [AutoProcessor, nil],
      default: {
        model: "Xenova/speecht5_tts"
      },
      type: "text"
    },
    "image-to-text" => {
      tokenizer: AutoTokenizer,
      pipeline: ImageToTextPipeline,
      model: AutoModelForVision2Seq,
      processor: AutoProcessor,
      default: {
        model: "Xenova/vit-gpt2-image-captioning"
      },
      type: "multimodal"
    },
    "image-classification" => {
      pipeline: ImageClassificationPipeline,
      model: AutoModelForImageClassification,
      processor: AutoProcessor,
      default: {
        model: "Xenova/vit-base-patch16-224"
      },
      type: "multimodal"
    },
    "image-segmentation" => {
      pipeline: ImageSegmentationPipeline,
      model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation],
      processor: AutoProcessor,
      default: {
        model: "Xenova/detr-resnet-50-panoptic"
      },
      type: "multimodal"
    },
    "zero-shot-image-classification" => {
      tokenizer: AutoTokenizer,
      pipeline: ZeroShotImageClassificationPipeline,
      model: AutoModel,
      processor: AutoProcessor,
      default: {
        model: "Xenova/clip-vit-base-patch32"
      },
      type: "multimodal"
    },
    "object-detection" => {
      pipeline: ObjectDetectionPipeline,
      model: AutoModelForObjectDetection,
      processor: AutoProcessor,
      default: {
        model: "Xenova/detr-resnet-50"
      },
      type: "multimodal"
    },
    "zero-shot-object-detection" => {
      tokenizer: AutoTokenizer,
      pipeline: ZeroShotObjectDetectionPipeline,
      model: AutoModelForZeroShotObjectDetection,
      processor: AutoProcessor,
      default: {
        model: "Xenova/owlvit-base-patch32"
      },
      type: "multimodal"
    },
    "document-question-answering" => {
      tokenizer: AutoTokenizer,
      pipeline: DocumentQuestionAnsweringPipeline,
      model: AutoModelForDocumentQuestionAnswering,
      processor: AutoProcessor,
      default: {
        model: "Xenova/donut-base-finetuned-docvqa"
      },
      type: "multimodal"
    },
    "image-to-image" => {
      pipeline: ImageToImagePipeline,
      model: AutoModelForImageToImage,
      processor: AutoProcessor,
      default: {
        model: "Xenova/swin2SR-classical-sr-x2-64"
      },
      type: "image"
    },
    "depth-estimation" => {
      pipeline: DepthEstimationPipeline,
      model: AutoModelForDepthEstimation,
      processor: AutoProcessor,
      default: {
        model: "Xenova/dpt-large"
      },
      type: "image"
    },
    "feature-extraction" => {
      tokenizer: AutoTokenizer,
      pipeline: FeatureExtractionPipeline,
      model: AutoModel,
      default: {
        model: "Xenova/all-MiniLM-L6-v2"
      },
      type: "text"
    },
    "image-feature-extraction" => {
      processor: AutoProcessor,
      pipeline: ImageFeatureExtractionPipeline,
      model: [AutoModelForImageFeatureExtraction, AutoModel],
      default: {
        model: "Xenova/vit-base-patch16-224"
      },
      type: "image"
    },
    "embedding" => {
      tokenizer: AutoTokenizer,
      pipeline: EmbeddingPipeline,
      model: AutoModel,
      default: {
        model: "sentence-transformers/all-MiniLM-L6-v2"
      },
      type: "text"
    },
    "reranking" => {
      tokenizer: AutoTokenizer,
      pipeline: RerankingPipeline,
      model: AutoModel,
      default: {
        model: "mixedbread-ai/mxbai-rerank-base-v1"
      },
      type: "text"
    }
  }

  TASK_ALIASES = {
    "sentiment-analysis" => "text-classification",
    "ner" => "token-classification",
    "text-to-speech" => "text-to-audio"
  }

  DEFAULT_PROGRESS_CALLBACK = lambda do |msg|
    stream = $stderr
    tty = stream.tty?
    width = tty ? stream.winsize[1] : 80
    width = 80 if width == 0

    if msg[:status] == "progress" && tty
      stream.print "\r#{Utils::Hub.display_progress(msg[:file], width, msg[:size], msg[:total_size])}"
    elsif msg[:status] == "done" && !msg[:cache_hit]
      if tty
        stream.puts
      else
        stream.puts Utils::Hub.display_progress(msg[:file], width, 1, 1)
      end
    end
  end

  NO_DEFAULT = Object.new

  class << self
    def pipeline(
      task,
      model = nil,
      quantized: NO_DEFAULT,
      progress_callback: DEFAULT_PROGRESS_CALLBACK,
      config: nil,
      cache_dir: nil,
      local_files_only: false,
      revision: "main",
      device: nil,
      dtype: nil,
      model_file_name: nil,
      session_options: {}
    )
      # Apply aliases
      task = TASK_ALIASES[task] || task

      if quantized == NO_DEFAULT
        # TODO no quantization by default in 2.0
        quantized = ["text-classification", "token-classification", "question-answering", "feature-extraction"].include?(task)
      end

      # Get pipeline info
      pipeline_info = SUPPORTED_TASKS[task.split("_", 1)[0]]
      if !pipeline_info
        raise Error, "Unsupported pipeline: #{task}. Must be one of #{SUPPORTED_TASKS.keys}"
      end

      # Use model if specified, otherwise, use default
      if !model
        model = pipeline_info[:default][:model]
        warn "No model specified. Using default model: #{model.inspect}."
      end

      pretrained_options = {
        quantized:,
        progress_callback:,
        config:,
        cache_dir:,
        local_files_only:,
        revision:,
        device:,
        dtype:,
        model_file_name:,
        session_options:
      }

      classes = {
        tokenizer: pipeline_info[:tokenizer],
        model: pipeline_info[:model],
        processor: pipeline_info[:processor]
      }

      # Load model, tokenizer, and processor (if they exist)
      results = load_items(classes, model, pretrained_options)
      results[:task] = task

      # for previous revision of sentence-transformers/all-MiniLM-L6-v2
      if model == "sentence-transformers/all-MiniLM-L6-v2" && results[:model].instance_variable_get(:@session).outputs.any? { |v| v[:name] == "token_embeddings" }
        results[:model].instance_variable_set(:@output_names, ["token_embeddings"])
      end

      Utils.dispatch_callback(progress_callback, {
        status: "ready",
        task: task,
        model: model
      })

      pipeline_class = pipeline_info.fetch(:pipeline)
      pipeline_class.new(**results)
    end

    private

    def load_items(mapping, model, pretrained_options)
      result = {}

      mapping.each do |name, cls|
        next if !cls

        if cls.is_a?(Array)
          e = nil
          cls.each do |c|
            begin
              result[name] = c.from_pretrained(model, **pretrained_options)
            rescue => err
              e = err
            end
          end
          raise e unless result[name]
        else
          result[name] = cls.from_pretrained(model, **pretrained_options)
        end
      end

      result
    end
  end
end


================================================
FILE: lib/informers/processors.rb
================================================
module Informers
  class FeatureExtractor
    attr_reader :config

    def initialize(config)
      super()
      @config = config
    end
  end

  class ImageFeatureExtractor < FeatureExtractor
    def initialize(config)
      super(config)

      @image_mean = @config["image_mean"] || @config["mean"]
      @image_std = @config["image_std"] || @config["std"]

      @resample = @config["resample"] || 2 # 2 => bilinear
      @do_rescale = @config.fetch("do_rescale", true)
      @rescale_factor = @config["rescale_factor"] || (1 / 255.0)
      @do_normalize = @config["do_normalize"]

      @do_resize = @config["do_resize"]
      @do_thumbnail = @config["do_thumbnail"]
      @size = @config["size"]
      @size_divisibility = @config["size_divisibility"] || @config["size_divisor"]

      @do_center_crop = @config["do_center_crop"]
      @crop_size = @config["crop_size"]
      @do_convert_rgb = @config.fetch("do_convert_rgb", true)
      @do_crop_margin = @config["do_crop_margin"]

      @pad_size = @config["pad_size"]
      @do_pad = @config["do_pad"]

      if @do_pad && !@pad_size && @size && !@size["width"].nil? && !@size["height"].nil?
        # Should pad, but no pad size specified
        # We infer the pad size from the resize size
        @pad_size = @size
      end

      @do_flip_channel_order = @config["do_flip_channel_order"] || false
    end

    def thumbnail(image, size, resample = 2)
      input_height = image.height
      input_width = image.width

      output_height = size["height"]
      output_width = size["width"]

      # We always resize to the smallest of either the input or output size.
      height = [input_height, output_height].min
      width = [input_width, output_width].min

      if height == input_height && width == input_width
        return image
      end
      if input_height > input_width
        width = (input_width * height / input_height).floor
      elsif input_width > input_height
        height = (input_height * width / input_width).floor
      end
      image.resize(width, height, resample:)
    end

    def pad_image(
      pixel_data,
      img_dims,
      pad_size,
      mode: "constant",
      center: false,
      constant_values: 0
    )
      image_height, image_width, image_channels = img_dims

      if pad_size.is_a?(Numeric)
        padded_image_width = pad_size
        padded_image_height = pad_size
      else
        padded_image_width = pad_size[:width] || pad_size["width"]
        padded_image_height = pad_size[:height] || pad_size["height"]
      end

      # Only add padding if there is a difference in size
      if padded_image_width != image_width || padded_image_height != image_height
        padded_pixel_data = Array.new(padded_image_width * padded_image_height * image_channels)
        if constant_values.is_a?(Array)
          # Fill with constant values, cycling through the array
          padded_pixel_data.length.times do |i|
            padded_pixel_data[i] = constant_values[i % image_channels]
          end
        elsif constant_values != 0
          padded_pixel_data.fill(constant_values)
        end

        left, top =
          if center
            [((padded_image_width - image_width) / 2.0).floor, ((padded_image_height - image_height) / 2.0).floor]
          else
            [0, 0]
          end

        # Copy the original image into the padded image
        image_height.times do |i|
          a = (i + top) * padded_image_width
          b = i * image_width
          image_width.times do |j|
            c = (a + j + left) * image_channels
            d = (b + j) * image_channels
            image_channels.times do |k|
              padded_pixel_data[c + k] = pixel_data[d + k]
            end
          end
        end

        if mode == "symmetric"
          if center
            raise Error, "`center` padding is not supported when `mode` is set to `symmetric`."
          end
          h1 = image_height - 1
          w1 = image_width - 1
          padded_image_height.times do |i|
            a = i * padded_image_width
            b = Utils.calculate_reflect_offset(i, h1) * image_width

            padded_image_width.times do |j|
              next if i < image_height && j < image_width # Do not overwrite original image
              c = (a + j) * image_channels
              d = (b + Utils.calculate_reflect_offset(j, w1)) * image_channels

              # Copy channel-wise
              image_channels.times do |k|
                padded_pixel_data[c + k] = pixel_data[d + k]
              end
            end
          end
        end

        # Update pixel data and image dimensions
        pixel_data = padded_pixel_data
        img_dims = [padded_image_height, padded_image_width, image_channels]
      end
      [pixel_data, img_dims]
    end

    def rescale(pixel_data)
      pixel_data.length.times do |i|
        pixel_data[i] *= @rescale_factor
      end
    end

    def get_resize_output_image_size(image, size)
      src_width, src_height = image.size

      if @do_thumbnail
        # NOTE: custom logic for `Donut` models
        height = size["height"]
        width = size["width"]
        shortest_edge = [height, width].min
      elsif size.is_a?(Numeric)
        shortest_edge = size
        longest_edge = @config["max_size"] || shortest_edge
      elsif !size.nil?
        # Extract known properties from `size`
        shortest_edge = size["shortest_edge"]
        longest_edge = size["longest_edge"]
      end

      if !shortest_edge.nil? || !longest_edge.nil?
        # http://opensourcehacker.com/2011/12/01/calculate-aspect-ratio-conserving-resize-for-images-in-javascript/
        # Try resize so that shortest edge is `shortest_edge` (target)
        short_resize_factor =
          if shortest_edge.nil?
            1 # If `shortest_edge` is not set, don't upscale
          else
            [shortest_edge / src_width.to_f, shortest_edge / src_height.to_f].max
          end

        new_width = src_width * short_resize_factor
        new_height = src_height * short_resize_factor

        # The new width and height might be greater than `longest_edge`, so
        # we downscale again to ensure the largest dimension is `longest_edge`
        long_resize_factor =
          if longest_edge.nil?
            1 # If `longest_edge` is not set, don't downscale
          else
            [longest_edge / new_width.to_f, longest_edge / new_height.to_f].min
          end

        # To avoid certain floating point precision issues, we round to 2 decimal places
        final_width = (new_width * long_resize_factor).round(2).floor
        final_height = (new_height * long_resize_factor).round(2).floor

        if !@size_divisibility.nil?
          raise Todo
        end
        [final_width, final_height]
      elsif !size.nil? && !size["width"].nil? && !size["height"].nil?
        new_width = size["width"]
        new_height = size["height"]

        if @config["keep_aspect_ratio"] && @config["ensure_multiple_of"]
          raise Todo
        end

        [new_width, new_height]
      else
        raise Todo
      end
    end

    def resize(image)
      new_width, new_height = get_resize_output_image_size(image, @size)
      image.resize(new_width, new_height, resample: @resample)
    end

    def preprocess(
      image,
      do_normalize: nil,
      do_pad: nil,
      do_convert_rgb: nil,
      do_convert_grayscale: nil,
      do_flip_channel_order: nil
    )
      if @do_crop_margin
        # NOTE: Specific to nougat processors. This is done before resizing,
        # and can be interpreted as a pre-preprocessing step.
        image = crop_margin(image)
      end

      src_width, src_height = image.size # original image size

      # Convert image to RGB if specified in config.
      if !do_convert_rgb.nil? ? do_convert_rgb : @do_convert_rgb
        image = image.rgb
      elsif do_convert_grayscale
        image = image.grayscale
      end

      # Resize all images
      if @do_resize
        image = resize(image)
      end

      # Resize the image using thumbnail method.
      if @do_thumbnail
        image = thumbnail(image, @size, @resample)
      end

      if @do_center_crop
        if @crop_size.is_a?(Integer)
          crop_width = @crop_size
          crop_height = @crop_size
        else
          crop_width = @crop_size["width"]
          crop_height = @crop_size["height"]
        end
        image = image.center_crop(crop_width, crop_height)
      end

      reshaped_input_size = [image.height, image.width]

      # NOTE: All pixel-level manipulation (i.e., modifying `pixelData`)
      # occurs with data in the hwc format (height, width, channels),
      # to emulate the behavior of the original Python code (w/ numpy).
      pixel_data = image.data
      img_dims = [image.height, image.width, image.channels]

      if @do_rescale
        rescale(pixel_data)
      end

      if !do_normalize.nil? ? do_normalize : @do_normalize
        image_mean = @image_mean
        if !@image_mean.is_a?(Array)
          image_mean = new Array(image.channels) { image_mean }
        end

        image_std = @image_std
        if !@image_std.is_a?(Array)
          image_std = new Array(image.channels) { image_std }
        end

        if image_mean.length != image.channels || image_std.length != image.channels
          raise Error, "When set to arrays, the length of `image_mean` (#{image_mean.length}) and `image_std` (#{image_std.length}) must match the number of channels in the image (#{image.channels})."
        end

        i = 0
        while i < pixel_data.length
          image.channels.times do |j|
            pixel_data[i + j] = (pixel_data[i + j] - image_mean[j]) / image_std[j]
          end
          i += image.channels
        end
      end

      # do padding after rescaling/normalizing
      if !do_pad.nil? ? do_pad : @do_pad
        if @pad_size
          padded = pad_image(pixel_data, [image.height, image.width, image.channels], @pad_size)
          pixel_data, img_dims = padded # Update pixel data and image dimensions
        elsif @size_divisibility
          raise Todo
        end
      end

      if !do_flip_channel_order.nil? ? do_flip_channel_order : @do_flip_channel_order
        raise Todo
      end

      # convert to channel dimension format (hwc -> chw)
      h, w, c = img_dims
      pixel_values =
        c.times.map do |ci|
          h.times.map do |hi|
            w.times.map do |wi|
              index = (hi * w * c) + (wi * c) + ci
              pixel_data[index]
            end
          end
        end

      {
        original_size: [src_height, src_width],
        reshaped_input_size: reshaped_input_size,
        pixel_values: pixel_values
      }
    end

    def call(images, *args)
      if !images.is_a?(Array)
        images = [images]
      end

      image_data = images.map { |x| preprocess(x) }

      # Stack pixel values
      pixel_values = Utils.stack(image_data.map { |x| x[:pixel_values] }, 0)

      {
        pixel_values: pixel_values,

        # Original sizes of images
        original_sizes: image_data.map { |x| x[:original_size] },

        # Reshaped sizes of images, before padding or cropping
        reshaped_input_sizes: image_data.map { |x| x[:reshaped_input_size] }
      }
    end
  end

  class CLIPFeatureExtractor < ImageFeatureExtractor
  end

  class DPTFeatureExtractor < ImageFeatureExtractor
  end

  class ViTFeatureExtractor < ImageFeatureExtractor
  end

  class OwlViTFeatureExtractor < ImageFeatureExtractor
    def post_process_object_detection(*args)
      Utils.post_process_object_detection(*args)
    end
  end

  class Swin2SRImageProcessor < ImageFeatureExtractor
    def pad_image(pixel_data, img_dims, pad_size, **options)
      # NOTE: In this case, `padSize` represents the size of the sliding window for the local attention.
      # In other words, the image is padded so that its width and height are multiples of `padSize`.
      image_height, image_width, _image_channels = img_dims

      super(
        pixel_data,
        img_dims,
        {
          # NOTE: For Swin2SR models, the original python implementation adds padding even when the image's width/height is already
          # a multiple of `pad_size`. However, this is most likely a bug (PR: https://github.com/mv-lab/swin2sr/pull/19).
          # For this reason, we only add padding when the image's width/height is not a multiple of `pad_size`.
          width: image_width + (pad_size - image_width % pad_size) % pad_size,
          height: image_height + (pad_size - image_height % pad_size) % pad_size
        },
        mode: "symmetric",
        center: false,
        constant_values: -1,
        **options
      )
    end
  end

  class DonutFeatureExtractor < ImageFeatureExtractor
    def pad_image(pixel_data, img_dims, pad_size, **options)
      _image_height, _image_width, image_channels = img_dims

      image_mean = @image_mean
      if !image_mean.is_a?(Array)
        image_mean = new Array(image_channels, image_mean)
      end

      image_std = @image_std
      if !image_std.is_a?(Array)
        image_std = new Array(image_channels, image_std)
      end

      constant_values = image_mean.map.with_index { |x, i| -x / image_std[i] }

      super(
        pixel_data,
        img_dims,
        pad_size,
        center: true,
        # Since normalization is done after padding, we need to use certain constant values to ensure the same behaviour is observed.
        # For more information, see https://github.com/huggingface/transformers/blob/main/src/transformers/models/donut/image_processing_donut.py#L433-L451
        constant_values: constant_values,
        **options
      )
    end
  end

  class DetrFeatureExtractor < ImageFeatureExtractor
    def call(images)
      result = super(images)

      # TODO support differently-sized images, for now assume all images are the same size.
      # TODO support different mask sizes (not just 64x64)
      # Currently, just fill pixel mask with 1s
      mask_size = [result[:pixel_values].size, 64, 64]
      pixel_mask =
        mask_size[0].times.map do
          mask_size[1].times.map do
            mask_size[2].times.map do
              1
            end
          end
        end

      result.merge(pixel_mask: pixel_mask)
    end

    def post_process_object_detection(*args)
      Utils.post_process_object_detection(*args)
    end

    def remove_low_and_no_objects(class_logits, mask_logits, object_mask_threshold, num_labels)
      mask_probs_item = []
      pred_scores_item = []
      pred_labels_item = []

      class_logits.size.times do |j|
        cls = class_logits[j]
        mask = mask_logits[j]

        pred_label = Utils.max(cls)[1]
        if pred_label == num_labels
          # Is the background, so we ignore it
          next
        end

        scores = Utils.softmax(cls)
        pred_score = scores[pred_label]
        if pred_score > object_mask_threshold
          mask_probs_item << mask
          pred_scores_item << pred_score
          pred_labels_item << pred_label
        end
      end

      [mask_probs_item, pred_scores_item, pred_labels_item]
    end

    def check_segment_validity(
      mask_labels,
      mask_probs,
      k,
      mask_threshold = 0.5,
      overlap_mask_area_threshold = 0.8
    )
      # mask_k is a 1D array of indices, indicating where the mask is equal to k
      mask_k = []
      mask_k_area = 0
      original_area = 0

      mask_probs_k_data = mask_probs[k].flatten

      # Compute the area of all the stuff in query k
      mask_labels.length.times do |i|
        if mask_labels[i] == k
          mask_k << i
          mask_k_area += 1
        end

        if mask_probs_k_data[i] >= mask_threshold
          original_area += 1
        end
      end
      mask_exists = mask_k_area > 0 && original_area > 0

      # Eliminate disconnected tiny segments
      if mask_exists
        # Perform additional check
        area_ratio = mask_k_area / original_area
        mask_exists = area_ratio > overlap_mask_area_threshold
      end

      [mask_exists, mask_k]
    end

    def compute_segments(
      mask_probs,
      pred_scores,
      pred_labels,
      mask_threshold,
      overlap_mask_area_threshold,
      label_ids_to_fuse = nil,
      target_size = nil
    )
      height, width = target_size || Utils.dims(mask_probs[0])

      segmentation = Array.new(height * width)
      segments = []

      # 1. If target_size is not null, we need to resize the masks to the target size
      if !target_size.nil?
        # resize the masks to the target size
        mask_probs.length.times do |i|
          mask_probs[i] = Utils.interpolate(mask_probs[i], target_size, "bilinear", false)
        end
      end

      # 2. Weigh each mask by its prediction score
      # NOTE: `mask_probs` is updated in-place
      #
      # Temporary storage for the best label/scores for each pixel ([height, width]):
      mask_labels = Array.new(mask_probs[0].flatten.length)
      best_scores = Array.new(mask_probs[0].flatten.length, 0)

      mask_probs.length.times do |i|
        score = pred_scores[i]

        mask_probs_i_data = mask_probs[i].flatten
        mask_probs_i_dims = Utils.dims(mask_probs[i])

        mask_probs_i_data.length.times do |j|
          mask_probs_i_data[j] *= score
          if mask_probs_i_data[j] > best_scores[j]
            mask_labels[j] = i
            best_scores[j] = mask_probs_i_data[j]
          end
        end

        mask_probs[i] = Utils.reshape(mask_probs_i_data, mask_probs_i_dims)
      end

      current_segment_id = 0

      # stuff_memory_list = {}
      pred_labels.length.times do |k|
        pred_class = pred_labels[k]

        # TODO add `should_fuse`
        # should_fuse = label_ids_to_fuse.include?(pred_class)

        # Check if mask exists and large enough to be a segment
        mask_exists, mask_k = check_segment_validity(
          mask_labels,
          mask_probs,
          k,
          mask_threshold,
          overlap_mask_area_threshold
        )

        if !mask_exists
          # Nothing to see here
          next
        end

        current_segment_id += 1

        # Add current object segment to final segmentation map
        mask_k.each do |index|
          segmentation[index] = current_segment_id
        end

        segments << {
          id: current_segment_id,
          label_id: pred_class,
          score: pred_scores[k]
        }
      end

      segmentation = Utils.reshape(segmentation, [height, width])

      [segmentation, segments]
    end

    def post_process_panoptic_segmentation(
      outputs,
      threshold: 0.5,
      mask_threshold: 0.5,
      overlap_mask_area_threshold: 0.8,
      label_ids_to_fuse: nil,
      target_sizes: nil
    )
      if label_ids_to_fuse.nil?
        warn "`label_ids_to_fuse` unset. No instance will be fused."
        label_ids_to_fuse = Set.new
      end

      class_queries_logits = outputs[:logits] # [batch_size, num_queries, num_classes+1]
      masks_queries_logits = outputs[:pred_masks] # [batch_size, num_queries, height, width]

      mask_probs = Utils.sigmoid(masks_queries_logits) # [batch_size, num_queries, height, width]

      batch_size, _num_queries, num_labels = class_queries_logits.size, class_queries_logits[0].size, class_queries_logits[0][0].size
      num_labels -= 1 # Remove last class (background)

      if !target_sizes.nil? && target_sizes.length != batch_size
        raise Error, "Make sure that you pass in as many target sizes as the batch dimension of the logits"
      end

      to_return = []
      batch_size.times do |i|
        target_size = !target_sizes.nil? ? target_sizes[i] : nil

        class_logits = class_queries_logits[i]
        mask_logits = mask_probs[i]

        mask_probs_item, pred_scores_item, pred_labels_item = remove_low_and_no_objects(class_logits, mask_logits, threshold, num_labels)

        if pred_labels_item.length == 0
          raise Todo
        end

        # Get segmentation map and segment information of batch item
        segmentation, segments = compute_segments(
          mask_probs_item,
          pred_scores_item,
          pred_labels_item,
          mask_threshold,
          overlap_mask_area_threshold,
          label_ids_to_fuse,
          target_size
        )

        to_return << {
          segmentation: segmentation,
          segments_info: segments
        }
      end

      to_return
    end
  end

  module Utils
    def self.center_to_corners_format(v)
      centerX, centerY, width, height = v
      [
        centerX - width / 2.0,
        centerY - height / 2.0,
        centerX + width / 2.0,
        centerY + height / 2.0
      ]
    end

    def self.post_process_object_detection(outputs, threshold = 0.5, target_sizes = nil, is_zero_shot = false)
      out_logits = outputs[:logits]
      out_bbox = outputs[:pred_boxes]
      batch_size, num_boxes, num_classes = out_logits.size, out_logits[0].size, out_logits[0][0].size

      if !target_sizes.nil? && target_sizes.length != batch_size
        raise Error, "Make sure that you pass in as many target sizes as the batch dimension of the logits"
      end
      to_return = []
      batch_size.times do |i|
        target_size = !target_sizes.nil? ? target_sizes[i] : nil
        info = {
          boxes: [],
          classes: [],
          scores: []
        }
        logits = out_logits[i]
        bbox = out_bbox[i]

        num_boxes.times do |j|
          logit = logits[j]

          indices = []
          if is_zero_shot
            # Get indices of classes with high enough probability
            probs = Utils.sigmoid(logit)
            probs.length.times do |k|
              if probs[k] > threshold
                indices << k
              end
            end
          else
            # Get most probable class
            max_index = Utils.max(logit)[1]

            if max_index == num_classes - 1
              # This is the background class, skip it
              next
            end
            indices << max_index

            # Compute softmax over classes
            probs = Utils.softmax(logit)
          end

          indices.each do |index|
            box = bbox[j]

            # convert to [x0, y0, x1, y1] format
            box = center_to_corners_format(box)
            if !target_size.nil?
              box = box.map.with_index { |x, i| x * target_size[(i + 1) % 2] }
            end

            info[:boxes] << box
            info[:classes] << index
            info[:scores] << probs[index]
          end
        end
        to_return << info
      end
      to_return
    end
  end

  class WhisperFeatureExtractor < FeatureExtractor
    def initialize(config)
      super(config)

      raise Todo
    end

    def _extract_fbank_features(waveform)
      raise Todo
    end

    def call(audio)
      raise Todo
    end
  end

  class Wav2Vec2FeatureExtractor < FeatureExtractor
    def _zero_mean_unit_var_norm(input_values)
      sum = input_values.sum
      mean = sum / input_values.length.to_f
      variance = input_values.sum { |b| (b - mean) ** 2 } / input_values.length.to_f
      input_values.map { |x| (x - mean) / Math.sqrt(variance + 1e-7) }
    end

    def call(audio)
      # TODO
      # validate_audio_inputs(audio, 'Wav2Vec2FeatureExtractor')

      input_values = audio

      # zero-mean and unit-variance normalization
      if @config["do_normalize"]
        input_values = _zero_mean_unit_var_norm(input_values)
      end

      # TODO: allow user to pass in attention mask
      {
        input_values: [input_values],
        attention_mask: [Array.new(input_values.length, 1)]
      }
    end
  end

  class ClapFeatureExtractor < FeatureExtractor
    def initialize(config)
      super(config)

      # TODO
    end

    def call(audio, max_length: nil)
      raise Todo
    end
  end

  class Processor
    attr_reader :feature_extractor

    def initialize(feature_extractor)
      @feature_extractor = feature_extractor
    end

    def call(input, *args)
      @feature_extractor.(input, *args)
    end
  end

  class AutoProcessor
    FEATURE_EXTRACTOR_CLASS_MAPPING = {
      "ViTFeatureExtractor" => ViTFeatureExtractor,
      "OwlViTFeatureExtractor" => OwlViTFeatureExtractor,
      "CLIPFeatureExtractor" => CLIPFeatureExtractor,
      "DPTFeatureExtractor" => DPTFeatureExtractor,
      "DetrFeatureExtractor" => DetrFeatureExtractor,
      "Swin2SRImageProcessor" => Swin2SRImageProcessor,
      "DonutFeatureExtractor" => DonutFeatureExtractor,
      "WhisperFeatureExtractor" => WhisperFeatureExtractor,
      "Wav2Vec2FeatureExtractor" => Wav2Vec2FeatureExtractor,
      "ClapFeatureExtractor" => ClapFeatureExtractor
    }

    PROCESSOR_CLASS_MAPPING = {}

    def self.from_pretrained(
      pretrained_model_name_or_path,
      progress_callback: nil,
      config: nil,
      cache_dir: nil,
      local_files_only: false,
      revision: "main",
      **kwargs
    )
      preprocessor_config = config || Utils::Hub.get_model_json(pretrained_model_name_or_path, "preprocessor_config.json", true,
        progress_callback:,
        config:,
        cache_dir:,
        local_files_only:,
        revision:
      )

      # Determine feature extractor class
      # TODO: Ensure backwards compatibility with old configs
      key = preprocessor_config["feature_extractor_type"] || preprocessor_config["image_processor_type"]
      feature_extractor_class = FEATURE_EXTRACTOR_CLASS_MAPPING[key]

      if !feature_extractor_class
        if preprocessor_config["size"]
          # Assume ImageFeatureExtractor
          warn "Feature extractor type #{key.inspect} not found, assuming ImageFeatureExtractor due to size parameter in config."
          feature_extractor_class = ImageFeatureExtractor
        else
          raise Error, "Unknown Feature Extractor type: #{key}"
        end
      end

      # If no associated processor class, use default
      processor_class = PROCESSOR_CLASS_MAPPING[preprocessor_config["processor_class"]] || Processor

      # Instantiate processor and feature extractor
      feature_extractor = feature_extractor_class.new(preprocessor_config)
      processor_class.new(feature_extractor)
    end
  end
end


================================================
FILE: lib/informers/tokenizers.rb
================================================
module Informers
  class PreTrainedTokenizer
    attr_reader :mask_token, :mask_token_id, :sep_token_id

    def initialize(tokenizer_json, tokenizer_config)
      super()

      @tokenizer_config = tokenizer_config

      @tokenizer = Tokenizers::Tokenizer.from_file(tokenizer_json)

      # Add added_tokens to model
      @special_tokens = []
      @all_special_ids = []

      @added_tokens = []
      @tokenizer.added_tokens_decoder.each do |id, token|
        @added_tokens << token

        if token.special
          @special_tokens << token.content
          @all_special_ids << id
        end
      end

      # Update additional_special_tokens
      @additional_special_tokens = tokenizer_config["additional_special_tokens"] || []
      @special_tokens.concat(@additional_special_tokens)

      @mask_token = get_token("mask_token")
      @mask_token_id = @tokenizer.token_to_id(@mask_token) if @mask_token

      @sep_token = get_token("sep_token")
      @sep_token_id = @tokenizer.token_to_id(@sep_token) if @sep_token

      @model_max_length = tokenizer_config["model_max_length"]

      # for donut-base-finetuned-docvqa
      if @model_max_length && @model_max_length > (1 << 63)
        @model_max_length = 1 << 63
      end
    end

    def get_token(*keys)
      keys.each do |key|
        item = @tokenizer_config[key]
        if !item
          next
        end

        if item.is_a?(Hash)
          if item["__type"] == "AddedToken"
            return item["content"]
          else
            raise Error, "Unknown token: #{item}"
          end
        else
          return item
        end
      end

      nil
    end

    def call(
      text,
      text_pair: nil,
      add_special_tokens: true,
      padding: false,
      truncation: nil,
      max_length: nil,
      return_tensor: true,
      return_token_type_ids: true, # TODO change default
      return_offsets: false
    )
      is_batched = text.is_a?(Array)

      if is_batched
        if text.length == 0
          raise Error, "text array must be non-empty"
        end

        if !text_pair.nil?
          if !text_pair.is_a?(Array)
            raise Error, "text_pair must also be an array"
          elsif text.length != text_pair.length
            raise Error, "text and text_pair must have the same length"
          end
        end
      end

      if padding
        @tokenizer.enable_padding
      else
        @tokenizer.no_padding
      end

      if truncation
        @tokenizer.enable_truncation(max_length || @model_max_length)
      else
        @tokenizer.no_truncation
      end

      if is_batched
        input = text_pair ? text.zip(text_pair) : text
        encoded = @tokenizer.encode_batch(input, add_special_tokens: add_special_tokens)
      else
        encoded = [@tokenizer.encode(text, text_pair, add_special_tokens: add_special_tokens)]
      end

      result = {input_ids: encoded.map(&:ids), attention_mask: encoded.map(&:attention_mask)}
      if return_token_type_ids
        result[:token_type_ids] = encoded.map(&:type_ids)
      end
      if return_offsets
        result[:offsets] = encoded.map(&:offsets)
      end
      result
    end

    def decode(tokens, skip_special_tokens:)
      @tokenizer.decode(tokens, skip_special_tokens: skip_special_tokens)
    end

    def convert_tokens_to_string(tokens)
      @tokenizer.decoder.decode(tokens)
    end

    def convert_tokens_to_ids(tokens)
      tokens.map { |t| @tokenizer.token_to_id(t) }
    end

    def id_to_token(id)
      @tokenizer.id_to_token(id)
    end

    def batch_decode(batch, **decode_args)
      @tokenizer.decode_batch(batch, **decode_args)
    end

    def padding_side=(side)
      @tokenizer.enable_padding(direction: side)
    end
  end

  class BertTokenizer < PreTrainedTokenizer
    # TODO
    # self.return_token_type_ids = true
  end

  class DebertaV2Tokenizer < PreTrainedTokenizer
    # TODO
    # self.return_token_type_ids = true
  end

  class DistilBertTokenizer < PreTrainedTokenizer
  end

  class T5Tokenizer < PreTrainedTokenizer
  end

  class GPT2Tokenizer < PreTrainedTokenizer
    # _default_chat_template = `{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}`
  end

  class BartTokenizer < PreTrainedTokenizer
  end

  class RobertaTokenizer < PreTrainedTokenizer
  end

  class XLMRobertaTokenizer < PreTrainedTokenizer
  end

  class MPNetTokenizer < PreTrainedTokenizer
  end

  class CLIPTokenizer < PreTrainedTokenizer
  end

  class NllbTokenizer < PreTrainedTokenizer
    attr_reader :language_regex, :language_codes, :lang_to_token

    def initialize(tokenizer_json, tokenizer_config)
      super(tokenizer_json, tokenizer_config)

      @language_regex = /^[a-z]{3}_[A-Z][a-z]{3}$/
      @language_codes = @special_tokens.filter { |x| @language_regex.match?(x) }
      @lang_to_token = ->(x) { x } # Identity function
    end

    def _build_translation_inputs(raw_inputs, tokenizer_options, generate_kwargs)
      Utils._build_translation_inputs(self, raw_inputs, tokenizer_options, generate_kwargs)
    end
  end

  class M2M100Tokenizer < PreTrainedTokenizer
    attr_reader :language_regex, :language_codes, :lang_to_token

    def initialize(tokenizer_json, tokenizer_config)
      super(tokenizer_json, tokenizer_config)

      @language_regex = /^__[a-z]{2,3}__$/
      @language_codes = @special_tokens
        .filter { |x| @language_regex.match?(x) }
        .map { |x| x.slice(2, -2) }
      @lang_to_token = ->(x) { "__#{x}__" }
    end

    def _build_translation_inputs(raw_inputs, tokenizer_options, generate_kwargs)
      Utils._build_translation_inputs(self, raw_inputs, tokenizer_options, generate_kwargs)
    end
  end

  module Utils
    def self._build_translation_inputs(slf, raw_inputs, tokenizer_options, generate_kwargs)
      if !slf.respond_to?(:language_codes) || !slf.language_codes.is_a?(Array)
        raise Error, "Tokenizer must have `language_codes` attribute set and it should be an array of language ids."
      end
      if !slf.respond_to?(:language_regex) || !slf.language_regex.is_a?(Regexp)
        raise Error, "Tokenizer must have `language_regex` attribute set and it should be a regular expression."
      end
      if !slf.respond_to?(:lang_to_token) || !slf.lang_to_token.respond_to?(:call)
        raise Error, "Tokenizer must have `lang_to_token` attribute set and it should be a function."
      end
      src_lang_token = generate_kwargs[:src_lang]
      tgt_lang_token = generate_kwargs[:tgt_lang]

      if !slf.language_codes.include?(tgt_lang_token)
        raise Error, "Target language code #{tgt_lang_token.inspect} is not valid. Must be one of: #{slf.language_codes.join(", ")}"
      end

      if !src_lang_token.nil?
        # Check that the source language is valid:
        if !slf.language_codes.include?(src_lang_token)
          raise Error, "Source language code #{src_lang_token.inspect} is not valid. Must be one of: #{slf.language_codes.join(", ")}"
        end
      end

      # Override the `forced_bos_token_id` to force the correct language
      generate_kwargs["forced_bos_token_id"] = slf.convert_tokens_to_ids([slf.lang_to_token.(tgt_lang_token)])[0]

      slf.(raw_inputs, **tokenizer_options)
    end
  end

  class SpeechT5Tokenizer < PreTrainedTokenizer
  end

  class AutoTokenizer
    TOKENIZER_CLASS_MAPPING = {
      "T5Tokenizer" => T5Tokenizer,
      "BertTokenizer" => BertTokenizer,
      "DebertaV2Tokenizer" => DebertaV2Tokenizer,
      "DistilBertTokenizer" => DistilBertTokenizer,
      "BartTokenizer" => BartTokenizer,
      "RobertaTokenizer" => RobertaTokenizer,
      "XLMRobertaTokenizer" => XLMRobertaTokenizer,
      "MPNetTokenizer" => MPNetTokenizer,
      "CLIPTokenizer" => CLIPTokenizer,
      "GPT2Tokenizer" => GPT2Tokenizer,
      "NllbTokenizer" => NllbTokenizer,
      "M2M100Tokenizer" => M2M100Tokenizer,
      "SpeechT5Tokenizer" => SpeechT5Tokenizer,
      "PreTrainedTokenizer" => PreTrainedTokenizer
    }

    def self.from_pretrained(
      pretrained_model_name_or_path,
      quantized: true,
      progress_callback: nil,
      config: nil,
      cache_dir: nil,
      local_files_only: false,
      revision: "main",
      legacy: nil,
      **kwargs
    )
      tokenizer_json, tokenizer_config = load_tokenizer(
        pretrained_model_name_or_path,
        quantized:,
        progress_callback:,
        config:,
        cache_dir:,
        local_files_only:,
        revision:,
        legacy:
      )

      # Some tokenizers are saved with the "Fast" suffix, so we remove that if present.
      tokenizer_name = tokenizer_config["tokenizer_class"]&.delete_suffix("Fast") || "PreTrainedTokenizer"

      cls = TOKENIZER_CLASS_MAPPING[tokenizer_name]
      if !cls
        warn "Unknown tokenizer class #{tokenizer_name.inspect}, attempting to construct from base class."
        cls = PreTrainedTokenizer
      end
      cls.new(tokenizer_json, tokenizer_config)
    end

    def self.load_tokenizer(pretrained_model_name_or_path, **options)
      info = [
        Utils::Hub.get_model_file(pretrained_model_name_or_path, "tokenizer.json", true, **options),
        Utils::Hub.get_model_json(pretrained_model_name_or_path, "tokenizer_config.json", true, **options)
      ]

      # Override legacy option if `options.legacy` is not null
      if !options[:legacy].nil?
        info[1]["legacy"] = options[:legacy]
      end
      info
    end
  end
end


================================================
FILE: lib/informers/utils/audio.rb
================================================
module Informers
  module Utils
    def self.read_audio(input, sampling_rate)
      data =
        if input.is_a?(URI)
          require "open-uri"

          input.read
        elsif input.is_a?(String)
          File.binread(input)
        else
          raise ArgumentError, "Unsupported input type: #{input.class.name}"
        end

      ffmpeg_read(data, sampling_rate)
    end
  end
end


================================================
FILE: lib/informers/utils/core.rb
================================================
module Informers
  module Utils
    def self.dispatch_callback(progress_callback, data)
      progress_callback.(data) if progress_callback
    end

    def self.calculate_reflect_offset(i, w)
      ((i + w) % (2 * w) - w).abs
    end
  end
end


================================================
FILE: lib/informers/utils/dtypes.rb
================================================
module Informers
  module Utils
    DEFAULT_DTYPE_SUFFIX_MAPPING = {
      fp32: "",
      fp16: "_fp16",
      int8: "_int8",
      uint8: "_uint8",
      q8: "_quantized",
      q4: "_q4",
      q4f16: "_q4f16",
      bnb4: "_bnb4"
    }
  end
end


================================================
FILE: lib/informers/utils/ffmpeg.rb
================================================
# Copyright 2021 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

module Informers
  module Utils
    # from the Transformers Python library
    def self.ffmpeg_read(data, sampling_rate)
      ar = "#{sampling_rate}"
      ac = "1"
      format_for_conversion = "f32le"
      ffmpeg_command = [
        "ffmpeg",
        "-i",
        "pipe:0",
        "-ac",
        ac,
        "-ar",
        ar,
        "-f",
        format_for_conversion,
        "-hide_banner",
        "-loglevel",
        "quiet",
        "pipe:1"
      ]

      stdout, status = Open3.capture2(*ffmpeg_command, stdin_data: data)
      if !status.success?
        raise Error, "ffmpeg was not found but is required to load audio files from filename"
      end
      stdout.unpack("e*")
    end
  end
end


================================================
FILE: lib/informers/utils/generation.rb
================================================
module Informers
  module Utils
    class GenerationConfig
      def initialize(kwargs)
        @config = {}

        # Parameters that control the length of the output
        @config["max_length"] = kwargs["max_length"] || 20
        @config["max_new_tokens"] = kwargs["max_new_tokens"]
        @config["min_length"] = kwargs["min_length"] || 0
        @config["min_new_tokens"] = kwargs["min_new_tokens"]
        @config["early_stopping"] = kwargs["early_stopping"] || false
        @config["max_time"] = kwargs["max_time"]

        # Parameters that control the generation strategy used
        @config["do_sample"] = kwargs["do_sample"] || false
        @config["num_beams"] = kwargs["num_beams"] || 1
        @config["num_beam_groups"] = kwargs["num_beam_groups"] || 1
        @config["penalty_alpha"] = kwargs["penalty_alpha"]
        @config["use_cache"] = kwargs.fetch("use_cache", true)

        # Parameters for manipulation of the model output logits
        @config["temperature"] = kwargs["temperature"] || 1.0
        @config["top_k"] = kwargs["top_k"] || 50
        @config["top_p"] = kwargs["top_p"] || 1.0
        @config["typical_p"] = kwargs["typical_p"] || 1.0
        @config["epsilon_cutoff"] = kwargs["epsilon_cutoff"] || 0.0
        @config["eta_cutoff"] = kwargs["eta_cutoff"] || 0.0
        @config["diversity_penalty"] = kwargs["diversity_penalty"] || 0.0
        @config["repetition_penalty"] = kwargs["repetition_penalty"] || 1.0
        @config["encoder_repetition_penalty"] = kwargs["encoder_repetition_penalty"] || 1.0
        @config["length_penalty"] = kwargs["length_penalty"] || 1.0
        @config["no_repeat_ngram_size"] = kwargs["no_repeat_ngram_size"] || 0
        @config["bad_words_ids"] = kwargs["bad_words_ids"]
        @config["force_words_ids"] = kwargs["force_words_ids"]
        @config["renormalize_logits"] = kwargs["renormalize_logits"] || false
        @config["constraints"] = kwargs["constraints"]
        @config["forced_bos_token_id"] = kwargs["forced_bos_token_id"]
        @config["forced_eos_token_id"] = kwargs["forced_eos_token_id"]
        @config["remove_invalid_values"] = kwargs["remove_invalid_values"] || false
        @config["exponential_decay_length_penalty"] = kwargs["exponential_decay_length_penalty"]
        @config["suppress_tokens"] = kwargs["suppress_tokens"]
        @config["begin_suppress_tokens"] = kwargs["begin_suppress_tokens"]
        @config["forced_decoder_ids"] = kwargs["forced_decoder_ids"]

        # Parameters that define the output variables of `generate`
        @config["num_return_sequences"] = kwargs["num_return_sequences"] || 1
        @config["output_attentions"] = kwargs["output_attentions"] || false
        @config["output_hidden_states"] = kwargs["output_hidden_states"] || false
        @config["output_scores"] = kwargs["output_scores"] || false
        @config["return_dict_in_generate"] = kwargs["return_dict_in_generate"] || false

        # Special tokens that can be used at generation time
        @config["pad_token_id"] = kwargs["pad_token_id"]
        @config["bos_token_id"] = kwargs["bos_token_id"]
        @config["eos_token_id"] = kwargs["eos_token_id"]

        # Generation parameters exclusive to encoder-decoder models
        @config["encoder_no_repeat_ngram_size"] = kwargs["encoder_no_repeat_ngram_size"] || 0
        @config["decoder_start_token_id"] = kwargs["decoder_start_token_id"]

        # Wild card
        @generation_kwargs = kwargs["generation_kwargs"] || {}
      end

      def [](key)
        @config[key.to_s]
      end

      def merge!(config)
        @config.merge!(config)
      end
    end

    class Sampler
      def initialize(generation_config)
        super()
        @generation_config = generation_config
      end

      def call(logits, index = -1)
        # Sample from logits, of dims [batch, sequence_length, vocab_size].
        # If index is specified, sample from [batch, index, vocab_size].
        sample(logits, index)
      end

      def get_logits(logits, index)
        vocab_size = Utils.dims(logits)[-1]

        logs = logits.flatten

        if index == -1
          logs = logs.last(vocab_size)
        else
          raise Todo
        end

        # add temperature
        if @generation_config["temperature"] > 0
          logs = logs.map { |x| x / @generation_config["temperature"] }
        end
        logs
      end

      def self.get_sampler(generation_config)
        if generation_config[:do_sample]
          MultinomialSampler.new(generation_config)
        elsif generation_config[:num_beams] > 1
          BeamSearchSampler.new(generation_config)
        else
          if generation_config[:num_return_sequences] > 1
            raise Error, "num_return_sequences has to be 1 when doing greedy search, but is #{generation_config[:num_return_sequences]}."
          end
          GreedySampler.new(generation_config)
        end
      end
    end

    class GreedySampler < Sampler
      def sample(logits, index = -1)
        # NOTE: no need to do log_softmax here since we only take the maximum
        logs = get_logits(logits, index)
        argmax = Utils.max(logs)[1]

        # Note: score is meaningless in this context, since we are performing
        # greedy search (p = 1 => log(p) = 0)
        [
          [argmax, 0]
        ]
      end
    end

    class BeamSearchSampler < Sampler
      def sample(logits, index = -1)
        k = Utils.dims(logits)[-1] # defaults to vocab size
        if @generation_config["top_k"] > 0
          k = [@generation_config["top_k"], k].min
        end

        # Get logits of nth token
        logs = get_logits(logits, index)

        # Get top k tokens
        top_logits = Utils.get_top_items(logs, k)

        # Compute softmax over logits
        probabilities = Utils.softmax(top_logits.map { |x| x[1] })

        Array.new(@generation_config["num_beams"]) do |i|
          [
            top_logits[i][0],
            Math.log(probabilities[i])
          ]
        end
      end
    end

    class LogitsProcessorList
      def initialize
        super
        @processors = []
      end

      def push(item)
        @processors << item
      end

      def concat(items)
        @processors.concat(items)
      end

      def call(input_ids, batched_logits)
        # NOTE: This is different from the Python code, since vanilla Ruby does not support vectorized operations.
        # As a result, we apply each processor to each item in the batch.
        batched_logits.each do |logits|
          # Modifies logits inplace
          @processors.each do |func|
            func.(input_ids, logits)
          end
        end
      end

      def to_ary
        @processors
      end
    end

    class LogitsProcessor
    end

    class NoRepeatNGramLogitsProcessor < LogitsProcessor
      def initialize(no_repeat_ngram_size)
        super()
        @no_repeat_ngram_size = no_repeat_ngram_size
      end

      def get_ngrams(prev_input_ids)
        cur_len = prev_input_ids.length

        ngrams = []
        j = 0
        while j < cur_len + 1 - @no_repeat_ngram_size
          ngram = []
          @no_repeat_ngram_size.times do |k|
            ngram << prev_input_ids[j + k]
          end
          ngrams << ngram
          j += 1
        end

        generated_ngram = {}
        ngrams.each do |ngram|
          prev_ngram = ngram.slice(0, ngram.length - 1)
          prev_ngram_key = JSON.generate(prev_ngram)
          prev_ngram_value = generated_ngram[prev_ngram_key] || []
          prev_ngram_value << ngram[ngram.length - 1]
          generated_ngram[prev_ngram_key] = prev_ngram_value
        end
        generated_ngram
      end

      def get_generated_ngrams(banned_ngrams, prev_input_ids)
        ngram_idx = prev_input_ids.slice(prev_input_ids.length + 1 - @no_repeat_ngram_size, prev_input_ids.length)
        banned = banned_ngrams[JSON.generate(ngram_idx)] || []
        banned
      end

      def calc_banned_ngram_tokens(prev_input_ids)
        banned_tokens = []
        if prev_input_ids.length + 1 < @no_repeat_ngram_size
          # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
          banned_tokens
        else
          generated_ngrams = get_ngrams(prev_input_ids)
          banned_tokens = get_generated_ngrams(generated_ngrams, prev_input_ids)
          banned_tokens
        end
      end

      def call(input_ids, logits)
        banned_tokens = calc_banned_ngram_tokens(input_ids)

        banned_tokens.each do |token|
          logits[token] = -Float::INFINITY
        end
        logits
      end
    end

    class MinLengthLogitsProcessor < LogitsProcessor
      def initialize(min_length, eos_token_id)
        super()
        @min_length = min_length
        @eos_token_id = eos_token_id.is_a?(Array) ? eos_token_id : [eos_token_id]
      end

      def call(input_ids, logits)
        if input_ids.length < @min_length
          @eos_token_id.each do |eos_token|
            logits[eos_token] = -Float::INFINITY
          end
        end

        logits
      end
    end

    class ForcedBOSTokenLogitsProcessor < LogitsProcessor
      def initialize(bos_token_id)
        super()
        @bos_token_id = bos_token_id
      end

      def call(input_ids, logits)
        if input_ids.length == 1
          logits.map! { -Float::INFINITY }
          logits[@bos_token_id] = 0
        end
        logits
      end
    end

    class ForcedEOSTokenLogitsProcessor < LogitsProcessor
      def initialize(max_length, forced_eos_token_id)
        super()
        @max_length = max_length
        @forced_eos_token_id = forced_eos_token_id
      end

      def call(input_ids, logits)
      end
    end
  end
end


================================================
FILE: lib/informers/utils/hub.rb
================================================
module Informers
  module Utils
    module Hub
      class FileResponse
        attr_reader :exists, :status

        def initialize(file_path)
          @file_path = file_path

          @exists = File.exist?(file_path)
          if @exists
            @status = ["200", "OK"]
          else
            @status = ["404", "Not Found"]
          end
        end

        def read
          File.binread(@file_path)
        end
      end

      def self.is_valid_url(string, protocols = nil, valid_hosts = nil)
        begin
          url = URI.parse(string)
        rescue
          return false
        end
        if protocols && !protocols.include?(url.scheme)
          return false
        end
        if valid_hosts && !valid_hosts.include?(url.host)
          return false
        end
        true
      end

      def self.get_file(url_or_path, progress_callback = nil, progress_info = {})
        if !is_valid_url(url_or_path, ["http", "https"])
          raise Error, "Invalid url"
        else
          headers = {}
          headers["User-Agent"] = "informers/#{VERSION};"

          # Check whether we are making a request to the Hugging Face Hub.
          is_hfurl = is_valid_url(url_or_path, ["http", "https"], ["huggingface.co", "hf.co"])
          if is_hfurl
            # If an access token is present in the environment variables,
            # we add it to the request headers.
            token = ENV["HF_TOKEN"]
            if token
              headers["Authorization"] = "Bearer #{token}"
            end
          end
          options = {}
          if progress_callback
            total_size = nil
            options[:content_length_proc] = lambda do |size|
              total_size = size
              Utils.dispatch_callback(progress_callback, {status: "download"}.merge(progress_info).merge(total_size: size))
            end
            options[:progress_proc] = lambda do |size|
              Utils.dispatch_callback(progress_callback, {status: "progress"}.merge(progress_info).merge(size: size, total_size: total_size))
            end
          end
          URI.parse(url_or_path).open(**headers, **options)
        end
      end

      class FileCache
        attr_reader :path

        def initialize(path)
          @path = path
        end

        def match(request)
          file_path = resolve_path(request)
          file = FileResponse.new(file_path)

          file if file.exists
        end

        def put(request, response)
          output_path = resolve_path(request)

          begin
            tmp_path = "#{output_path}.incomplete"
            FileUtils.mkdir_p(File.dirname(output_path))
            File.open(tmp_path, "wb") do |f|
              while !response.eof?
                f.write(response.read(1024 * 1024))
              end
            end
            FileUtils.move(tmp_path, output_path)
          rescue => e
            warn "An error occurred while writing the file to cache: #{e}"
          end
        end

        def resolve_path(request)
          File.join(@path, request)
        end
      end

      def self.try_cache(cache, *names)
        names.each do |name|
          begin
            result = cache.match(name)
            return result if result
          rescue
            next
          end
        end
        nil
      end

      def self.get_model_file(path_or_repo_id, filename, fatal = true, **options)
        # Initiate file retrieval
        Utils.dispatch_callback(options[:progress_callback], {
          status: "initiate",
          name: path_or_repo_id,
          file: filename
        })

        # If `cache_dir` is not specified, use the default cache directory
        cache = FileCache.new(options[:cache_dir] || Informers.cache_dir)

        revision = options[:revision] || "main"

        request_url = path_join(path_or_repo_id, filename)

        remote_url = path_join(
          Informers.remote_host,
          Informers.remote_path_template
            .gsub("{model}", path_or_repo_id)
            .gsub("{revision}", URI.encode_www_form_component(revision)),
          filename
        )

        # Choose cache key for filesystem cache
        # When using the main revision (default), we use the request URL as the cache key.
        # If a specific revision is requested, we account for this in the cache key.
        fs_cache_key = revision == "main" ? request_url : path_join(path_or_repo_id, revision, filename)

        proposed_cache_key = fs_cache_key

        resolved_path = cache.resolve_path(proposed_cache_key)

        # Whether to cache the final response in the end.
        to_cache_response = false

        # A caching system is available, so we try to get the file from it.
        response = try_cache(cache, proposed_cache_key)

        cache_hit = !response.nil?

        if response.nil?
          # File is not cached, so we perform the request

          if response.nil? || response.status[0] == "404"
            # File not found locally. This means either:
            # - The user has disabled local file access (`Informers.allow_local_models = false`)
            # - the path is a valid HTTP url (`response.nil?`)
            # - the path is not a valid HTTP url and the file is not present on the file system or local server (`response.status[0] == "404"`)

            if options[:local_files_only] || !Informers.allow_remote_models
              # User requested local files only, but the file is not found locally.
              if fatal
                raise Error, "`local_files_only: true` or `Informers.allow_remote_models = false` and file was not found locally at #{resolved_path.inspect}."
              else
                # File not found, but this file is optional.
                # TODO in future, cache the response?
                return nil
              end
            end

            progress_info = {
              name: path_or_repo_id,
              file: filename
            }

            # File not found locally, so we try to download it from the remote server
            response = get_file(remote_url, options[:progress_callback], progress_info)

            if response.status[0] != "200"
              # should not happen
              raise Todo
            end

            # Success! We use the proposed cache key from earlier
            cache_key = proposed_cache_key
          end

          to_cache_response = cache && !response.is_a?(FileResponse) && response.status[0] == "200"
        end

        if to_cache_response && cache_key && cache.match(cache_key).nil?
          cache.put(cache_key, response)
        end

        Utils.dispatch_callback(options[:progress_callback], {
          status: "done",
          name: path_or_repo_id,
          file: filename,
          cache_hit: cache_hit
        })

        resolved_path
      end

      def self.get_model_json(model_path, file_name, fatal = true, **options)
        buffer = get_model_file(model_path, file_name, fatal, **options)
        if buffer.nil?
          # Return empty object
          return {}
        end

        JSON.load_file(buffer)
      end

      def self.path_join(*parts)
        parts = parts.map.with_index do |part, index|
          if index != 0
            part = part.delete_prefix("/")
          end
          if index != parts.length - 1
            part = part.delete_suffix("/")
          end
          part
        end
        parts.join("/")
      end

      def self.display_progress(filename, width, size, expected_size)
        bar_width = [width - (filename.length + 3), 1].max
        progress = expected_size && expected_size > 0 ? size / expected_size.to_f : 0
        done = (progress * bar_width).round
        not_done = bar_width - done
        "#{filename} |#{"█" * done}#{" " * not_done}|"
      end
    end
  end
end


================================================
FILE: lib/informers/utils/image.rb
================================================
module Informers
  module Utils
    class RawImage
      RESAMPLING_MAPPING = {
        0 => "nearest",
        1 => "lanczos",
        2 => "bilinear",
        3 => "bicubic",
        4 => "box",
        5 => "hamming"
      }

      attr_reader :image, :width, :height, :channels

      def initialize(image)
        @image = image
        @width = image.width
        @height = image.height
        @channels = image.bands
      end

      def data
        @image.write_to_memory.unpack("C*")
      end

      def size
        [@width, @height]
      end

      def resize(width, height, resample: 2)
        resample_method = RESAMPLING_MAPPING[resample] || resample

        case resample_method
        when "bilinear", "bicubic"
          img =
            @image.affine(
              [width / @width.to_f, 0, 0, height / @height.to_f],
              interpolate: Vips::Interpolate.new(resample_method.to_sym)
            )
        else
          raise Todo
        end

        RawImage.new(img)
      end

      def center_crop(crop_width, crop_height)
        # If the image is already the desired size, return it
        if @width == crop_width && @height == crop_height
          return self
        end

        # Determine bounds of the image in the new canvas
        width_offset = (@width - crop_width) / 2.0
        height_offset = (@height - crop_height) / 2.0

        if width_offset >= 0 && height_offset >= 0
          # Cropped image lies entirely within the original image
          img = @image.crop(
            width_offset.floor,
            height_offset.floor,
            crop_width,
            crop_height
          )
        elsif width_offset <= 0 && height_offset <= 0
          raise Todo
        else
          raise Todo
        end

        RawImage.new(img)
      end

      def rgb
        if @channels == 3
          return self
        end

        raise Todo
      end

      def save(path)
        @image.write_to_file(path)
      end

      def self.read(input)
        if input.is_a?(RawImage)
          input
        elsif input.is_a?(URI)
          require "open-uri"

          RawImage.new(Vips::Image.new_from_buffer(input.read, ""))
        elsif input.is_a?(String)
          RawImage.new(Vips::Image.new_from_file(input))
        else
          raise ArgumentError, "Unsupported input type: #{input.class.name}"
        end
      end

      def self.from_array(input)
        c, h, w = Utils.dims(input)
        pixel_data = Array.new(w * h * c)

        input.each_with_index do |cv, ci|
          cv.each_with_index do |hv, hi|
            hv.each_with_index do |v, wi|
              pixel_data[(hi * w * c) + (wi * c) + ci] = v
            end
          end
        end

        RawImage.new(Vips::Image.new_from_memory_copy(pixel_data.pack("C*"), w, h, c, :uchar))
      end
    end
  end
end


================================================
FILE: lib/informers/utils/math.rb
================================================
module Informers
  module Utils
    def self.interpolate_data(input, in_shape, out_shape, mode = "bilinear", align_corners = false)
      in_channels, in_height, in_width = in_shape
      out_height, out_width = out_shape

      # TODO use mode and align_corners

      # Output image dimensions
      x_scale = out_width / in_width.to_f
      y_scale = out_height / in_height.to_f

      # Output image
      out_img = Array.new(out_height * out_width * in_channels)

      # Pre-calculate strides
      in_stride = in_height * in_width
      out_stride = out_height * out_width

      out_height.times do |i|
        out_width.times do |j|
          # Calculate output offset
          out_offset = i * out_width + j

          # Calculate input pixel coordinates
          x = (j + 0.5) / x_scale - 0.5
          y = (i + 0.5) / y_scale - 0.5

          # Calculate the four nearest input pixels
          # We also check if the input pixel coordinates are within the image bounds
          x1 = x.floor
          y1 = y.floor
          x2 = [x1 + 1, in_width - 1].min
          y2 = [y1 + 1, in_height - 1].min

          x1 = [x1, 0].max
          y1 = [y1, 0].max

          # Calculate the fractional distances between the input pixel and the four nearest pixels
          s = x - x1
          t = y - y1

          # Perform bilinear interpolation
          w1 = (1 - s) * (1 - t)
          w2 = s * (1 - t)
          w3 = (1 - s) * t
          w4 = s * t

          # Calculate the four nearest input pixel indices
          y_stride = y1 * in_width
          x_stride = y2 * in_width
          idx1 = y_stride + x1
          idx2 = y_stride + x2
          idx3 = x_stride + x1
          idx4 = x_stride + x2

          in_channels.times do |k|
            # Calculate channel offset
            c_offset = k * in_stride

            out_img[k * out_stride + out_offset] =
              w1 * input[c_offset + idx1] +
              w2 * input[c_offset + idx2] +
              w3 * input[c_offset + idx3] +
              w4 * input[c_offset + idx4]
          end
        end
      end

      out_img
    end

    def self.softmax(arr)
      # Compute the maximum value in the array
      max_val = arr.max

      #  Compute the exponentials of the array values
      exps = arr.map { |x| Math.exp(x - max_val) }

      # Compute the sum of the exponentials
      sum_exps = exps.sum

      # Compute the softmax values
      softmax_arr = exps.map { |x| x / sum_exps }

      softmax_arr
    end

    def self.sigmoid(arr)
      if arr[0].is_a?(Array)
        return arr.map { |a| sigmoid(a) }
      end
      arr.map { |v| 1 / (1 + Math.exp(-v)) }
    end

    def self.get_top_items(items, top_k = 0)
      # if top == 0, return all

      items = items
        .map.with_index { |x, i| [i, x] } # Get indices ([index, score])
        .sort_by { |v| -v[1] }            # Sort by log probabilities

      if !top_k.nil? && top_k > 0
        items = items.slice(0, top_k)     # Get top k items
      end

      items
    end

    def self.max(arr)
      if arr.length == 0
        raise Error, "Array must not be empty"
      end
      arr.map.with_index.max_by { |v, _| v }
    end
  end
end


================================================
FILE: lib/informers/utils/tensor.rb
================================================
module Informers
  module Utils
    def self.mean_pooling(last_hidden_state, attention_mask)
      last_hidden_state.zip(attention_mask).map do |state, mask|
        state[0].size.times.map do |k|
          sum = 0.0
          count = 0

          state.zip(mask) do |s, m|
            count += m
            sum += s[k] * m
          end

          sum / count
        end
      end
    end

    def self.normalize(result)
      result.map do |row|
        norm = Math.sqrt(row.sum { |v| v * v })
        row.map { |v| v / norm }
      end
    end

    def self.stack(tensors, dim = 0)
      tensors
    end

    def self.ones_like(tensor)
      if tensor[0].is_a?(Array)
        return tensor.map { |v| ones_like(v) }
      end
      tensor.map { |_| 1 }
    end

    def self.dims(tensor)
      dims = []
      while tensor.is_a?(Array)
        dims << tensor.size
        tensor = tensor[0]
      end
      dims
    end

    def self.interpolate(input, shape, mode = "bilinear", align_corners = false)
      out_height, out_width = shape

      # Input image dimensions
      in_channels = dims(input)[-3] || 1
      in_height = dims(input)[-2]
      in_width = dims(input)[-1]

      output = interpolate_data(
        input.flatten,
        [in_channels, in_height, in_width],
        [out_height, out_width],
        mode,
        align_corners
      )
      reshape(output, [in_channels, out_height, out_width])
    end

    def self.reshape(arr, dims)
      arr = arr.flatten
      dims[1..-1].reverse_each do |dim|
        arr = arr.each_slice(dim)
      end
      arr.to_a
    end
  end
end


================================================
FILE: lib/informers/version.rb
================================================
module Informers
  VERSION = "1.2.1"
end


================================================
FILE: lib/informers.rb
================================================
# dependencies
require "onnxruntime"
require "tokenizers"

# stdlib
require "io/console"
require "json"
require "open-uri"
require "open3"
require "stringio"
require "uri"

# modules
require_relative "informers/backends/onnx"
require_relative "informers/utils/audio"
require_relative "informers/utils/core"
require_relative "informers/utils/dtypes"
require_relative "informers/utils/generation"
require_relative "informers/utils/ffmpeg"
require_relative "informers/utils/hub"
require_relative "informers/utils/image"
require_relative "informers/utils/math"
require_relative "informers/utils/tensor"
require_relative "informers/configs"
require_relative "informers/env"
require_relative "informers/model"
require_relative "informers/models"
require_relative "informers/processors"
require_relative "informers/tokenizers"
require_relative "informers/version"
require_relative "informers/pipelines"

module Informers
  class Error < StandardError; end

  class Todo < Error
    def message
      "not implemented yet"
    end
  end
end


================================================
FILE: test/model_test.rb
================================================
require_relative "test_helper"

class ModelTest < Minitest::Test
  # https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
  def test_all_minilm
    sentences = ["This is an example sentence", "Each sentence is converted"]

    model = Informers.pipeline("embedding", "sentence-transformers/all-MiniLM-L6-v2")
    embeddings = model.(sentences)

    assert_elements_in_delta [0.067657, 0.063496, 0.048713], embeddings[0][..2]
    assert_elements_in_delta [0.086439, 0.10276, 0.0053946], embeddings[1][..2]
  end

  # https://huggingface.co/Xenova/all-MiniLM-L6-v2
  def test_all_minilm_xenova
    sentences = ["This is an example sentence", "Each sentence is converted"]

    model = Informers.pipeline("embedding", "Xenova/all-MiniLM-L6-v2", dtype: "q8")
    embeddings = model.(sentences)

    assert_elements_in_delta [0.045927, 0.07328, 0.05401], embeddings[0][..2]
    assert_elements_in_delta [0.081881, 0.1076, -0.01324], embeddings[1][..2]
  end

  # https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1
  def test_multi_qa_minilm
    query = "How many people live in London?"
    docs = ["Around 9 Million people live in London", "London is known for its financial district"]

    model = Informers.pipeline("embedding", "sentence-transformers/multi-qa-MiniLM-L6-cos-v1")
    query_embedding = model.(query)
    doc_embeddings = model.(docs)
    scores = doc_embeddings.map { |e| e.zip(query_embedding).sum { |d, q| d * q } }
    doc_score_pairs = docs.zip(scores).sort_by { |d, s| -s }

    assert_equal "Around 9 Million people live in London", doc_score_pairs[0][0]
    assert_in_delta 0.9156, doc_score_pairs[0][1]
    assert_equal "London is known for its financial district", doc_score_pairs[1][0]
    assert_in_delta 0.4948, doc_score_pairs[1][1]
  end

  # https://huggingface.co/sentence-transformers/paraphrase-MiniLM-L6-v2
  def test_paraphrase_minilm
    sentences = ["This is an example sentence", "Each sentence is converted"]

    model = Informers.pipeline("embedding", "sentence-transformers/paraphrase-MiniLM-L6-v2")
    embeddings = model.(sentences, normalize: false)

    assert_elements_in_delta [0.067359, 0.783935, 0.270018], embeddings[0][..2]
    assert_elements_in_delta [0.122117, 0.670228, 0.317166], embeddings[1][..2]
  end

  # https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1
  def test_mxbai_embed
    query_prefix = "Represent this sentence for searching relevant passages: "

    input = [
      "The dog is barking",
      "The cat is purring",
      query_prefix + "puppy"
    ]

    model = Informers.pipeline("embedding", "mixedbread-ai/mxbai-embed-large-v1")
    embeddings = model.(input, pooling: "cls", normalize: false)

    assert_elements_in_delta [-0.61227727, 1.4060247, -0.04079155], embeddings[1][..2]
    assert_elements_in_delta [-0.00624076, 0.12864432, 0.5248165], embeddings[-1][..2]
  end

  # https://huggingface.co/Supabase/gte-small
  def test_gte_small
    sentences = ["That is a happy person", "That is a very happy person"]

    model = Informers.pipeline("embedding", "Supabase/gte-small")
    embeddings = model.(sentences)

    assert_elements_in_delta [-0.05316979, 0.01044252, 0.06194701], embeddings[0][..2]
    assert_elements_in_delta [-0.05246907, 0.03752426, 0.07344585], embeddings[-1][..2]
  end

  # https://huggingface.co/intfloat/e5-base-v2
  def test_e5_base
    doc_prefix = "passage: "
    query_prefix = "query: "

    input = [
      doc_prefix + "Ruby is a programming language created by Matz",
      query_prefix + "Ruby creator"
    ]

    model = Informers.pipeline("embedding", "intfloat/e5-base-v2")
    embeddings = model.(input)

    assert_elements_in_delta [-0.00596662, -0.03730119, -0.0703470], embeddings[0][..2]
    assert_elements_in_delta [0.00298353, -0.04421991, -0.0591884], embeddings[-1][..2]
  end

  # https://huggingface.co/nomic-ai/nomic-embed-text-v1
  def test_nomic_embed
    doc_prefix = "search_document: "
    query_prefix = "search_query: "

    input = [
      doc_prefix + "The dog is barking",
      query_prefix + "puppy"
    ]

    model = Informers.pipeline("embedding", "nomic-ai/nomic-embed-text-v1")
    embeddings = model.(input)

    assert_elements_in_delta [-0.00645858, 0.01145126, 0.0099767], embeddings[0][..2]
    assert_elements_in_delta [-0.01173127, 0.04957652, -0.0176401], embeddings[-1][..2]
  end

  # https://huggingface.co/BAAI/bge-base-en-v1.5
  def test_bge_base
    query_prefix = "Represent this sentence for searching relevant passages: "

    input = [
      "The dog is barking",
      "The cat is purring",
      query_prefix + "puppy"
    ]

    model = Informers.pipeline("embedding", "BAAI/bge-base-en-v1.5")
    embeddings = model.(input)

    assert_elements_in_delta [-0.07482512, -0.0770234, 0.03398684], embeddings[1][..2]
    assert_elements_in_delta [0.00029264, -0.0619305, -0.06199387], embeddings[-1][..2]
  end

  # https://huggingface.co/jinaai/jina-embeddings-v2-base-en
  def test_jina_embeddings
    sentences = ["How is the weather today?", "What is the current weather like today?"]

    model = Informers.pipeline("embedding", "jinaai/jina-embeddings-v2-base-en", model_file_name: "../model")
    embeddings = model.(sentences)

    assert_elements_in_delta [-0.02488641, -0.0429398, 0.04303398], embeddings[0][..2]
    assert_elements_in_delta [-0.0081194, -0.06225249, 0.03116853], embeddings[1][..2]
  end

  # https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5
  def test_snowflake_arctic_embed
    query_prefix = "Represent this sentence for searching relevant passages: "

    input = [
      "The dog is barking",
      "The cat is purring",
      query_prefix + "puppy"
    ]

    model = Informers.pipeline("embedding", "Snowflake/snowflake-arctic-embed-m-v1.5")
    embeddings = model.(input, model_output: "sentence_embedding", pooling: "none")

    assert_elements_in_delta [0.03239886, 0.0009998, 0.08401278], embeddings[0][..2]
    assert_elements_in_delta [-0.02530634, -0.02715422, 0.01218867], embeddings[-1][..2]

    embeddings = model.(input, model_output: "token_embeddings", pooling: "cls")

    assert_elements_in_delta [0.03239886, 0.0009998, 0.08401278], embeddings[0][..2]
    assert_elements_in_delta [-0.02530634, -0.02715422, 0.01218867], embeddings[-1][..2]
  end

  # https://huggingface.co/sentence-transformers/all-mpnet-base-v2
  def test_all_mpnet
    sentences = ["This is an example sentence", "Each sentence is converted"]

    model = Informers.pipeline("embedding", "sentence-transformers/all-mpnet-base-v2")
    embeddings = model.(sentences)

    assert_elements_in_delta [0.02250263, -0.07829167, -0.02303071], embeddings[0][..2]
    assert_elements_in_delta [0.04170236, 0.00109747, -0.01553415], embeddings[1][..2]
  end

  # https://huggingface.co/BAAI/bge-m3
  def test_bge_m3
    sentences = ["This is an example sentence", "Each sentence is converted"]

    model = Informers.pipeline("embedding", "BAAI/bge-m3")
    model.(sentences, model_output: "token_embeddings")
  end

  # https://huggingface.co/mixedbread-ai/mxbai-rerank-base-v1
  def test_mxbai_rerank
    query = "How many people live in London?"
    docs = ["Around 9 Million people live in London", "London is known for its financial district"]

    model = Informers.pipeline("reranking", "mixedbread-ai/mxbai-rerank-base-v1")
    result = model.(query, docs, return_documents: true)

    assert_equal 0, result[0][:doc_id]
    assert_in_delta 0.984, result[0][:score]
    assert_equal docs[0], result[0][:text]

    assert_equal 1, result[1][:doc_id]
    assert_in_delta 0.139, result[1][:score]
    assert_equal docs[1], result[1][:text]
  end

  # https://huggingface.co/jinaai/jina-reranker-v1-turbo-en
  def test_jina_reranker
    query = "How many people live in London?"
    docs = ["Around 9 Million people live in London", "London is known for its financial district"]

    model = Informers.pipeline("reranking", "jinaai/jina-reranker-v1-turbo-en")
    result = model.(query, docs, return_documents: true)

    assert_equal 0, result[0][:doc_id]
    assert_in_delta 0.912, result[0][:score]
    assert_equal docs[0], result[0][:text]

    assert_equal 1, result[1][:doc_id]
    assert_in_delta 0.0555, result[1][:score]
    assert_equal docs[1], result[1][:text]
  end

  # https://huggingface.co/BAAI/bge-reranker-base
  def test_bge_reranker
    query = "How many people live in London?"
    docs = ["Around 9 Million people live in London", "London is known for its financial district"]

    model = Informers.pipeline("reranking", "BAAI/bge-reranker-base")
    result = model.(query, docs, return_documents: true)

    assert_equal 0, result[0][:doc_id]
    assert_in_delta 0.996, result[0][:score]
    assert_equal docs[0], result[0][:text]

    assert_equal 1, result[1][:doc_id]
    assert_in_delta 0.000158, result[1][:score], 0.000001
    assert_equal docs[1], result[1][:text]
  end

  # https://huggingface.co/Xenova/ms-marco-MiniLM-L-6-v2
  def test_ms_marco_minilm
    query = "How many people live in London?"
    docs = ["Around 9 Million people live in London", "London is known for its financial district"]

    model = Informers.pipeline("reranking", "Xenova/ms-marco-MiniLM-L-6-v2")
    result = model.(query, docs, return_documents: true)

    assert_equal 0, result[0][:doc_id]
    assert_in_delta 1, result[0][:score]
    assert_equal docs[0], result[0][:text]

    assert_equal 1, result[1][:doc_id]
    assert_in_delta 0.0067, result[1][:score]
    assert_equal docs[1], result[1][:text]
  end
end


================================================
FILE: test/pipeline_test.rb
================================================
require_relative "test_helper"

class PipelineTest < Minitest::Test
  def test_ner
    ner = Informers.pipeline("ner")
    result = ner.("Ruby is a programming language created by Matz")
    assert_equal 1, result.size
    assert_equal "PER", result[0][:entity_group]
    assert_in_delta 0.994, result[0][:score]
    assert_equal "Matz", result[0][:word]
    assert_equal 42, result[0][:start]
    assert_equal 46, result[0][:end]
  end

  def test_ner_aggregation_strategy
    ner = Informers.pipeline("ner")
    result = ner.("Ruby is a programming language created by Matz", aggregation_strategy: "none")
    assert_equal 2, result.size
    assert_equal "B-PER", result[0][:entity]
    assert_in_delta 0.996, result[0][:score]
    assert_equal 8, result[0][:index]
    assert_equal "Mat", result[0][:word]
    assert_equal 42, result[0][:start]
    assert_equal 45, result[0][:end]
  end

  def test_sentiment_analysis
    classifier = Informers.pipeline("sentiment-analysis")
    result = classifier.("I love transformers!")
    assert_equal "POSITIVE", result[:label]
    assert_in_delta 0.9997887, result[:score], 0.0000001

    result = classifier.("This is super cool")
    assert_equal "POSITIVE", result[:label]
    assert_in_delta 0.9998608, result[:score], 0.0000001

    result = classifier.(["This is super cool", "I didn't like it"])
    assert_equal "POSITIVE", result[0][:label]
    assert_in_delta 0.9998600, result[0][:score], 0.0000001
    assert_equal "NEGATIVE", result[1][:label]
    assert_in_delta 0.9985375, result[1][:score], 0.0000001
  end

  def test_question_answering
    qa = Informers.pipeline("question-answering")
    result = qa.("Who invented Ruby?", "Ruby is a programming language created by Matz")
    assert_in_delta 0.998, result[:score]
    assert_equal "Matz", result[:answer]
    assert_equal 42, result[:start]
    assert_equal 46, result[:end]
  end

  def test_zero_shot_classification
    classifier = Informers.pipeline("zero-shot-classification")
    text = "Last week I upgraded my iOS version and ever since then my phone has been overheating whenever I use your app."
    labels = ["mobile", "billing", "website", "account access"]
    result = classifier.(text, labels)
    assert_equal text, result[:sequence]
    assert_equal ["mobile", "billing", "account access", "website"], result[:labels]
    assert_elements_in_delta [0.633, 0.134, 0.121, 0.111], result[:scores]
  end

  def test_text2text_generation
    text2text = Informers.pipeline("text2text-generation")
    result = text2text.("translate from English to French: I'm very happy")
    assert_equal "Je suis très heureux.", result[0][:generated_text]
  end

  def test_translation
    translator = Informers.pipeline("translation", "Xenova/nllb-200-distilled-600M")
    result = translator.("जीवन एक चॉकलेट बॉक्स की तरह है।", src_lang: "hin_Deva", tgt_lang: "fra_Latn")
    assert_equal "La vie est comme une boîte à chocolat.", result[0][:translation_text]
  end

  def test_text_generation
    generator = Informers.pipeline("text-generation")
    result = generator.("I enjoy walking with my cute dog,")
    assert_equal "I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to", result[0][:generated_text]
  end

  def test_summarization
    skip "TODO"

    summarizer = Informers.pipeline("summarization")
    result = summarizer.("Ruby is awesome.")
    assert_equal "Ruby is awesome. Ruby is awesome. Ruby is great. Ruby's website is great. Ruby's site is great for the first time. Ruby will be great for all the people who want to know more about the site. Click here for more information. Click HERE for", result[0][:summary_text]
  end

  def test_fill_mask
    unmasker = Informers.pipeline("fill-mask")
    result = unmasker.("Paris is the [MASK] of France.")
    assert_equal 5, result.size
    assert_in_delta 0.997, result[0][:score]
    assert_equal 3007, result[0][:token]
    assert_equal "capital", result[0][:token_str]
    assert_equal "paris is the capital of france.", result[0][:sequence]
  end

  def test_fill_mask_no_mask_token
    unmasker = Informers.pipeline("fill-mask")
    error = assert_raises(ArgumentError) do
      unmasker.("Paris is the <mask> of France.")
    end
    assert_equal "Mask token ([MASK]) not found in text.", error.message
  end

  def test_feature_extraction
    sentences = ["This is an example sentence", "Each sentence is converted"]
    extractor = Informers.pipeline("feature-extraction")
    output = extractor.(sentences)
    assert_in_delta (-0.0145), output[0][0][0]
    assert_in_delta (-0.3130), output[-1][-1][-1]
  end

  def test_embedding
    sentences = ["This is an example sentence", "Each sentence is converted"]
    embed = Informers.pipeline("embedding")
    embeddings = embed.(sentences)
    assert_elements_in_delta [0.067657, 0.063496, 0.048713], embeddings[0][..2]
    assert_elements_in_delta [0.086439, 0.10276, 0.0053946], embeddings[1][..2]
  end

  def test_reranking
    query = "How many people live in London?"
    docs = ["Around 9 Million people live in London", "London is known for its financial district"]
    rerank = Informers.pipeline("reranking")
    result = rerank.(query, docs)
    assert_equal 2, result.size
    assert_equal 0, result[0][:doc_id]
    assert_in_delta 0.984, result[0][:score]
    assert_equal 1, result[1][:doc_id]
    assert_in_delta 0.139, result[1][:score]
  end

  def test_image_classification
    classifier = Informers.pipeline("image-classification")
    result = classifier.("test/support/pipeline-cat-chonk.jpeg", top_k: 2)
    assert_equal "lynx, catamount", result[0][:label]
    assert_in_delta 0.428, result[0][:score], 0.01
    assert_equal "cougar, puma, catamount, mountain lion, painter, panther, Felis concolor", result[1][:label]
    assert_in_delta 0.047, result[1][:score], 0.01
  end

  def test_zero_shot_image_classification
    classifier = Informers.pipeline("zero-shot-image-classification")
    result = classifier.("test/support/pipeline-cat-chonk.jpeg", ["dog", "cat", "tiger"])
    assert_equal 3, result.size
    assert_equal "cat", result[0][:label]
    assert_in_delta 0.756, result[0][:score]
    assert_equal "tiger", result[1][:label]
    assert_in_delta 0.189, result[1][:score]
    assert_equal "dog", result[2][:label]
    assert_in_delta 0.055, result[2][:score]
  end

  def test_object_detection
    detector = Informers.pipeline("object-detection")
    result = detector.("test/support/pipeline-cat-chonk.jpeg")
    assert_equal 3, result.size

    assert_equal "cat", result[0][:label]
    assert_in_delta 0.992, result[0][:score]
    assert_equal 177, result[0][:box][:xmin]
    assert_equal 153, result[0][:box][:ymin]
    assert_equal 959, result[0][:box][:xmax]
    assert_equal 600, result[0][:box][:ymax]

    assert_equal "bicycle", result[2][:label]
    assert_in_delta 0.726, result[2][:score]
    assert_equal 0, result[2][:box][:xmin]
    assert_equal 0, result[2][:box][:ymin]
    assert_equal 196, result[2][:box][:xmax]
    assert_equal 413, result[2][:box][:ymax]
  end

  def test_zero_shot_object_detection
    detector = Informers.pipeline("zero-shot-object-detection")
    result = detector.("test/support/zero-sh-obj-detection_1.png", ["human face", "rocket", "helmet", "american flag"])
    assert_equal 4, result.size

    assert_equal "human face", result[0][:label]
    assert_in_delta 0.351, result[0][:score]
    assert_equal 179, result[0][:box][:xmin]
    assert_equal 72, result[0][:box][:ymin]
    assert_equal 270, result[0][:box][:xmax]
    assert_equal 178, result[0][:box][:ymax]

    assert_equal "rocket", result[1][:label]
    assert_in_delta 0.211, result[1][:score]
    assert_equal 351, result[1][:box][:xmin]
    assert_equal 6, result[1][:box][:ymin]
    assert_equal 468, result[1][:box][:xmax]
    assert_equal 289, result[1][:box][:ymax]
  end

  def test_depth_estimation
    estimator = Informers.pipeline("depth-estimation")
    result = estimator.("test/support/pipeline-cat-chonk.jpeg")
    assert_in_delta 1.078, result[:predicted_depth][0][0]
    assert_kind_of Vips::Image, result[:depth]
    # result[:depth].write_to_file("/tmp/depth-estimation.jpg")
  end

  def test_image_to_text
    captioner = Informers.pipeline("image-to-text")
    result = captioner.("test/support/pipeline-cat-chonk.jpeg")
    assert_equal "a cat is standing in the snow", result[0][:generated_text]
  end

  def test_image_to_image
    skip "Expensive"

    upscaler = Informers.pipeline("image-to-image")
    result = upscaler.("test/support/pipeline-cat-chonk.jpeg")
    assert_kind_of Vips::Image, result
    result.write_to_file("/tmp/image-to-image.jpg")
  end

  def test_image_segmentation
    segmenter = Informers.pipeline("image-segmentation")
    result = segmenter.("test/support/pipeline-cat-chonk.jpeg")
    assert_equal 3, result.size

    assert_equal "snow", result[0][:label]
    assert_in_delta 0.997, result[0][:score]
    assert_equal "LABEL_184", result[1][:label]
    assert_in_delta 0.993, result[1][:score]
    assert_equal "cat", result[2][:label]
    assert_in_delta 0.998, result[2][:score]
  end

  def test_image_feature_extraction
    fe = Informers.pipeline("image-feature-extraction")
    result = fe.("test/support/pipeline-cat-chonk.jpeg")
    assert_in_delta 0.877, result[0][0], 0.01
  end

  def test_progress_callback
    msgs = []
    extractor = Informers.pipeline("feature-extraction", progress_callback: ->(msg) { msgs << msg })
    extractor.("I love transformers!")

    expected_msgs = [
      {status: "initiate", name: "Xenova/all-MiniLM-L6-v2", file: "tokenizer.json"},
      {status: "ready", task: "feature-extraction", model: "Xenova/all-MiniLM-L6-v2"}
    ]
    expected_msgs.each do |expected|
      assert_includes msgs, expected
    end
  end

  def test_device
    skip unless mac?

    sentences = ["This is an example sentence", "Each sentence is converted"]
    embed = Informers.pipeline("embedding", "Xenova/all-MiniLM-L6-v2", device: "coreml")
    embeddings = embed.(sentences)
    assert_elements_in_delta [0.067657, 0.063496, 0.048713], embeddings[0][..2]
    assert_elements_in_delta [0.086439, 0.10276, 0.0053946], embeddings[1][..2]
  end

  def test_device_invalid
    error = assert_raises(ArgumentError) do
      Informers.pipeline("embedding", device: "bad")
    end
    assert_equal "Unsupported device: bad. Should be one of: cpu, cuda, coreml", error.message
  end

  def test_dtype
    sentences = ["This is an example sentence", "Each sentence is converted"]
    embed = Informers.pipeline("embedding", "Xenova/all-MiniLM-L6-v2", dtype: "fp16")
    embeddings = embed.(sentences)
    assert_elements_in_delta [0.067657, 0.063496, 0.048713], embeddings[0][..2]
    assert_elements_in_delta [0.086439, 0.10276, 0.0053946], embeddings[1][..2]
  end

  def test_dtype_invalid
    error = assert_raises(ArgumentError) do
      Informers.pipeline("embedding", dtype: "bad")
    end
    assert_equal "Invalid dtype: bad. Should be one of: fp32, fp16, int8, uint8, q8, q4, q4f16, bnb4", error.message
  end

  def test_session_options
    # TODO improve test
    Informers.pipeline("embedding", session_options: {log_severity_level: 2})
  end
end


================================================
FILE: test/test_helper.rb
================================================
require "bundler/setup"
Bundler.require(:default)
require "minitest/autorun"

class Minitest::Test
  def assert_elements_in_delta(expected, actual, delta = 0.001)
    assert_equal expected.size, actual.size
    expected.zip(actual) do |exp, act|
      assert_in_delta exp, act, delta
    end
  end

  def mac?
    RbConfig::CONFIG["host_os"] =~ /darwin/i
  end
end