Full Code of kyutai-labs/moshivis for AI

main 8624f4e01d4b cached
170 files
751.6 KB
189.7k tokens
859 symbols
1 requests
Download .txt
Showing preview only (803K chars total). Download the full file or copy to clipboard to get everything.
Repository: kyutai-labs/moshivis
Branch: main
Commit: 8624f4e01d4b
Files: 170
Total size: 751.6 KB

Directory structure:
gitextract_d_6tdl44/

├── .dockerignore
├── .gitattributes
├── .github/
│   ├── actions/
│   │   └── rust_build/
│   │       └── action.yml
│   ├── requirements_github_actions.txt
│   └── workflows/
│       ├── checks.yml
│       └── rust-ci.yml
├── .gitignore
├── CONTRIBUTING.md
├── ISSUE_TEMPLATE/
│   ├── bug.yml
│   └── question.yml
├── LICENSE-APACHE
├── LICENSE-MIT
├── LICENSE.md
├── PULL_REQUEST_TEMPLATE.md
├── README.md
├── client/
│   ├── .eslinrc.json
│   ├── .nvmrc
│   ├── .prettierignore
│   ├── .prettierrc.json
│   ├── Dockerfile
│   ├── LICENSE
│   ├── README.md
│   ├── index.html
│   ├── package.json
│   ├── postcss.config.js
│   ├── public/
│   │   └── assets/
│   │       ├── decoderWorker.min.wasm
│   │       └── images/
│   │           └── demo/
│   │               └── attribution.txt
│   ├── src/
│   │   ├── app.tsx
│   │   ├── audio-processor.ts
│   │   ├── components/
│   │   │   ├── Button/
│   │   │   │   └── Button.tsx
│   │   │   ├── ImageGallery/
│   │   │   │   └── ImageGallery.tsx
│   │   │   └── Input/
│   │   │       └── Input.tsx
│   │   ├── decoder/
│   │   │   └── decoderWorker.ts
│   │   ├── env.ts
│   │   ├── index.css
│   │   ├── modules.d.ts
│   │   ├── pages/
│   │   │   ├── Conversation/
│   │   │   │   ├── Conversation.tsx
│   │   │   │   ├── MediaContext.ts
│   │   │   │   ├── SocketContext.ts
│   │   │   │   ├── components/
│   │   │   │   │   ├── AudioVisualizer/
│   │   │   │   │   │   ├── AudioVisualizer.tsx
│   │   │   │   │   │   ├── ClientVisualizer.tsx
│   │   │   │   │   │   └── ServerVisualizer.tsx
│   │   │   │   │   ├── Controls/
│   │   │   │   │   │   └── Controls.tsx
│   │   │   │   │   ├── ModelParams/
│   │   │   │   │   │   └── ModelParams.tsx
│   │   │   │   │   ├── ServerAudio/
│   │   │   │   │   │   ├── ServerAudio.tsx
│   │   │   │   │   │   └── ServerAudioStats.tsx
│   │   │   │   │   ├── ServerInfo/
│   │   │   │   │   │   └── ServerInfo.tsx
│   │   │   │   │   ├── TextDisplay/
│   │   │   │   │   │   ├── TextDisplay.tsx
│   │   │   │   │   │   └── TextDisplayStats.tsx
│   │   │   │   │   └── UserAudio/
│   │   │   │   │       ├── UserAudio.tsx
│   │   │   │   │       └── UserAudioStats.tsx
│   │   │   │   ├── getMimeType.ts
│   │   │   │   └── hooks/
│   │   │   │       ├── audioUtils.ts
│   │   │   │       ├── useModelParams.ts
│   │   │   │       ├── useServerAudio.ts
│   │   │   │       ├── useServerInfo.ts
│   │   │   │       ├── useServerText.ts
│   │   │   │       ├── useSocket.ts
│   │   │   │       └── useUserAudio.ts
│   │   │   └── Queue/
│   │   │       ├── Queue.tsx
│   │   │       ├── api/
│   │   │       │   ├── client.ts
│   │   │       │   ├── errors/
│   │   │       │   │   ├── api_error.ts
│   │   │       │   │   └── response_error.ts
│   │   │       │   └── validators.ts
│   │   │       └── hooks/
│   │   │           └── useUserEmail.ts
│   │   └── protocol/
│   │       ├── encoder.ts
│   │       ├── testMessages.ts
│   │       └── types.ts
│   ├── tailwind.config.js
│   ├── tsconfig.json
│   └── vite.config.ts
├── docker-bake.hcl
├── kyuteye_mlx/
│   ├── .pylintrc
│   ├── LICENSE
│   ├── MANIFEST.in
│   ├── README.md
│   ├── kyuteye_mlx/
│   │   ├── __init__.py
│   │   ├── benchmark.py
│   │   ├── local_web.py
│   │   ├── mlx_vlm/
│   │   │   ├── LICENSE
│   │   │   ├── __init__.py
│   │   │   └── models/
│   │   │       ├── __init__.py
│   │   │       ├── pixtral/
│   │   │       │   ├── __init__.py
│   │   │       │   └── vision.py
│   │   │       └── siglip/
│   │   │           └── vision.py
│   │   ├── models/
│   │   │   ├── __init__.py
│   │   │   ├── generate.py
│   │   │   ├── lm.py
│   │   │   ├── pixtral.py
│   │   │   └── siglip.py
│   │   ├── modules/
│   │   │   ├── __init__.py
│   │   │   ├── config.py
│   │   │   ├── cross_attention.py
│   │   │   ├── kv_cache.py
│   │   │   └── transformer.py
│   │   ├── py.typed
│   │   ├── quantize.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── loading.py
│   │       ├── profiling.py
│   │       └── sampling.py
│   ├── pixtral-12b-8bit.config
│   ├── pyproject.toml
│   ├── siglip448.config
│   └── tests/
│       └── test_siglip.py
├── kyuteye_pt/
│   ├── .pylintrc
│   ├── LICENSE.md
│   ├── README.md
│   ├── configs/
│   │   └── moshika-vis.yaml
│   ├── kyuteye/
│   │   ├── __init__.py
│   │   ├── config/
│   │   │   ├── __init__.py
│   │   │   ├── enums.py
│   │   │   ├── kyuteye_config.py
│   │   │   └── subconfigs.py
│   │   ├── models/
│   │   │   ├── __init__.py
│   │   │   ├── docker-bake.hcl
│   │   │   ├── helium.py
│   │   │   ├── hf_model_configs.py
│   │   │   ├── image_projection.py
│   │   │   ├── loaders.py
│   │   │   └── moshivis.py
│   │   ├── modules/
│   │   │   ├── __init__.py
│   │   │   ├── attention.py
│   │   │   ├── cross_attention.py
│   │   │   ├── image_encoder.py
│   │   │   ├── image_transforms.py
│   │   │   ├── streaming_utils.py
│   │   │   ├── transformer.py
│   │   │   └── utils.py
│   │   ├── server.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── dist_utils.py
│   │       ├── logging_utils.py
│   │       └── struct_utils.py
│   ├── pyproject.toml
│   └── tests/
│       └── hello.py
├── kyuteye_rs/
│   ├── Cargo.toml
│   ├── configs/
│   │   ├── config-moshika-vis-q8.json
│   │   └── config-moshika-vis.json
│   ├── moshi-backend/
│   │   ├── Cargo.toml
│   │   ├── build.rs
│   │   └── src/
│   │       ├── audio.rs
│   │       ├── build.rs
│   │       ├── image_embedder.rs
│   │       ├── main.rs
│   │       ├── metrics.rs
│   │       ├── standalone.rs
│   │       ├── stream_both.rs
│   │       └── utils.rs
│   └── moshi-core/
│       ├── Cargo.toml
│       └── src/
│           ├── conv.rs
│           ├── dynamic_logits_processor.rs
│           ├── lib.rs
│           ├── lm.rs
│           ├── lm_generate.rs
│           ├── lm_generate_multistream.rs
│           ├── mimi.rs
│           ├── nn.rs
│           ├── quantization.rs
│           ├── seanet.rs
│           ├── streaming.rs
│           └── transformer.rs
├── scripts/
│   ├── convert_ckpt_utils.py
│   └── get_static_client.py
└── ssvd/
    ├── README.md
    ├── __init__.py
    ├── generate.py
    ├── multiturn_instruct.py
    ├── multiturn_prompting.py
    └── utils.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .dockerignore
================================================
**/target/
**/node_modules/
**/dist
ssvd/synthetic_visual_dialogues/

================================================
FILE: .gitattributes
================================================
*.wav filter=lfs diff=lfs merge=lfs -text


================================================
FILE: .github/actions/rust_build/action.yml
================================================
name: rust_build
description: 'Setup rust env'
inputs:
  os:
    default: ubuntu-latest
  toolchain:
    default: stable
  target:
    default: check
runs:
  using: "composite"
  steps:
    - uses: actions-rs/toolchain@v1
      with:
        profile: minimal
        toolchain: ${{ inputs.toolchain }}
        override: true
    - name: cargo cache
      uses: actions/cache@v3
      with:
        path: |
          ~/.cargo/bin/
          ~/.cargo/registry/index/
          ~/.cargo/registry/cache/
          ~/.cargo/git/db/
          kyuteye_rs/target/
        key: ${{ inputs.os }}-cargo-${{ inputs.target }}-${{ hashFiles('**/Cargo.toml') }}
        restore-keys: ${{ inputs.os }}-cargo-
    - name: install deps
      shell: bash
      run: |
        sudo apt-get update
        sudo apt-get install libasound2-dev


================================================
FILE: .github/requirements_github_actions.txt
================================================
# Main setup
# old version: transformers 4.43.3 and accelerate 0.33.0
# new version (for pixtrla): transformers 4.46.0 and accelerate 1.0.1
accelerate==1.0.1
anls
anls-star
av<12
auditok<0.3.0
cython
datasets
deepspeed
demucs
einops
encodec
fasttext
flashy>=0.0.1
gradio
huggingface_hub
hydra_colorlog
hydra-core>=1.1
ipywidgets
jiwer
julius
jupyterlab
librosa
maturin
num2words
numpy
onnxruntime
opencv-python
protobuf
pyannote.audio
pyannote.metrics
pycocoevalcap
pycocotools
sentencepiece
spacy==3.5.2
tensorboard
timm
torch==2.2.0  
torchaudio==2.2.0
torchmetrics
torchtyping
torchvision==0.17.0
tqdm
transformers==4.47.0  # need Encodec there.
webdataset==0.2.100  # for sanity
evaluate
rouge-score
xformers==0.0.24

# specific clip commit
clip @ https://github.com/openai/CLIP/archive/master.zip#sha256=11c3593912e6e6446fb0bde144c5ea374f7e19eeab9072c3eb00b59dd8afb706

# launcheon + code prettifying stuff
fire
rich
pyyaml
black
mypy==1.11.2
pylint
matplotlib
seaborn

================================================
FILE: .github/workflows/checks.yml
================================================
name: Checks

on:
  push:
    branches:
      - main
  pull_request:
    types: [opened, synchronize, reopened, ready_for_review]
  workflow_dispatch:

jobs:
  pylint_pytorch:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
    - uses: astral-sh/setup-uv@v5
    - name: Static lint analysis with pylint
      run: |
        cd kyuteye_pt && uv run --locked pylint --rcfile=.pylintrc --fail-under=8.5 ./kyuteye

  ruff_mlx:
    runs-on: macos-14
    steps:
    - uses: actions/checkout@v4
    - uses: astral-sh/setup-uv@v5
    - name: Static lint analysis with pylint
      run: |
        cd kyuteye_mlx && uv run ruff format --diff && uv run ruff check --select I

  sanity_check_pytorch:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
    - uses: astral-sh/setup-uv@v5
    - name: Sanity check
      run: |
        cd kyuteye_pt && uv run --locked sanity-check

  sanity_check_mlx:
    runs-on: macos-14
    steps:
    - uses: actions/checkout@v4
    - uses: astral-sh/setup-uv@v5
    - name: Sanity check
      run: |
        cd kyuteye_mlx && uv run --locked sanity-check

  sanity_check_rust:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Cache Cargo
        uses: actions/cache@v3
        with:
          path: |
            ~/.cargo/registry
            kyuteye_rs/target
          key: cargo-${{ hashFiles('**/Cargo.lock') }}
          restore-keys: cargo-
      - run: cd kyuteye_rs && cargo fmt --all -- --check
      - name: Ubuntu dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y -qq libasound2-dev libssl-dev libpulse-dev libdbus-1-dev portaudio19-dev protobuf-compiler
      - name: Clippy
        run: cd kyuteye_rs && cargo --locked clippy --workspace --tests --examples --locked -- -D warnings

  build_client:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - run: docker buildx bake client
      - run: tail client/dist/index.html


================================================
FILE: .github/workflows/rust-ci.yml
================================================
on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main, refacto ]

name: Rust CI

jobs:
  check:
    name: Check
    defaults:
      run:
        working-directory: ./kyuteye_rs
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest]
        rust: [stable]
    steps:
      - uses: actions/checkout@v2
      - uses: ./.github/actions/rust_build
      - name: check
        shell: bash
        run: |
          cargo check
      - name: clippy
        shell: bash
        run: |
          cargo clippy -- -D warnings
      - name: fmt
        shell: bash
        run: |
          cargo fmt --all -- --check
  test:
    name: Test
    defaults:
      run:
        working-directory: ./kyuteye_rs
    runs-on: ${{ matrix.os }}
    strategy:
      matrix:
        os: [ubuntu-latest]
        rust: [stable]
    steps:
      - uses: actions/checkout@v2
      - uses: actions/setup-python@v5
        with:
          python-version: 3.11
      - uses: ./.github/actions/rust_build
        with:
          target: test
      - name: test
        shell: bash
        run: |
          cargo test


================================================
FILE: .gitignore
================================================
~*
__pycache__
*.pt
*.pth
*.ipynb*
*.egg-info
*.jsonl
nohup.out
.idea/*
client/node_modules
client/dist
target/
*.safetensors
.DS_Store
*.lprof
*.prof
cert.pem
key.pem
.mypy_cache
Gemfile.lock
project_page/_site/*
kyuteye_mlx/static/*
ssvd/synthetic_visual_dialogues/

================================================
FILE: CONTRIBUTING.md
================================================
# Contributing to MoshiVis

## Pull Requests

MoshiVis is the implementation of a research paper.
Therefore, we do not plan on accepting many pull requests for new features.
However, we certainly welcome them for bug fixes.

1. Fork the repo and create your branch from `main`.
2. If you have changed APIs, update the documentation accordingly.
3. Ensure pre-commit hooks pass properly, in particular the linting and typing.
4. When changing the Rust code, run `cargo check`, `cargo clippy`, `cargo test`.
5. Accept the Contributor License Agreement (see after).

Note that in general, we will not accept refactoring of the code.


## Contributor License Agreement ("CLA")

In order to accept your pull request, we need you to submit a Contributor License Agreement.

If you agree with the full CLA provided in the next paragraph, copy the following statement in your PR, changing your Github Handle:

> I, {your GitHub handle}, confirm that I have read and understood the terms of the CLA of Kyutai-labs, as outlined in the repository's CONTRIBUTING.md, and I agree to be bound by these terms.

The full CLA is provided as follows:

> I, {your GitHub handle}, hereby grant to Kyutai-labs a perpetual, worldwide, non-exclusive, royalty-free,
> irrevocable license to use, modify, distribute, and sublicense my Contributions.

> I understand and accept that Contributions are limited to modifications, improvements, or changes
> to the project’s source code submitted via pull requests. I accept that Kyutai-labs has full discretion to
> review, accept, reject, or request changes to any Contributions I submit, and that submitting
> a pull request does not guarantee its inclusion in the project.

> By submitting a Contribution, I grant Kyutai-labs a perpetual, worldwide license to use, modify,
> reproduce, distribute, and create derivative works based on my Contributions.
> I also agree to assign all patent rights for any inventions or improvements that arise from my Contributions,
> giving the Kyutai-labs full rights to file for and enforce patents.
> I understand that the Kyutai-labs may commercialize, relicense, or exploit the project and my Contributions without further notice or obligation to me.
> I confirm that my Contributions are original and that I have the legal right to grant this license.
> If my Contributions include third-party materials, I will ensure that I have the necessary permissions
> and will disclose this information. I accept that once my Contributions are integrated, they may be altered or removed at the Kyutai-labs’s discretion.

> I acknowledge that I am making these Contributions voluntarily and will not receive any compensation.
> Furthermore, I understand that all Contributions, including mine, are provided on an "as-is" basis, with no warranties.
> By submitting a pull request, I agree to be bound by these terms.

## Issues

Please submit issues on our Github repository.

## License

By contributing to MoshiVis, you agree that your contributions will be licensed
under the LICENSE-* files in the root directory of this source tree.
In particular, the rust code is licensed under APACHE, and the python code under MIT.


================================================
FILE: ISSUE_TEMPLATE/bug.yml
================================================
name: Bug Report
description: You found a bug.
labels: ["bug", "triage"]
body:
  - type: dropdown
    id: backend
    attributes:
      label: Backend impacted
      description: Which backend is concerned with your bug report?
      options:
        - The PyTorch implementation
        - The MLX implementation
        - The Rust implementation
        - Other / All
      default: 0
    validations:
      required: true
  - type: dropdown
    id: os
    attributes:
      label: Operating system
      description: What is your operating system?
      options:
        - Linux
        - Mac OS X
        - Windows (unsupported)
      default: 0
    validations:
      required: true
  - type: dropdown
    id: hardware
    attributes:
      label: Hardware
      description: What hardware are you using?
      options:
        - CPU
        - GPU with CUDA
        - Metal with MLX
      default: 0
    validations:
      required: true
  - type: textarea
    id: description
    attributes:
      label: Description
      description: Provide a detailed description of your bug.
      placeholder: 
      value: 
    validations:
      required: true
  - type: textarea
    id: more_info
    attributes:
      label: Extra information
      description: Please provide any other relevant information, such as log extracts, code etc.
      placeholder: 
      value: 
    validations:
      required: true
  - type: textarea
    id: env
    attributes:
      label: Environment
      description: Please provide any other relevant information, such as log extracts, code etc.
      placeholder: 
      value: |
          Fill in the following information on your system.
          - Operating system version:

          If the backend impacted is PyTorch:
          - Python version:
          - PyTorch version:
          - CUDA version (run `python -c 'import torch;  print(torch.version.cuda)'`):
          - GPU model and memory:
            
          If the backend is MLX:
          - Mac model:
    validations:
      required: true


================================================
FILE: ISSUE_TEMPLATE/question.yml
================================================
name: Question
description: You have a question about Moshi/Mimi, this codebase.
labels: ["question", "triage"]
body:
  - type: markdown
    attributes:
      value: |
        Please first check the [FAQ](https://github.com/kyutai-labs/moshi/blob/main/FAQ.md).
  - type: checkboxes
    id: terms
    attributes:
      label: Due diligence
      description: Have you searched the existing issues / FAQ / Google / asked ChatGPT?
      options:
        - label: I have done my due diligence in trying to find the answer myself.
          required: true

  - type: dropdown
    id: backend
    attributes:
      label: Topic
      description: What is your question about?
      options:
        - The paper
        - The PyTorch implementation
        - The MLX implementation
        - The Rust implementation
        - Other / All
      default: 0
    validations:
      required: true
  - type: textarea
    id: question
    attributes:
      label: Question
      description: What is your question?
      placeholder: Your question. Please make sure this is directly related to our codebase. We will not provide support for installing PyTorch, CUDA, Rust etc.
      value: 
    validations:
      required: true


================================================
FILE: LICENSE-APACHE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: LICENSE-MIT
================================================
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:

The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.


================================================
FILE: LICENSE.md
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.

================================================
FILE: PULL_REQUEST_TEMPLATE.md
================================================
## Checklist

- [ ] Read CONTRIBUTING.md, and accept the CLA by including the provided snippet. We will not accept PR without this.
- [ ] Run pre-commit hook.
- [ ] If you changed Rust code, run `cargo check`, `cargo clippy`, `cargo test`.

## PR Description

<!-- Description for the PR -->


================================================
FILE: README.md
================================================
# M👁️shiVis: Teaching Speech Models to Converse about Images

![CI checks](https://github.com/kyutai-labs/moshivis/actions/workflows/checks.yml/badge.svg)

[[Preprint]][moshi-vision-arxiv] [[Demo]][talk-to-moshivis] [[Models on Hugging Face]](https://huggingface.co/collections/kyutai/)


MoshiVis is a **Vision Speech Model** (VSM) directly building on the speech-text foundation model [Moshi][moshi-arxiv] and augmenting it with the ability to freely discuss about an image while maintaining its natural conversation style and low latency. In total, MoshiVis adds $\sim$ 206M adapters parameters on top of the 7B Moshi and a pretrained frozen 400M PaliGemma2 vision encoder.

 This repository currently contains inference code to run your own MoshiVis server supporting three different backends via a webUI frontend. We are also planning to release training/finetuning code in the future.
For more information about our speech codec Mimi and speech model Moshi, please visit the original [Moshi repo][moshi-github].
For more technical details on MoshiVis, see our [blog post][blog] and [preprint][moshi-vision-arxiv].

[Talk to MoshiVis][talk-to-moshivis] now on our live demo !

<p align="center">
<img src="assets/MoshiVisSchema.png" style="padding: 5px; background-color: black" alt="Schema representing the structure of MoshiVis. "
width="650px"></p>


To inject visual inputs in the stream of *speech tokens* from Moshi, we extend the core transformer with a **cross-attention mechanism** to infuse visual information into the speech tokens stream. To maintain Moshi's **low-latency** and reduce memory usage, the cross-attention projection weights are shared **across layers.**
Moreover, to ensure that Moshi’s original conversational abilities are not lost in the process, the cross-attention modules  feature a gating mechanism that allows the model to modulate the visual input stream at will.



For more details on MoshiVis, including our training pipeline, synthetic data generation pipeline, and ablation experiments on the gating mechanism see our [preprint][moshi-vision-arxiv].


## Model Release
We release MoshikaVis, based on the original Moshika  (*female voice*)  checkpoints from Moshi's open-source release. For the image embedding part, we rely on publicly available off-the-shelf image-text encoders: The checkpoints we release use the frozen weights of a vision encoder from the [PaliGemma2](https://arxiv.org/abs/2412.03555) family, specifically on the weights provided at [huggingface](https://huggingface.co/google/paligemma2-3b-pt-448). Note that for convenience, each MoshiVis checkpoint contains the full model: i.e., the vision adaptation modules weights are bundled together with the weights of Mimi (speech codec), the Helium text tokenizer, image encoder, and base Moshi model.

For each model, we release several variants compatible with three different backends and quantization formats. Further instructions for each backend can be found below.

| Backend | Moshi**ka** |
| ------- | ----------- |
| [PyTorch](#pytorch-backend) |  [BF16](https://huggingface.co/kyutai/moshika-vis-pytorch-bf16)  |
| [Rust](#rust-backend) |  [BF16](https://huggingface.co/kyutai/moshika-vis-candle-bf16) [Q8_0](https://huggingface.co/kyutai/moshika-vis-candle-q8)  |
| [MLX](#mlx-backend) |  [BF16](https://huggingface.co/kyutai/moshika-vis-mlx-bf16)   |



All model weights (*excluding the bundled vision encoder*) are released under the CC-BY 4.0 license; The bundled vision encoder (*PaliGemma2's vision encoder*) is released under the [Gemma license](https://ai.google.dev/gemma/terms).


## Organisation of the Repository

For the **frontend**, we recommend using the provided web UI as it allows for additional echo cancellation that helps
the overall model quality. To obtain the client, you can either **(i)** build it yourself from the sources in [`client`](client/) as [described here](#building-the-frontend) or **(ii)** download the pre-built static
version we provide:

```bash
# Download prebuilt client sources
# option 1: using uv dependency manager
uv run scripts/get_static_client.py

# OR option 2: with pip
pip install fire rich huggingface_hub
python scripts/get_static_client.py
```

Most commands below will serve this UI by default using the `https` protocol (see more info [here](#http-vs-https)). To connect via `https`, you will need to generate SSL certificates first, as follows:

```bash
# Generate the SSL certificates in the root directory
openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout key.pem -out cert.pem
```

We provide three different  **backends** for the MoshiVis inference stack in this repo. While we hope that the present codebase will work on Windows, we do not provide official support for it.
- A [PyTorch](#pytorch-backend) version in the [`kyuteye_pt`](kyuteye_pt) directory.
- A [Rust](#rust-backend) version (as used in the online demo) is in the [`kyuteye_rs`](kyuteye_rs/) directory.
- A [MLX](#mlx-backend) version (tested on a MacBook Pro M3) is in the [`kyuteye_mlx`](kyuteye_mlx/) directory



For the PyTorch and MLX backends, we recommend using [uv](https://docs.astral.sh/uv/) to setup and run the code,
as it will manage all dependencies for you transparently.

`uv` is provided as a lightweight binary and can be installed as:
```bash
curl -LsSf https://astral.sh/uv/install.sh | sh
```



### PyTorch Backend
 > Note: At the moment, we do not support quantization
 > for the PyTorch version, so you will need a GPU with a significant amount of memory ($\sim$ 24GB).

You can start the MoshiVis PyTorch server with the following command and then access the web UI on [https://localhost:8008](https://localhost:8008)

```bash
cd kyuteye_pt
uv run server configs/moshika-vis.yaml --port 8088
```

Note that if your GPU is on a distant machine, you may need to forward the remote 8088 port to your localhost using ssh `-L` flag. Then connects to [https://localhost:8088](https://localhost:8088) as mentionned previously.


### Rust Backend
> For the Rust backend, you will need a recent version of the [Rust toolchain](https://rustup.rs/).
> To compile GPU support, you will need a valid [CUDA](https://developer.nvidia.com/cuda-toolkit) installation, in particular with `nvcc`.

In order to run the Rust inference server, use the following command:

```bash
cd kyuteye_rs
pip install pkg-config
cargo run --features cuda --bin moshi-backend -r -- --config configs/config-moshika-vis.json standalone --vis
```

When using macOS, you can replace `--features cuda` with `--features metal`.

Alternatively you can use `config-moshika-vis-q8.json` rather than `config-moshika-vis.json` to use the
quantized q8 model. You can also change some of the server options (e.g., starting port) in the json file directly.

Once the server has printed 'standalone worker listening', this means the model is ready.
By default the Rust server  will be accessible at [https://localhost:8088](https://localhost:8088).



### MLX Backend

We provide a MLX model checkpoint in `bfloat16` as well as quantized checkpoints
using `q4` and `q8`.

To start the MoshiVis MLX backend you can then run the following commands:

```bash
cd kyuteye_mlx
# In bfloat16 - weights will be downloaded from HF
uv run server

# In q4
uv run server -q 4

# In q8
uv run server -q 8
```

You can then access the web UI at [http://localhost:8008](http://localhost:8008).

Note that unlike other backends, not all settings available in the web UI are propagated to the MLX backend. Instead, you can configure some options directly via the command line e.g. `--text-temperature`.

### Frontends

#### WebUI

We recommend using the WebUI frontend as explained [here](#organisation-of-the-repository). If you want to build the sources yourself, follow these steps (further installation and build instructions can be found in the `client` directory):

**via NPM.**
```bash
cd client
npm install
npm run build
```

**via Docker.**  If you have `docker` installed, you can also build the client via

```bash
docker buildx bake client
```

After building the sources, the static dir for the web UI can then be found in the
`client/dist` directory, and  will be used as default for the different backend.

#### Rust Command Line

Alternatively, we also provide a command line interface for the Rust backend:


```bash
cd kyuteye_rs;
cargo run --bin moshi-cli -r -- tui --host localhost
```


## Troubleshooting

### http vs https
By default, the web UI server starts with the `https` protocol rather than `http`: Accessing a server that is not localhost via `http` may cause issues with using the microphone in the web UI (in some browsers this is only allowed using https). 

To use an `https` connection, you will first need to setup SSL certificates:

```bash
# Generate the SSL certificates in the root directory
# pip install openssl
openssl req -x509 -nodes -days 365 -newkey rsa:2048 -keyout key.pem -out cert.pem
```

Note that if you want to use a `http` connection instead you can:
  * For the PyTorch backend, add the flag `--ssl False`
  * For the MLX backend, `http` is the default and `https` can be used with `--ssl certdir` where `certdir` is the directory that contains the certificates.


Note that when using `https` you  may get warnings from the browser about the site being unsafe.
When using chrome for instance, you
can bypass these by selecting "Details" or "Advanced", then "Visit this unsafe
site" or "Proceed to localhost (unsafe)".


## License

The present code is provided under the MIT license for the Python parts, and Apache license for the Rust backend.
The web client code is provided under the MIT license.

The model weights (*excluding the vision encoder*) for the models are released under the CC-BY 4.0 license; the vision encoder is licensed under Apache 2.0.

All images displayed in the web UI are obtained under the free Unsplash license. For the precise list of image urls and authors, please refer to [this file](client/public/assets/images/demo/attribution.txt).



## Datasets
We also release two data-related artifacts to accompany MoshiVis:
  * In the `ssvd` directory, we include code and instructions to reproduce our synthetic visual dialogue  datasets described in Section 3.3 and Appendix E of our preprint
  * For evaluation purposes, we also release [`Babillage`](https://huggingface.co/datasets/kyutai/Babillage) on HuggingFace, which contains spoken versions of three common VLM benchmarks (COCO-Captions 2014, OCR-VQA and VQAv2) for prompting the model's visual understanding in audio form.

## Citation

If you use MoshiVis in your research, please cite our work:

```
@article{kyutai2025moshivis,
  author = {Amélie Royer and Moritz Böhle and Gabriel de Marmiesse and
  Laurent Mazaré and Alexandre Défossez and Neil Zeghidour and Patrick Pérez},
  year = {2025},
  title = {Vision-Speech Models: Teaching Speech Models to Converse about Images},
  journal = {ArXiv},
  url = {https://arxiv.org/abs/2503.15633}
}

@techreport{kyutai2024moshi,
      title={Moshi: a speech-text foundation model for real-time dialogue},
      author={Alexandre Défossez and Laurent Mazaré and Manu Orsini and
      Amélie Royer and Patrick Pérez and Hervé Jégou and Edouard Grave and Neil Zeghidour},
      year={2024},
      eprint={2410.00037},
      archivePrefix={arXiv},
      primaryClass={eess.AS},
      url={https://arxiv.org/abs/2410.00037},
}
```

[blog]: https://kyutai.org/moshivis
[moshi-vision-arxiv]: https://arxiv.org/abs/2503.15633
[moshi-arxiv]: https://arxiv.org/abs/2410.00037
[moshi-github]: https://github.com/kyutai-labs/moshi/tree/main?tab=readme-ov-file#models
[talk-to-moshivis]: https://vis.moshi.chat


================================================
FILE: client/.eslinrc.json
================================================
{
  "env": {
    "browser": true,
    "es2021": true
  },
  "extends": [
    "plugin:react/recommended",
    "standard-with-typescript",
    "plugin:import/typescript",
    "plugin:prettier/recommended"
  ],
  "parser": "@typescript-eslint/parser",
  "overrides": [],
  "parserOptions": {
    "ecmaVersion": "latest",
    "sourceType": "module",
    "project": "./tsconfig.json"
  },
  "plugins": ["react", "prettier"],
  "rules": {
    "@typescript-eslint/triple-slash-reference": "off"
  }
}


================================================
FILE: client/.nvmrc
================================================
v20.12.2


================================================
FILE: client/.prettierignore
================================================
dist/*

================================================
FILE: client/.prettierrc.json
================================================
{
  "arrowParens": "avoid",
  "singleQuote": false,
  "trailingComma": "all",
  "tabWidth": 2,
  "useTabs": false,
  "semi": true,
  "printWidth": 80,
  "plugins": ["prettier-plugin-tailwindcss"]
}


================================================
FILE: client/Dockerfile
================================================
FROM node:20 AS builder

WORKDIR /app

COPY . /app

RUN npm install

RUN npx vite build

FROM scratch AS build_result

COPY --from=builder /app/dist /


================================================
FILE: client/LICENSE
================================================
Permission is hereby granted, free of charge, to any
person obtaining a copy of this software and associated
documentation files (the "Software"), to deal in the
Software without restriction, including without
limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software
is furnished to do so, subject to the following
conditions:

The above copyright notice and this permission notice
shall be included in all copies or substantial portions
of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.


================================================
FILE: client/README.md
================================================
# moshi-client

Frontend for the demo.

## Quickstart

To start developping, you will need a basic environment with NodeJS, for instance:
```bash
cd client
micromamba create -n node22 python=3.10
micromamba activate node22
micromamba install nodejs=22.11
# install
npm install
```
Alternatively, you can use [NVM](https://github.com/nvm-sh/nvm) to help you manage your node version and make sure you're on the recommended version for this project. If you do so run, `nvm use`.

To run the client in dev mode, use:
```bash
# typically will start on port 5173
npm run dev
```

When you're satisfied, build the client (in `dist` directory) that will be used as
static dir by the  different  backends:
```bash
npm run build
```

If Docker is available, you can skip all the previous steps and just run

```
docker buildx bake
```
from the root of this repository. It will output the static sources for the website in `client/dist`.

### License

The present code is provided under the MIT license.


================================================
FILE: client/index.html
================================================
<!doctype html>
<html lang="en" class=" bg-black" data-theme="dark">
  <head>
    <meta charset="UTF-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <link rel="icon" type="image/png" sizes="32x32" href="/assets/favicon-32x32.png">
    <link rel="icon" type="image/png" sizes="16x16" href="/assets/favicon-16x16.png">
    <title>moshi.chat</title>
  </head>
  <body class=" bg-black font-mono font-thin">
    <div id="root" />
    <script type="module" src="/src/app.tsx"></script>
  </body>
</html>


================================================
FILE: client/package.json
================================================
{
  "name": "kyutai-client",
  "private": true,
  "version": "0.0.0",
  "type": "module",
  "scripts": {
    "dev": "vite",
    "build": "tsc && vite build",
    "lint": "eslint",
    "lint:fix": "eslint --fix",
    "prettier": "prettier --write .",
    "preview": "vite preview"
  },
  "devDependencies": {
    "@eslint/js": "^9.3.0",
    "@types/react": "^18.3.1",
    "@types/react-dom": "^18.3.0",
    "@types/ws": "^8.5.10",
    "autoprefixer": "^10.4.19",
    "daisyui": "^4.12.2",
    "eslint": "^8.57.0",
    "eslint-config-prettier": "^9.1.0",
    "eslint-plugin-prettier": "^5.1.3",
    "eslint-plugin-react": "^7.34.1",
    "globals": "^15.2.0",
    "postcss": "^8.4.38",
    "prettier": "^3.2.5",
    "prettier-eslint": "^16.3.0",
    "prettier-plugin-tailwindcss": "^0.5.14",
    "tailwindcss": "^3.4.3",
    "typescript": "^5.2.2",
    "typescript-eslint": "^7.9.0",
    "vite": "^6.2.1",
    "vite-plugin-top-level-await": "^1.4.1"
  },
  "dependencies": {
    "eruda": "^3.0.1",
    "opus-recorder": "^8.0.5",
    "react": "^18.3.1",
    "react-dom": "^18.3.1",
    "react-router-dom": "^6.23.1",
    "webm-duration-fix": "^1.0.4",
    "ws": "^8.16.0",
    "zod": "^3.23.8"
  }
}


================================================
FILE: client/postcss.config.js
================================================
export default {
  plugins: {
    tailwindcss: {},
    autoprefixer: {},
  },
};


================================================
FILE: client/public/assets/images/demo/attribution.txt
================================================
image1.jpg  https://unsplash.com/photos/seven-brushes-and-water-color-palette-TTwwVG4Isjw   Crystal de Passillé-Chabot
image2.jpg  https://unsplash.com/photos/a-bunch-of-stuffed-animals-that-are-on-a-shelf-uTDvpjnF2nw    Hoyoun Lee
image3.jpg  https://unsplash.com/photos/an-orange-and-white-clownfish-in-an-aquarium-8iHpPG7Vk9Y    James Lee
image4.jpg  https://unsplash.com/photos/red-dragon-action-figure-on-table-X-A-LJVAhzk   Clint Bustrillos
image5.jpg  https://unsplash.com/photos/panda-bear-sitting-on-bamboo-sticks-surrounded-with-trees-NsNRu6dfRds   Ying Wu
image6.jpg  https://unsplash.com/photos/carousel-with-string-lights-OEBeLcrzlaw cmophoto.net
image7.jpg  https://unsplash.com/photos/flight-of-pigeons-flying-above-grass-field-near-eiffel-tower-in-paris-m45uW4f9YQg   Stijn te Strake
image8.jpg   https://unsplash.com/photos/gray-typewriter-and-macbook-1F4MukO0UNg Glenn Carstens-Peters
image9.jpg  https://unsplash.com/photos/a-couple-of-sandwiches-sitting-on-top-of-a-cutting-board-smv9xho-dnE    Deepthi Clicks
image10.jpg  https://unsplash.com/photos/man-in-white-hat-and-black-shirt-painting-nkVa4ylaWG0   Federico Scarionati
image11.jpg https://unsplash.com/photos/a-statue-of-a-gnome-next-to-a-light-pole-UNT9ExjTgZE    Lionel Mermoz
image12.jpg https://unsplash.com/photos/brown-concrete-building-on-green-grass-field-during-daytime-mDIMJzdu5D0 Celine Chamiot-Poncet
image13.jpg https://unsplash.com/photos/astronaut-in-spacesuit-floating-in-space-Yj1M5riCKk4    NASA
image14.jpg https://unsplash.com/photos/marble-toy-lot-near-yellow-drawstring-pouch-1kZzV02D2hM Crissy Jarvis
image15.jpg https://unsplash.com/photos/person-holding-black-frying-pan-APDMfLHZiRA Kevin McCutcheon
image16.jpg https://unsplash.com/photos/selective-focus-photo-of-four-green-humming-birds-with-red-flowers-5TU1htuOUn4  James wainscoat
image17.jpg https://unsplash.com/photos/orange-and-white-tabby-cat-sitting-on-brown-wooden-table-in-kitchen-room-w2DsS-ZAP4U    Paul Hanoka
image18.jpg https://unsplash.com/photos/lantern-on-the-street-at-nighttime--F3wMFrZ7z0  Denys Nevozhai
image19.jpg https://unsplash.com/photos/baked-breads-in-rack-ZnPNZpjzi0M    Dan Gold
image20.jpg https://unsplash.com/photos/lemonades-on-tray-JB5YCqOXV1o   Rod Long

================================================
FILE: client/src/app.tsx
================================================
import ReactDOM from "react-dom/client";
import {
  createBrowserRouter,
  RouterProvider,
} from "react-router-dom";
import "./index.css";
// @ts-expect-error - Worker is not recognized by the TS compiler
import { DecoderWorker } from "./decoder/decoderWorker";
import { Queue } from "./pages/Queue/Queue";

const router = createBrowserRouter([
  {
    path: "/",
    element: <Queue />,
  },
]);

ReactDOM.createRoot(document.getElementById("root") as HTMLElement).render(
    <RouterProvider router={router}/>
);


================================================
FILE: client/src/audio-processor.ts
================================================
// @ts-nocheck
function asMs(samples) {
  return (samples * 1000 / sampleRate).toFixed(1);
}

function asSamples(mili) {
  return Math.round(mili * sampleRate / 1000);
}

class MoshiProcessor extends AudioWorkletProcessor {
  constructor() {
    super();
    console.log("Moshi processor lives", currentFrame, sampleRate);
    console.log(currentTime);

    // Buffer length definitions
    let frameSize = asSamples(80);
    // initialBufferSamples: we wait to have at least that many samples before starting to play
    this.initialBufferSamples = 2 * frameSize;
    // once we have enough samples, we further wait that long before starting to play.
    // This allows to have buffer lengths that are not a multiple of frameSize.
    this.partialBufferSamples = asSamples(80);
    // If the buffer length goes over that many, we will drop the oldest packets until
    // we reach back initialBufferSamples + partialBufferSamples.
    this.maxBufferSamples = asSamples(80);
    // increments
    this.partialBufferIncrement = asSamples(40);
    this.maxPartialWithIncrements = asSamples(240);
    this.maxBufferSamplesIncrement = asSamples(40);
    this.maxMaxBufferWithIncrements = asSamples(240);

    // State and metrics
    this.initState();

    this.port.onmessage = (event) => {
      if (event.data.type == "reset") {
        console.log("Reset audio processor state.");
        this.initState();
        return;
      }
      let frame = event.data.frame;
      this.frames.push(frame);
      if (this.currentSamples() >= this.initialBufferSamples && !this.started) {
        this.start();
      }
      if (this.pidx < 20) {
        console.log(this.timestamp(), "Got packet", this.pidx++, asMs(this.currentSamples()), asMs(frame.length))

      }
      if (this.currentSamples() >= this.totalMaxBufferSamples()) {
        console.log(this.timestamp(), "Dropping packets", asMs(this.currentSamples()), asMs(this.totalMaxBufferSamples()));
        let target = this.initialBufferSamples + this.partialBufferSamples
        while (this.currentSamples() > (this.initialBufferSamples + this.partialBufferSamples)) {
          let first = this.frames[0];
          let to_remove = this.currentSamples() - target;
          to_remove = Math.min(first.length - this.offsetInFirstBuffer, to_remove);
          this.offsetInFirstBuffer += to_remove;
          this.timeInStream += to_remove / sampleRate;
          if (this.offsetInFirstBuffer == first.length) {
            this.frames.shift();
            this.offsetInFirstBuffer = 0;
          }
        }
        console.log(this.timestamp(), "Packet dropped", asMs(this.currentSamples()));
        this.maxBufferSamples += this.maxBufferSamplesIncrement;
        this.maxBufferSamples = Math.min(this.maxMaxBufferWithIncrements, this.maxBufferSamples);
        console.log("Increased maxBuffer to", asMs(this.maxBufferSamples));
      }
      let delay = this.currentSamples() / sampleRate;
      this.port.postMessage({
        totalAudioPlayed: this.totalAudioPlayed,
        actualAudioPlayed: this.actualAudioPlayed,
        delay: event.data.micDuration - this.timeInStream,
        minDelay: this.minDelay,
        maxDelay: this.maxDelay,
      });
    };
  }

  initState() {
    this.frames = new Array();
    this.offsetInFirstBuffer = 0;
    this.firstOut = false;
    this.remainingPartialBufferSamples = 0;
    this.timeInStream = 0.;
    this.resetStart();

    // Metrics
    this.totalAudioPlayed = 0.;
    this.actualAudioPlayed = 0.;
    this.maxDelay = 0.;
    this.minDelay = 2000.;
    // Debug
    this.pidx = 0;

    // For now let's reset the buffer params.
    this.partialBufferSamples = asSamples(80);
    this.maxBufferSamples = asSamples(80);
  }

  totalMaxBufferSamples() {
    return this.maxBufferSamples + this.partialBufferSamples + this.initialBufferSamples;
  }

  timestamp() {
    return Date.now() % 1000;
  }

  currentSamples() {
    let samples = 0;
    for (let k = 0; k < this.frames.length; k++) {
      samples += this.frames[k].length
    }
    samples -= this.offsetInFirstBuffer;
    return samples;
  }

  resetStart() {
    this.started = false;
  }

  start() {
    this.started = true;
    this.remainingPartialBufferSamples = this.partialBufferSamples;
    this.firstOut = true;
  }

  canPlay() {
    return this.started && this.frames.length > 0 && this.remainingPartialBufferSamples <= 0;
  }

  process(inputs, outputs, parameters) {
    let delay = this.currentSamples() / sampleRate;
    if (this.canPlay()) {
      this.maxDelay = Math.max(this.maxDelay, delay);
      this.minDelay = Math.min(this.minDelay, delay);
    }
    const output = outputs[0][0];
    if (!this.canPlay()) {
      if (this.actualAudioPlayed > 0) {
        this.totalAudioPlayed += output.length / sampleRate;
      }
      this.remainingPartialBufferSamples -= output.length;
      return true;
    }
    if (this.firstOut) {
      console.log(this.timestamp(), "Audio resumed", asMs(this.currentSamples()), this.remainingPartialBufferSamples);
    }
    let first = this.frames[0];
    let out_idx = 0;
    while (out_idx < output.length && this.frames.length) {
      let first = this.frames[0];
      let to_copy = Math.min(first.length - this.offsetInFirstBuffer, output.length - out_idx);
      output.set(first.subarray(this.offsetInFirstBuffer, this.offsetInFirstBuffer + to_copy), out_idx);
      this.offsetInFirstBuffer += to_copy;
      out_idx += to_copy;
      if (this.offsetInFirstBuffer == first.length) {
        this.offsetInFirstBuffer = 0;
        this.frames.shift();
      }
    }
    if (this.firstOut) {
      this.firstOut = false;
      for (let i = 0; i < out_idx; i++) {
        output[i] *= i / out_idx;
      }
    }
    if (out_idx < output.length) {
      console.log(this.timestamp(), "Missed some audio", output.length - out_idx);
      this.partialBufferSamples += this.partialBufferIncrement;
      this.partialBufferSamples = Math.min(this.partialBufferSamples, this.maxPartialWithIncrements);
      console.log("Increased partial buffer to", asMs(this.partialBufferSamples));
      // We ran out of a buffer, let's revert to the started state to replenish it.
      this.resetStart();
      for (let i = 0; i < out_idx; i++) {
        output[i] *= (out_idx - i) / out_idx;
      }
    }
    this.totalAudioPlayed += output.length / sampleRate;
    this.actualAudioPlayed += out_idx / sampleRate;
    this.timeInStream += out_idx / sampleRate;
    return true;
  }
}
registerProcessor("moshi-processor", MoshiProcessor);


================================================
FILE: client/src/components/Button/Button.tsx
================================================
import { FC } from "react";

type ButtonProps = React.ButtonHTMLAttributes<HTMLButtonElement>;
export const Button: FC<ButtonProps> = ({ children, className, ...props }) => {
  return (
    <button
      className={`border-2 disabled:bg-gray-100 border-white bg-black p-2 text-white hover:bg-gray-800 active:bg-gray-700  ${className ?? ""}`}
      {...props}
    >
      {children}
    </button>
  );
};

================================================
FILE: client/src/components/ImageGallery/ImageGallery.tsx
================================================

import { useState, ChangeEvent } from "react";

import { Button } from "../Button/Button";

// Natural images
import img1 from "/assets/images/demo/image1.jpg";
import img2 from "/assets/images/demo/image2.jpg";
import img3 from "/assets/images/demo/image3.jpg";
import img4 from "/assets/images/demo/image4.jpg";
import img5 from "/assets/images/demo/image5.jpg";
import img6 from "/assets/images/demo/image6.jpg";
import img7 from "/assets/images/demo/image7.jpg";
import img8 from "/assets/images/demo/image8.jpg";
import img9 from "/assets/images/demo/image9.jpg";
import img10 from "/assets/images/demo/image10.jpg";
import img11 from "/assets/images/demo/image11.jpg";
import img12 from "/assets/images/demo/image12.jpg";
import img13 from "/assets/images/demo/image13.jpg";
import img14 from "/assets/images/demo/image14.jpg";
import img15 from "/assets/images/demo/image15.jpg";
import img16 from "/assets/images/demo/image16.jpg";
import img17 from "/assets/images/demo/image17.jpg";
import img18 from "/assets/images/demo/image18.jpg";
import img19 from "/assets/images/demo/image19.jpg";
import img20 from "/assets/images/demo/image20.jpg";

const images = [
    img1,
    img2,
    img3,
    img4,
    img5,
    img6,
    img7,
    img8,
    img9,
    img10,
    img11,
    img12,
    img13,
    img14,
    img15,
    img16,
    img17,
    img18,
    img19,
    img20,
]

var images_order: number[] = [];
for (let i = 0; i < images.length; i++) {
    images_order.push(i)
}

type ImageGalleryProps = React.InputHTMLAttributes<HTMLInputElement> & {
    // Properties for the ImageGallery
    paramsSetter: Function;
    clickAction: Function;
    size: number;
    numImages: number;
}


type ImageItemProps = React.InputHTMLAttributes<HTMLInputElement> & {
    // Properties for a single item in the ImageGallery
    // Two actions:
    // paramsSetter sets the chosen image url into the model params
    // clickAction then starts the conversation
    paramsSetter: Function;
    clickAction: Function;
    size: number;
    imageUrl: string;
}


function ImageSelect(props: ImageItemProps) {
    // Represents a single image in the gallery
    const [isHover, setIsHover] = useState(false);

    const handleMouseEnter = () => {
        setIsHover(true);
    };
    const handleMouseLeave = () => {
        setIsHover(false);
    };
    let bordercolor = isHover ? "#f7a319" : "black";
    let bgalpha = isHover ? 0.05 : 0.6;
    let textalpha = isHover ? 1.0 : 0.0
    let label = isHover ? "Select" : "X";
    let style = {
        width: props.size,
        height: props.size,
        background: `url(${props.imageUrl})`,
        backgroundSize: "100% 100%",
        border: `3px solid ${bordercolor}`,
        margin: "2px",
        padding: "0px",
        color: `rgba(255, 255, 255, ${textalpha})`,
        boxShadow: `inset 0 0 0 1000px rgba(0,0,0,${bgalpha})`,
        textShadow: `2px 2px 2px rgba(0, 0, 0, ${textalpha})`
    };
    return (
        <button style={style} onMouseEnter={handleMouseEnter} onMouseLeave={handleMouseLeave}
            // we do not save the image URL if it is selected from the UI, otherwise it is a bit messy
            onClick={async () => { await props.paramsSetter(props.imageUrl); sessionStorage.removeItem("imageUrl"); props.clickAction() }
            } > {label}</button >
    );
}


const shuffle = (array: number[]) => {
    return array.sort(() => Math.random() - 0.5);
};




export const ImageGallery = (props: ImageGalleryProps) => {
    const [ordering, SetOrdering] = useState(images_order);
    const [preview, setPreview] = useState<string | null>(sessionStorage.getItem("imageUrl"));


    const handleFileChange = (e: ChangeEvent<HTMLInputElement>, isCapture: boolean) => {
        if (e.target.files && e.target.files[0]) {
            const file = e.target.files[0];
            const url = URL.createObjectURL(file);
            setPreview(url);
            props.paramsSetter(url);
            // only save the image URL when it's an uploaded file
            // doesn't really seem to work with one-shot photo otherwise
            if (!isCapture) {
                sessionStorage.setItem("imageUrl", url);
            }
        }
    };

    const resetFile = () => {
        setPreview(null);
        props.paramsSetter(undefined);
        sessionStorage.removeItem("imageUrl");
    };

    function handleShuffle() {
        SetOrdering(shuffle([...ordering]));
    }

    // Image Gallery widget (random subset)
    const steps = [];
    for (let i = 0; i < props.numImages; i++) {
        steps.push(<ImageSelect
            key={"natural_" + ordering[i]}
            imageUrl={images[ordering[i]]} {...props}></ImageSelect >);
    }

    return (
        <div className="presentation">
            <div className="mt-0 flex flex-grow justify-center items-center flex-col presentation mb-8">
                {preview && <img src={preview} alt="Preview" style={{ width: "200px", marginTop: "20px", marginBottom: "10px" }} />}
                <div className="flex-row">
                    {preview && <Button className="mr-3" onClick={async () => await props.clickAction()}>Connect</Button>}
                    {preview && <Button className="ml-3" onClick={resetFile}>X</Button>}
                </div>
            </div>
            <div className="flex justify-center items-center m-0 p-0" style={{ marginRight: "12%", marginLeft: "12%" }}>
                {!preview && <form style={{ display: "block", width: "50%", marginBottom: 0 }}>
                    <label htmlFor="selectimg" className='m-0 border-2 disabled:bg-gray-100 border-white bg-black p-2 text-white hover:bg-gray-800 active:bg-gray-700'>Upload Image</label>
                    <input id="selectimg" style={{ visibility: "hidden" }} type="file" accept="image/*" onChange={(e) => handleFileChange(e, false)} />
                </form>}
                {!preview && <form style={{ display: "block", width: "10%", marginBottom: 0 }}>
                    <label htmlFor="selectimgphoto" className='m-0 border-2 disabled:bg-gray-100 border-white bg-black p-2 text-white hover:bg-gray-800 active:bg-gray-700'>📷</label>
                    <input id="selectimgphoto" style={{ visibility: "hidden" }} type="file" accept="image/*" capture="environment" onChange={(e) => handleFileChange(e, true)} />
                </form>}
                {!preview && <span style={{ display: "flex", flex: 1 }}></span>}
                {!preview && <button
                    className="border-0 disabled:text-white-100 border-white bg-black m-0 pb-7 hover:text-purple-300 active:bg-gray-700 text-4xl"
                    onClick={handleShuffle}
                    style={{ display: "flex" }}>
                    ⟳
                </button>}
            </div >
            <div className="imageGallery" >{steps}</div>
        </div >)
        ;
};

================================================
FILE: client/src/components/Input/Input.tsx
================================================
type InputProps = React.InputHTMLAttributes<HTMLInputElement> & {
  error?: string;
}

export const Input = ({className, error, ...props}:InputProps) => {
  return (
    <div className="pb-8 relative mt-8">
      <input
        {...props}
        className={`border-2 disabled:bg-gray-100 border-white bg-black p-2 outline-none text-white hover:bg-gray-800 focus:bg-gray-800 p-2 ${className ?? ""}`}
      />
      {error && <p className=" absolute text-red-800">{error}</p>}
    </div>
  );
}

================================================
FILE: client/src/decoder/decoderWorker.ts
================================================
export const DecoderWorker = new Worker(
  new URL("/assets/decoderWorker.min.js", import.meta.url),
);


================================================
FILE: client/src/env.ts
================================================
type ENV = {
  VITE_QUEUE_API_PATH: string;
  VITE_ENV: 'development' | 'production';
};

const parseEnv = (): ENV => {
  const VITE_QUEUE_API_PATH = import.meta.env.VITE_QUEUE_API_PATH;
  
  if (!VITE_QUEUE_API_PATH) {
    throw new Error("VITE_QUEUE_API_PATH is not defined");
  }

  return {
    VITE_QUEUE_API_PATH,
    VITE_ENV: import.meta.env.DEV ? 'development' : 'production',
  };
};

export const env = parseEnv();


================================================
FILE: client/src/index.css
================================================
@tailwind base;
@tailwind components;
@tailwind utilities;

@layer utilities {

  /* Hide scrollbar for Chrome, Safari and Opera */
  .no-scrollbar::-webkit-scrollbar {
    display: none;
  }

  /* Hide scrollbar for IE, Edge and Firefox */
  .no-scrollbar {
    -ms-overflow-style: none;
    /* IE and Edge */
    scrollbar-width: none;
    /* Firefox */
  }

  .scrollbar::-webkit-scrollbar {
    width: 10px;
  }

  .scrollbar::-webkit-scrollbar-track {
    background: transparent;
  }

  .scrollbar::-webkit-scrollbar-thumb {
    background: white;
    border: 3px solid #f6f7ed;
  }
}

.settingsbutton#changed:before {
  content: "C";
  width: 13px;
  height: 13px;
  line-height: 18px;
  text-align: center;
  display: block;
  border-radius: 50%;
  background: #54e8b3;
  border: 1px solid #FFF;
  box-shadow: 0 1px 3px rgba(0, 0, 0, 0.4);
  color: #FFF;
  position: absolute;
  top: -7px;
  right: -7px;
}

.main-grid {
  display: grid;
  grid-template-columns: 1fr;
  grid-template-rows: min-content 1fr 1fr;
  gap: 30px;
  grid-auto-flow: column;
  grid-template-areas:
    "controls"
    "player"
    "player-text";

  @media screen and (min-width: 768px) {
    grid-template-columns: 2fr 2.5fr;
    grid-template-rows: min-content min-content min-content 1fr;
    gap: 30px 30px;
    grid-auto-flow: column;
    align-items: center;
    justify-items: center;
    grid-template-areas:
      "controls controls"
      "player player-stats"
      "player player-text"
      "player player-text";
  }
}

.presentation {
  max-width: 450px;
}

.presentation>p {
  padding-top: 10px;
}


.gallery {
  max-width: 450px;
}

.cute-words {
  color: #54e8b3;
}

.vis-words {
  color: #f7a319;
}

.explain-links {
  color: #BCFCE5;
}


.controls {
  grid-area: controls;
}

.player {
  grid-area: player;
  grid-template-areas:
    "server-audio"
    "user-audio"
    "user-image"
    "download-links";
  display: grid;
  grid-template-columns: 1fr 1fr;
  grid-template-rows: 3fr;
  justify-items: stretch;
  row-gap: 30px;
  /* margin:auto; */
}

.user-image {
  grid-area: user-image;
  grid-column: 1 / -1;
  grid-row: 1;
  height: 200px
}

.user-image img {
  height: 100%;
  width: auto;
  margin: auto
}

.server-audio {
  grid-area: server-audio;
  grid-column: 1;
  grid-row: 2;
}

.user-audio {
  grid-area: user-audio;
  grid-column: 2;
  grid-row: 2;
}

.download-links {
  grid-area: download-links;
  grid-column: 1/-1;
  grid-row: 3;
  color: #54e8b3;
  height: 10%;
}

.player-stats {
  grid-area: player-stats;
  width: 100%;
  height: 100%;
}

.commands {
  grid-area: commands;
  width: 100%;
  height: 100%;
}

.player-text {
  grid-area: player-text;
  width: 100%;
  height: 100%;
  overflow: scroll;
}

================================================
FILE: client/src/modules.d.ts
================================================
declare module "opus-recorder";


================================================
FILE: client/src/pages/Conversation/Conversation.tsx
================================================
import { FC, MutableRefObject, useCallback, useEffect, useMemo, useRef, useState } from "react";
import { useSocket } from "./hooks/useSocket";
import { SocketContext } from "./SocketContext";
import { ServerAudio } from "./components/ServerAudio/ServerAudio";
import { UserAudio } from "./components/UserAudio/UserAudio";
import { Button } from "../../components/Button/Button";
import { ServerAudioStats } from "./components/ServerAudio/ServerAudioStats";
import { AudioStats } from "./hooks/useServerAudio";
import { TextDisplay } from "./components/TextDisplay/TextDisplay";
import { MediaContext } from "./MediaContext";
import { ServerInfo } from "./components/ServerInfo/ServerInfo";
import { ModelParamsValues, useModelParams } from "./hooks/useModelParams";
import { ModelParams } from "./components/ModelParams/ModelParams";
import fixWebmDuration from "webm-duration-fix";
import canvasLogo from "./canvas-logo.png";
import { getMimeType, getExtension } from "./getMimeType";

type ConversationProps = {
  workerAddr: string;
  workerAuthId?: string;
  sessionAuthId?: string;
  sessionId?: number;
  email?: string;
  audioContext: MutableRefObject<AudioContext>;
  worklet: MutableRefObject<AudioWorkletNode>;
  onConversationEnd?: () => void;
  isBypass?: boolean;
} & Partial<ModelParamsValues>;


const buildURL = ({
  workerAddr,
  params,
  workerAuthId,
  email,
  textSeed,
  audioSeed,
}: {
  workerAddr: string;
  params: ModelParamsValues;
  workerAuthId?: string;
  email?: string;
  textSeed: number;
  audioSeed: number;
}) => {
  if (workerAddr == "same" || workerAddr == "") {
    workerAddr = window.location.hostname + ":" + window.location.port;
    console.log("Overriding workerAddr to", workerAddr);
  }
  const wsProtocol = (window.location.protocol === 'https:') ? 'wss' : 'ws';
  const url = new URL(`${wsProtocol}://${workerAddr}/api/chat`);
  if (workerAuthId) {
    url.searchParams.append("worker_auth_id", workerAuthId);
  }
  if (email) {
    url.searchParams.append("email", email);
  }
  url.searchParams.append("text_temperature", params.textTemperature.toString());
  url.searchParams.append("text_topk", params.textTopk.toString());
  url.searchParams.append("audio_temperature", params.audioTemperature.toString());
  url.searchParams.append("audio_topk", params.audioTopk.toString());
  url.searchParams.append("pad_mult", params.padMult.toString());
  url.searchParams.append("text_seed", textSeed.toString());
  url.searchParams.append("audio_seed", audioSeed.toString());
  url.searchParams.append("repetition_penalty_context", params.repetitionPenaltyContext.toString());
  url.searchParams.append("repetition_penalty", params.repetitionPenalty.toString());
  // Add image params if given
  if (params.imageUrl != undefined) {
    url.searchParams.append("image_resolution", params.imageResolution.toString());
    url.searchParams.append("center_crop", params.centerCrop.toString());
    url.searchParams.append("xa_start", params.gateDelay.toString());
    url.searchParams.append("text_temperature_gating_influence", params.gateInfluence.toString());
  }
  return url.toString();
};


export const Conversation: FC<ConversationProps> = ({
  workerAddr,
  workerAuthId,
  audioContext,
  worklet,
  sessionAuthId,
  sessionId,
  onConversationEnd,
  isBypass = false,
  email,
  ...params
}) => {
  const getAudioStats = useRef<() => AudioStats>(() => ({
    playedAudioDuration: 0,
    missedAudioDuration: 0,
    totalAudioMessages: 0,
    delay: 0,
    minPlaybackDelay: 0,
    maxPlaybackDelay: 0,
  }));
  const isRecording = useRef<boolean>(false);
  const videoChunks = useRef<Blob[]>([]);
  const audioChunks = useRef<Blob[]>([]);

  const audioStreamDestination = useRef<MediaStreamAudioDestinationNode>(audioContext.current.createMediaStreamDestination());
  const mediaRecorder = useRef<MediaRecorder | null>(null);
  const audioRecorder = useRef<MediaRecorder>(new MediaRecorder(audioStreamDestination.current.stream, { mimeType: getMimeType("audio"), audioBitsPerSecond: 128000 }));
  const [videoURL, setVideoURL] = useState<string>("");
  const [audioURL, setAudioURL] = useState<string>("");
  const [userRating, setUserRating] = useState<number>(0);
  const [userRatingHovered, setUserRatingHovered] = useState<number>(0);
  const [baseBlobName, setBaseBlobName] = useState<string>("moshi");
  const [isOver, setIsOver] = useState(false);
  const modelParams = useModelParams(params);
  const micDuration = useRef<number>(0);
  const actualAudioPlayed = useRef<number>(0);
  const textContainerRef = useRef<HTMLDivElement>(null);
  const textSeed = useMemo(() => Math.round(1000000 * Math.random()), []);
  const audioSeed = useMemo(() => Math.round(1000000 * Math.random()), []);
  const canvasRef = useRef<HTMLCanvasElement>(null);
  const logoRef = useRef<HTMLImageElement>(null);
  const [isLogoLoaded, setIsLogoLoaded] = useState(false);

  const WSURL = buildURL({
    workerAddr,
    params: modelParams,
    workerAuthId,
    email: email,
    textSeed: textSeed,
    audioSeed: audioSeed,
  });



  const onDisconnect = useCallback(() => {
    setIsOver(true);
    console.log("on disconnect!");
    stopRecording();
  }, [setIsOver]);

  const { isConnected, sendMessage, socket, start, stop } = useSocket({
    // onMessage,
    uri: WSURL,
    onDisconnect,
    imageUrl: params.imageUrl,
  });
  useEffect(() => {
    audioRecorder.current.ondataavailable = (e) => {
      audioChunks.current.push(e.data);
    };
    audioRecorder.current.onstop = async () => {
      let blob: Blob;
      const mimeType = getMimeType("audio");
      if (mimeType.includes("webm")) {
        blob = await fixWebmDuration(new Blob(audioChunks.current, { type: mimeType }));
      } else {
        blob = new Blob(audioChunks.current, { type: mimeType });
      }
      setAudioURL(URL.createObjectURL(blob));
      audioChunks.current = [];
      console.log("Audio Recording and encoding finished");
    };
  }, [mediaRecorder, audioRecorder, setVideoURL, setAudioURL, videoChunks, audioChunks]);


  const RatingButton = (props: { rating: number }) => {
    const [isHover, setIsHover] = useState(false);
    const handleMouseEnter = () => {
      setUserRatingHovered(props.rating);
      setIsHover(true);
    };
    const handleMouseLeave = () => {
      setUserRatingHovered(0);
      setIsHover(false);
    };
    let style = {
      color: (isHover || userRating >= props.rating || userRatingHovered >= props.rating) ? `#f7a319` : '#333333',
    };
    return (
      <button
        onMouseEnter={handleMouseEnter} onMouseLeave={handleMouseLeave}
        style={style} className="flex b-0 text-6xl"
        disabled={isOver}
        onClick={async () => {
          setUserRating(props.rating); sendMessage({
            type: "user_rating",
            data: props.rating,
          })
        }}
      >
        ★
      </button>
    );
  };
  useEffect(() => {
    start();
    return () => {
      stop();
    };
  }, [start, workerAuthId]);

  useEffect(() => {

    if (!canvasRef) {
      console.log("No canvas ref");
      return;
    }
    if (!logoRef) {
      console.log("No logo ref");
      return;
    }
    if (!isLogoLoaded) {
      console.log("Logo not loaded");
      return;
    }
    if (!canvasRef.current) {
      console.log("No canvas");
      return;
    }
    if (!logoRef.current) {
      console.log("No logo");
      return;
    }

    const ctx = canvasRef.current.getContext("2d");
    if (ctx) {
      ctx.drawImage(logoRef.current, 20, 250, 320, 98);
      ctx.lineWidth = 1;
      ctx.strokeStyle = "white";
      ctx.strokeRect(5, 5, 370, 370);
    }
  }, [canvasRef, logoRef, isLogoLoaded]);

  const startRecording = useCallback(() => {
    if (isRecording.current) {
      return;
    }
    console.log(Date.now() % 1000, "Starting recording");
    console.log("Starting recording");
    if (canvasRef.current) {
      // Note: Attaching a track from this stream to the existing MediaRecorder
      // rather than creating a new MediaRecorder for the canvas stream
      // doesn't work on Safari as it just ends the recording immediately.
      // It works on Chrome though and is much cleaner.
      console.log("Adding canvas to stream");
      const captureStream = canvasRef.current.captureStream(30);
      captureStream.addTrack(audioStreamDestination.current.stream.getAudioTracks()[0]);
      mediaRecorder.current = new MediaRecorder(captureStream, { mimeType: getMimeType("video"), videoBitsPerSecond: 1000000 });
      mediaRecorder.current.ondataavailable = (e) => {
        console.log("Video data available");
        videoChunks.current.push(e.data);
      };
      mediaRecorder.current.onstop = async () => {
        let blob: Blob;
        const mimeType = getMimeType("video");
        if (mimeType.includes("webm")) {
          blob = await fixWebmDuration(new Blob(videoChunks.current, { type: mimeType }));
        } else {
          blob = new Blob(videoChunks.current, { type: mimeType });
        }
        setVideoURL(URL.createObjectURL(blob));
        videoChunks.current = [];
        console.log("Video Recording and encoding finished");
      };
    }
    worklet.current?.connect(audioStreamDestination.current);
    // videoStream.current.addTrack(audioStreamDestination.current.stream.getAudioTracks()[0]);

    setVideoURL("");
    setAudioURL("");
    mediaRecorder.current?.start();
    audioRecorder.current.start();
    isRecording.current = true;
  }, [isRecording, setVideoURL, setVideoURL, worklet, audioStreamDestination, mediaRecorder, audioRecorder, canvasRef]);

  const stopRecording = useCallback(() => {
    console.log("Stopping recording");
    console.log("isRecording", isRecording)
    if (!isRecording.current) {
      return;
    }
    worklet.current?.disconnect(audioStreamDestination.current);
    audioRecorder.current.stop();
    mediaRecorder.current?.stop();
    isRecording.current = false;
  }, [isRecording, worklet, audioStreamDestination, mediaRecorder, audioRecorder]);



  return (
    <SocketContext.Provider
      value={{
        isConnected,
        sendMessage,
        socket,
      }}
    >
      <MediaContext.Provider value={
        {
          startRecording,
          stopRecording,
          audioContext,
          worklet,
          audioStreamDestination,
          micDuration,
          actualAudioPlayed,
        }
      }>
        <div>
          <div className="main-grid h-screen max-h-screen w-screen p-4 max-w-96 md:max-w-screen-lg m-auto">
            <div className="controls text-center flex justify-center items-center gap-2">
              {isOver && !isBypass && (
                <Button
                  onClick={() => {
                    // Reload the page to reset the conversation on iOS
                    const isIOS = /iPad|iPhone|iPod/.test(navigator.userAgent)
                    if (onConversationEnd && !isIOS) {
                      onConversationEnd();
                      return;
                    }
                    sessionStorage.setItem("textTemperature", modelParams.textTemperature.toString());
                    sessionStorage.setItem("textTopk", modelParams.textTopk.toString());
                    sessionStorage.setItem("audioTemperature", modelParams.audioTemperature.toString());
                    sessionStorage.setItem("audioTopk", modelParams.audioTopk.toString());
                    sessionStorage.setItem("padMult", modelParams.padMult.toString());
                    sessionStorage.setItem("repetitionPenalty", modelParams.repetitionPenalty.toString());
                    sessionStorage.setItem("repetitionPenaltyContext", modelParams.repetitionPenaltyContext.toString());
                    sessionStorage.setItem("imageResolution", modelParams.imageResolution.toString());
                    sessionStorage.setItem("gateDelay", modelParams.gateDelay.toString());
                    sessionStorage.setItem("gateInfluence", modelParams.gateInfluence.toString());
                    sessionStorage.setItem("displayColor", modelParams.displayColor.toString());
                    sessionStorage.setItem("centerCrop", modelParams.centerCrop.toString());
                    document.location.reload();
                  }}
                >
                  Start Over
                </Button>
              )
              }
              {
                (!isOver || isBypass) && (
                  <Button
                    onClick={() => {
                      audioContext.current.resume();
                      isConnected ? stop() : start();
                    }}
                  >
                    {!isConnected ? "Connect" : "Disconnect"}
                  </Button>
                )
              }
              <div className={`h-4 w - 4 rounded - full ${isConnected ? 'bg-green-700' : 'bg-red-700'} `} />
            </div>
            <div className="relative player h-full max-h-full w-full justify-stretch gap-3 border-2 border-white md:p-12">
              <div className="user-image">
                <img src={params.imageUrl} style={{ border: "1px solid white" }}></img>
              </div>
              <ServerAudio
                imageUrl={params.imageUrl}
                copyCanvasRef={canvasRef}
                setGetAudioStats={(callback: () => AudioStats) =>
                  (getAudioStats.current = callback)
                }
              />
              <UserAudio copyCanvasRef={canvasRef} />
              <div className="pt-8 text-sm flex justify-center items-center flex-col download-links"
                style={{
                  minHeight: 80,
                  margin: -10,
                  padding: 0,
                }}>
                <div className='text-xs text-white'> Feel free to rate the interaction before ending the session:</div>
                <div className="flex flex-row">
                  <RatingButton rating={1} />
                  <RatingButton rating={2} />
                  <RatingButton rating={3} />
                  <RatingButton rating={4} />
                  <RatingButton rating={5} />
                </div>
                {audioURL && <div><a href={audioURL} download={`${baseBlobName}_audio.${getExtension("audio")}`} className="pt-2 text-center block">Download audio</a></div>}
                {videoURL && <div><a href={videoURL} download={`${baseBlobName}_video.${getExtension("video")}`} className="pt-2 text-center">Download video</a></div>}
                {videoURL && getExtension("video") === "webm" && <div><a href="https://restream.io/tools/webm-to-mp4-converter" target="_blank" rel="noreferrer" className="explain-links pt-2 text-center italic block">How to convert to mp4</a></div>}
              </div>
            </div>
            <div className="scrollbar player-text border-2 border-white " ref={textContainerRef}>
              <TextDisplay containerRef={textContainerRef} displayColor={params.displayColor} />
            </div>
            <div className="player-stats hidden md:block">
              <ServerAudioStats getAudioStats={getAudioStats} />
            </div>
          </div>
          <div className="max-w-96 md:max-w-screen-lg p-4 m-auto text-center">
            <ServerInfo setFileName={(x: string) => setBaseBlobName(x)} />
            {!workerAuthId && <ModelParams {...modelParams} isConnected={isConnected} />}
          </div>
          <canvas height={380} width={380} className="hidden" ref={canvasRef} />
          <img src={canvasLogo} ref={logoRef} className="hidden" onLoad={() => {
            console.log("Logo loaded");
            setIsLogoLoaded(true);
          }} />
        </div>
      </MediaContext.Provider>
    </SocketContext.Provider >
  );
};


================================================
FILE: client/src/pages/Conversation/MediaContext.ts
================================================
import { MutableRefObject, createContext, useContext } from "react";
type MediaContextType = {
  startRecording: () => void;
  stopRecording: () => void;
  audioContext: MutableRefObject<AudioContext>;
  audioStreamDestination: MutableRefObject<MediaStreamAudioDestinationNode>;
  worklet: MutableRefObject<AudioWorkletNode>;
  micDuration: MutableRefObject<number>;
  actualAudioPlayed: MutableRefObject<number>;
};

export const MediaContext = createContext<MediaContextType | null>(null);

export const useMediaContext = () => {
  const context = useContext(MediaContext);
  if (!context) {
    throw new Error(
      "useMediaContext must be used within a MediaContextProvider",
    );
  }

  return context;
};

================================================
FILE: client/src/pages/Conversation/SocketContext.ts
================================================
import { createContext, useContext } from "react";
import { WSMessage } from "../../protocol/types";

type SocketContextType = {
  isConnected: boolean;
  socket: WebSocket | null;
  sendMessage: (message: WSMessage) => void;
};

export const SocketContext = createContext<SocketContextType>({
  isConnected: false,
  socket: null,
  sendMessage: () => {},
});

export const useSocketContext = () => {
  return useContext(SocketContext);
};


================================================
FILE: client/src/pages/Conversation/components/AudioVisualizer/AudioVisualizer.tsx
================================================
import { FC, useCallback, useEffect, useRef } from "react";

type AudioVisualizerProps = {
  analyser: AnalyserNode | null;
};

export const AudioVisualizer: FC<AudioVisualizerProps> = ({ analyser }) => {
  const requestRef = useRef<number | null>(null);
  const canvasRef = useRef<HTMLCanvasElement>(null);

  const visualizeData = useCallback(() => {
    requestRef.current = window.requestAnimationFrame(() => visualizeData());
    if (!canvasRef.current) {
      console.log("Canvas not found");
      return;
    }
    const audioData = new Uint8Array(140);
    analyser?.getByteFrequencyData(audioData);
    const bar_width = 3;
    let start = 0;
    const ctx = canvasRef.current.getContext("2d");
    if (!ctx) {
      console.log("Canvas context not found");
      return;
    }
    ctx.clearRect(0, 0, canvasRef.current.width, canvasRef.current.height);
    for (let i = 0; i < audioData.length; i++) {
      start = i * 4;
      let gradient = ctx.createLinearGradient(
        0,
        0,
        canvasRef.current.width,
        canvasRef.current.height,
      );
      gradient.addColorStop(0.2, "#2392f5");
      gradient.addColorStop(0.5, "#fe0095");
      gradient.addColorStop(1.0, "purple");
      ctx.fillStyle = gradient;
      ctx.fillRect(
        start,
        canvasRef.current.height,
        bar_width,
        (-audioData[i] * 100) / 255,
      );
    }
  }, [analyser]);

  const resetCanvas = useCallback(() => {
    if (!canvasRef.current) {
      return;
    }
    const ctx = canvasRef.current.getContext("2d");
    if (!ctx) {
      return;
    }
    ctx.clearRect(0, 0, canvasRef.current.width, canvasRef.current.height);
  }, []);

  useEffect(() => {
    if (!analyser) {
      return;
    }
    visualizeData();
    return () => {
      if (requestRef.current) {
        console.log("Canceling animation frame");
        cancelAnimationFrame(requestRef.current);
      }
    };
  }, [visualizeData, analyser, resetCanvas]);

  return <canvas ref={canvasRef} width={250} height={100} />;
};


================================================
FILE: client/src/pages/Conversation/components/AudioVisualizer/ClientVisualizer.tsx
================================================
import { FC, RefObject, useCallback, useEffect, useRef, useState } from "react";
import { clamp } from "../../hooks/audioUtils";

type AudioVisualizerProps = {
  analyser: AnalyserNode | null;
  parent: RefObject<HTMLElement>;
  copyCanvasRef: RefObject<HTMLCanvasElement>;
};

const MAX_INTENSITY = 255;

const COLORS = [
  "#197556",
  "#299e77",
  "#32b89b",
  "#31d4b8",
  "#14d9d5",
  "#41eff2",
  "#7ff3f5",
  "#789bf5",
  "#eb94eb",
  "#e63280",
  "#c41862",
];

export const ClientVisualizer: FC<AudioVisualizerProps> = ({ analyser, parent, copyCanvasRef }) => {
  const [canvasWidth, setCanvasWidth] = useState(parent.current ? Math.min(parent.current.clientWidth, parent.current.clientHeight) : 0);
  const requestRef = useRef<number | null>(null);
  const canvasRef = useRef<HTMLCanvasElement>(null);

  const drawBars = useCallback(
    (
      ctx: CanvasRenderingContext2D,
      x: number,
      y: number,
      volume: number,
      height: number,
      width: number,
      gap: number,
    ) => {
      const barHeight = height / 10 - gap;
      for (let i = 1; i <= 10; i++) {
        const barY = y + height + gap + Math.min(1, width / 30) - (i * barHeight + i * gap);
        ctx.fillStyle = COLORS[i - 1];
        ctx.strokeStyle = "white";
        ctx.lineWidth = Math.min(1, height / 100);
        if (i <= volume) {
          ctx.fillRect(x, barY, width, barHeight);
        }
        ctx.strokeRect(x, barY, width, barHeight);
      }
    },
    [],
  );

  const draw = useCallback((ctx: CanvasRenderingContext2D, audioData: Uint8Array, x: number, y: number, width: number, height: number) => {
    const stereoGap = Math.floor(width / 30);
    const barGap = Math.floor(height / 30);
    const padding = Math.floor(width / 30);
    const maxBarHeight = Math.floor(height - padding * 2);
    const maxBarWidth = Math.floor(
      width / 2.5 - stereoGap - padding * 2,
    );

    const centerX = x + width / 2;
    const averageIntensity = Math.sqrt(
      audioData.reduce((acc, curr) => acc + curr * curr, 0) / audioData.length,
    );
    const intensity = clamp(
      averageIntensity * 1.4,
      averageIntensity,
      MAX_INTENSITY,
    );
    const volume = Math.floor((intensity * 10) / MAX_INTENSITY);
    ctx.fillStyle = "rgba(0, 0, 0, 0)";
    ctx.fillRect(x, y, width, height);
    drawBars(
      ctx,
      centerX - maxBarWidth - stereoGap / 2,
      y,
      volume,
      maxBarHeight,
      maxBarWidth,
      barGap,
    );
    drawBars(
      ctx,
      centerX + stereoGap / 2,
      y,
      volume,
      maxBarHeight,
      maxBarWidth,
      barGap,
    );
  }, [analyser, drawBars]);

  const visualizeData = useCallback(() => {
    const width = parent.current ? Math.min(parent.current.clientWidth, parent.current.clientHeight) : 0
    if (width !== canvasWidth) {
      console.log("Setting canvas width");
      setCanvasWidth(width);
    }
    requestRef.current = window.requestAnimationFrame(() => visualizeData());
    if (!canvasRef.current) {
      console.log("Canvas not found");
      return;
    }
    const audioData = new Uint8Array(140);
    analyser?.getByteFrequencyData(audioData);

    const ctx = canvasRef.current.getContext("2d");
    if (!ctx) {
      console.log("Canvas context not found");
      return;
    }
    ctx.clearRect(0, 0, canvasRef.current.width, canvasRef.current.height);
    draw(ctx, audioData, 0, 0, width, width);
    if (copyCanvasRef?.current) {
      const copyCtx = copyCanvasRef.current.getContext("2d");
      if (copyCtx) {
        const x = 240;
        const y = 140;
        const width = 140 / 1.25; // slightly scaled down version
        const height = 180 / 1.25; // slightly scaled down version
        copyCtx.clearRect(x, y, width, height);
        draw(copyCtx, audioData, x, y, width, height);
      }
    }
  }, [analyser, canvasWidth, drawBars, parent, copyCanvasRef, draw]);

  useEffect(() => {
    visualizeData();
    return () => {
      if (requestRef.current) {
        console.log("Canceling animation frame");
        cancelAnimationFrame(requestRef.current);
      }
    };
  }, [visualizeData, analyser]);
  return (
    <canvas
      ref={canvasRef}
      className="max-h-full max-w-full"
      width={canvasWidth}
      height={canvasWidth}
    />
  );
};


================================================
FILE: client/src/pages/Conversation/components/AudioVisualizer/ServerVisualizer.tsx
================================================
import { FC, RefObject, useCallback, useEffect, useRef, useState } from "react";
import { clamp } from "../../hooks/audioUtils";
import { useSocketContext } from "../../SocketContext";

type AudioVisualizerProps = {
  analyser: AnalyserNode | null;
  parent: RefObject<HTMLElement>;
  imageUrl: string | undefined;
  copyCanvasRef?: RefObject<HTMLCanvasElement>;
};

const MAX_INTENSITY = 255;

export const ServerVisualizer: FC<AudioVisualizerProps> = ({ analyser, parent, imageUrl, copyCanvasRef }) => {
  const [canvasWidth, setCanvasWidth] = useState(parent.current ? Math.min(parent.current.clientWidth, parent.current.clientHeight) : 0);
  const requestRef = useRef<number | null>(null);
  const canvasRef = useRef<HTMLCanvasElement>(null);

  const { isConnected } = useSocketContext();


  const draw = useCallback((width: number, centerX: number, centerY: number, audioData: Uint8Array, ctx: CanvasRenderingContext2D) => {
    const maxCircleWidth = Math.floor(width * 0.95);
    const averageIntensity = Math.sqrt(
      audioData.reduce((acc, curr) => acc + curr * curr, 0) / audioData.length,
    );
    const intensity = clamp(
      averageIntensity * 1.4,
      averageIntensity,
      MAX_INTENSITY,
    );
    const relIntensity = intensity / MAX_INTENSITY;
    const radius = ((isConnected ? 0.3 + 0.7 * relIntensity : relIntensity) * maxCircleWidth) / 2;
    // Draw a circle with radius based on intensity

    ctx.clearRect(centerX - width / 2, centerY - width / 2, width, width);
    ctx.fillStyle = 'rgba(0, 0, 0, 0)';
    ctx.fillRect(centerX - width / 2, centerY - width / 2, width, width);
    ctx.beginPath();
    //ctx.fillStyle = "#39e3a7";
    ctx.fillStyle = 'rgba(57, 227, 167, 0.5)';
    ctx.arc(centerX, centerY, radius, 0, 2 * Math.PI);
    ctx.fill();
    ctx.closePath();

    // Draw an inner circle if we are connected.
    if (isConnected) {
      ctx.beginPath();
      ctx.arc(centerX, centerY, maxCircleWidth / 6, 0, 2 * Math.PI);
      // ctx.fillStyle = "#BCFCE5";
      ctx.fillStyle = 'rgba(188, 252, 229, 0.5)';
      ctx.fill();
      ctx.closePath();
    }

    //Draw a circle with max radius
    ctx.beginPath();
    ctx.arc(centerX, centerY, maxCircleWidth / 2, 0, 2 * Math.PI);
    ctx.strokeStyle = "white";
    ctx.lineWidth = (width / 50 < 3) ? 3 : width / 50;
    ctx.stroke();
    ctx.fillStyle = 'rgba(0, 0, 0, 0.6)';
    ctx.fill()
    ctx.closePath();
  }, [isConnected]);

  const visualizeData = useCallback(() => {
    const width = parent.current ? Math.min(parent.current.clientWidth, parent.current.clientHeight) : 0;
    if (width !== canvasWidth) {
      console.log("Setting canvas width");
      setCanvasWidth(width);
    }
    requestRef.current = window.requestAnimationFrame(() => visualizeData());
    if (!canvasRef.current) {
      console.log("Canvas not found");
      return;
    }
    const ctx = canvasRef.current.getContext("2d");
    const audioData = new Uint8Array(140);
    analyser?.getByteFrequencyData(audioData);
    if (!ctx) {
      console.log("Canvas context not found");
      return;
    }
    const centerX = width / 2;
    const centerY = width / 2;
    draw(width, centerX, centerY, audioData, ctx);
    if (copyCanvasRef?.current) {
      const copyCtx = copyCanvasRef.current.getContext("2d");
      if (copyCtx) {
        draw(100, 295, 70, audioData, copyCtx);
        if (imageUrl) {
          const img = new Image()
          img.src = imageUrl;
          img.onload = function () {
            copyCtx.drawImage(img, 25, 25, 200, 200);
            copyCtx.strokeStyle = 'white';
            copyCtx.rect(25, 25, 200, 200);
            copyCtx.stroke();
          };
        }
      }
    }
  }, [analyser, isConnected, canvasWidth, parent, copyCanvasRef]);


  useEffect(() => {
    if (!analyser) {
      return;
    }
    analyser.smoothingTimeConstant = 0.95;
    visualizeData();
    return () => {
      if (requestRef.current) {
        console.log("Canceling animation frame");
        cancelAnimationFrame(requestRef.current);
      }
    };
  }, [visualizeData, analyser]);

  return (
    <canvas
      className="max-h-full max-w-full"
      ref={canvasRef}
      width={canvasWidth}
      height={canvasWidth}
    />
  );
};


================================================
FILE: client/src/pages/Conversation/components/Controls/Controls.tsx
================================================
import {
  controlBOSMessage,
  controlEOSMessage,
} from "../../../../protocol/testMessages";
import { useSocketContext } from "../../SocketContext";
import { Button } from "../../../../components/Button/Button";

export const Controls = () => {
  const { sendMessage } = useSocketContext();

  const sendControlBOS = () => {
    sendMessage(controlBOSMessage);
  };

  const sendControlEOS = () => {
    sendMessage(controlEOSMessage);
  };
  return (
    <div className="flex w-full justify-between gap-3">
      <Button className="flex-grow" onClick={sendControlEOS}>
        eos
      </Button>
      <Button className="flex-grow" onClick={sendControlBOS}>
        bos
      </Button>
    </div>
  );
};


================================================
FILE: client/src/pages/Conversation/components/ModelParams/ModelParams.tsx
================================================
import { FC, RefObject } from "react";
import { useModelParams } from "../../hooks/useModelParams";
import { Button } from "../../../../components/Button/Button";

type ModelParamsProps = {
  isConnected: boolean;
  modal?: RefObject<HTMLDialogElement>,
} & ReturnType<typeof useModelParams>;
export const ModelParams: FC<ModelParamsProps> = ({
  textTemperature,
  textTopk,
  audioTemperature,
  audioTopk,
  padMult,
  repetitionPenalty,
  repetitionPenaltyContext,
  imageResolution,
  gateDelay,
  gateInfluence,
  displayColor,
  centerCrop,
  setTextTemperature,
  setTextTopk,
  setAudioTemperature,
  setAudioTopk,
  setPadMult,
  setRepetitionPenalty,
  setRepetitionPenaltyContext,
  setImageResolution,
  setGateDelay,
  setGateInfluence,
  setDisplayColor,
  setCenterCrop,
  resetParams,
  isConnected,
  modal,
}) => {
  return (
    <div className=" p-2 mt-6 self-center flex flex-col text-white items-center text-center">
      {!isConnected && <span className="text-xs italic mb-3">Hover on each element to display a helpful tooltip</span>}
      <table>
        <tbody>
          <tr title="Sampling temperature for Moshi's text tokens ('inner monologue')">
            <td>Text temperature:</td>
            <td className="w-12 text-center">{textTemperature}</td>
            <td className="p-2"><input className="range align-middle" disabled={isConnected} type="range" id="text-temperature" name="text-temperature" step="0.01" min="0.2" max="1.2" value={textTemperature} onChange={e => setTextTemperature(parseFloat(e.target.value))} /></td>
          </tr>
          <tr>
            <td title="Sampling top-k for Moshi's text tokens ('inner monologue')">Text topk:</td>
            <td className="w-12 text-center">{textTopk}</td>
            <td className="p-2"><input className="range align-middle" disabled={isConnected} type="range" id="text-topk" name="text-topk" step="1" min="10" max="500" value={textTopk} onChange={e => setTextTopk(parseInt(e.target.value))} /></td>
          </tr>
          <tr title="Sampling temperature for Moshi's audio tokens">
            <td>Audio temperature:</td>
            <td className="w-12 text-center">{audioTemperature}</td>
            <td className="p-2"><input className="range align-middle" disabled={isConnected} type="range" id="audio-temperature" name="audio-temperature" step="0.01" min="0.2" max="1.2" value={audioTemperature} onChange={e => setAudioTemperature(parseFloat(e.target.value))} /></td>
          </tr>
          <tr title="Sampling top-k for Moshi's audio tokens">
            <td>Audio topk:</td>
            <td className="w-12 text-center">{audioTopk}</td>
            <td className="p-2"><input className="range align-middle" disabled={isConnected} type="range" id="audio-topk" name="audio-topk" step="1" min="10" max="500" value={audioTopk} onChange={e => setAudioTopk(parseInt(e.target.value))} /></td>
          </tr>
          <tr title="Up/Down weight the text padding token (lower values make Moshi more reactive)">
            <td>Padding multiplier:</td>
            <td className="w-12 text-center">{padMult}</td>
            <td className="p-2"><input className="range align-middle" disabled={isConnected} type="range" id="audio-pad-mult" name="audio-pad-mult" step="0.05" min="-4" max="4" value={padMult} onChange={e => setPadMult(parseFloat(e.target.value))} /></td>
          </tr>
          <tr title="Up/Down weight repeated tokens (higher values enforce fewer repetitions)">
            <td>Repeat penalty:</td>
            <td className="w-12 text-center">{repetitionPenalty}</td>
            <td className="p-2"><input className="range align-middle" disabled={isConnected} type="range" id="repetition-penalty" name="repetition-penalty" step="0.01" min="1" max="2" value={repetitionPenalty} onChange={e => setRepetitionPenalty(parseFloat(e.target.value))} /></td>
          </tr>
          <tr title="Which horizon to consider for the repeat penalty">
            <td>Repeat penalty last N:</td>
            <td className="w-12 text-center">{repetitionPenaltyContext}</td>
            <td className="p-2"><input className="range align-middle" disabled={isConnected} type="range" id="repetition-penalty-context" name="repetition-penalty-context" step="1" min="0" max="200" value={repetitionPenaltyContext} onChange={e => setRepetitionPenaltyContext(parseFloat(e.target.value))} /></td>
          </tr>
          <tr title="Input image resolution in pixels (the largest side will be resized to the given size)">
            <td>Image max-side (px):</td>
            <td className="w-12 text-center">{imageResolution}</td>
            <td className="p-2"><input className="range align-middle" disabled={isConnected} type="range" id="image-resolution" name="image-resolution" step="16" min="64" max="512" value={imageResolution} onChange={e => setImageResolution(parseFloat(e.target.value))} /></td>
          </tr>
          <tr title="Whether to center crop the image to square or keep its original aspect ratio">
            <td>Center Crop:</td>
            <td className="w-12 text-center">{centerCrop ? '✔️' : '✖️'}</td>
            <td className="p-2"><input className="range align-middle" disabled={isConnected} type="range" id="center-crop" name="center-crop" step="1" min="0" max="1" value={centerCrop ? 1 : 0} onChange={e => setCenterCrop((parseFloat(e.target.value) == 1) ? true : false)} /></td>
          </tr>
          <tr title="Add a temporal delay of X tokens before activating the gate">
            <td>Gating Delay:</td>
            <td className="w-12 text-center">{gateDelay}</td>
            <td className="p-2"><input className="range align-middle" disabled={isConnected} type="range" id="gating-delay" name="gating-delay" step="1" min="0" max="32" value={gateDelay} onChange={e => setGateDelay(parseFloat(e.target.value))} /></td>
          </tr>
          <tr title="Whether to display MoshiVis's gates' outputs via the text color (orange indicates more image relevance; green, more general knowledge)">
            <td>Display Gating:</td>
            <td className="w-12 text-center">{displayColor ? '✔️' : '✖️'}</td>
            <td className="p-2"><input className="range align-middle" disabled={isConnected} type="range" id="display-color" name="display-color" step="1" min="0" max="1" value={displayColor ? 1 : 0} onChange={e => setDisplayColor((parseFloat(e.target.value) == 1) ? true : false)} /></td>
          </tr>
          <tr title="Whether and how much to rescale the text temperature based on MoshiVis's gates' outputs (higher value = text temperature will be lowered when the gates are active, i.e. when the tokens are image relevant)">
            <td>Temperature Gating:</td>
            <td className="w-12 text-center">{gateInfluence}</td>
            <td className="p-2"><input className="range align-middle" disabled={isConnected} type="range" id="gate-influence" name="gate-influence" step="0.01" min="0.0" max="0.99" value={gateInfluence} onChange={e => setGateInfluence(parseFloat(e.target.value))} /></td>
          </tr>
        </tbody>
      </table>
      <div>
        {!isConnected && <Button onClick={resetParams} className="mt-6 mr-4">Reset</Button>}
        {!isConnected && <Button onClick={() => modal?.current?.close()} className="mt-6 ml-4">Ok</Button>}
      </div>
    </div >
  )
};


================================================
FILE: client/src/pages/Conversation/components/ServerAudio/ServerAudio.tsx
================================================
import { FC, useRef } from "react";
import { AudioStats, useServerAudio } from "../../hooks/useServerAudio";
import { ServerVisualizer } from "../AudioVisualizer/ServerVisualizer";

type ServerAudioProps = {
  setGetAudioStats: (getAudioStats: () => AudioStats) => void;
  imageUrl: string | undefined;
  copyCanvasRef?: React.RefObject<HTMLCanvasElement>;
};
export const ServerAudio: FC<ServerAudioProps> = ({ setGetAudioStats, imageUrl, copyCanvasRef }) => {
  const { analyser, hasCriticalDelay, setHasCriticalDelay } = useServerAudio({
    setGetAudioStats,
  });
  const containerRef = useRef<HTMLDivElement>(null);
  return (
    <>
      {hasCriticalDelay && (
        <div className="fixed left-0 top-0 flex w-screen justify-between bg-red-500 p-2 text-center text-white">
          <p>A connection issue has been detected, you've been reconnected</p>
          <button
            onClick={async () => {
              setHasCriticalDelay(false);
            }}
            className="bg-white p-1 text-black"
          >
            Dismiss
          </button>
        </div>
      )}
      <div className="server-audio aspect-square" ref={containerRef}>
        <ServerVisualizer analyser={analyser.current} parent={containerRef} imageUrl={imageUrl} copyCanvasRef={copyCanvasRef} />
      </div>
    </>
  );
};


================================================
FILE: client/src/pages/Conversation/components/ServerAudio/ServerAudioStats.tsx
================================================
import { useState, useEffect, useRef } from "react";

type ServerAudioStatsProps = {
  getAudioStats: React.MutableRefObject<
    () => {
      playedAudioDuration: number;
      missedAudioDuration: number;
      totalAudioMessages: number;
      delay: number;
      minPlaybackDelay: number;
      maxPlaybackDelay: number;
    }
  >;
};

export const ServerAudioStats = ({ getAudioStats }: ServerAudioStatsProps) => {
  const [audioStats, setAudioStats] = useState(getAudioStats.current());

  const movingAverageSum = useRef<number>(0.);
  const movingAverageCount = useRef<number>(0.);
  const movingBeta = 0.85;

  let convertMinSecs = (total_secs: number) => {
    // convert secs to the format mm:ss.cc
    let mins = (Math.floor(total_secs / 60)).toString();
    let secs = (Math.floor(total_secs) % 60).toString();
    let cents = (Math.floor(100 * (total_secs - Math.floor(total_secs)))).toString();
    if (secs.length < 2) {
      secs = "0" + secs;
    }
    if (cents.length < 2) {
      cents = "0" + cents;
    }
    return mins + ":" + secs + "." + cents;
  };

  useEffect(() => {
    const interval = setInterval(() => {
      const newAudioStats = getAudioStats.current();
      setAudioStats(newAudioStats);
      movingAverageCount.current *= movingBeta;
      movingAverageCount.current += (1 - movingBeta) * 1;
      movingAverageSum.current *= movingBeta;
      movingAverageSum.current += (1 - movingBeta) * newAudioStats.delay;

    }, 141);
    return () => {
      clearInterval(interval);
    };
  }, []);

  return (
    <div className="w-full border-2 border-white p-2 text-white ">
      <h2 className="text-md pb-2">Server Audio Stats</h2>
      <table>
        <tbody>
          <tr>
            <td className="text-md pr-2">Audio played: </td>
            <td>{convertMinSecs(audioStats.playedAudioDuration)}</td>
          </tr>
          <tr>
            <td className="text-md pr-2">Missed audio: </td>
            <td>{convertMinSecs(audioStats.missedAudioDuration)}</td>
          </tr>
          <tr>
            <td className="text-md pr-2">Latency: </td>
            <td>{(movingAverageSum.current / movingAverageCount.current).toFixed(3)}</td>
          </tr>
          <tr>
            <td className="text-md pr-2">Min/Max buffer: </td>
            <td>{audioStats.minPlaybackDelay.toFixed(3)} / {audioStats.maxPlaybackDelay.toFixed(3)}</td>
          </tr>
        </tbody>
      </table>
    </div>
  );
};


================================================
FILE: client/src/pages/Conversation/components/ServerInfo/ServerInfo.tsx
================================================
import { useServerInfo } from "../../hooks/useServerInfo";

function pretty_format(num: number): number {
  return Math.round((num + Number.EPSILON) * 100) / 100
}

export const ServerInfo = (props: { setFileName: Function }) => {
  const { serverInfo } = useServerInfo();
  if (!serverInfo) {
    return null;
  }
  props.setFileName(serverInfo.base_filename);
  return (
    <div className="p-2 pt-4 self-center flex flex-col text-white border-2 border-white break-words">
      Our server is running on the following configuration:
      <div>Image resolution: {serverInfo.image_resolution} px</div>
      <div>Text temperature: {pretty_format(serverInfo.text_temperature)}</div>
      <div>Text topk: {serverInfo.text_topk}</div>
      <div>Temperature gating: {pretty_format(serverInfo.text_temperature_gating_influence)}</div>
      <div>Audio temperature: {pretty_format(serverInfo.audio_temperature)}</div>
      <div>Audio topk: {serverInfo.audio_topk}</div>
      <div>Pad mult: {serverInfo.pad_mult}</div>
      <div>Repeat penalty last N: {serverInfo.repetition_penalty_context}</div>
      <div>Repeat penalty: {serverInfo.repetition_penalty}</div>
      <div>LM model file: {serverInfo.lm_model_file}</div>
      <div>Instance name: {serverInfo.instance_name}</div>
    </div>
  );
};


================================================
FILE: client/src/pages/Conversation/components/TextDisplay/TextDisplay.tsx
================================================
import { FC, useEffect, useRef } from "react";
import { useServerText } from "../../hooks/useServerText";

type TextDisplayProps = {
  containerRef: React.RefObject<HTMLDivElement>;
  displayColor: boolean | undefined;
};

// Palette 2: Purple to Green Moshi
// sns.diverging_palette(288, 145, s=90, l=72, n=11).as_hex()
// Palette 2: Green to orange Moshi
// sns.diverging_palette(145, 40, s=90, l=72, n=11).as_hex()
const textDisplayColors = [
  '#38c886', '#5bd09a', '#80d9af',
  '#a4e2c4', '#c8ead9', '#f2f1f1',
  '#f4e0cb', '#f5cea6', '#f5bd81',
  '#f6ac5b', '#f79b37']

function clamp_color(v: number) {
  return v <= 0
    ? 0
    : v >= textDisplayColors.length
      ? textDisplayColors.length
      : v
}

export const TextDisplay: FC<TextDisplayProps> = ({
  containerRef, displayColor
}) => {
  const { text, textColor } = useServerText();
  const currentIndex = text.length - 1;
  const prevScrollTop = useRef(0);

  useEffect(() => {
    if (containerRef.current) {
      prevScrollTop.current = containerRef.current.scrollTop;
      containerRef.current.scroll({
        top: containerRef.current.scrollHeight,
        behavior: "smooth",
      });
    }
  }, [text]);
  if (displayColor && (textColor.length == text.length)) {
    return (
      <div className="h-full w-full max-w-full max-h-full  p-2 text-white">
        {text.map((t, i) => (
          <span
            key={i}
            className={`${i === currentIndex ? "font-bold" : "font-normal"}`}
            style={{
              color: `${textDisplayColors[clamp_color(textColor[i])]}`
            }}
          >
            {t}
          </span>
        ))
        }
      </div >
    );
  }
  else {
    return (
      <div className="h-full w-full max-w-full max-h-full  p-2 text-white">
        {text.map((t, i) => (
          <span
            key={i}
            className={`${i === currentIndex ? "font-bold" : "font-normal"}`}
          >
            {t}
          </span>
        ))}
      </div>
    );
  };
};


================================================
FILE: client/src/pages/Conversation/components/TextDisplay/TextDisplayStats.tsx
================================================
import { FC } from "react";

type TextDisplayStatsProps = {
  totalTextMessages: number;
};
export const TextDisplayStats: FC<TextDisplayStatsProps> = ({
  totalTextMessages,
}) => {
  return (
    <div className="w-60 flex-shrink-0">
      <h2 className="text-center text-lg">Text Display Stats</h2>
      <div>
        <div className="flex justify-evenly">
          <p className="text-md">Total messages:</p>
          <p>{totalTextMessages}</p>
        </div>
      </div>
    </div>
  );
};


================================================
FILE: client/src/pages/Conversation/components/UserAudio/UserAudio.tsx
================================================
import { FC, useCallback, useEffect, useRef, useState } from "react";
import { useSocketContext } from "../../SocketContext";
import { useUserAudio } from "../../hooks/useUserAudio";
import { ClientVisualizer } from "../AudioVisualizer/ClientVisualizer";

type UserAudioProps = {
  copyCanvasRef: React.RefObject<HTMLCanvasElement>;
};
export const UserAudio: FC<UserAudioProps> = ({ copyCanvasRef }) => {
  const [analyser, setAnalyser] = useState<AnalyserNode | null>(null);
  const { sendMessage, isConnected } = useSocketContext();
  const containerRef = useRef<HTMLDivElement>(null);
  const onRecordingStart = useCallback(() => {
    console.log("Recording started");
  }, []);

  const onRecordingStop = useCallback(() => {
    console.log("Recording stopped");
  }, []);

  const onRecordingChunk = useCallback(
    (chunk: Uint8Array) => {
      if (!isConnected) {
        return;
      }
      sendMessage({
        type: "audio",
        data: chunk,
      });
    },
    [sendMessage, isConnected],
  );

  const { startRecordingUser, stopRecording } = useUserAudio({
    constraints: {
      audio: {
        echoCancellation: true,
        noiseSuppression: true,
        autoGainControl: true,
        channelCount: 1,
      },
      video: false,
    },
    onDataChunk: onRecordingChunk,
    onRecordingStart,
    onRecordingStop,
  });

  useEffect(() => {
    let res: Awaited<ReturnType<typeof startRecordingUser>>;
    if (isConnected) {
      startRecordingUser().then(result => {
        if (result) {
          res = result;
          setAnalyser(result.analyser);
        }
      });
    }
    return () => {
      console.log("Stop recording called from somewhere else.");
      stopRecording();
      res?.source?.disconnect();
    };
  }, [startRecordingUser, stopRecording, isConnected]);

  return (
    <div className="user-audio aspect-square" ref={containerRef}>
      <ClientVisualizer analyser={analyser} parent={containerRef} copyCanvasRef={copyCanvasRef} />
    </div>
  );
};


================================================
FILE: client/src/pages/Conversation/components/UserAudio/UserAudioStats.tsx
================================================
import { FC } from "react";

type UserAudioStatsProps = {
  sentMessagesCount: number;
};

export const UserAudioStats: FC<UserAudioStatsProps> = ({
  sentMessagesCount,
}) => {
  return (
    <div>
      <h2 className="text-center text-lg">User Audio Stats</h2>
      <div>
        <div className="flex justify-between">
          <p className="text-md">Total messages:</p>
          <p>{sentMessagesCount}</p>
        </div>
      </div>
    </div>
  );
};


================================================
FILE: client/src/pages/Conversation/getMimeType.ts
================================================
export const mimeTypeCheck = () => {
  const types = [
    "audio/ogg",
    "audio/wav",
    "audio/webm;codecs=opus",
    "audio/webm;codecs=pcm",
    "audio/webm;codecs=pcm_s16le",
    "audio/webm;codecs=pcm_f32le",
    "audio/mp3",
    "audio/aac",
    "audio/mp4",
    "audio/webm",
    "audio/mpeg",
    "video/mp4",
    "video/webm;codecs=vp9",
    "video/webm;codecs=vp8",
    "video/webm",
  ];
  for (const mime of types) {
      console.log(mime, MediaRecorder.isTypeSupported(mime));
  }
}

const getVideoMimeType = () => {
  if (!MediaRecorder.isTypeSupported){
    return "video/mp4";
  }
  if (MediaRecorder.isTypeSupported("video/webm")) {
    return "video/webm";
  }
  if (MediaRecorder.isTypeSupported("video/mp4")) {
    return "video/mp4";
  }
  console.log("No supported video mime type found")
  return "";
};

const getAudioMimeType = () => {
  if (!MediaRecorder.isTypeSupported){
    return "audio/mp4";
  }
  if (MediaRecorder.isTypeSupported("audio/webm")) {
    return "audio/webm";
  }
  if (MediaRecorder.isTypeSupported("audio/mpeg")) {
    return "audio/mpeg";
  }``
  if (MediaRecorder.isTypeSupported("audio/mp4")) {
    return "audio/mp4";
  }
  console.log("No supported audio mime type found")
  return "";
}

export const getMimeType = (type: "audio" | "video") => {
  if(type === "audio") {
    return getAudioMimeType();
  }
  return getVideoMimeType();
}

export const getExtension = (type: "audio" | "video") => {
  if(getMimeType(type).includes("mp4")) {
    return "mp4";
  }
  if(getMimeType(type).includes("mpeg")) {
    return "mp3";
  }
  return "webm";
}

================================================
FILE: client/src/pages/Conversation/hooks/audioUtils.ts
================================================
export const clamp = (value: number, min: number, max: number) => {
  return Math.min(Math.max(value, min), max);
};


================================================
FILE: client/src/pages/Conversation/hooks/useModelParams.ts
================================================
import { useCallback, useState } from "react";

export const DEFAULT_TEXT_TEMPERATURE = 0.45;
export const DEFAULT_TEXT_TOPK = 25;
export const DEFAULT_AUDIO_TEMPERATURE = 0.7;
export const DEFAULT_AUDIO_TOPK = 250;
export const DEFAULT_PAD_MULT = 0;
export const DEFAULT_REPETITION_PENALTY_CONTEXT = 64;
export const DEFAULT_REPETITION_PENALTY = 1.15;
export const DEFAULT_IMAGE_RESOLUTION = 448;
export const DEFAULT_IMAGE_URL = undefined;
export const DEFAULT_GATE_DELAY = 16;
export const DEFAULT_GATE_INFLUENCE = 0.0;
export const DEFAULT_DISPLAY_COLOR = true;
export const DEFAULT_CENTER_CROP = false;

export type ModelParamsValues = {
  textTemperature: number;
  textTopk: number;
  audioTemperature: number;
  audioTopk: number;
  padMult: number;
  repetitionPenaltyContext: number,
  repetitionPenalty: number,
  imageResolution: number,
  imageUrl: string | undefined,
  gateDelay: number,
  gateInfluence: number,
  displayColor: boolean,
  centerCrop: boolean,
};

export function importantSettingsHaveChanged(params: ModelParamsValues): boolean {
  return (params.textTemperature != DEFAULT_TEXT_TEMPERATURE) ||
    (params.textTopk != DEFAULT_TEXT_TOPK) ||
    (params.audioTemperature != DEFAULT_AUDIO_TEMPERATURE) ||
    (params.audioTopk != DEFAULT_AUDIO_TOPK) ||
    (params.padMult != DEFAULT_PAD_MULT) ||
    (params.repetitionPenalty != DEFAULT_REPETITION_PENALTY) ||
    (params.repetitionPenaltyContext != DEFAULT_REPETITION_PENALTY_CONTEXT) ||
    (params.imageResolution != DEFAULT_IMAGE_RESOLUTION) ||
    (params.gateDelay != DEFAULT_GATE_DELAY) ||
    (params.gateInfluence != DEFAULT_GATE_INFLUENCE) ||
    (params.centerCrop != DEFAULT_CENTER_CROP)
}

type useModelParamsArgs = Partial<ModelParamsValues>;

export const useModelParams = (params?: useModelParamsArgs) => {

  const [textTemperature, setTextTemperatureBase] = useState(params?.textTemperature || DEFAULT_TEXT_TEMPERATURE);
  const [textTopk, setTextTopkBase] = useState(params?.textTopk || DEFAULT_TEXT_TOPK);
  const [audioTemperature, setAudioTemperatureBase] = useState(params?.audioTemperature || DEFAULT_AUDIO_TEMPERATURE);
  const [audioTopk, setAudioTopkBase] = useState(params?.audioTopk || DEFAULT_AUDIO_TOPK);
  const [padMult, setPadMultBase] = useState(params?.padMult || DEFAULT_PAD_MULT);
  const [repetitionPenalty, setRepetitionPenaltyBase] = useState(params?.repetitionPenalty || DEFAULT_REPETITION_PENALTY);
  const [repetitionPenaltyContext, setRepetitionPenaltyContextBase] = useState(params?.repetitionPenaltyContext || DEFAULT_REPETITION_PENALTY_CONTEXT);
  const [imageResolution, setImageResolutionBase] = useState(params?.imageResolution || DEFAULT_IMAGE_RESOLUTION);
  const [imageUrl, setImageUrlBase] = useState(params?.imageUrl || DEFAULT_IMAGE_URL);
  const [gateDelay, setGateDelayBase] = useState(params?.gateDelay || DEFAULT_GATE_DELAY);
  const [gateInfluence, setGateInfluenceBase] = useState(params?.gateInfluence || DEFAULT_GATE_INFLUENCE);
  const [displayColor, setDisplayColorBase] = useState<boolean>(params?.displayColor == undefined ? DEFAULT_DISPLAY_COLOR : params?.displayColor);
  const [centerCrop, setCenterCropBase] = useState<boolean>(params?.centerCrop == undefined ? DEFAULT_CENTER_CROP : params?.centerCrop);

  const resetParams = useCallback(() => {
    setTextTemperatureBase(DEFAULT_TEXT_TEMPERATURE);
    setTextTopkBase(DEFAULT_TEXT_TOPK);
    setAudioTemperatureBase(DEFAULT_AUDIO_TEMPERATURE);
    setAudioTopkBase(DEFAULT_AUDIO_TOPK);
    setPadMultBase(DEFAULT_PAD_MULT);
    setRepetitionPenaltyBase(DEFAULT_REPETITION_PENALTY);
    setRepetitionPenaltyContextBase(DEFAULT_REPETITION_PENALTY_CONTEXT);
    setImageResolutionBase(DEFAULT_IMAGE_RESOLUTION);
    setImageUrlBase(DEFAULT_IMAGE_URL);
    setGateDelayBase(DEFAULT_GATE_DELAY);
    setGateInfluenceBase(DEFAULT_GATE_INFLUENCE);
    setDisplayColorBase(DEFAULT_DISPLAY_COLOR);
    setCenterCropBase(DEFAULT_CENTER_CROP);
  }, [
    setTextTemperatureBase,
    setTextTopkBase,
    setAudioTemperatureBase,
    setAudioTopkBase,
    setPadMultBase,
    setRepetitionPenaltyBase,
    setRepetitionPenaltyContextBase,
    setImageResolutionBase,
    setImageUrlBase,
    setDisplayColorBase,
    setCenterCropBase,
  ]);

  const setTextTemperature = useCallback((value: number) => {
    if (value <= 1.2 && value >= 0.2) {
      setTextTemperatureBase(value);
    }
  }, []);
  const setTextTopk = useCallback((value: number) => {
    if (value <= 500 && value >= 10) {
      setTextTopkBase(value);
    }
  }, []);
  const setAudioTemperature = useCallback((value: number) => {
    if (value <= 1.2 && value >= 0.2) {
      setAudioTemperatureBase(value);
    }
  }, []);
  const setAudioTopk = useCallback((value: number) => {
    if (value <= 500 && value >= 10) {
      setAudioTopkBase(value);
    }
  }, []);
  const setPadMult = useCallback((value: number) => {
    if (value <= 4 && value >= -4) {
      setPadMultBase(value);
    }
  }, []);
  const setRepetitionPenalty = useCallback((value: number) => {
    if (value <= 2.0 && value >= 1.0) {
      setRepetitionPenaltyBase(value);
    }
  }, []);
  const setRepetitionPenaltyContext = useCallback((value: number) => {
    if (value <= 200 && value >= 0) {
      setRepetitionPenaltyContextBase(value);
    }
  }, []);
  const setImageResolution = useCallback((value: number) => {
    if (value <= 512 && value >= 160) {
      setImageResolutionBase(value);
    }
  }, []);
  const setImageUrl = useCallback((value: string | undefined) => {
    setImageUrlBase(value);
  }, []);
  const setGateDelay = useCallback((value: number) => {
    if (value <= 32 && value >= 0) {
      setGateDelayBase(value);
    }
  }, []);
  const setGateInfluence = useCallback((value: number) => {
    if (value <= 1.0 && value >= 0.0) {
      setGateInfluenceBase(value);
    }
  }, []);
  const setDisplayColor = useCallback((value: boolean) => {
    setDisplayColorBase(value);
  }, []);
  const setCenterCrop = useCallback((value: boolean) => {
    setCenterCropBase(value);
  }, []);
  return {
    textTemperature,
    textTopk,
    audioTemperature,
    audioTopk,
    padMult,
    repetitionPenalty,
    repetitionPenaltyContext,
    imageResolution,
    imageUrl,
    gateDelay,
    gateInfluence,
    displayColor,
    centerCrop,
    setTextTemperature,
    setTextTopk,
    setAudioTemperature,
    setAudioTopk,
    setPadMult,
    setRepetitionPenalty,
    setRepetitionPenaltyContext,
    setImageUrl,
    setImageResolution,
    setGateDelay,
    setGateInfluence,
    setDisplayColor,
    setCenterCrop,
    resetParams,
  }
}

================================================
FILE: client/src/pages/Conversation/hooks/useServerAudio.ts
================================================
import { useCallback, useEffect, useRef, useState } from "react";
import { useSocketContext } from "../SocketContext";
import { decodeMessage } from "../../../protocol/encoder";
import { useMediaContext } from "../MediaContext";
import { DecoderWorker } from "../../../decoder/decoderWorker";

export type AudioStats = {
  playedAudioDuration: number;
  missedAudioDuration: number;
  totalAudioMessages: number;
  delay: number;
  minPlaybackDelay: number;
  maxPlaybackDelay: number;
};

type useServerAudioArgs = {
  setGetAudioStats?: (getAudioStats: () => AudioStats) => void;
};

type WorkletStats = {
  totalAudioPlayed: number;
  actualAudioPlayed: number;
  delay: number;
  minDelay: number;
  maxDelay: number;
};

export const useServerAudio = ({setGetAudioStats}: useServerAudioArgs) => {
  const { socket  } = useSocketContext();
  const {startRecording, stopRecording, audioContext, worklet, micDuration, actualAudioPlayed } =
    useMediaContext();
  const analyser = useRef(audioContext.current.createAnalyser());
  worklet.current.connect(analyser.current);
  const startTime = useRef<number | null>(null);
  const decoderWorker = useRef(DecoderWorker);
  const [hasCriticalDelay, setHasCriticalDelay] = useState(false);
  const totalAudioMessages = useRef(0);
  const receivedDuration = useRef(0);
  const workletStats = useRef<WorkletStats>({
    totalAudioPlayed: 0,
    actualAudioPlayed: 0,
    delay: 0,
    minDelay: 0,
    maxDelay: 0,});

  const onDecode = useCallback(
    async (data: Float32Array) => {
      receivedDuration.current += data.length / audioContext.current.sampleRate;
      worklet.current.port.postMessage({frame: data, type: "audio", micDuration: micDuration.current});
    },
    [],
  );

  const onWorkletMessage = useCallback(
    (event: MessageEvent<WorkletStats>) => {
      workletStats.current = event.data;
      actualAudioPlayed.current = workletStats.current.actualAudioPlayed;
    },
    [],
  );
  worklet.current.port.onmessage = onWorkletMessage;

  const getAudioStats = useCallback(() => {
    return {
      playedAudioDuration: workletStats.current.actualAudioPlayed,
      delay: workletStats.current.delay,
      minPlaybackDelay: workletStats.current.minDelay,
      maxPlaybackDelay: workletStats.current.maxDelay,
      missedAudioDuration: workletStats.current.totalAudioPlayed - workletStats.current.actualAudioPlayed,
      totalAudioMessages: totalAudioMessages.current,
    };
  }, []);

  const onWorkerMessage = useCallback(
    (e: MessageEvent<any>) => {
      if (!e.data) {
        return;
      }
      onDecode(e.data[0]);
    },
    [onDecode],
  );

  let midx = 0;
  const decodeAudio = useCallback((data: Uint8Array) => {
    if (midx < 5) {
      console.log(Date.now() % 1000, "Got NETWORK message", micDuration.current - workletStats.current.actualAudioPlayed, midx++);
    }
    decoderWorker.current.postMessage(
      {
        command: "decode",
        pages: data,
      },
      [data.buffer],
    );
  }, []);

  const onSocketMessage = useCallback(
    (e: MessageEvent) => {
      const dataArray = new Uint8Array(e.data);
      const message = decodeMessage(dataArray);
      if (message.type === "audio") {
        decodeAudio(message.data);
        //For stats purposes for now
        totalAudioMessages.current++;
      }
    },
    [decodeAudio],
  );

  useEffect(() => {
    const currentSocket = socket;
    if (!currentSocket) {
      return;
    }
    worklet.current.port.postMessage({type: "reset"});
    console.log(Date.now() % 1000, "Should start in a bit");
    startRecording();
    currentSocket.addEventListener("message", onSocketMessage);
    totalAudioMessages.current = 0;
    return () => {
      console.log("Stop recording called in unknown function.")
      stopRecording();
      startTime.current = null;
      currentSocket.removeEventListener("message", onSocketMessage);
    };
  }, [socket]);

  useEffect(() => {
    if (setGetAudioStats) {
      console.log("Setting getAudioStats");
      setGetAudioStats(getAudioStats);
    }
  }, [setGetAudioStats, getAudioStats]);

  useEffect(() => {
    decoderWorker.current.onmessage = onWorkerMessage;
    // 960 = 24000 / 12.5 / 2
    // The /2 is a bit optional, but won't hurt for recording the mic, and for the
    // the decoding it might help getting some decoded audio out asap.
    decoderWorker.current.postMessage({
      command: "init",
      bufferLength: 960 * audioContext.current.sampleRate / 24000,
      decoderSampleRate: 24000,
      outputBufferSampleRate: audioContext.current.sampleRate,
      resampleQuality: 0,
    });

    return () => {
      console.log("Terminating worker");
    };
  }, [onWorkerMessage]);

  return {
    decodeAudio,
    analyser,
    getAudioStats,
    hasCriticalDelay,
    setHasCriticalDelay,
  };
};


================================================
FILE: client/src/pages/Conversation/hooks/useServerInfo.ts
================================================
import { useCallback, useEffect, useState } from "react";
import { useSocketContext } from "../SocketContext";
import { decodeMessage } from "../../../protocol/encoder";
import { z } from "zod";

const ServersInfoSchema = z.object({
  text_temperature: z.number(),
  text_topk: z.number(),
  text_temperature_gating_influence: z.number(),
  audio_temperature: z.number(),
  audio_topk: z.number(),
  pad_mult: z.number(),
  repetition_penalty_context: z.number(),
  repetition_penalty: z.number(),
  image_resolution: z.number(),
  lm_model_file: z.string(),
  instance_name: z.string(),
  base_filename: z.string(),
  build_info: z.object({
    build_timestamp: z.string(),
    build_date: z.string(),
    git_branch: z.string(),
    git_timestamp: z.string(),
    git_date: z.string(),
    git_hash: z.string(),
    git_describe: z.string(),
    rustc_host_triple: z.string(),
    rustc_version: z.string(),
    cargo_target_triple: z.string(),
  }),
});

const parseInfo = (infos: any) => {
  const serverInfo = ServersInfoSchema.safeParse(infos);
  if (!serverInfo.success) {
    console.error(serverInfo.error);
    return null;
  }
  return serverInfo.data;
};

type ServerInfo = {
  text_temperature: number;
  text_topk: number;
  text_temperature_gating_influence: number;
  audio_temperature: number;
  audio_topk: number;
  pad_mult: number;
  repetition_penalty_context: number;
  repetition_penalty: number;
  image_resolution: number;
  lm_model_file: string;
  instance_name: string;
  base_filename: string;
  build_info: {
    build_timestamp: string;
    build_date: string;
    git_branch: string;
    git_timestamp: string;
    git_date: string;
    git_hash: string;
    git_describe: string;
    rustc_host_triple: string;
    rustc_version: string;
    cargo_target_triple: string;
  };
}

export const useServerInfo = () => {
  const [serverInfo, setServerInfo] = useState<ServerInfo | null>(null);
  const { socket } = useSocketContext();

  const onSocketMessage = useCallback((e: MessageEvent) => {
    const dataArray = new Uint8Array(e.data);
    const message = decodeMessage(dataArray);
    if (message.type === "metadata") {
      const infos = parseInfo(message.data);
      if (infos) {
        setServerInfo(infos);
        console.log("received metadata", infos);
      }
    }
  }, [setServerInfo]);

  useEffect(() => {
    const currentSocket = socket;
    if (!currentSocket) {
      return;
    }
    setServerInfo(null);
    currentSocket.addEventListener("message", onSocketMessage);
    return () => {
      currentSocket.removeEventListener("message", onSocketMessage);
    };
  }, [socket]);

  return { serverInfo };
};


================================================
FILE: client/src/pages/Conversation/hooks/useServerText.ts
================================================
import { useCallback, useEffect, useState } from "react";
import { useSocketContext } from "../SocketContext";
import { decodeMessage } from "../../../protocol/encoder";

export const useServerText = () => {
  const [text, setText] = useState<string[]>([]);
  const [textColor, setTextColor] = useState<number[]>([]);
  const [totalTextMessages, setTotalTextMessages] = useState(0);
  const { socket } = useSocketContext();

  const onSocketMessage = useCallback((e: MessageEvent) => {
    const dataArray = new Uint8Array(e.data);
    const message = decodeMessage(dataArray);
    if (message.type === "text") {
      setText(text => [...text, message.data]);
      setTotalTextMessages(count => count + 1);
    } else if (message.type === "coloredtext") {
      setText(text => [...text, message.data]);
      setTextColor(textColor => [...textColor, message.color]);
      setTotalTextMessages(count => count + 1);
    }
  }, []);

  useEffect(() => {
    const currentSocket = socket;
    if (!currentSocket) {
      return;
    }
    setText([]);
    currentSocket.addEventListener("message", onSocketMessage);
    return () => {
      currentSocket.removeEventListener("message", onSocketMessage);
    };
  }, [socket]);

  return { text, textColor, totalTextMessages };
};


================================================
FILE: client/src/pages/Conversation/hooks/useSocket.ts
================================================
import { useState, useEffect, useCallback, useRef } from "react";
import { WSMessage } from "../../../protocol/types";
import { decodeMessage, encodeMessage } from "../../../protocol/encoder";

export const useSocket = ({
  onMessage,
  uri,
  onDisconnect: onDisconnectProp,
  imageUrl,
}: {
  onMessage?: (message: WSMessage) => void;
  uri: string;
  onDisconnect?: () => void;
  imageUrl?: string;
}) => {
  const lastMessageTime = useRef<null | number>(null);
  const [isConnected, setIsConnected] = useState(false);
  const [imageSent, setImageSent] = useState(false);
  const [onConnectDone, setOnConnectDone] = useState(false);
  const [socket, setSocket] = useState<WebSocket | null>(null);
  const sendMessage = useCallback(
    (message: WSMessage) => {
      if (!socket) {
        console.log("socket not present");
        return false;
      }
      // audio message with no connection
      if (message.type == "audio" && !isConnected) {
        console.log("isConnected false on audio message, please wait for handshake");
        return false;
      }
      // otherwise send message
      socket.send(encodeMessage(message));
      return true;
    },
    [isConnected, socket],
  );
  useEffect(() => {
    async function sendImage() {
      console.log("image send", imageSent);
      console.log("image url", imageUrl);
      if (imageUrl && !imageSent) {
        const imageBytes = await fetchImageBytes(imageUrl);
        const sent = sendMessage({
          type: "image",
          data: imageBytes,
        });
        if (sent) {
          console.log("Image sent");
          setImageSent(true);
        }
      }
    }

    sendImage();
  }, [socket, onConnectDone, imageUrl, imageSent]);

  const onConnect = useCallback(() => {
    console.log("connected, now waiting for handshake.");
    setOnConnectDone(true);
  }, [setIsConnected, socket]);

  const onDisconnect = useCallback(() => {
    console.log("disconnected");
    if (onDisconnectProp) {
      onDisconnectProp();
    }
    setIsConnected(false);
  }, [onDisconnectProp]);

  const onMessageEvent = useCallback(
    (eventData: MessageEvent) => {
      lastMessageTime.current = Date.now();
      const dataArray = new Uint8Array(eventData.data);
      const message = decodeMessage(dataArray);
      if (message.type == "handshake") {
        console.log("Handshake received, let's rocknroll.");
        setIsConnected(true);
      }
      if (!onMessage) {
        return;
      }
      onMessage(message);
    },
    [onMessage, setIsConnected],
  );

  const start = useCallback(() => {
    const ws = new WebSocket(uri);
    ws.binaryType = "arraybuffer";
    ws.addEventListener("open", onConnect);
    ws.addEventListener("close", onDisconnect);
    ws.addEventListener("message", onMessageEvent);
    setSocket(ws);
    console.log("Socket created", ws);
    lastMessageTime.current = Date.now();
  }, [uri, onMessage, onDisconnectProp]);

  const stop = useCallback(() => {
    setIsConnected(false);
    if (onDisconnectProp) {
      onDisconnectProp();
    }
    socket?.close();
    setSocket(null);
  }, [socket]);

  useEffect(() => {
    if (!isConnected) {
      return;
    }
    let intervalId = setInterval(() => {
      if (lastMessageTime.current && Date.now() - lastMessageTime.current > 10000) {
        console.log("closing socket due to inactivity", socket);
        socket?.close();
        onDisconnect();
        clearInterval(intervalId);
      }
    }, 500);

    return () => {
      lastMessageTime.current = null;
      clearInterval(intervalId);
    };
  }, [isConnected, socket]);

  return {
    isConnected,
    socket,
    sendMessage,
    start,
    stop,
  };
};

async function fetchImageBytes(imageUrl: string) {
  const response = await fetch(imageUrl);

  if (!response.ok) {
    throw new Error(`Failed to fetch image: ${response.statusText}`);
  }
  const arrayBuffer = await response.arrayBuffer();
  return new Uint8Array(arrayBuffer);
}

================================================
FILE: client/src/pages/Conversation/hooks/useUserAudio.ts
================================================
import { useCallback, useRef, useState } from "react";
import Recorder from "opus-recorder";
import encoderPath from "opus-recorder/dist/encoderWorker.min.js?url";
import { useMediaContext } from "../MediaContext";

export enum UserMediaStatuses {
  IDLE = "IDLE",
  READY = "READY",
  WAITING_FOR_PERMISSION = "WAITING_FOR_PERMISSION",
  ERROR = "ERROR",
  RECORDING = "RECORDING",
  STOPPED = "STOPPED",
  STOPPING = "STOPPING",
}

type useUserAudioArgs = {
  constraints: MediaStreamConstraints;
  onDataChunk?: (chunk: Uint8Array) => void;
  onRecordingStart?: () => void;
  onRecordingStop?: () => void;
};

export const useUserAudio = ({
  constraints,
  onDataChunk,
  onRecordingStart = () => {},
  onRecordingStop = () => {},
}: useUserAudioArgs) => {
  const { audioStreamDestination, audioContext, micDuration } = useMediaContext();
  const [error, setError] = useState<string | null>(null);
  const [status, setStatus] = useState<UserMediaStatuses>(
    UserMediaStatuses.IDLE,
  );

  //TODO: Fix any type for recorder
  const recorder = useRef<any>(null);

  const getMediaStream = useCallback(async () => {
    setStatus(UserMediaStatuses.WAITING_FOR_PERMISSION);
    try {
      const stream =
        await window.navigator.mediaDevices.getUserMedia(constraints);
      setStatus(UserMediaStatuses.IDLE);
      return stream;
    } catch (error: any) {
      console.error(error);
      setError(error.name);
      setStatus(UserMediaStatuses.ERROR);
      return null;
    }
  }, [constraints, setStatus]);

  const startRecordingUser = useCallback(async () => {
    console.log(Date.now() % 1000, "Starting recording in user audio");
    const mediaStream = await getMediaStream();
    if (mediaStream) {
      const analyser = audioContext.current.createAnalyser();
      const source = audioContext.current.createMediaStreamSource(mediaStream);
      source.connect(analyser);
      source.connect(audioStreamDestination.current);

      // For buffer length: 960 = 24000 / 12.5 / 2
      // The /2 is a bit optional, but won't hurt for recording the mic.
      // Note that bufferLength actually has 0 impact for mono audio, only
      // the frameSize and maxFramesPerPage seems to have any.
      const recorderOptions = {
        mediaTrackConstraints: constraints,
        encoderPath,
        bufferLength: Math.round(960 * audioContext.current.sampleRate / 24000),
        encoderFrameSize: 20,
        encoderSampleRate: 24000,
        maxFramesPerPage: 2,
        numberOfChannels: 1,
        recordingGain: 1,
        resampleQuality: 3,
        encoderComplexity: 0,
        encoderApplication: 2049,
        streamPages: true,
      };
      let chunk_idx = 0;
      let lastpos = 0;
      recorder.current = new Recorder(recorderOptions);
      recorder.current.ondataavailable = (data: Uint8Array) => {
        // opus actually always works at 48khz, so it seems this is the proper value to use here.
        micDuration.current = recorder.current.encodedSamplePosition / 48000;
        if (chunk_idx < 5) {
          console.log(Date.now() % 1000, "Mic Data chunk", chunk_idx++, (recorder.current.encodedSamplePosition - lastpos) / 48000, micDuration.current);
          lastpos = recorder.current.encodedSamplePosition;
        }
        if (onDataChunk) {
          onDataChunk(data);
        }
      };
      recorder.current.onstart = () => {
        setStatus(UserMediaStatuses.RECORDING);
        onRecordingStart();
      };
      recorder.current.onstop = () => {
        setStatus(UserMediaStatuses.STOPPED);
        source.disconnect();
        onRecordingStop();

        recorder.current = null;
      };

      if (recorder.current) {
        // setTimeout(() => {recorder.current.start(); setStatus(UserMediaStatuses.RECORDING);}, 1500);
        recorder.current.start();

      }

      return {
        analyser,
        mediaStream,
        source,
      };
    }
    return {
      analyser: null,
      mediaStream: null,
      source: null,
    };
  }, [setStatus, onDataChunk, onRecordingStart, onRecordingStop]);

  const stopRecording = useCallback(() => {
    setStatus(UserMediaStatuses.STOPPING);
    if (recorder.current) {
      recorder.current.stop();
    }
  }, [setStatus]);

  return {
    status,
    error,
    startRecordingUser,
    stopRecording,
  };
};


================================================
FILE: client/src/pages/Queue/Queue.tsx
================================================
import moshiProcessorUrl from "../../audio-processor.ts?worker&url";
import { FC, useEffect, useMemo, useState, useCallback, useRef, MutableRefObject } from "react";
import eruda from "eruda";
import { useSearchParams } from "react-router-dom";
import { Conversation } from "../Conversation/Conversation";
import { Button } from "../../components/Button/Button";
import { ImageGallery } from "../../components/ImageGallery/ImageGallery";
import { useModelParams, importantSettingsHaveChanged } from "../Conversation/hooks/useModelParams";
import { ModelParams } from "../Conversation/components/ModelParams/ModelParams";
import { env } from "../../env";
import { useUserEmail } from "./hooks/useUserEmail";
import { Input } from "../../components/Input/Input";
import { getAPIClient } from "./api/client";

type Status = "connecting" | "in_queue" | "has_credentials" | "error" | "no_queue" | "idle" | "bypass";


function getFloatFromStorage(val: string | null) {
  return (val == null) ? undefined : parseFloat(val)
}

function getIntFromStorage(val: string | null) {
  return (val == null) ? undefined : parseInt(val)
}

function getBoolFromStage(val: string | null) {
  return (val == 'true') ? true : ((val == 'false') ? false : undefined)
}

export const Queue: FC = () => {
  const [searchParams] = useSearchParams();
  let queueId = searchParams.get("queue_id");
  if (!queueId) {
    queueId = 'talktomoshi';
  }
  const [sessionId, setSessionId] = useState<number | null>(null);
  const [sessionAuthId, setSessionAuthId] = useState<string | null>(null);
  const [workerAddr, setWorkerAddr] = useState<string | null>(null);
  const [workerAuthId, setWorkerAuthId] = useState<string | null>(null);
  const [currentPosition, setCurrentPosition] = useState<string | null>(null);
  const [error, setError] = useState<string | null>(null);
  const overrideWorkerAddr = searchParams.get("worker_addr");
  const [hasMicrophoneAccess, setHasMicrophoneAccess] = useState<boolean>(false);
  const [showMicrophoneAccessMessage, setShowMicrophoneAccessMessage] = useState<boolean>(false);
  const [shouldConnect, setShouldConnect] = useState<boolean>(false);
  let default_image_url = sessionStorage.getItem("imageUrl");
  const modelParams = useModelParams({
    textTemperature: getFloatFromStorage(sessionStorage.getItem("textTemperature")),
    textTopk: getIntFromStorage(sessionStorage.getItem("textTopk")),
    audioTemperature: getFloatFromStorage(sessionStorage.getItem("audioTemperature")),
    audioTopk: getIntFromStorage(sessionStorage.getItem("audioTopk")),
    padMult: getFloatFromStorage(sessionStorage.getItem("padMult")),
    repetitionPenalty: getFloatFromStorage(sessionStorage.getItem("repetitionPenalty")),
    repetitionPenaltyContext: getIntFromStorage(sessionStorage.getItem("repetitionPenaltyContext")),
    imageResolution: getIntFromStorage(sessionStorage.getItem("imageResolution")),
    gateDelay: getIntFromStorage(sessionStorage.getItem("gateDelay")),
    gateInfluence: getFloatFromStorage(sessionStorage.getItem("gateInfluence")),
    displayColor: getBoolFromStage(sessionStorage.getItem("displayColor")),
    centerCrop: getBoolFromStage(sessionStorage.getItem("centerCrop")),
    imageUrl: (default_image_url == null) ? undefined : default_image_url
  });
  const modalRef = useRef<HTMLDialogElement>(null);

  let def_user_email = sessionStorage.getItem("userEmail");
  const { userEmail, setUserEmail, error: emailError, validate } = useUserEmail(!!overrideWorkerAddr, (def_user_email == null) ? '' : def_user_email);

  const audioContext = useRef<AudioContext | null>(null);
  const worklet = useRef<AudioWorkletNode | null>(null);
  // enable eruda in development
  useEffect(() => {
    if (env.VITE_ENV === "development") {
      eruda.init();
    }
    () => {
      if (env.VITE_ENV === "development") {
        eruda.destroy();
      }
    };
  }, []);

  const getMicrophoneAccess = useCallback(async () => {
    try {
      await window.navigator.mediaDevices.getUserMedia({ audio: true });
      setHasMicrophoneAccess(true);
      return true;
    } catch (e) {
      console.error(e);
      setShowMicrophoneAccessMessage(true);
      setHasMicrophoneAccess(false);
    }
    return false;
  }, [setHasMicrophoneAccess, setShowMicrophoneAccessMessage, setShouldConnect]);

  const startProcessor = useCallback(async () => {
    if (!audioContext.current) {
      audioContext.current = new AudioContext();
    }
    if (worklet.current) {
      return;
    }
    let ctx = audioContext.current;
    ctx.resume();
    try {
      worklet.current = new AudioWorkletNode(ctx, 'moshi-processor');
    } catch (err) {
      await ctx.audioWorklet.addModule(moshiProcessorUrl);
      worklet.current = new AudioWorkletNode(ctx, 'moshi-processor');
    }
    worklet.current.connect(ctx.destination);
  }, [audioContext, worklet]);

  const onConnect = useCallback(async () => {
    if (!validate(userEmail)) {
      return;
    }
    await startProcessor();
    const hasAccess = await getMicrophoneAccess();
    if (hasAccess) {
      setShouldConnect(true);
    }
  }, [setShouldConnect, startProcessor, userEmail, getMicrophoneAccess, validate]);

  const status: Status = useMemo(() => {
    if (overrideWorkerAddr) {
      return "bypass";
    }
    if (!queueId) {
      return "no_queue";
    }
    if (error) {
      return "error";
    }
    if (!shouldConnect) {
      return "idle";
    }
    if (workerAddr && workerAuthId) {
      return "has_credentials";
    }
    if (!sessionId || !sessionAuthId) {
      return "connecting";
    }
    return "in_queue";
  }, [queueId, sessionId, sessionAuthId, workerAddr, workerAuthId, currentPosition, hasMicrophoneAccess, error, shouldConnect]);

  const client = useMemo(() => {
    return getAPIClient(env.VITE_QUEUE_API_PATH)
  }, [env.VITE_QUEUE_API_PATH]);

  useEffect(() => {
    if (!shouldConnect) {
      return;
    }
    if (status !== "connecting" || !queueId) {
      return;
    }
    client.addUser(queueId)
      .then(({ session_id, session_auth_id }) => {
        setSessionId(session_id);
        setSessionAuthId(session_auth_id);
        console.log("Added user to queue", session_id, session_auth_id);
      })
      .catch((e) => {
        setError(e.message);
        console.error(e);
      });
  }, [queueId, client, status, shouldConnect]);

  useEffect(() => {
    if (!sessionId || !sessionAuthId) {
      return;
    }
    if (status === "has_credentials") {
      return;
    }
    let isQuerying = false;
    let intervalId: number | null = null;
    const checkUser = () => {
      if (isQuerying) {
        return;
      }
      isQuerying = true;
      client.checkUser(sessionId, sessionAuthId)
        .then(({ worker_addr, worker_auth_id, current_position }) => {
          setCurrentPosition(current_position);
          if (worker_addr && worker_auth_id) {
            setWorkerAddr(worker_addr);
            setWorkerAuthId(worker_auth_id);
            if (intervalId !== null) {
              clearInterval(intervalId);
            }
          }
        })
        .catch((e) => {
          if (intervalId !== null) {
            clearInterval(intervalId);
          }
          setError(e.message);
          console.error(e);
        }).finally(() => {
          isQuerying = false;
        });
    }
    intervalId = setInterval(checkUser, 400);
    return () => {
      if (intervalId !== null) {
        clearInterval(intervalId);
      }
    };
  }, [sessionId, sessionAuthId, client, setCurrentPosition, setWorkerAddr, setWorkerAuthId, status, setError]);


  if (status === "bypass" && hasMicrophoneAccess && audioContext.current && worklet.current) {
    return (
      <Conversation
        workerAddr={overrideWorkerAddr ?? ""}
        audioContext={audioContext as MutableRefObject<AudioContext>}
        worklet={worklet as MutableRefObject<AudioWorkletNode>}
        {...modelParams}
      />
    );
  }

  if (status === "has_credentials" && workerAddr && audioContext.current && workerAuthId && sessionId && sessionAuthId && worklet?.current) {
    return (
      <Conversation
        email={userEmail}
        workerAddr={overrideWorkerAddr ?? workerAddr}
        workerAuthId={workerAuthId}
        audioContext={audioContext as MutableRefObject<AudioContext>}
        worklet={worklet as MutableRefObject<AudioWorkletNode>}
        sessionId={sessionId}
        sessionAuthId={sessionAuthId}
        onConversationEnd={() => {
          setWorkerAddr(null);
          setWorkerAuthId(null);
          setSessionId(null);
          setSessionAuthId(null);
          setShouldConnect(false);
        }}
        {...modelParams}
      />
    );
  }
  return (
    <div className="text-white text-center h-screen w-screen p-4 flex flex-col items-center ">
      <div>
        <h1 className="text-4xl" style={{ letterSpacing: "2px" }}>M👁️shiVis</h1>
        {/*
          To add more space to the top add padding to the top of the following div
          by changing the pt-4 class to pt-8 or pt-12. (see: https://tailwindcss.com/docs/padding)
   👁️       If you'd like to move this part to the bottom of the screen, change the class to pb-4 or pb-8 and move the following so it is contained by the last one in the page.
          Font size can be changed by changing the text-sm class to text-lg or text-xl. (see : https://tailwindcss.com/docs/font-size)
          As for the links you can use the one below as an example and add more by copying it and changing the href and text.
        */}
        <div className="pt-8 text-sm flex justify-center items-center flex-col mb-0">
          <div className="presentation text-left">
            <p><span className='vis-words'>MoshiVis</span> is an experimental multimodal conversational AI.
              Like <span className='cute-words'>Moshi</span>, MoshiVis can <span className='cute-words'>listen</span> to you and
              <span className='cute-words'> talk</span> at all time for maximum conversational flow.  Now augmented with <span className='vis-words'>visual</span> inputs.</p>
            <p>For instance, you can now ask Moshi to describe your favorite <span className='vis-words'>movie poster</span>,
              grill it on details about the <span className='cute-words'>plot</span>, then go back for more
              details about the <span className='vis-words'>image</span> ask it  to do some <span className='cute-words'>Pirate</span> role play.</p>
            <p>We strive to support all browsers but Chrome works best. Conversations are limited to <span className='cute-words'>5 min</span>.</p>
            <p> Head to the <span className='vis-words'>Settings</span> to configure the image size and other parameters.</p>
            <p> For more information about this project, check out the <a href="https://kyutai.org/moshivis" target="_blank" className="link">MoshiVis project page</a>!</p>
            <p>Baked with &lt;3 @<a href="https://kyutai.org/" className='cute-words underline'>Kyutai</a>.</p>
          </div>
        </div>
      </div>
      <div className="text-sm mt-10 mb-0">
        <p>Add your <span className='cute-words'>email address</span> first, then feel free</p>
        <p> to upload your own <span className='vis-words'>image</span> or select one below.</p>
        <p>Uploaded images should be smaller than <span className='cute-words'>15 MB</span>.</p>
      </div>
      <div className="flex flex-grow justify-center items-center flex-col mb-0">
        {status == 'error' && <p className="text-center text-red-800 text-2xl">{error}</p>}
        {status == 'no_queue' && <p className="text-center">No queue id provided</p>}
        {(status === 'idle' || status === 'bypass') && (
          <>
            {showMicrophoneAccessMessage &&
              <p className="text-center">Please enable your microphone before proceeding</p>
            }
            <Input
              type="email"
              placeholder="Enter your email"
              value={userEmail}
              onChange={(e) => setUserEmail(e.target.value)}
              error={emailError ?? ""}
              onKeyDown={(e) => {
                if (e.key === "Enter") {
                  if (modelParams.imageUrl == undefined) {
                    modelParams.setImageUrl("/assets/images/demo/image" + Math.floor(1 + Math.random() * 19) + ".jpg")
                  }
                  onConnect();
                }
              }}
            />
            <Button className="settingsbutton absolute top-4 right-4"
              id={importantSettingsHaveChanged(modelParams) ? 'changed' : 'default'}
              title={importantSettingsHaveChanged(modelParams) ? 'Change or reset MoshiVis here (Your current settings are different from the defaults)' : 'Change or reset settings here'}
              onClick={() => modalRef.current?.showModal()}>Settings</Button>
            <dialog ref={modalRef} className="modal">
              <div className="modal-box border-2 border-white rounded-none flex justify-center bg-black">
                <ModelParams {...modelParams} isConnected={shouldConnect} modal={modalRef} />
              </div>
              <form method="dialog" className="modal-backdrop">
                <button>Close</button>
              </form>
            </dialog>
          </>
        )}
        {status === "connecting" && <p className="text-center">Connecting to queue...</p>}
        {status === "in_queue" && (
          <p className="text-center">
            You're in the queue !<br />
            {currentPosition && <span>Current position: <span className="text-green">{currentPosition}</span></span>}
          </p>)
        }
      </div>
      <div className="mt-0 flex flex-grow justify-center items-center flex-col presentation mb-8">
        <ImageGallery numImages={6} size={110} paramsSetter={modelParams.setImageUrl} clickAction={onConnect}></ImageGallery>
      </div>

      <div className="text-center flex justify-end items-center flex-col">
        <a target="_blank" href="https://kyutai.org/moshi-terms.pdf" className="text-center">Terms of Use</a>
        <a target="_blank" href="https://kyutai.org/moshi-privacy.pdf" className="text-center">Privacy Policy</a>
      </div>
    </div >
  )
};


================================================
FILE: client/src/pages/Queue/api/client.ts
================================================
import { APIError } from "./errors/api_error";
import { ResponseError } from "./errors/response_error";
import { validateAddUser, validateCheckUser } from "./validators";

export const getAPIClient = (url:string) =>  ({
  addUser: async (queueId:string) => {
    const encodedQueueId = encodeURIComponent(queueId);
    const response = await fetch(`${url}/add_user?queue_id=${encodedQueueId}`);
    if (!response.ok) {
      const errorText  = await response.text();
      throw new APIError(errorText , response.status);
    }
    const json = await response.json();
    const result = validateAddUser(json);
    if(result.success) {
      return result.data;
    }
    console.error(result.error.message);
    throw new ResponseError("Failed to validate response");
    
  },
  checkUser: async (sessionId:number, sessionAuthId:string) => {
    const encodedSessionAuthId = encodeURIComponent(sessionAuthId);
    const encodedSessionId = encodeURIComponent(sessionId);
    const response = await fetch(`${url}/check_user?session_id=${encodedSessionId}&session_auth_id=${encodedSessionAuthId}`);
    if (!response.ok) {
      const errorText  = await response.text();
      throw new APIError(errorText , response.status);
    }
    const json = await response.json();
    const result = validateCheckUser(json);
    if(result.success) {
      return result.data;
    }
    console.error(result.error.message);
    throw new ResponseError("Failed to validate response");
  },
  addFeedback: async ({
    workerAuthId,
    sessionId,
    sessionAuthId,
    feedback,
    timestamp,
    email
  }:{
    workerAuthId:string;
    sessionId:number;
    sessionAuthId:string;
    feedback:0|1;
    timestamp:number;
    email:string;

  } ) => {
    const encodedWorkerAuthId = encodeURIComponent(workerAuthId);
    const encodedSessionAuthId = encodeURIComponent(sessionAuthId);
    const encodedSessionId = encodeURIComponent(sessionId);
    const encodedFeedback = encodeURIComponent(feedback);
    const encodedTimestamp = encodeURIComponent(timestamp);
    const encodedEmail = encodeURIComponent(email);
    const response = await fetch(`${url}/user_feedback?worker_auth_id=${encodedWorkerAuthId}&session_id=${encodedSessionId}&session_auth_id=${encodedSessionAuthId}&feedback=${encodedFeedback}&timestamp=${encodedTimestamp}&email=${encodedEmail}`);
    if (!response.ok) {
      const errorText  = await response.text();
      throw new APIError(errorText , response.status);
    }
    return response.json();
  }
});


================================================
FILE: client/src/pages/Queue/api/errors/api_error.ts
================================================
export class APIError extends Error {
  status:number;

  constructor(message:string, status:number) {
    super(message);
    this.status = status;
    this.name = "APIError";
  }
}


================================================
FILE: client/src/pages/Queue/api/errors/response_error.ts
================================================
export class ResponseError extends Error {
  constructor(message:string) {
    super(message);
    this.name = "ResponseError";
  }
}


================================================
FILE: client/src/pages/Queue/api/validators.ts
================================================
import { z } from "zod"

export const validateAddUser = (response: unknown) => {
  const AddUser = z.object({
    session_id: z.number(),
    session_auth_id: z.string(),
  });
  return AddUser.safeParse(response);
};

export const validateCheckUser = (response: unknown) => {
  const CheckUser = z.object({
    session_id: z.number(),
    // TODO: add more statuses
    status: z.enum(['wait', 'ready']),
    worker_auth_id: z.string().nullable(),
    worker_addr: z.string().nullable(),
    current_position: z.string(),
  });
  return CheckUser.safeParse(response);
}

================================================
FILE: client/src/pages/Queue/hooks/useUserEmail.ts
================================================
import { useCallback, useState } from "react";
import { z } from "zod";

const validateEmail = z.string().email();

export const useUserEmail = (isBypass: boolean, init_value: string) => {
  const [userEmail, setUserEmail] = useState<string>(init_value);
  const [error, setError] = useState<string | null>(null);

  const validate = useCallback((email: string) => {
    if (isBypass) {
      setError(null);
      return true;
    }
    const result = validateEmail.safeParse(email);
    if (result.success) {
      setError(null);
      sessionStorage.setItem("userEmail", email.toString());
      return true;
    }
    setError('Invalid email address');
    return false;
  }, [setError]);
  return { userEmail, setUserEmail, error, validate };
}


================================================
FILE: client/src/protocol/encoder.ts
================================================
import {
  CONTROL_MESSAGE,
  CONTROL_MESSAGES_MAP,
  MODELS_MAP,
  WSMessage,
  VERSIONS_MAP,
} from "./types";

export const encodeMessage = (message: WSMessage): Uint8Array => {
  switch (message.type) {
    case "handshake":
      return new Uint8Array([
        0x00,
        VERSIONS_MAP[message.version],
        MODELS_MAP[message.model],
      ]);
    case "audio":
      return new Uint8Array([0x01, ...message.data]);
    case "text":
      // Not used in practice
      return new Uint8Array([0x02, ...new TextEncoder().encode(message.data)]);
    case "control":
      // Not used in practice
      return new Uint8Array([0x03, CONTROL_MESSAGES_MAP[message.action]]);
    case "metadata":
      // Not used in practice
      return new Uint8Array([
        0x04,
        ...new TextEncoder().encode(JSON.stringify(message.data)),
      ]);
    case "error":
      // Not used in practice
      return new Uint8Array([0x05, ...new TextEncoder().encode(message.data)]);
    case "ping":
      // Not used in practice
      return new Uint8Array([0x06]);
    case "coloredtext":
      // Not used in practice
      return new Uint8Array([0x07, 0x05, ...new TextEncoder().encode(message.data)]);
    case "image":
      return new Uint8Array([0x08, ...message.data]);
    case "user_rating":
      return new Uint8Array([0x0A, message.data]);
  }
};

export const decodeMessage = (data: Uint8Array): WSMessage => {
  const type = data[0];
  const payload = data.slice(1);
  switch (type) {
    case 0x00: {
      return {
        type: "handshake",
        version: 0,
        model: 0,
      };
    }
    case 0x01:
      return {
        type: "audio",
        data: payload,
      };
    case 0x02:
      return {
        type: "text",
        data: new TextDecoder().decode(payload),
      };
    case 0x03: {
      const action = Object.keys(CONTROL_MESSAGES_MAP).find(
        key => CONTROL_MESSAGES_MAP[key as CONTROL_MESSAGE] === payload[0],
      ) as CONTROL_MESSAGE | undefined;

      //TODO: log this and don't throw
      if (!action) {
        throw new Error("Unknown control message");
      }
      return {
        type: "control",
        action,
      };
    }
    case 0x04:
      return {
        type: "metadata",
        data: JSON.parse(new TextDecoder().decode(payload)),
      }
    case 0x05:
      return {
        type: "error",
        data: new TextDecoder().decode(payload),
      }
    case 0x06:
      return {
        type: "ping",
      }
    case 0x07:
      return {
        type: "coloredtext",
        color: payload[0],
        data: new TextDecoder().decode(payload.slice(1)),
      };
    case 0x08:
      return {
        type: "image",
        data: payload,
      };
    // never used in practice
    case 0x0A:
      return {
        type: "user_rating",
        data: payload[0],
      };
    default: {
      console.log(type);
      throw new Error("Unknown message type");
    }
  }
};


================================================
FILE: client/src/protocol/testMessages.ts
================================================
import { WSMessage } from "./types";

export const handshakeMessage: WSMessage = {
  type: "handshake",
  version: 0,
  model: 0,
};

export const audioMessage: WSMessage = {
  type: "audio",
  data: new Uint8Array(10),
};

export const textMessage: WSMessage = {
  type: "text",
  data: "Hello",
};

export const controlBOSMessage: WSMessage = {
  type: "control",
  action: "start",
};

export const controlEOSMessage: WSMessage = {
  type: "control",
  action: "endTurn",
};

export const metadataMessage: WSMessage = {
  type: "metadata",
  data: { key: "value" },
};


================================================
FILE: client/src/protocol/types.ts
================================================
export type MessageType =
  | "handshake"
  | "audio"
  | "text"
  | "coloredtext"
  | "control"
  | "metadata";

export const VERSIONS_MAP = {
  0: 0b00000000,
} as const;

export const MODELS_MAP = {
  0: 0b00000000,
} as const;

export type VERSION = keyof typeof VERSIONS_MAP;

export type MODEL = keyof typeof MODELS_MAP;

export type WSMessage =
  | {
    type: "handshake";
    version: VERSION;
    model: MODEL;
  }
  | {
    type: "user_rating";
    data: number;
  }
  | {
    type: "audio";
    data: Uint8Array;
  }
  | {
    type: "text";
    data: string;
  }
  | {
    type: "coloredtext";
    color: number;
    data: string;
  }
  | {
    type: "control";
    action: CONTROL_MESSAGE;
  }
  | {
    type: "metadata";
    data: unknown;
  }
  | {
    type: "error";
    data: string;
  }
  | {
    type: "ping";
  }
  | {
    type: "image";
    data: Uint8Array;
  }

export const CONTROL_MESSAGES_MAP = {
  start: 0b00000000,
  endTurn: 0b00000001,
  pause: 0b00000010,
  restart: 0b00000011,
} as const;

export type CONTROL_MESSAGE = keyof typeof CONTROL_MESSAGES_MAP;


================================================
FILE: client/tailwind.config.js
================================================
/** @type {import('tailwindcss').Config} */

export default {
  content: ["./src/**/*.{js,jsx,ts,tsx}", "./index.html"],
  theme: {
    extend: {},
  },
  plugins: [require('daisyui')],
};


================================================
FILE: client/tsconfig.json
================================================
{
  "compilerOptions": {
    "target": "ES2020",
    "useDefineForClassFields": true,
    "module": "ESNext",
    "lib": [
      "ES2020",
      "DOM",
      "DOM.Iterable"
    ],
    "skipLibCheck": true,
    "outDir": "dist",
    /* Bundler mode */
    "moduleResolution": "bundler",
    "allowImportingTsExtensions": true,
    "resolveJsonModule": true,
    "isolatedModules": true,
    "noEmit": true,
    "jsx": "react-jsx",
    /* Linting */
    "strict": true,
    "noUnusedLocals": true,
    "noUnusedParameters": true,
    "noFallthroughCasesInSwitch": true,
    "types": [
      "vite/client"
    ]
  },
  "include": [
    "src"
  ]
}

================================================
FILE: client/vite.config.ts
================================================
import { ProxyOptions, defineConfig, loadEnv } from "vite";
import topLevelAwait from "vite-plugin-top-level-await";

export default defineConfig(({ mode }) => {
  const env = loadEnv(mode, process.cwd());
  const proxyConf: Record<string, string | ProxyOptions> = env.VITE_QUEUE_API_URL ? {
    "/api": {
      target: env.VITE_QUEUE_API_URL,
      changeOrigin: true,
    },
  } : {};
  return {
    server: {
      host: "0.0.0.0",
      https: {
        cert: "./cert.pem",
        key: "./key.pem",
      },
      proxy: {
        ...proxyConf,
      }
    },
    plugins: [
      topLevelAwait({
        // The export name of top-level await promise for each chunk module
        promiseExportName: "__tla",
        // The function to generate import names of top-level await promise in each chunk module
        promiseImportName: i => `__tla_${i}`,
      }),
    ],
  };
});


================================================
FILE: docker-bake.hcl
================================================
group "default" {
  targets = ["client"]
}

target "client" {
  context    = "./client"

  # Specify output type as a local directory
  output = [
    "type=local,dest=./client/dist"
  ]
}

================================================
FILE: kyuteye_mlx/.pylintrc
================================================
[MAIN]

# Analyse import fallback blocks. This can be used to support both Python 2 and
# 3 compatible code, which means that the block might have code that exists
# only in one or another interpreter, leading to false positives when analysed.
analyse-fallback-blocks=no

# Clear in-memory caches upon conclusion of linting. Useful if running pylint
# in a server-like mode.
clear-cache-post-run=no

# Load and enable all available extensions. Use --list-extensions to see a list
# all available extensions.
#enable-all-extensions=

# In error mode, messages with a category besides ERROR or FATAL are
# suppressed, and no reports are done by default. Error mode is compatible with
# disabling specific errors.
#errors-only=

# Always return a 0 (non-error) status code, even if lint errors are found.
# This is primarily useful in continuous integration scripts.
#exit-zero=

# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code.
extension-pkg-allow-list=

# A comma-separated list of package or module names from where C extensions may
# be loaded. Extensions are loading into the active Python interpreter and may
# run arbitrary code. (This is an alternative name to extension-pkg-allow-list
# for backward compatibility.)
extension-pkg-whitelist=

# Return non-zero exit code if any of these messages/categories are detected,
# even if score is above --fail-under value. Syntax same as enable. Messages
# specified are enabled, while categories only check already-enabled messages.
fail-on=

# Specify a score threshold under which the program will exit with error.
fail-under=10

# Interpret the stdin as a python script, whose filename needs to be passed as
# the module_or_package argument.
#from-stdin=

# Files or directories to be skipped. They should be base names, not paths.
ignore=CVS

# Add files or directories matching the regular expressions patterns to the
# ignore-list. The regex matches against paths and can be in Posix or Windows
# format. Because '\\' represents the directory delimiter on Windows systems,
# it can't be used as an escape character.
ignore-paths=

# Files or directories matching the regular expression patterns are skipped.
# The regex matches against base names, not paths. The default value ignores
# Emacs file locks
ignore-patterns=^\.#

# List of module names for which member attributes should not be checked and
# will not be imported (useful for modules/projects where namespaces are
# manipulated during runtime and thus existing member attributes cannot be
# deduced by static analysis). It supports qualified module names, as well as
# Unix pattern matching.
ignored-modules=

# Python code to execute, usually for sys.path manipulation such as
# pygtk.require().
#init-hook=

# Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
# number of processors available to use, and will cap the count on Windows to
# avoid hangs.
jobs=1

# Control the amount of potential inferred values when inferring a single
# object. This can help the performance when dealing with large functions or
# complex, nested conditions.
limit-inference-results=100

# List of plugins (as comma separated values of python module names) to load,
# usually to register additional checkers.
load-plugins=

# Pickle collected data for later comparisons.
persistent=yes

# Minimum Python version to use for version dependent checks. Will default to
# the version used to run pylint.
py-version=3.10

# Discover python modules and packages in the file system subtree.
recursive=no

# Add paths to the list of the source roots. Supports globbing patterns. The
# source root is an absolute path or a path relative to the current working
# directory used to determine a package namespace for modules located under the
# source root.
source-roots=

# When enabled, pylint would attempt to guess common misconfiguration and emit
# user-friendly hints instead of false-positive error messages.
suggestion-mode=yes

# Allow loading of arbitrary C extensions. Extensions are imported into the
# active Python interpreter and may run arbitrary code.
unsafe-load-any-extension=no

# In verbose mode, extra non-checker-related info will be displayed.
#verbose=


[BASIC]

# Naming style matching correct argument names.
argument-naming-style=snake_case

# Regular expression matching correct argument names. Overrides argument-
# naming-style. If left empty, argument names will be checked with the set
# naming style.
#argument-rgx=

# Naming style matching correct attribute names.
attr-naming-style=snake_case

# Regular expression matching correct attribute names. Overrides attr-naming-
# style. If left empty, attribute names will be checked with the set naming
# style.
#attr-rgx=

# Bad variable names which should always be refused, separated by a comma.
bad-names=foo,
          bar,
          baz,
          toto,
          tutu,
          tata

# Bad variable names regexes, separated by a comma. If names match any regex,
# they will always be refused
bad-names-rgxs=

# Naming style matching correct class attribute names.
class-attribute-naming-style=any

# Regular expression matching correct class attribute names. Overrides class-
# attribute-naming-style. If left empty, class attribute names will be checked
# with the set naming style.
#class-attribute-rgx=

# Naming style matching correct class constant names.
class-const-naming-style=UPPER_CASE

# Regular expression matching correct class constant names. Overrides class-
# const-naming-style. If left empty, class constant names will be checked with
# the set naming style.
#class-const-rgx=

# Naming style matching correct class names.
class-naming-style=PascalCase

# Regular expression matching correct class names. Overrides class-naming-
# style. If left empty, class names will be checked with the set naming style.
#class-rgx=

# Naming style matching correct constant names.
const-naming-style=UPPER_CASE

# Regular expression matching correct constant names. Overrides const-naming-
# style. If left empty, constant names will be checked with the set naming
# style.
#const-rgx=

# Minimum line length for functions/classes that require docstrings, shorter
# ones are exempt.
docstring-min-length=-1

# Naming style matching correct function names.
function-naming-style=snake_case

# Regular expression matching correct function names. Overrides function-
# naming-style. If left empty, function names will be checked with the set
# naming style.
#function-rgx=

# Good variable names which should always be accepted, separated by a comma.
good-names=i,
           j,
           k,
           ex,
           Run,
           _

# Good variable names regexes, separated by a comma. If names match any regex,
# they will always be accepted
good-names-rgxs=

# Include a hint for the correct naming format with invalid-name.
include-naming-hint=no

# Naming style matching correct inline iteration names.
inlinevar-naming-style=any

# Regular expression matching correct inline iteration names. Overrides
# inlinevar-naming-style. If left empty, inline iteration names will be checked
# with the set naming style.
#inlinevar-rgx=

# Naming style matching correct method names.
method-naming-style=snake_case

# Regular expression matching correct method names. Overrides method-naming-
# style. If left empty, method names will be checked with the set naming style.
#method-rgx=

# Naming style matching correct module names.
module-naming-style=snake_case

# Regular expression matching correct module names. Overrides module-naming-
# style. If left empty, module names will be checked with the set naming style.
#module-rgx=

# Colon-delimited sets of names that determine each other's naming style when
# the name regexes allow several styles.
name-group=

# Regular expression which should only match function or class names that do
# not require a docstring.
no-docstring-rgx=^_

# List of decorators that produce properties, such as abc.abstractproperty. Add
# to this list to register other decorators that produce valid properties.
# These decorators are taken in consideration only for invalid-name.
property-classes=abc.abstractproperty

# Regular expression matching correct type alias names. If left empty, type
# alias names will be checked with the set naming style.
#typealias-rgx=

# Regular expression matching correct type variable names. If left empty, type
# variable names will be checked with the set naming style.
#typevar-rgx=

# Naming style matching correct variable names.
variable-naming-style=snake_case

# Regular expression matching correct variable names. Overrides variable-
# naming-style. If left empty, variable names will be checked with the set
# naming style.
#variable-rgx=


[CLASSES]

# Warn about protected attribute access inside special methods
check-protected-access-in-special-methods=no

# List of method names used to declare (i.e. assign) instance attributes.
defining-attr-methods=__init__,
                      __new__,
                      setUp,
                      asyncSetUp,
                      __post_init__

# List of member names, which should be excluded from the protected access
# warning.
exclude-protected=_asdict,_fields,_replace,_source,_make,os._exit

# List of valid names for the first argument in a class method.
valid-classmethod-first-arg=cls

# List of valid names for the first argument in a metaclass class method.
valid-metaclass-classmethod-first-arg=mcs


[DESIGN]

# List of regular expressions of class ancestor names to ignore when counting
# public methods (see R0903)
exclude-too-few-public-methods=

# List of qualified class names to ignore when counting class parents (see
# R0901)
ignored-parents=

# Maximum number of arguments for function / method.
max-args=5

# Maximum number of attributes for a class (see R0902).
max-attributes=7

# Maximum number of boolean expressions in an if statement (see R0916).
max-bool-expr=5

# Maximum number of branch for function / method body.
max-branches=12

# Maximum number of locals for function / method body.
max-locals=15

# Maximum number of parents for a class (see R0901).
max-parents=7

# Maximum number of public methods for a class (see R0904).
max-public-methods=20

# Maximum number of return / yield for function / method body.
max-returns=6

# Maximum number of statements in function / method body.
max-statements=50

# Minimum number of public methods for a class (see R0903).
min-public-methods=2


[EXCEPTIONS]

# Exceptions that will emit a warning when caught.
overgeneral-exceptions=builtins.BaseException,builtins.Exception


[FORMAT]

# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
expected-line-ending-format=

# Regexp for a line that is allowed to be longer than the limit.
ignore-long-lines=^\s*(# )?<?https?://\S+>?$

# Number of spaces of indent required inside a hanging or continued line.
indent-after-paren=4

# String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
# tab).
indent-string='    '

# Maximum number of characters on a single line.
max-line-length=100

# Maximum number of lines in a module.
max-module-lines=1200

# Allow the body of a class to be on the same line as the declaration if body
# contains single statement.
single-line-class-stmt=no

# Allow the body of an if to be on the same line as the test if there is no
# else.
single-line-if-stmt=no


[IMPORTS]

# List of modules that can be imported at any level, not just the top level
# one.
allow-any-import-level=

# Allow explicit reexports by alias from a package __init__.
allow-reexport-from-package=no

# Allow wildcard imports from modules that define __all__.
allow-wildcard-with-all=no

# Deprecated modules which should not be used, separated by a comma.
deprecated-modules=

# Output a graph (.gv or any supported image format) of external dependencies
# to the given file (report RP0402 must not be disabled).
ext-import-graph=

# Output a graph (.gv or any supported image format) of all (i.e. internal and
# external) dependencies to the given file (report RP0402 must not be
# disabled).
import-graph=

# Output a graph (.gv or any supported image format) of internal dependencies
# to the given file (report RP0402 must not be disabled).
int-import-graph=

# Force import order to recognize a module as part of the standard
# compatibility libraries.
known-standard-library=

# Force import order to recognize a module as part of a third party library.
known-third-party=enchant

# Couples of modules and preferred modules, separated by a comma.
preferred-modules=


[LOGGING]

# The type of string formatting that logging methods do. `old` means using %
# formatting, `new` is for `{}` formatting.
logging-format-style=old

# Logging modules to check that the string format arguments are in logging
# function parameter format.
logging-modules=logging


[MESSAGES CONTROL]

# Only show warnings with the listed confidence levels. Leave empty to show
# all. Valid levels: HIGH, CONTROL_FLOW, INFERENCE, INFERENCE_FAILURE,
# UNDEFINED.
confidence=HIGH,
           CONTROL_FLOW,
           INFERENCE,
           INFERENCE_FAILURE,
           UNDEFINED

# Disable the message, report, category or checker with the given id(s). You
# can either give multiple identifiers separated by comma (,) or put this
# option multiple times (only on the command line, not in the configuration
# file where it should appear only once). You can also use "--disable=all" to
# disable everything first and then re-enable specific checks. For example, if
# you want to run only the similarities checker, you can use "--disable=all
# --enable=similarities". If you want to run only the classes checker, but have
# no Warning level messages displayed, use "--disable=all --enable=classes
# --disable=W".
disable=raw-checker-failed,
        bad-inline-option,
        locally-disabled,
        file-ignored,
        suppressed-message,
        useless-suppression,
        deprecated-pragma,
        use-symbolic-message-instead,
        use-implicit-booleaness-not-comparison-to-string,
        use-implicit-booleaness-not-comparison-to-zero,
        too-many-locals,
        unspecified-encoding,
        too-many-arguments,
        too-many-instance-attributes,
        too-many-branches,
        too-many-statements,
        too-many-return-statements,
        too-many-public-methods,
        too-few-public-methods,
        use-dict-literal,
        unnecessary-lambda-assignment,
        too-many-function-args

# Enable the message, report, category or checker with the given id(s). You can
# either give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
# it should appear only once). See also the "--disable" option for examples.
enable=


[METHOD_ARGS]

# List of qualified names (i.e., library.method) which require a timeout
# parameter e.g. 'requests.api.get,requests.api.post'
timeout-methods=requests.api.delete,requests.api.get,requests.api.head,requests.api.options,requests.api.patch,requests.api.post,requests.api.put,requests.api.request


[MISCELLANEOUS]

# List of note tags to take in consideration, separated by a comma.
notes=FIXME,
      XXX,
      TODO

# Regular expression of note tags to take in consideration.
notes-rgx=


[REFACTORING]

# Maximum number of nested blocks for function / method body
max-nested-blocks=5

# Complete name of functions that never returns. When checking for
# inconsistent-return-statements if a never returning function is called then
# it will be considered as an explicit return statement and no message will be
# printed.
never-returning-functions=sys.exit,argparse.parse_error


[REPORTS]

# Python expression which should return a score less than or equal to 10. You
# have access to the variables 'fatal', 'error', 'warning', 'refactor',
# 'convention', and 'info' which contain the number of messages in each
# category, as well as 'statement' which is the total number of statements
# analyzed. This score is used by the global evaluation report (RP0004).
evaluation=max(0, 0 if fatal else 10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10))

# Template used to display messages. This is a python new-style format string
# used to format the message information. See doc for all details.
msg-template=

# Set the output format. Available formats are: text, parseable, colorized,
# json2 (improved json format), json (old json format) and msvs (visual
# studio). You can also give a reporter class, e.g.
# mypackage.mymodule.MyReporterClass.
#output-format=

# Tells whether to display a full report or only the messages.
reports=no

# Activate the evaluation score.
score=yes


[SIMILARITIES]

# Comments are removed from the similarity computation
ignore-comments=yes

# Docstrings are removed from the similarity computation
ignore-docstrings=yes

# Imports are removed from the similarity computation
ignore-imports=yes

# Signatures are removed from the similarity computation
ignore-signatures=yes

# Minimum lines number of a similarity.
min-similarity-lines=12


[SPELLING]

# Limits count of emitted suggestions for spelling mistakes.
max-spelling-suggestions=4

# Spelling dictionary name. No available dictionaries : You need to install
# both the python package and the system dependency for enchant to work.
spelling-dict=

# List of comma separated words that should be considered directives if they
# appear at the beginning of a comment and should not be checked.
spelling-ignore-comment-directives=fmt: on,fmt: off,noqa:,noqa,nosec,isort:skip,mypy:

# List of comma separated words that should not be checked.
spelling-ignore-words=

# A path to a file that contains the private dictionary; one word per line.
spelling-private-dict-file=

# Tells whether to store unknown words to the private dictionary (see the
# --spelling-private-dict-file option) instead of raising a message.
spelling-store-unknown-words=no


[STRING]

# This flag controls whether inconsistent-quotes generates a warning when the
# character used as a quote delimiter is used inconsistently within a module.
check-quote-consistency=no

# This flag controls whether the implicit-str-concat should generate a warning
# on implicit string concatenation in sequences defined over several lines.
check-str-concat-over-line-jumps=no


[TYPECHECK]

# List of decorators that produce context managers, such as
# contextlib.contextmanager. Add to this list to register other decorators that
# produce valid context managers.
contextmanager-decorators=contextlib.contextmanager

# List of members which are set dynamically and missed by pylint inference
# system, and so shouldn't trigger E1101 when accessed. Python regular
# expressions are accepted.
generated-members=

# Tells whether to warn about missing members when the owner of the attribute
# is inferred to be None.
ignore-none=yes

# This flag controls whether pylint should warn about no-member and similar
# checks whenever an opaque object is returned when inferring. The inference
# can return multiple potential results while evaluating a Python object, but
# some branches might not be evaluated, which results in partial inference. In
# that case, it might be useful to still emit no-member and other checks for
# the rest of the inferred objects.
ignore-on-opaque-inference=yes

# List of symbolic message names to ignore for Mixin members.
ignored-checks-for-mixins=no-member,
                          not-async-context-manager,
                          not-context-manager,
                          attribute-defined-outside-init

# List of class names for which member attributes should not be checked (useful
# for classes with dynamically set attributes). This supports the use of
# qualified names.
ignored-classes=optparse.Values,thread._local,_thread._local,argparse.Namespace

# Show a hint with possible names when a member name was not found. The aspect
# of finding the hint is based on edit distance.
missing-member-hint=yes

# The minimum edit distance a name should have in order to be considered a
# similar match for a missing member name.
missing-member-hint-distance=1

# The total number of similar names that should be taken in consideration when
# showing a hint for a missing member.
missing-member-max-choices=1

# Regex pattern to define which classes are considered mixins.
mixin-class-rgx=.*[Mm]ixin

# List of decorators that change the signature of a decorated function.
signature-mutators=


[VARIABLES]

# List of additional names supposed to be defined in builtins. Remember that
# you should avoid defining new builtins when possible.
additional-builtins=

# Tells whether unused global variables should be treated as a violation.
allow-global-unused-variables=yes

# List of names allowed to shadow builtins
allowed-redefined-builtins=

# List of strings 
Download .txt
gitextract_d_6tdl44/

├── .dockerignore
├── .gitattributes
├── .github/
│   ├── actions/
│   │   └── rust_build/
│   │       └── action.yml
│   ├── requirements_github_actions.txt
│   └── workflows/
│       ├── checks.yml
│       └── rust-ci.yml
├── .gitignore
├── CONTRIBUTING.md
├── ISSUE_TEMPLATE/
│   ├── bug.yml
│   └── question.yml
├── LICENSE-APACHE
├── LICENSE-MIT
├── LICENSE.md
├── PULL_REQUEST_TEMPLATE.md
├── README.md
├── client/
│   ├── .eslinrc.json
│   ├── .nvmrc
│   ├── .prettierignore
│   ├── .prettierrc.json
│   ├── Dockerfile
│   ├── LICENSE
│   ├── README.md
│   ├── index.html
│   ├── package.json
│   ├── postcss.config.js
│   ├── public/
│   │   └── assets/
│   │       ├── decoderWorker.min.wasm
│   │       └── images/
│   │           └── demo/
│   │               └── attribution.txt
│   ├── src/
│   │   ├── app.tsx
│   │   ├── audio-processor.ts
│   │   ├── components/
│   │   │   ├── Button/
│   │   │   │   └── Button.tsx
│   │   │   ├── ImageGallery/
│   │   │   │   └── ImageGallery.tsx
│   │   │   └── Input/
│   │   │       └── Input.tsx
│   │   ├── decoder/
│   │   │   └── decoderWorker.ts
│   │   ├── env.ts
│   │   ├── index.css
│   │   ├── modules.d.ts
│   │   ├── pages/
│   │   │   ├── Conversation/
│   │   │   │   ├── Conversation.tsx
│   │   │   │   ├── MediaContext.ts
│   │   │   │   ├── SocketContext.ts
│   │   │   │   ├── components/
│   │   │   │   │   ├── AudioVisualizer/
│   │   │   │   │   │   ├── AudioVisualizer.tsx
│   │   │   │   │   │   ├── ClientVisualizer.tsx
│   │   │   │   │   │   └── ServerVisualizer.tsx
│   │   │   │   │   ├── Controls/
│   │   │   │   │   │   └── Controls.tsx
│   │   │   │   │   ├── ModelParams/
│   │   │   │   │   │   └── ModelParams.tsx
│   │   │   │   │   ├── ServerAudio/
│   │   │   │   │   │   ├── ServerAudio.tsx
│   │   │   │   │   │   └── ServerAudioStats.tsx
│   │   │   │   │   ├── ServerInfo/
│   │   │   │   │   │   └── ServerInfo.tsx
│   │   │   │   │   ├── TextDisplay/
│   │   │   │   │   │   ├── TextDisplay.tsx
│   │   │   │   │   │   └── TextDisplayStats.tsx
│   │   │   │   │   └── UserAudio/
│   │   │   │   │       ├── UserAudio.tsx
│   │   │   │   │       └── UserAudioStats.tsx
│   │   │   │   ├── getMimeType.ts
│   │   │   │   └── hooks/
│   │   │   │       ├── audioUtils.ts
│   │   │   │       ├── useModelParams.ts
│   │   │   │       ├── useServerAudio.ts
│   │   │   │       ├── useServerInfo.ts
│   │   │   │       ├── useServerText.ts
│   │   │   │       ├── useSocket.ts
│   │   │   │       └── useUserAudio.ts
│   │   │   └── Queue/
│   │   │       ├── Queue.tsx
│   │   │       ├── api/
│   │   │       │   ├── client.ts
│   │   │       │   ├── errors/
│   │   │       │   │   ├── api_error.ts
│   │   │       │   │   └── response_error.ts
│   │   │       │   └── validators.ts
│   │   │       └── hooks/
│   │   │           └── useUserEmail.ts
│   │   └── protocol/
│   │       ├── encoder.ts
│   │       ├── testMessages.ts
│   │       └── types.ts
│   ├── tailwind.config.js
│   ├── tsconfig.json
│   └── vite.config.ts
├── docker-bake.hcl
├── kyuteye_mlx/
│   ├── .pylintrc
│   ├── LICENSE
│   ├── MANIFEST.in
│   ├── README.md
│   ├── kyuteye_mlx/
│   │   ├── __init__.py
│   │   ├── benchmark.py
│   │   ├── local_web.py
│   │   ├── mlx_vlm/
│   │   │   ├── LICENSE
│   │   │   ├── __init__.py
│   │   │   └── models/
│   │   │       ├── __init__.py
│   │   │       ├── pixtral/
│   │   │       │   ├── __init__.py
│   │   │       │   └── vision.py
│   │   │       └── siglip/
│   │   │           └── vision.py
│   │   ├── models/
│   │   │   ├── __init__.py
│   │   │   ├── generate.py
│   │   │   ├── lm.py
│   │   │   ├── pixtral.py
│   │   │   └── siglip.py
│   │   ├── modules/
│   │   │   ├── __init__.py
│   │   │   ├── config.py
│   │   │   ├── cross_attention.py
│   │   │   ├── kv_cache.py
│   │   │   └── transformer.py
│   │   ├── py.typed
│   │   ├── quantize.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── loading.py
│   │       ├── profiling.py
│   │       └── sampling.py
│   ├── pixtral-12b-8bit.config
│   ├── pyproject.toml
│   ├── siglip448.config
│   └── tests/
│       └── test_siglip.py
├── kyuteye_pt/
│   ├── .pylintrc
│   ├── LICENSE.md
│   ├── README.md
│   ├── configs/
│   │   └── moshika-vis.yaml
│   ├── kyuteye/
│   │   ├── __init__.py
│   │   ├── config/
│   │   │   ├── __init__.py
│   │   │   ├── enums.py
│   │   │   ├── kyuteye_config.py
│   │   │   └── subconfigs.py
│   │   ├── models/
│   │   │   ├── __init__.py
│   │   │   ├── docker-bake.hcl
│   │   │   ├── helium.py
│   │   │   ├── hf_model_configs.py
│   │   │   ├── image_projection.py
│   │   │   ├── loaders.py
│   │   │   └── moshivis.py
│   │   ├── modules/
│   │   │   ├── __init__.py
│   │   │   ├── attention.py
│   │   │   ├── cross_attention.py
│   │   │   ├── image_encoder.py
│   │   │   ├── image_transforms.py
│   │   │   ├── streaming_utils.py
│   │   │   ├── transformer.py
│   │   │   └── utils.py
│   │   ├── server.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── dist_utils.py
│   │       ├── logging_utils.py
│   │       └── struct_utils.py
│   ├── pyproject.toml
│   └── tests/
│       └── hello.py
├── kyuteye_rs/
│   ├── Cargo.toml
│   ├── configs/
│   │   ├── config-moshika-vis-q8.json
│   │   └── config-moshika-vis.json
│   ├── moshi-backend/
│   │   ├── Cargo.toml
│   │   ├── build.rs
│   │   └── src/
│   │       ├── audio.rs
│   │       ├── build.rs
│   │       ├── image_embedder.rs
│   │       ├── main.rs
│   │       ├── metrics.rs
│   │       ├── standalone.rs
│   │       ├── stream_both.rs
│   │       └── utils.rs
│   └── moshi-core/
│       ├── Cargo.toml
│       └── src/
│           ├── conv.rs
│           ├── dynamic_logits_processor.rs
│           ├── lib.rs
│           ├── lm.rs
│           ├── lm_generate.rs
│           ├── lm_generate_multistream.rs
│           ├── mimi.rs
│           ├── nn.rs
│           ├── quantization.rs
│           ├── seanet.rs
│           ├── streaming.rs
│           └── transformer.rs
├── scripts/
│   ├── convert_ckpt_utils.py
│   └── get_static_client.py
└── ssvd/
    ├── README.md
    ├── __init__.py
    ├── generate.py
    ├── multiturn_instruct.py
    ├── multiturn_prompting.py
    └── utils.py
Download .txt
SYMBOL INDEX (859 symbols across 91 files)

FILE: client/src/audio-processor.ts
  function asMs (line 2) | function asMs(samples) {
  function asSamples (line 6) | function asSamples(mili) {
  class MoshiProcessor (line 10) | class MoshiProcessor extends AudioWorkletProcessor {
    method constructor (line 11) | constructor() {
    method initState (line 80) | initState() {
    method totalMaxBufferSamples (line 101) | totalMaxBufferSamples() {
    method timestamp (line 105) | timestamp() {
    method currentSamples (line 109) | currentSamples() {
    method resetStart (line 118) | resetStart() {
    method start (line 122) | start() {
    method canPlay (line 128) | canPlay() {
    method process (line 132) | process(inputs, outputs, parameters) {

FILE: client/src/components/Button/Button.tsx
  type ButtonProps (line 3) | type ButtonProps = React.ButtonHTMLAttributes<HTMLButtonElement>;

FILE: client/src/components/ImageGallery/ImageGallery.tsx
  type ImageGalleryProps (line 56) | type ImageGalleryProps = React.InputHTMLAttributes<HTMLInputElement> & {
  type ImageItemProps (line 65) | type ImageItemProps = React.InputHTMLAttributes<HTMLInputElement> & {
  function ImageSelect (line 77) | function ImageSelect(props: ImageItemProps) {
  function handleShuffle (line 144) | function handleShuffle() {

FILE: client/src/components/Input/Input.tsx
  type InputProps (line 1) | type InputProps = React.InputHTMLAttributes<HTMLInputElement> & {

FILE: client/src/env.ts
  type ENV (line 1) | type ENV = {

FILE: client/src/pages/Conversation/Conversation.tsx
  type ConversationProps (line 18) | type ConversationProps = {

FILE: client/src/pages/Conversation/MediaContext.ts
  type MediaContextType (line 2) | type MediaContextType = {

FILE: client/src/pages/Conversation/SocketContext.ts
  type SocketContextType (line 4) | type SocketContextType = {

FILE: client/src/pages/Conversation/components/AudioVisualizer/AudioVisualizer.tsx
  type AudioVisualizerProps (line 3) | type AudioVisualizerProps = {

FILE: client/src/pages/Conversation/components/AudioVisualizer/ClientVisualizer.tsx
  type AudioVisualizerProps (line 4) | type AudioVisualizerProps = {
  constant MAX_INTENSITY (line 10) | const MAX_INTENSITY = 255;
  constant COLORS (line 12) | const COLORS = [

FILE: client/src/pages/Conversation/components/AudioVisualizer/ServerVisualizer.tsx
  type AudioVisualizerProps (line 5) | type AudioVisualizerProps = {
  constant MAX_INTENSITY (line 12) | const MAX_INTENSITY = 255;

FILE: client/src/pages/Conversation/components/ModelParams/ModelParams.tsx
  type ModelParamsProps (line 5) | type ModelParamsProps = {

FILE: client/src/pages/Conversation/components/ServerAudio/ServerAudio.tsx
  type ServerAudioProps (line 5) | type ServerAudioProps = {

FILE: client/src/pages/Conversation/components/ServerAudio/ServerAudioStats.tsx
  type ServerAudioStatsProps (line 3) | type ServerAudioStatsProps = {

FILE: client/src/pages/Conversation/components/ServerInfo/ServerInfo.tsx
  function pretty_format (line 3) | function pretty_format(num: number): number {

FILE: client/src/pages/Conversation/components/TextDisplay/TextDisplay.tsx
  type TextDisplayProps (line 4) | type TextDisplayProps = {
  function clamp_color (line 19) | function clamp_color(v: number) {

FILE: client/src/pages/Conversation/components/TextDisplay/TextDisplayStats.tsx
  type TextDisplayStatsProps (line 3) | type TextDisplayStatsProps = {

FILE: client/src/pages/Conversation/components/UserAudio/UserAudio.tsx
  type UserAudioProps (line 6) | type UserAudioProps = {

FILE: client/src/pages/Conversation/components/UserAudio/UserAudioStats.tsx
  type UserAudioStatsProps (line 3) | type UserAudioStatsProps = {

FILE: client/src/pages/Conversation/hooks/useModelParams.ts
  constant DEFAULT_TEXT_TEMPERATURE (line 3) | const DEFAULT_TEXT_TEMPERATURE = 0.45;
  constant DEFAULT_TEXT_TOPK (line 4) | const DEFAULT_TEXT_TOPK = 25;
  constant DEFAULT_AUDIO_TEMPERATURE (line 5) | const DEFAULT_AUDIO_TEMPERATURE = 0.7;
  constant DEFAULT_AUDIO_TOPK (line 6) | const DEFAULT_AUDIO_TOPK = 250;
  constant DEFAULT_PAD_MULT (line 7) | const DEFAULT_PAD_MULT = 0;
  constant DEFAULT_REPETITION_PENALTY_CONTEXT (line 8) | const DEFAULT_REPETITION_PENALTY_CONTEXT = 64;
  constant DEFAULT_REPETITION_PENALTY (line 9) | const DEFAULT_REPETITION_PENALTY = 1.15;
  constant DEFAULT_IMAGE_RESOLUTION (line 10) | const DEFAULT_IMAGE_RESOLUTION = 448;
  constant DEFAULT_IMAGE_URL (line 11) | const DEFAULT_IMAGE_URL = undefined;
  constant DEFAULT_GATE_DELAY (line 12) | const DEFAULT_GATE_DELAY = 16;
  constant DEFAULT_GATE_INFLUENCE (line 13) | const DEFAULT_GATE_INFLUENCE = 0.0;
  constant DEFAULT_DISPLAY_COLOR (line 14) | const DEFAULT_DISPLAY_COLOR = true;
  constant DEFAULT_CENTER_CROP (line 15) | const DEFAULT_CENTER_CROP = false;
  type ModelParamsValues (line 17) | type ModelParamsValues = {
  function importantSettingsHaveChanged (line 33) | function importantSettingsHaveChanged(params: ModelParamsValues): boolean {
  type useModelParamsArgs (line 47) | type useModelParamsArgs = Partial<ModelParamsValues>;

FILE: client/src/pages/Conversation/hooks/useServerAudio.ts
  type AudioStats (line 7) | type AudioStats = {
  type useServerAudioArgs (line 16) | type useServerAudioArgs = {
  type WorkletStats (line 20) | type WorkletStats = {

FILE: client/src/pages/Conversation/hooks/useServerInfo.ts
  type ServerInfo (line 42) | type ServerInfo = {

FILE: client/src/pages/Conversation/hooks/useSocket.ts
  function sendImage (line 39) | async function sendImage() {
  function fetchImageBytes (line 136) | async function fetchImageBytes(imageUrl: string) {

FILE: client/src/pages/Conversation/hooks/useUserAudio.ts
  type UserMediaStatuses (line 6) | enum UserMediaStatuses {
  type useUserAudioArgs (line 16) | type useUserAudioArgs = {

FILE: client/src/pages/Queue/Queue.tsx
  type Status (line 15) | type Status = "connecting" | "in_queue" | "has_credentials" | "error" | ...
  function getFloatFromStorage (line 18) | function getFloatFromStorage(val: string | null) {
  function getIntFromStorage (line 22) | function getIntFromStorage(val: string | null) {
  function getBoolFromStage (line 26) | function getBoolFromStage(val: string | null) {

FILE: client/src/pages/Queue/api/errors/api_error.ts
  class APIError (line 1) | class APIError extends Error {
    method constructor (line 4) | constructor(message:string, status:number) {

FILE: client/src/pages/Queue/api/errors/response_error.ts
  class ResponseError (line 1) | class ResponseError extends Error {
    method constructor (line 2) | constructor(message:string) {

FILE: client/src/protocol/types.ts
  type MessageType (line 1) | type MessageType =
  constant VERSIONS_MAP (line 9) | const VERSIONS_MAP = {
  constant MODELS_MAP (line 13) | const MODELS_MAP = {
  type VERSION (line 17) | type VERSION = keyof typeof VERSIONS_MAP;
  type MODEL (line 19) | type MODEL = keyof typeof MODELS_MAP;
  type WSMessage (line 21) | type WSMessage =
  constant CONTROL_MESSAGES_MAP (line 64) | const CONTROL_MESSAGES_MAP = {
  type CONTROL_MESSAGE (line 71) | type CONTROL_MESSAGE = keyof typeof CONTROL_MESSAGES_MAP;

FILE: kyuteye_mlx/kyuteye_mlx/benchmark.py
  function main (line 9) | def main():

FILE: kyuteye_mlx/kyuteye_mlx/local_web.py
  class ModelInput (line 46) | class ModelInput(Enum):
  class ModelOutput (line 51) | class ModelOutput(Enum):
  class ServerMediaInput (line 58) | class ServerMediaInput(Enum):
  function colorize (line 67) | def colorize(text: str, color: str) -> str:
  function log (line 73) | def log(level: str, msg: str) -> None:
  function hf_hub_download (line 85) | def hf_hub_download(repo: str | None, path: str) -> str:
  function full_warmup (line 91) | def full_warmup(
  function get_model_file (line 119) | def get_model_file(args) -> str:
  function get_tokenizer (line 131) | def get_tokenizer(args) -> sentencepiece.SentencePieceProcessor:
  function get_embedder (line 139) | def get_embedder(args) -> SiglipWrapper | PixtralWrapper:
  function get_model (line 173) | def get_model(args, load_weights: bool = True) -> models.LmGen:
  function model_server (line 214) | def model_server(
  function handle_audio (line 266) | def handle_audio(
  function predict_text_and_audio (line 291) | def predict_text_and_audio(
  function web_server (line 305) | def web_server(
  function get_args_for_main (line 528) | def get_args_for_main() -> argparse.Namespace:
  function main (line 564) | def main() -> None:
  function sanity_check (line 593) | def sanity_check() -> None:

FILE: kyuteye_mlx/kyuteye_mlx/mlx_vlm/models/pixtral/vision.py
  class VisionConfig (line 9) | class VisionConfig:
    method from_dict (line 25) | def from_dict(cls, params):
  function position_ids_in_meshgrid (line 29) | def position_ids_in_meshgrid(patch_embeds_list: list[mx.array], max_widt...
  function generate_block_attention_mask (line 41) | def generate_block_attention_mask(patch_embeds_list: list[mx.array], ten...
  function rotate_half (line 59) | def rotate_half(x):
  function apply_rotary_pos_emb (line 65) | def apply_rotary_pos_emb(q, k, cos, sin, unsqueeze_dim=1):
  class Attention (line 73) | class Attention(nn.Module):
    method __init__ (line 74) | def __init__(
    method __call__ (line 110) | def __call__(self, queries, keys, values, position_embeddings, mask=No...
  class MLP (line 138) | class MLP(nn.Module):
    method __init__ (line 139) | def __init__(self, config: VisionConfig):
    method __call__ (line 147) | def __call__(self, x) -> mx.array:
  class EncoderLayer (line 151) | class EncoderLayer(nn.Module):
    method __init__ (line 152) | def __init__(self, config: VisionConfig):
    method __call__ (line 160) | def __call__(
  class Encoder (line 174) | class Encoder(nn.Module):
    method __init__ (line 175) | def __init__(self, config: VisionConfig):
  class PixtralRotaryEmbedding (line 180) | class PixtralRotaryEmbedding:
    method __init__ (line 181) | def __init__(self, config):
    method __call__ (line 202) | def __call__(self, x, position_ids):
  class PixtralVisionModel (line 210) | class PixtralVisionModel(nn.Module):
    method __init__ (line 211) | def __init__(self, config: VisionConfig):
    method __call__ (line 225) | def __call__(

FILE: kyuteye_mlx/kyuteye_mlx/mlx_vlm/models/siglip/vision.py
  class VisionConfig (line 10) | class VisionConfig:
    method from_dict (line 23) | def from_dict(cls, params):
  function check_array_shape (line 27) | def check_array_shape(arr):
  class Attention (line 43) | class Attention(nn.Module):
    method __init__ (line 44) | def __init__(
    method __call__ (line 78) | def __call__(self, x, mask=None):
  class MLP (line 95) | class MLP(nn.Module):
    method __init__ (line 96) | def __init__(self, config: VisionConfig) -> None:
    method __call__ (line 102) | def __call__(self, x: mx.array) -> mx.array:
  class EncoderLayer (line 109) | class EncoderLayer(nn.Module):
    method __init__ (line 110) | def __init__(self, config: VisionConfig) -> None:
    method __call__ (line 118) | def __call__(self, x: mx.array, mask: mx.array | None = None) -> mx.ar...
  class Encoder (line 125) | class Encoder(nn.Module):
    method __init__ (line 126) | def __init__(self, config: VisionConfig) -> None:
    method __call__ (line 130) | def __call__(
  class VisionEmbeddings (line 148) | class VisionEmbeddings(nn.Module):
    method __init__ (line 149) | def __init__(self, config: VisionConfig) -> None:
    method __call__ (line 167) | def __call__(self, x: mx.array) -> mx.array:
  class SigLipVisionModel (line 176) | class SigLipVisionModel(nn.Module):
    method __init__ (line 177) | def __init__(self, config: VisionConfig):
    method __call__ (line 183) | def __call__(
  class VisionModel (line 197) | class VisionModel(nn.Module):
    method __init__ (line 198) | def __init__(self, config: VisionConfig) -> None:
    method __call__ (line 206) | def __call__(self, x: mx.array, output_hidden_states: bool | None = No...
    method sanitize (line 209) | def sanitize(self, weights):

FILE: kyuteye_mlx/kyuteye_mlx/models/generate.py
  class LmGen (line 15) | class LmGen:
    method __init__ (line 16) | def __init__(
    method zero_token (line 42) | def zero_token(self) -> int:
    method ungenerated_token (line 48) | def ungenerated_token(self) -> int:
    method nb_input_tokens (line 56) | def nb_input_tokens(self) -> int:
    method step (line 60) | def step(
    method last_audio_tokens (line 105) | def last_audio_tokens(self) -> Int32[mx.array, "1 {self.nb_input_token...
    method reset (line 116) | def reset(self) -> None:

FILE: kyuteye_mlx/kyuteye_mlx/models/lm.py
  class DepFormerConfig (line 20) | class DepFormerConfig:
  class LmConfig (line 26) | class LmConfig:
    method audio_eos_token (line 36) | def audio_eos_token(self) -> int:
    method audio_padding_token (line 40) | def audio_padding_token(self) -> int:
  class DepFormerSlice (line 44) | class DepFormerSlice(nn.Module):
    method __init__ (line 45) | def __init__(
    method __call__ (line 60) | def __call__(self, _: mx.array) -> mx.array:
  class DepFormer (line 64) | class DepFormer(nn.Module):
    method __init__ (line 65) | def __init__(self, cfg: LmConfig):
    method __call__ (line 79) | def __call__(self, _: mx.array) -> mx.array:
    method sample (line 82) | def sample(
  class Lm (line 111) | class Lm(nn.Module):
    method __init__ (line 112) | def __init__(self, cfg: LmConfig):
    method __call__ (line 139) | def __call__(
    method sample (line 150) | def sample(
    method warmup (line 180) | def warmup(self) -> None:
    method reset_all_caches (line 195) | def reset_all_caches(self) -> None:
  function config1b_202412 (line 204) | def config1b_202412() -> LmConfig:
  function config1b_202412_16rvq (line 265) | def config1b_202412_16rvq() -> LmConfig:
  function config_v0_1 (line 326) | def config_v0_1() -> LmConfig:
  function config_siglip (line 392) | def config_siglip() -> LmConfig:
  function config_pixtral (line 398) | def config_pixtral() -> LmConfig:
  function config_helium_1_preview_2b (line 404) | def config_helium_1_preview_2b() -> LmConfig:

FILE: kyuteye_mlx/kyuteye_mlx/models/pixtral.py
  class PixtralWrapper (line 10) | class PixtralWrapper(mlx.nn.Module):
    method __init__ (line 13) | def __init__(self) -> None:
    method __call__ (line 33) | def __call__(self, x: mx.array) -> mx.array:
    method warmup (line 48) | def warmup(self) -> None:

FILE: kyuteye_mlx/kyuteye_mlx/models/siglip.py
  class SiglipWrapper (line 11) | class SiglipWrapper(mlx.nn.Module):
    method __init__ (line 14) | def __init__(self) -> None:
    method __call__ (line 21) | def __call__(self, x: mx.array) -> mx.array:
    method warmup (line 35) | def warmup(self) -> None:

FILE: kyuteye_mlx/kyuteye_mlx/modules/config.py
  class TransformerConfig (line 6) | class TransformerConfig:
    method head_dim (line 34) | def head_dim(self) -> int:

FILE: kyuteye_mlx/kyuteye_mlx/modules/cross_attention.py
  class SharedModuleType (line 17) | class SharedModuleType(type):
    method __call__ (line 22) | def __call__(cls, *args: Any, **kwargs: Any) -> Any:
  class CrossAttention (line 28) | class CrossAttention(nn.Module):
    method __init__ (line 29) | def __init__(self, cfg: TransformerConfig):
    method __call__ (line 38) | def __call__(
  class SharedCrossAttention (line 65) | class SharedCrossAttention(CrossAttention, metaclass=SharedModuleType):
  class XAGate (line 71) | class XAGate(nn.Module):
    method __init__ (line 72) | def __init__(
    method __call__ (line 98) | def __call__(
  class GatedCrossAttention (line 104) | class GatedCrossAttention(nn.Module):
    method __init__ (line 105) | def __init__(self, cfg: TransformerConfig) -> None:
    method __call__ (line 113) | def __call__(

FILE: kyuteye_mlx/kyuteye_mlx/modules/kv_cache.py
  class XACache (line 13) | class XACache:
    method __init__ (line 14) | def __init__(self) -> None:
    method set (line 19) | def set(self, k: mx.array, v: mx.array) -> None:
    method reset (line 26) | def reset(self) -> None:
    method state (line 32) | def state(self) -> tuple[mx.array | None, mx.array | None]:
  class KVCache (line 36) | class KVCache:
    method __init__ (line 37) | def __init__(self, head_dim: int | tuple[int, int], n_kv_heads: int) -...
    method update_and_fetch (line 50) | def update_and_fetch(self, keys: mx.array, values: mx.array) -> tuple[...
    method reset (line 75) | def reset(self) -> None:
    method state (line 79) | def state(self) -> tuple[mx.array | None, mx.array | None]:
  class RotatingKVCache (line 83) | class RotatingKVCache:
    method __init__ (line 84) | def __init__(
    method _trim (line 107) | def _trim(self, trim_size: int, v: mx.array, append: mx.array | None =...
    method update_and_fetch (line 117) | def update_and_fetch(self, keys: mx.array, values: mx.array) -> tuple[...
    method reset (line 176) | def reset(self) -> None:
    method state (line 181) | def state(self) -> tuple[mx.array | None, mx.array | None]:
  class BaseModelArgs (line 186) | class BaseModelArgs:
    method from_dict (line 188) | def from_dict(cls, params: dict[str, Any]):

FILE: kyuteye_mlx/kyuteye_mlx/modules/transformer.py
  class Attention (line 15) | class Attention(nn.Module):
    method __init__ (line 16) | def __init__(self, cfg: TransformerConfig) -> None:
    method __call__ (line 29) | def __call__(
  class MlpGating (line 59) | class MlpGating(nn.Module):
    method __init__ (line 60) | def __init__(self, cfg: TransformerConfig) -> None:
    method __call__ (line 70) | def __call__(
  class MlpNoGating (line 79) | class MlpNoGating(nn.Module):
    method __init__ (line 80) | def __init__(self, cfg: TransformerConfig) -> None:
    method __call__ (line 86) | def __call__(self, xs: mx.array) -> mx.array:
  class TransformerLayer (line 90) | class TransformerLayer(nn.Module):
    method __init__ (line 91) | def __init__(self, cfg: TransformerConfig) -> None:
    method __call__ (line 123) | def __call__(
  class ImagePrefix (line 140) | class ImagePrefix(nn.Module):
    method __init__ (line 141) | def __init__(self, cfg: TransformerConfig) -> None:
    method __call__ (line 147) | def __call__(
  class Transformer (line 155) | class Transformer(nn.Module):
    method __init__ (line 156) | def __init__(self, cfg: TransformerConfig, with_img_prefix: bool = Fal...
    method __call__ (line 164) | def __call__(
    method make_cache (line 179) | def make_cache(self) -> list[KVCache]:
    method make_rot_cache (line 183) | def make_rot_cache(self) -> list[RotatingKVCache]:

FILE: kyuteye_mlx/kyuteye_mlx/quantize.py
  function quantize (line 28) | def quantize(
  function main (line 79) | def main():

FILE: kyuteye_mlx/kyuteye_mlx/utils/loading.py
  function repeat_shared_weights (line 4) | def repeat_shared_weights(weights: dict[str, mx.array], num_layers: int)...
  function remove_shared_weights (line 18) | def remove_shared_weights(weights: dict[str, mx.array], num_layers: int)...
  function split_embedder_weights (line 25) | def split_embedder_weights(

FILE: kyuteye_mlx/kyuteye_mlx/utils/profiling.py
  function profile (line 11) | def profile(x: Callable) -> Callable:

FILE: kyuteye_mlx/kyuteye_mlx/utils/sampling.py
  function top_p_sampling (line 12) | def top_p_sampling(
  function categorical_sampling (line 48) | def categorical_sampling(logits: BFloat16[mx.array, "batch vocab"], temp...
  class Sampler (line 53) | class Sampler:
    method __call__ (line 57) | def __call__(

FILE: kyuteye_mlx/tests/test_siglip.py
  function convert_weights_for_mlx (line 9) | def convert_weights_for_mlx(weights: dict[str, torch.Tensor]) -> dict[st...
  function test_siglip_weights_conversion (line 22) | def test_siglip_weights_conversion() -> None:

FILE: kyuteye_pt/kyuteye/config/enums.py
  class ImageEncoder (line 8) | class ImageEncoder(Enum):
    method out_dims (line 26) | def out_dims(self) -> int:
    method to_rust (line 48) | def to_rust(self) -> str:

FILE: kyuteye_pt/kyuteye/config/kyuteye_config.py
  class KyuteyeConfig (line 23) | class KyuteyeConfig:
    method __init__ (line 30) | def __init__(self, **kwargs: Any):
    method __getattribute__ (line 88) | def __getattribute__(self, name: str) -> Any:
    method __setattr__ (line 94) | def __setattr__(self, name: str, value: Any) -> None:
    method moshi_constructor_kwargs (line 102) | def moshi_constructor_kwargs(self) -> Dict[str, Any]:
    method from_yml (line 111) | def from_yml(cls, path: Path | str) -> "KyuteyeConfig":
    method to_yml (line 115) | def to_yml(self, path: Optional[Path | str] = None) -> None:
    method print (line 140) | def print(self, flat: bool = False, only: Optional[Sequence[str]] = No...
    method to_dict (line 163) | def to_dict(self, flat: bool = True) -> Dict[str, Any]:
  function __load_yaml__ (line 173) | def __load_yaml__(path: Path | str) -> Dict:
  function __save_yaml__ (line 194) | def __save_yaml__(config: Dict, path: Path | str) -> None:

FILE: kyuteye_pt/kyuteye/config/subconfigs.py
  function __is_nonstring_iterable__ (line 11) | def __is_nonstring_iterable__(arg: Any) -> bool:
  class LMConfig (line 16) | class LMConfig:
    method help (line 32) | def help(field_name: str) -> str:
  class ImageEncoderConfig (line 47) | class ImageEncoderConfig:
    method __post_init__ (line 62) | def __post_init__(self) -> None:
    method help (line 73) | def help(field_name: str) -> str:
  class MoshiConfig (line 86) | class MoshiConfig:
    method help (line 119) | def help(field_name: str) -> str:
  class FusionConfig (line 134) | class FusionConfig:
    method help (line 157) | def help(field_name: str) -> str:
    method __post_init__ (line 205) | def __post_init__(self) -> None:
    method crossattention_kwargs (line 262) | def crossattention_kwargs(self) -> Dict[str, Any]:

FILE: kyuteye_pt/kyuteye/models/helium.py
  class Helium (line 12) | class Helium(torch.nn.Module):
    method __init__ (line 39) | def __init__(
    method forward (line 111) | def forward(

FILE: kyuteye_pt/kyuteye/models/hf_model_configs.py
  class HeliumConfig (line 9) | class HeliumConfig(PretrainedConfig):
    method __init__ (line 14) | def __init__(
  class MoshiVisConfig (line 80) | class MoshiVisConfig(HeliumConfig):
    method __init__ (line 85) | def __init__(
    method total_audio_loss_weight (line 163) | def total_audio_loss_weight(self) -> float:
    method audio_semantic_loss_weight (line 178) | def audio_semantic_loss_weight(self) -> float:
    method audio_acoustic_loss_weight (line 187) | def audio_acoustic_loss_weight(self) -> float:
    method audio_other_semantic_loss_weight (line 196) | def audio_other_semantic_loss_weight(self) -> float:
    method audio_other_acoustic_loss_weight (line 201) | def audio_other_acoustic_loss_weight(self) -> float:
    method sparsity_loss_weight (line 206) | def sparsity_loss_weight(self) -> float:

FILE: kyuteye_pt/kyuteye/models/image_projection.py
  class ImageProjection (line 19) | class ImageProjection(torch.nn.Module):
    method __init__ (line 30) | def __init__(
    method from_config (line 75) | def from_config(
    method init_proj_module (line 106) | def init_proj_module(self, num_tokens: int) -> Optional[torch.nn.Module]:
    method encoder_out_dim (line 117) | def encoder_out_dim(self) -> int:
    method to_tensor_and_normalize (line 122) | def to_tensor_and_normalize(self) -> Callable:
    method init_norm_module (line 126) | def init_norm_module(self, norm_type: Optional[str]) -> Optional[torch...
    method forward (line 134) | def forward(self, x: torch.Tensor | List[torch.Tensor]) -> Dict[str, t...
    method encode (line 156) | def encode(
    method project_extra (line 176) | def project_extra(self, logits: torch.Tensor) -> torch.Tensor:
    method project_xa (line 184) | def project_xa(self, logits: torch.Tensor) -> torch.Tensor:

FILE: kyuteye_pt/kyuteye/models/loaders.py
  function get_moshi_vis (line 16) | def get_moshi_vis(

FILE: kyuteye_pt/kyuteye/models/moshivis.py
  class MoshiVis (line 15) | class MoshiVis(StreamingModule):
    method __init__ (line 23) | def __init__(
    method cross_attention (line 147) | def cross_attention(self) -> bool:
    method num_audio_codebooks_in (line 152) | def num_audio_codebooks_in(self) -> int:
    method num_audio_codebooks_out (line 157) | def num_audio_codebooks_out(self) -> int:
    method num_codebooks (line 162) | def num_codebooks(self) -> int:
    method initial_audio_token_id (line 167) | def initial_audio_token_id(self) -> int:
    method initial_text_token_id (line 172) | def initial_text_token_id(self) -> int:
    method audio_offset (line 178) | def audio_offset(self) -> int:
    method forward_text (line 182) | def forward_text(
    method forward_depformer (line 229) | def forward_depformer(
    method device (line 270) | def device(self) -> torch.device:
    method get_initial_token (line 274) | def get_initial_token(self) -> torch.Tensor:
  class MoshiVisGen (line 298) | class MoshiVisGen(StreamingModule):
    method __init__ (line 301) | def __init__(
    method update_gen_kwargs (line 329) | def update_gen_kwargs(
    method model_dim (line 343) | def model_dim(self) -> int:
    method num_audio_codebooks_out (line 348) | def num_audio_codebooks_out(self) -> int:
    method from_config (line 353) | def from_config(
    method precompte_ca_kv (line 380) | def precompte_ca_kv(
    method step (line 399) | def step(
    method depformer_step (line 513) | def depformer_step(

FILE: kyuteye_pt/kyuteye/modules/attention.py
  class KVCache (line 18) | class KVCache:
    method __init__ (line 35) | def __init__(
    method clone (line 71) | def clone(self) -> "KVCache":
    method current_start (line 87) | def current_start(self) -> int:
    method __maybe_increase_capacity__ (line 91) | def __maybe_increase_capacity__(self, required_capacity: int) -> None:
    method complete (line 120) | def complete(
  class MultiheadAttention (line 140) | class MultiheadAttention(StreamingModule):
    method __init__ (line 161) | def __init__(
    method _complete_kv (line 219) | def _complete_kv(
    method forward (line 244) | def forward(

FILE: kyuteye_pt/kyuteye/modules/cross_attention.py
  class SharedModuleType (line 14) | class SharedModuleType(type):
    method __call__ (line 19) | def __call__(cls, *args: Any, **kwargs: Any) -> Any:
  class XAGate (line 25) | class XAGate(torch.nn.Module):
    method __init__ (line 28) | def __init__(
    method forward (line 61) | def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
  class SharedXaGate (line 70) | class SharedXaGate(XAGate, metaclass=SharedModuleType):
  class CrossAttention (line 76) | class CrossAttention(MultiheadAttention):
    method __init__ (line 79) | def __init__(self, *args: Any, **kwargs: Any) -> None:
  class SharedCrossAttention (line 84) | class SharedCrossAttention(CrossAttention, metaclass=SharedModuleType):
  class GatedCrossAttention (line 90) | class GatedCrossAttention(StreamingModule):
    method __init__ (line 93) | def __init__(
    method get_xa_scope (line 166) | def get_xa_scope(
    method is_active (line 248) | def is_active(self, image_tokens_mask: Optional[torch.Tensor] = None) ...
    method forward (line 289) | def forward(

FILE: kyuteye_pt/kyuteye/modules/image_encoder.py
  class TrimmedFlexiViTWrapper (line 26) | class TrimmedFlexiViTWrapper(torch.nn.Module):
    method __init__ (line 29) | def __init__(
    method forward (line 36) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  function load_paligemma_vision_encoder (line 43) | def load_paligemma_vision_encoder(
  class PixtralOutput (line 75) | class PixtralOutput:
  class PixtralWrapper (line 82) | class PixtralWrapper(torch.nn.Module):
    method __init__ (line 85) | def __init__(
    method __get_num_output_tokens__ (line 100) | def __get_num_output_tokens__(self, x: List[torch.Tensor]) -> List[int]:
    method split_and_pad_output (line 109) | def split_and_pad_output(
    method forward (line 131) | def forward(self, x: List[torch.Tensor] | torch.Tensor) -> PixtralOutput:
  function get_img_normalize (line 145) | def get_img_normalize(
  function load_image_encoder (line 160) | def load_image_encoder(

FILE: kyuteye_pt/kyuteye/modules/image_transforms.py
  function get_minimal_transforms (line 21) | def get_minimal_transforms(
  class Normalize (line 55) | class Normalize:
    method __init__ (line 59) | def __init__(self, mean: Sequence[float], std: Sequence[float]) -> None:
    method __call__ (line 73) | def __call__(
    method to_pil_transform (line 80) | def to_pil_transform(self, mode: str = "RGB") -> T.Transform:
  class UnitNormalize (line 91) | class UnitNormalize(Normalize):
    method __init__ (line 94) | def __init__(self) -> None:
  class CLIPNormalize (line 101) | class CLIPNormalize(Normalize):
    method __init__ (line 104) | def __init__(self) -> None:
  class SigLIPNormalize (line 111) | class SigLIPNormalize(Normalize):
    method __init__ (line 114) | def __init__(self) -> None:
  class PixtralNormalize (line 118) | class PixtralNormalize:
    method __init__ (line 123) | def __init__(self) -> None:
    method __call__ (line 128) | def __call__(

FILE: kyuteye_pt/kyuteye/modules/streaming_utils.py
  class StreamingModule (line 16) | class StreamingModule(torch.nn.Module):
    method __init__ (line 19) | def __init__(self) -> None:
    method empty_streaming_state (line 25) | def empty_streaming_state(self) -> bool:
    method has_streaming_attribute (line 29) | def has_streaming_attribute(self, key: str) -> bool:
    method add_streaming_attribute (line 33) | def add_streaming_attribute(
    method get_streaming_attribute (line 39) | def get_streaming_attribute(self, key: str, default: Any = None) -> Any:
    method is_streaming (line 44) | def is_streaming(self) -> bool:
    method get_streaming_info_as_int (line 48) | def get_streaming_info_as_int(self, attr_name: str, default: int = 0) ...
    method streaming_offset (line 61) | def streaming_offset(self) -> int:
    method streaming_offset (line 66) | def streaming_offset(self, value: int | torch.Tensor) -> None:
    method _apply_named_streaming (line 73) | def _apply_named_streaming(self, fn: Callable) -> None:
    method _set_streaming (line 78) | def _set_streaming(self, streaming: bool) -> None:
    method streaming (line 85) | def streaming(self) -> Iterator:
    method streaming_forever (line 94) | def streaming_forever(self, batch_size: Optional[int] = None) -> None:
    method reset_streaming (line 99) | def reset_streaming(self) -> None:
    method get_streaming_state (line 107) | def get_streaming_state(self) -> State:
    method set_streaming_state (line 120) | def set_streaming_state(self, state: State) -> None:
    method flush (line 139) | def flush(self, x: Optional[torch.Tensor] = None) -> Optional["Streami...

FILE: kyuteye_pt/kyuteye/modules/transformer.py
  class TransformerLayer (line 22) | class TransformerLayer(StreamingModule):
    method __init__ (line 47) | def __init__(
    method _ff_block (line 147) | def _ff_block(self, x: torch.Tensor) -> torch.Tensor:
    method _maybe_cross_attend (line 165) | def _maybe_cross_attend(
    method _self_attend (line 184) | def _self_attend(
    method forward (line 197) | def forward(
  class Transformer (line 241) | class Transformer(StreamingModule):
    method __init__ (line 266) | def __init__(
    method set_context (line 314) | def set_context(self, context: Optional[int] = None) -> None:
    method forward (line 320) | def forward(

FILE: kyuteye_pt/kyuteye/modules/utils.py
  function multi_linear (line 18) | def multi_linear(
  function get_activation (line 43) | def get_activation(
  function gating_forward_kernel (line 68) | def gating_forward_kernel(
  class ActivationGating (line 83) | class ActivationGating(torch.nn.Module):
    method __init__ (line 92) | def __init__(
    method forward (line 115) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class NoGating (line 122) | class NoGating(torch.nn.Module):
    method __init__ (line 127) | def __init__(
    method forward (line 144) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  function make_ffn (line 149) | def make_ffn(
  class LayerNormF32 (line 192) | class LayerNormF32(torch.nn.LayerNorm):
    method forward (line 195) | def forward(
  function _rms_norm (line 204) | def _rms_norm(
  class RMSNorm (line 226) | class RMSNorm(torch.nn.Module):
    method __init__ (line 233) | def __init__(
    method forward (line 249) | def forward(self, x: torch.Tensor) -> torch.Tensor:
  class NormalizationLayer (line 256) | class NormalizationLayer(Enum):
    method create_norm_fn (line 266) | def create_norm_fn(self, dim: int, **kwargs: Any) -> torch.nn.Module:
  class ClampedEmbedding (line 299) | class ClampedEmbedding(torch.nn.Embedding):
    method __init__ (line 309) | def __init__(
    method forward (line 319) | def forward(  # pylint: disable=arguments-renamed
  function create_sin_embedding (line 332) | def create_sin_embedding(
  function apply_rope (line 360) | def apply_rope(
  class RotaryEmbedding (line 411) | class RotaryEmbedding(torch.nn.Module):
    method __init__ (line 417) | def __init__(self, max_period: float = 10000.0) -> None:
    method forward (line 421) | def forward(

FILE: kyuteye_pt/kyuteye/server.py
  function colorize (line 37) | def colorize(text: str, color: str) -> str:
  function make_log (line 44) | def make_log(level: str, msg: str) -> str:
  function log (line 57) | def log(level: str, msg: str) -> None:
  function seed_all (line 62) | def seed_all(seed: int) -> None:
  class ServerState (line 75) | class ServerState:
    method __init__ (line 88) | def __init__(
    method warmup (line 117) | def warmup(self) -> None:
    method handle_chat (line 136) | async def handle_chat(self, request: Any) -> Any:
    method extract_image (line 287) | async def extract_image(self, ws: web.WebSocketResponse) -> None:
  function start_server (line 314) | def start_server(
  function sanity_check (line 435) | def sanity_check() -> None:
  function main (line 439) | def main() -> None:

FILE: kyuteye_pt/kyuteye/utils/dist_utils.py
  function is_main (line 10) | def is_main() -> bool:
  function print_main (line 21) | def print_main(*args: Any, rich: bool = False, **kwargs: Any) -> None:

FILE: kyuteye_pt/kyuteye/utils/logging_utils.py
  function flatten_nested_dict (line 10) | def flatten_nested_dict(d: Dict) -> Dict:
  function get_git_revision_hash (line 21) | def get_git_revision_hash(verbose: bool = True) -> Tuple[str, str]:
  function pretty_json (line 38) | def pretty_json(config_dict: dict) -> str:

FILE: kyuteye_pt/kyuteye/utils/struct_utils.py
  class FrozenEnumMeta (line 9) | class FrozenEnumMeta(EnumMeta):
    method __new__ (line 12) | def __new__(mcs, name: str, bases: Any, classdict: Any) -> type:
    method __setattr__ (line 18) | def __setattr__(cls, name: str, value: Any) -> None:
    method __delattr__ (line 28) | def __delattr__(cls, name: str) -> None:
  class FrozenEnum (line 39) | class FrozenEnum(Enum, metaclass=FrozenEnumMeta):

FILE: kyuteye_pt/tests/hello.py
  function write_weights_for_analysis (line 11) | def write_weights_for_analysis(model:  torch.nn.Module):
  function test_weights_conversion_moshi (line 29) | def test_weights_conversion_moshi():

FILE: kyuteye_rs/moshi-backend/build.rs
  function main (line 4) | pub fn main() -> Result<()> {

FILE: kyuteye_rs/moshi-backend/src/audio.rs
  type Sample (line 8) | pub trait Sample {
    method to_i16 (line 9) | fn to_i16(&self) -> i16;
    method to_i16 (line 13) | fn to_i16(&self) -> i16 {
    method to_i16 (line 19) | fn to_i16(&self) -> i16 {
    method to_i16 (line 25) | fn to_i16(&self) -> i16 {
  function write_pcm_as_wav (line 30) | pub fn write_pcm_as_wav<W: Write, S: Sample>(
  function conv (line 63) | fn conv<T>(samples: &mut Vec<f32>, data: std::borrow::Cow<symphonia::cor...
  function pcm_decode (line 73) | pub(crate) fn pcm_decode<P: AsRef<std::path::Path>>(path: P) -> anyhow::...
  function resample (line 117) | pub(crate) fn resample(pcm_in: &[f32], sr_in: usize, sr_out: usize) -> a...
  function write_opus_header (line 145) | pub(crate) fn write_opus_header<W: std::io::Write>(w: &mut W) -> std::io...
  function write_opus_tags (line 159) | pub(crate) fn write_opus_tags<W: std::io::Write>(w: &mut W) -> std::io::...

FILE: kyuteye_rs/moshi-backend/src/build.rs
  function main (line 8) | pub fn main() -> Result<()> {

FILE: kyuteye_rs/moshi-backend/src/image_embedder.rs
  function load_image (line 10) | fn load_image(
  type ImageEncoder (line 78) | pub enum ImageEncoder {
  type ImageEncoderModel (line 88) | pub enum ImageEncoderModel {
  function init_output_proj (line 94) | fn init_output_proj(in_dims: usize, out_dims: usize, vb: VarBuilder) -> ...
  type ImageEmbedder (line 104) | pub struct ImageEmbedder {
    method new (line 124) | pub fn new(
    method output_proj (line 238) | pub fn output_proj(&self, img_features: Tensor, dev: &Device) -> Resul...
    method embed (line 255) | pub fn embed(
    method embed_from_tensor (line 328) | pub fn embed_from_tensor(&self, img: Tensor, dev: &Device) -> Result<C...

FILE: kyuteye_rs/moshi-backend/src/main.rs
  type Args (line 18) | struct Args {
  type StandaloneArgs (line 33) | struct StandaloneArgs {
  type Command (line 63) | enum Command {
  type NoDelayAcceptor (line 69) | pub struct NoDelayAcceptor;
    type Stream (line 72) | type Stream = tokio::net::TcpStream;
    type Service (line 73) | type Service = S;
    type Future (line 74) | type Future =
    method accept (line 77) | fn accept(&self, stream: tokio::net::TcpStream, service: S) -> Self::F...
  function tracing_init (line 86) | fn tracing_init(
  function main (line 115) | async fn main() -> Result<()> {

FILE: kyuteye_rs/moshi-backend/src/standalone.rs
  type Config (line 14) | pub struct Config {
    method load (line 27) | pub fn load<P: AsRef<std::path::Path>>(p: P) -> Result<Self> {
    method cert_file (line 44) | pub fn cert_file(&self, name: &str) -> Result<std::path::PathBuf> {
  function device (line 54) | pub(crate) fn device(cpu: bool) -> Result<candle::Device> {
  function new (line 68) | pub fn new(args: &StandaloneArgs, config: &stream_both::Config) -> Resul...
  function handle_socket (line 179) | async fn handle_socket(socket: ws::WebSocket, sm: stream_both::Streaming...
  function stream_handler (line 185) | pub async fn stream_handler(
  function download_from_hub (line 196) | pub async fn download_from_hub(config: &mut stream_both::Config) -> Resu...
  function run (line 243) | pub async fn run(args: &StandaloneArgs, config: &Config) -> Result<()> {

FILE: kyuteye_rs/moshi-backend/src/stream_both.rs
  type ForceSessionConfig (line 16) | pub struct ForceSessionConfig {
  type Config (line 28) | pub struct Config {
    method requires_model_download (line 57) | pub fn requires_model_download(&self) -> bool {
  function default_false (line 51) | fn default_false() -> bool {
  type AppState (line 68) | pub type AppState = Arc<AppStateInner>;
  type AppStateInner (line 69) | pub struct AppStateInner {
    method text (line 79) | fn text(
  type SessionConfigReq (line 117) | pub struct SessionConfigReq {
    method into_session_config (line 168) | fn into_session_config(self, force_cfg: Option<&ForceSessionConfig>) -...
  type SessionConfig (line 136) | pub struct SessionConfig {
  type SessionSummary (line 155) | struct SessionSummary<'a> {
  type MetaData (line 224) | pub struct MetaData {
  type StreamOut (line 242) | pub enum StreamOut {
  constant OPUS_ENCODER_FRAME_SIZE (line 252) | const OPUS_ENCODER_FRAME_SIZE: usize = 960;
  type MsgType (line 255) | pub enum MsgType {
    method from_u8 (line 269) | pub fn from_u8(v: u8) -> Result<Self> {
    method to_u8 (line 286) | pub fn to_u8(self) -> u8 {
  type ModelInput (line 302) | pub enum ModelInput {
  type MsgSender (line 308) | pub struct MsgSender {
    method new (line 319) | fn new(sender: SplitSink<ws::WebSocket, ws::Message>) -> Result<Self> {
    method send_colored_text (line 344) | async fn send_colored_text(&mut self, text: String, intensity: f32) ->...
    method send_ready (line 357) | async fn send_ready(&mut self) -> Result<()> {
    method send_metadata (line 367) | async fn send_metadata(&mut self, md: Box<MetaData>) -> Result<()> {
    method send_pcm (line 375) | async fn send_pcm(&mut self, pcm: Vec<f32>) -> Result<()> {
  type StreamingModel (line 415) | pub struct StreamingModel {
    method run_with_state (line 423) | fn run_with_state(
    method new (line 545) | pub fn new(state: &AppState, session_config: SessionConfigReq) -> Self {
    method run (line 560) | pub fn run(
  type Handle (line 697) | type Handle = tokio::task::JoinHandle<Result<()>>;
  function spawn_recv_loops (line 699) | fn spawn_recv_loops(
  function sender_loop (line 788) | async fn sender_loop(
  function handle_socket (line 814) | pub async fn handle_socket(

FILE: kyuteye_rs/moshi-backend/src/utils.rs
  type BuildInfo (line 2) | pub struct BuildInfo {
    method new (line 16) | pub fn new() -> BuildInfo {
  type WrapJson (line 32) | pub struct WrapJson<T>(pub anyhow::Result<T>);
  function into_response (line 35) | fn into_response(self) -> axum::response::Response {
  function replace_env_vars (line 50) | pub fn replace_env_vars(input: &str) -> String {
  type WrapBincode (line 59) | pub struct WrapBincode<T>(pub anyhow::Result<T>);
  function into_response (line 62) | fn into_response(self) -> axum::response::Response {
  function default_static_dir (line 77) | pub fn default_static_dir() -> String {
  type AxumError (line 81) | pub struct AxumError(anyhow::Error);
    method into_response (line 84) | fn into_response(self) -> axum::response::Response {
    method from (line 96) | fn from(value: E) -> Self {
  type AxumResult (line 101) | pub type AxumResult<R> = std::result::Result<R, AxumError>;

FILE: kyuteye_rs/moshi-core/src/conv.rs
  type Norm (line 11) | pub enum Norm {
  type PadMode (line 18) | pub enum PadMode {
  function conv1d_weight_norm (line 27) | fn conv1d_weight_norm(
  type NormConv1d (line 52) | pub struct NormConv1d {
    method new (line 60) | pub fn new(
  method forward (line 102) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  type NormConvTranspose1d (line 113) | pub struct NormConvTranspose1d {
    method new (line 125) | pub fn new(
  method forward (line 188) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  function get_extra_padding_for_conv1d (line 211) | fn get_extra_padding_for_conv1d(
  function pad1d (line 224) | fn pad1d(xs: &Tensor, pad_l: usize, pad_r: usize, mode: PadMode) -> Resu...
  function unpad1d (line 232) | fn unpad1d(xs: &Tensor, unpad_l: usize, unpad_r: usize) -> Result<Tensor> {
  type StreamableConv1d (line 241) | pub struct StreamableConv1d {
    method new (line 253) | pub fn new(
  method forward (line 289) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  method reset_state (line 316) | fn reset_state(&mut self) {
  method step (line 321) | fn step(&mut self, xs: &StreamTensor) -> Result<StreamTensor> {
  type StreamableConvTranspose1d (line 360) | pub struct StreamableConvTranspose1d {
    method new (line 370) | pub fn new(
  method forward (line 403) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  method reset_state (line 421) | fn reset_state(&mut self) {
  method step (line 425) | fn step(&mut self, xs: &StreamTensor) -> Result<StreamTensor> {
  type ConvDownsample1d (line 461) | pub struct ConvDownsample1d {
    method new (line 466) | pub fn new(
  method forward (line 494) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  method reset_state (line 500) | fn reset_state(&mut self) {
  method step (line 504) | fn step(&mut self, xs: &StreamTensor) -> Result<StreamTensor> {
  type ConvTrUpsample1d (line 510) | pub struct ConvTrUpsample1d {
    method new (line 515) | pub fn new(
  method forward (line 541) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  method reset_state (line 547) | fn reset_state(&mut self) {
  method step (line 551) | fn step(&mut self, xs: &StreamTensor) -> Result<StreamTensor> {
  function run_conv1d (line 561) | fn run_conv1d(
  function run_conv_tr1d (line 612) | fn run_conv_tr1d(
  function conv1d (line 655) | fn conv1d() -> Result<()> {
  function conv_tr1d (line 669) | fn conv_tr1d() -> Result<()> {

FILE: kyuteye_rs/moshi-core/src/dynamic_logits_processor.rs
  type GateInfluencedLogitsProcessor (line 5) | pub struct GateInfluencedLogitsProcessor {
    method from_sampling (line 12) | pub fn from_sampling(seed: u64, sampling: Sampling) -> Self {
    method from_sampling_with_scale (line 21) | pub fn from_sampling_with_scale(
    method new (line 34) | pub fn new(seed: u64, temperature: Option<f64>, top_p: Option<f64>) ->...
    method sample_argmax (line 46) | fn sample_argmax(&mut self, logits: Tensor) -> Result<u32> {
    method sample_multinomial (line 57) | fn sample_multinomial(&mut self, prs: &Vec<f32>) -> Result<u32> {
    method sample_topp (line 66) | fn sample_topp(&mut self, prs: &mut Vec<f32>, top_p: f32) -> Result<u3...
    method sample_topk (line 86) | fn sample_topk(&mut self, prs: &mut Vec<f32>, top_k: usize) -> Result<...
    method sample_topk_topp (line 101) | fn sample_topk_topp(&mut self, prs: &mut Vec<f32>, top_k: usize, top_p...
    method sample (line 119) | pub fn sample(&mut self, logits: &Tensor, gate_weight: f64) -> Result<...
    method sample_f (line 123) | pub fn sample_f(

FILE: kyuteye_rs/moshi-core/src/lib.rs
  type NormType (line 21) | pub enum NormType {

FILE: kyuteye_rs/moshi-core/src/lm.rs
  type DepFormerConfig (line 22) | pub struct DepFormerConfig {
  type Config (line 28) | pub struct Config {
    method v0_1 (line 42) | pub fn v0_1() -> Self {
    method v0_1_vision (line 103) | pub fn v0_1_vision() -> Self {
    method v0_1_vision_streaming (line 168) | pub fn v0_1_vision_streaming(num_slices: usize) -> Self {
    method v0_1_streaming (line 178) | pub fn v0_1_streaming(num_slices: usize) -> Self {
  type DepFormerSlice (line 190) | struct DepFormerSlice {
    method new (line 200) | fn new(
  type DepFormer (line 222) | pub struct DepFormer {
    method new (line 227) | pub fn new(
    method sample (line 256) | pub fn sample(
    method sample_cfg (line 308) | pub fn sample_cfg(
  type LmModel (line 359) | pub struct LmModel {
    method new (line 372) | pub fn new(cfg: &Config, vb: MaybeQuantizedVarBuilder) -> Result<Self> {
    method reset_state (line 421) | pub fn reset_state(&mut self) {
    method in_audio_codebooks (line 426) | pub fn in_audio_codebooks(&self) -> usize {
    method audio_pad_token (line 430) | pub fn audio_pad_token(&self) -> u32 {
    method text_start_token (line 434) | pub fn text_start_token(&self) -> u32 {
    method generated_audio_codebooks (line 438) | pub fn generated_audio_codebooks(&self) -> usize {
    method is_quantized (line 442) | pub fn is_quantized(&self) -> bool {
    method device (line 449) | pub fn device(&self) -> &Device {
    method forward (line 453) | pub fn forward(
    method maybe_precompute_ca_kv (line 500) | pub fn maybe_precompute_ca_kv(&self, ca_src: Option<CaSrc>) -> Result<...
    method forward_ca (line 508) | pub fn forward_ca(
    method forward_with_gate_weight (line 518) | pub fn forward_with_gate_weight(
    method depformer_sample (line 572) | pub fn depformer_sample(
  function load_lm_model (line 590) | pub fn load_lm_model<P: AsRef<std::path::Path>>(
  function load (line 614) | pub fn load<P: AsRef<std::path::Path>>(
  function load_streaming (line 623) | pub fn load_streaming<P: AsRef<std::path::Path>>(
  function load_streaming_both_ways (line 632) | pub fn load_streaming_both_ways<P: AsRef<std::path::Path>>(
  function load_vision (line 641) | pub fn load_vision<P: AsRef<std::path::Path>>(
  type ForcedAudioTokens (line 655) | pub struct ForcedAudioTokens {
    method new (line 662) | pub fn new(acoustic_delay: usize, audio_pad_token: u32, stream_codeboo...
    method forced_tokens (line 676) | pub fn forced_tokens(&self, step_idx: usize) -> &[Option<u32>] {

FILE: kyuteye_rs/moshi-core/src/lm_generate.rs
  constant UNGENERATED (line 8) | const UNGENERATED: u32 = u32::MAX;
  type Config (line 11) | pub struct Config {
    method v0_1 (line 22) | pub fn v0_1() -> Self {
    method audio_pad_token (line 34) | pub fn audio_pad_token(&self) -> u32 {
    method audio_codebooks (line 38) | pub fn audio_codebooks(&self) -> usize {
  type State (line 43) | pub struct State {
    method new (line 55) | pub fn new(
    method audio_codebooks (line 81) | pub fn audio_codebooks(&self) -> usize {
    method audio_pad_token (line 85) | pub fn audio_pad_token(&self) -> u32 {
    method step_gen_no_text (line 89) | pub fn step_gen_no_text(&mut self, force_text_token: Option<u32>) -> c...
    method step_gen (line 93) | pub fn step_gen(&mut self, prev_text_token: u32) -> candle::Result<u32> {
    method step_text_prompt (line 97) | pub fn step_text_prompt(&mut self, id: u32) -> candle::Result<u32> {
    method step_audio_prompt_ (line 101) | pub fn step_audio_prompt_(
    method step_audio_prompt (line 122) | pub fn step_audio_prompt(&mut self, codes: &[u32]) -> candle::Result<u...
    method step_audio_prompt_with_text (line 126) | pub fn step_audio_prompt_with_text(&mut self, codes: &[u32], text: u32...
    method last_audio_tokens (line 130) | pub fn last_audio_tokens(&self) -> Option<Vec<u32>> {
    method audio_tokens (line 147) | pub fn audio_tokens(&self) -> Vec<Vec<u32>> {
    method step (line 154) | fn step(

FILE: kyuteye_rs/moshi-core/src/lm_generate_multistream.rs
  constant UNGENERATED (line 11) | pub const UNGENERATED: u32 = u32::MAX;
  type Config (line 14) | pub struct Config {
    method v0_1 (line 25) | pub fn v0_1() -> Self {
    method v0_1_two_ways (line 37) | pub fn v0_1_two_ways() -> Self {
    method v0_1_one_way (line 49) | pub fn v0_1_one_way() -> Self {
    method audio_pad_token (line 61) | pub fn audio_pad_token(&self) -> u32 {
    method total_audio_codebooks (line 65) | pub fn total_audio_codebooks(&self) -> usize {
  type State (line 70) | pub struct State {
    method new (line 88) | pub fn new(
    method step_idx (line 124) | pub fn step_idx(&self) -> usize {
    method audio_pad_token (line 128) | fn audio_pad_token(&self) -> u32 {
    method config (line 132) | pub fn config(&self) -> &Config {
    method user_rating (line 136) | pub fn user_rating(&self) -> u32 {
    method set_user_rating (line 139) | pub fn set_user_rating(&mut self, grade: u32) {
    method apply_repetition_penalty (line 143) | fn apply_repetition_penalty(&self, logits: Tensor) -> candle::Result<T...
    method step_ (line 188) | pub fn step_(
    method step (line 284) | pub fn step(
    method step_with_gate_weight (line 300) | pub fn step_with_gate_weight(
    method audio_tokens (line 317) | pub fn audio_tokens(&self, include_all: bool) -> &[Vec<u32>] {
    method gate_weights (line 325) | pub fn gate_weights(&self, include_all: bool) -> &[f32] {
    method text_tokens (line 334) | pub fn text_tokens(&self, include_all: bool) -> &[u32] {
    method last_audio_tokens (line 343) | pub fn last_audio_tokens(&self) -> Option<Vec<u32>> {

FILE: kyuteye_rs/moshi-core/src/mimi.rs
  type ResampleMethod (line 11) | pub enum ResampleMethod {
  type Config (line 17) | pub struct Config {
    method v0_1 (line 32) | pub fn v0_1(num_codebooks: Option<usize>) -> Self {
  type Mimi (line 93) | pub struct Mimi {
    method new (line 105) | pub fn new(cfg: Config, vb: VarBuilder) -> Result<Self> {
    method config (line 161) | pub fn config(&self) -> &Config {
    method encode_pre_quantize (line 165) | pub fn encode_pre_quantize(&mut self, xs: &Tensor) -> Result<Tensor> {
    method encode (line 173) | pub fn encode(&mut self, xs: &Tensor) -> Result<Tensor> {
    method encode_step (line 183) | pub fn encode_step(&mut self, xs: &StreamTensor) -> Result<StreamTenso...
    method decode (line 196) | pub fn decode(&mut self, codes: &Tensor) -> Result<Tensor> {
    method decode_step (line 205) | pub fn decode_step(&mut self, codes: &StreamTensor) -> Result<StreamTe...
    method reset_state (line 215) | pub fn reset_state(&mut self) {
  function load (line 224) | pub fn load(model_file: &str, num_codebooks: Option<usize>, dev: &Device...

FILE: kyuteye_rs/moshi-core/src/nn.rs
  type MaybeQuantizedWeight (line 9) | pub enum MaybeQuantizedWeight {
    method to_tensor (line 16) | fn to_tensor(&self, dev: &Device) -> Result<Tensor> {
  function matmul_dtype (line 24) | pub fn matmul_dtype(device: &candle::Device) -> DType {
  type MaybeQuantizedVarBuilder (line 34) | pub enum MaybeQuantizedVarBuilder<'a> {
  function pp (line 41) | pub fn pp<S: ToString>(&self, s: S) -> Self {
  function get (line 48) | pub fn get<S: Into<Shape>>(&self, s: S, path: &str) -> Result<MaybeQuant...
  function get_as_tensor (line 56) | pub fn get_as_tensor<S: Into<Shape>>(&self, s: S, path: &str) -> Result<...
  function get_unquantized (line 64) | pub fn get_unquantized<S: Into<Shape>>(&self, s: S, path: &str) -> Resul...
  function contains_key (line 71) | pub fn contains_key(&self, name: &str) -> bool {
  function device (line 78) | pub fn device(&self) -> &Device {
  function dtype (line 85) | pub fn dtype(&self) -> DType {
  type MaybeQuantizedLinear (line 94) | pub enum MaybeQuantizedLinear {
  method forward (line 100) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  type MaybeQuantizedEmbedding (line 109) | pub enum MaybeQuantizedEmbedding {
    method new (line 115) | pub fn new(in_vocab_size: usize, dim: usize, vb: MaybeQuantizedVarBuil...
    method embeddings (line 127) | pub fn embeddings(&self) -> &Tensor {
    method hidden_size (line 134) | pub fn hidden_size(&self) -> Result<usize> {
  method forward (line 144) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  function linear (line 152) | pub fn linear(
  function linear_from (line 173) | pub fn linear_from(

FILE: kyuteye_rs/moshi-core/src/quantization.rs
  type CodebookEncode (line 8) | struct CodebookEncode;
    method name (line 11) | fn name(&self) -> &'static str {
    method cpu_fwd (line 15) | fn cpu_fwd(
  type EuclideanCodebook (line 73) | pub struct EuclideanCodebook {
    method new (line 86) | pub fn new(dim: usize, codebook_size: usize, vb: VarBuilder) -> Result...
    method encode_very_slow (line 109) | pub fn encode_very_slow(&self, xs: &Tensor) -> Result<Tensor> {
    method encode_slow (line 125) | pub fn encode_slow(&self, xs: &Tensor) -> Result<Tensor> {
    method encode (line 136) | pub fn encode(&self, xs: &Tensor) -> Result<Tensor> {
    method decode (line 146) | pub fn decode(&self, indexes: &Tensor) -> Result<Tensor> {
  type VectorQuantization (line 160) | pub struct VectorQuantization {
    method new (line 167) | pub fn new(
    method encode (line 189) | pub fn encode(&self, xs: &Tensor) -> Result<Tensor> {
    method decode (line 194) | pub fn decode(&self, codes: &Tensor) -> Result<Tensor> {
  type ResidualVectorQuantization (line 205) | pub struct ResidualVectorQuantization {
    method new (line 210) | pub fn new(
    method encode (line 226) | pub fn encode(&self, xs: &Tensor) -> Result<Tensor> {
    method decode (line 238) | pub fn decode(&self, xs: &Tensor) -> Result<Tensor> {
  type ResidualVectorQuantizer (line 260) | pub struct ResidualVectorQuantizer {
    method new (line 267) | pub fn new(
    method encode (line 318) | pub fn encode(&self, xs: &Tensor) -> Result<Tensor> {
    method decode (line 323) | pub fn decode(&self, codes: &Tensor) -> Result<Tensor> {
  type SplitResidualVectorQuantizer (line 337) | pub struct SplitResidualVectorQuantizer {
    method new (line 346) | pub fn new(
    method encode (line 383) | pub fn encode(&self, xs: &Tensor) -> Result<Tensor> {
    method decode (line 397) | pub fn decode(&self, codes: &Tensor) -> Result<Tensor> {

FILE: kyuteye_rs/moshi-core/src/seanet.rs
  type Config (line 12) | pub struct Config {
  type SeaNetResnetBlock (line 34) | pub struct SeaNetResnetBlock {
    method new (line 44) | pub fn new(
  method forward (line 109) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  method reset_state (line 123) | fn reset_state(&mut self) {
  method step (line 133) | fn step(&mut self, xs: &StreamTensor) -> Result<StreamTensor> {
  type EncoderLayer (line 147) | struct EncoderLayer {
  type SeaNetEncoder (line 153) | pub struct SeaNetEncoder {
    method new (line 162) | pub fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
  method forward (line 267) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  method reset_state (line 281) | fn reset_state(&mut self) {
  method step (line 290) | fn step(&mut self, xs: &StreamTensor) -> Result<StreamTensor> {
  type DecoderLayer (line 304) | struct DecoderLayer {
  type SeaNetDecoder (line 310) | pub struct SeaNetDecoder {
    method new (line 320) | pub fn new(cfg: &Config, vb: VarBuilder) -> Result<Self> {
  method forward (line 423) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  method reset_state (line 442) | fn reset_state(&mut self) {
  method step (line 451) | fn step(&mut self, xs: &StreamTensor) -> Result<StreamTensor> {

FILE: kyuteye_rs/moshi-core/src/streaming.rs
  type Dim (line 7) | pub trait Dim: candle::shape::Dim + Copy {}
  type StreamTensor (line 11) | pub struct StreamTensor(Option<Tensor>);
    method fmt (line 14) | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
    method from (line 23) | fn from(value: Option<Tensor>) -> Self {
    method from (line 29) | fn from(value: Tensor) -> Self {
    method from (line 35) | fn from(_value: ()) -> Self {
    method empty (line 41) | pub fn empty() -> Self {
    method from_tensor (line 45) | pub fn from_tensor(tensor: Tensor) -> Self {
    method shape (line 49) | pub fn shape(&self) -> Option<&candle::Shape> {
    method cat2 (line 53) | pub fn cat2<D: Dim>(&self, rhs: &Self, dim: D) -> Result<Self> {
    method seq_len (line 65) | pub fn seq_len<D: Dim>(&self, dim: D) -> Result<usize> {
    method reset (line 72) | pub fn reset(&mut self) {
    method narrow (line 76) | pub fn narrow<D: Dim>(&self, dim: D, offset: usize, len: usize) -> Res...
    method split (line 94) | pub fn split<D: Dim>(&self, dim: D, lhs_len: usize) -> Result<(Self, S...
    method as_option (line 116) | pub fn as_option(&self) -> Option<&Tensor> {
    method apply (line 120) | pub fn apply<M: candle::Module>(&self, m: &M) -> Result<Self> {
  type StreamingModule (line 128) | pub trait StreamingModule {
    method step (line 130) | fn step(&mut self, xs: &StreamTensor) -> Result<StreamTensor>;
    method reset_state (line 131) | fn reset_state(&mut self);
    method reset_state (line 200) | fn reset_state(&mut self) {}
    method step (line 202) | fn step(&mut self, xs: &StreamTensor) -> Result<StreamTensor> {
  type BinOp (line 135) | pub enum BinOp {
  type StreamingBinOp (line 143) | pub struct StreamingBinOp {
    method new (line 151) | pub fn new(op: BinOp, dim: candle::D) -> Self {
    method reset_state (line 160) | pub fn reset_state(&mut self) {
    method forward (line 165) | pub fn forward(&self, lhs: &Tensor, rhs: &Tensor) -> Result<Tensor> {
    method step (line 174) | pub fn step(&mut self, lhs: &StreamTensor, rhs: &StreamTensor) -> Resu...
  type Map (line 197) | pub struct Map<T: candle::Module>(T);

FILE: kyuteye_rs/moshi-core/src/transformer.rs
  type Config (line 17) | pub struct Config {
  type PositionalEmbedding (line 43) | pub enum PositionalEmbedding {
  type CrossAttentionGating (line 50) | pub enum CrossAttentionGating {
  type CaSrc (line 62) | pub enum CaSrc {
  type LayerScale (line 72) | pub struct LayerScale {
    method new (line 77) | pub fn new(d_model: usize, _init: f64, vb: MaybeQuantizedVarBuilder) -...
  method forward (line 84) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  type XaGate (line 90) | pub enum XaGate {
    method new (line 110) | pub fn new(cfg: &Config, vb: MaybeQuantizedVarBuilder) -> Result<Self> {
    method forward_with_gate_weight (line 166) | pub fn forward_with_gate_weight(&self, xs: &Tensor) -> Result<(Tensor,...
  method forward (line 195) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  type StreamingMultiheadCrossAttention (line 202) | pub struct StreamingMultiheadCrossAttention {
    method new (line 217) | pub fn new(
    method is_quantized (line 301) | pub fn is_quantized(&self) -> bool {
    method compute_kv (line 308) | pub fn compute_kv(&self, ca_src: &CaSrc) -> Result<(Tensor, Tensor)> {
    method forward_with_gate_weight (line 335) | pub fn forward_with_gate_weight(
    method forward (line 383) | pub fn forward(&self, xs: &Tensor, ca_src: &CaSrc, mask: Option<&Tenso...
  type RotaryEmbedding (line 390) | pub struct RotaryEmbedding {
    method new (line 397) | pub fn new(dim: usize, max_seq_len: usize, theta: f32, dev: &Device) -...
    method apply_rotary_emb (line 415) | pub fn apply_rotary_emb(&self, qk: &Tensor, seqlen_offset: usize) -> R...
  function get_causal_mask (line 425) | pub(crate) fn get_causal_mask(
  function flash_attn (line 441) | fn flash_attn(
  function flash_attn (line 452) | fn flash_attn(_: &Tensor, _: &Tensor, _: &Tensor, _: f32, _: bool) -> Re...
  type StreamingMultiheadAttention (line 457) | pub struct StreamingMultiheadAttention {
    method new (line 474) | pub fn new(
    method is_quantized (line 511) | pub fn is_quantized(&self) -> bool {
    method forward (line 518) | pub fn forward(&mut self, xs: &Tensor, mask: Option<&Tensor>) -> Resul...
    method reset_kv_cache (line 598) | pub fn reset_kv_cache(&mut self) {
    method set_kv_cache (line 602) | pub fn set_kv_cache(&mut self, kv_cache: candle_nn::kv_cache::KvCache) {
  type Mlp (line 608) | pub enum Mlp {
    method new (line 622) | pub fn new(cfg: &Config, vb: MaybeQuantizedVarBuilder) -> Result<Self> {
  method forward (line 650) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  type RmsNorm (line 669) | pub struct RmsNorm {
    method new (line 675) | pub fn new(d_model: usize, eps: f32, vb: MaybeQuantizedVarBuilder) -> ...
  method forward (line 684) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  type LayerNorm (line 690) | pub struct LayerNorm {
    method new (line 695) | pub fn new(d_model: usize, eps: f32, vb: MaybeQuantizedVarBuilder) -> ...
  method forward (line 709) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  type Norm (line 715) | pub enum Norm {
    method new (line 721) | pub fn new(d_model: usize, cfg: &Config, vb: MaybeQuantizedVarBuilder)...
    method new_shortcut (line 726) | pub fn new_shortcut(
  method forward (line 746) | fn forward(&self, xs: &Tensor) -> Result<Tensor> {
  type StreamingTransformerLayer (line 755) | pub struct StreamingTransformerLayer {
    method new (line 768) | pub fn new(
    method forward_with_gate_weight (line 826) | pub fn forward_with_gate_weight(
    method forward (line 860) | pub fn forward(
    method reset_kv_cache (line 870) | pub fn reset_kv_cache(&mut self) {
    method set_kv_cache (line 874) | pub fn set_kv_cache(&mut self, kv_cache: candle_nn::kv_cache::KvCache) {
  type StreamingTransformer (line 880) | pub struct StreamingTransformer {
    method new (line 890) | pub fn new(cfg: &Config, vb: MaybeQuantizedVarBuilder) -> Result<Self> {
    method forward (line 921) | pub fn forward(&mut self, xs: &Tensor) -> Result<Tensor> {
    method forward_ca (line 925) | pub fn forward_ca(&mut self, xs: &Tensor, ca_src: Option<&CaSrc>) -> R...
    method forward_with_gate_weight (line 930) | pub fn forward_with_gate_weight(
    method maybe_precompute_ca_kv (line 986) | pub fn maybe_precompute_ca_kv(&self, ca_src: Option<CaSrc>) -> Result<...
    method copy_state (line 1007) | pub fn copy_state(&mut self, from: &Self) -> Result<()> {
  method reset_state (line 1020) | fn reset_state(&mut self) {
  method step (line 1024) | fn step(&mut self, xs: &StreamTensor) -> Result<StreamTensor> {
  type ProjectedTransformer (line 1033) | pub struct ProjectedTransformer {
    method new (line 1043) | pub fn new(
    method forward (line 1076) | pub fn forward(&mut self, xs: &Tensor) -> Result<Vec<Tensor>> {
  method reset_state (line 1100) | fn reset_state(&mut self) {
  method step (line 1104) | fn step(&mut self, xs: &StreamTensor) -> Result<StreamTensor> {

FILE: scripts/convert_ckpt_utils.py
  function remove_other_output_codebooks (line 31) | def remove_other_output_codebooks(
  class Launcher (line 52) | class Launcher:
    method rust_to_pt (line 54) | def rust_to_pt(self, safetensors_file: str, out_file: Optional[str] = ...
    method pt_to_mlx (line 126) | def pt_to_mlx(self, safetensors_file: str, out_file: Optional[str] = N...

FILE: scripts/get_static_client.py
  function get (line 18) | def get() -> None:

FILE: ssvd/generate.py
  function get_pipeline (line 41) | def get_pipeline(
  function get_captions (line 60) | def get_captions(
  class Launcher (line 94) | class Launcher:
    method __get_db_file__ (line 98) | def __get_db_file__(
    method __get_table_name__ (line 105) | def __get_table_name__(
    method __get_annot_file__ (line 112) | def __get_annot_file__(
    method watch (line 125) | def watch(
    method run (line 161) | def run(

FILE: ssvd/multiturn_instruct.py
  function get_base_setting (line 9) | def get_base_setting() -> Tuple[str, str, str, str]:
  function get_location_setting (line 55) | def get_location_setting() -> Tuple[str, str, str, str]:
  function get_num_setting (line 70) | def get_num_setting() -> Tuple[str, str, str, str]:
  function get_property_setting (line 85) | def get_property_setting() -> Tuple[str, str, str, str]:
  function get_lead_short_setting (line 103) | def get_lead_short_setting() -> Tuple[str, str, str, str]:
  function get_lead_long_setting (line 122) | def get_lead_long_setting() -> Tuple[str, str, str, str]:
  function get_comb_start_setting (line 165) | def get_comb_start_setting() -> Tuple[str, str, str, str]:
  function get_tns_setting (line 251) | def get_tns_setting() -> Tuple[str, str, str, str]:
  function get_tbs_setting (line 292) | def get_tbs_setting() -> Tuple[str, str, str, str]:
  class MTCInstruct (line 335) | class MTCInstruct(Enum):
    method get_method (line 347) | def get_method(self, convo_len: int = -1) -> Callable:

FILE: ssvd/multiturn_prompting.py
  function list_to_prompt (line 21) | def list_to_prompt(
  function postprocess_mtc (line 95) | def postprocess_mtc(
  class ConvoIter (line 162) | class ConvoIter:
    method __init__ (line 165) | def __init__(
    method add_to_convos (line 180) | def add_to_convos(self, uid: str, answer: str) -> None:
    method make_iter (line 187) | def make_iter(self, captions: Sequence[str], img_ids: Sequence[str]) -...
  function run_multiturn_pipeline (line 220) | def run_multiturn_pipeline(

FILE: ssvd/utils.py
  function preprocess_pixelprose_captions (line 18) | def preprocess_pixelprose_captions(caption: str) -> Dict[str, str]:
  function maybe_shorten_caption (line 38) | def maybe_shorten_caption(caption: str, max_cap_len: int = 1500) -> str:
  function compile_pattern (line 57) | def compile_pattern(s: str) -> Pattern:
  function get_replace_pattern (line 63) | def get_replace_pattern() -> Pattern:
  function get_strings_for_logging (line 73) | def get_strings_for_logging(
  function sanitize_line (line 103) | def sanitize_line(s: str) -> str:
  function postprocess_synth_annot (line 115) | def postprocess_synth_annot(
Condensed preview — 170 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (810K chars).
[
  {
    "path": ".dockerignore",
    "chars": 68,
    "preview": "**/target/\n**/node_modules/\n**/dist\nssvd/synthetic_visual_dialogues/"
  },
  {
    "path": ".gitattributes",
    "chars": 42,
    "preview": "*.wav filter=lfs diff=lfs merge=lfs -text\n"
  },
  {
    "path": ".github/actions/rust_build/action.yml",
    "chars": 821,
    "preview": "name: rust_build\ndescription: 'Setup rust env'\ninputs:\n  os:\n    default: ubuntu-latest\n  toolchain:\n    default: stable"
  },
  {
    "path": ".github/requirements_github_actions.txt",
    "chars": 973,
    "preview": "# Main setup\n# old version: transformers 4.43.3 and accelerate 0.33.0\n# new version (for pixtrla): transformers 4.46.0 a"
  },
  {
    "path": ".github/workflows/checks.yml",
    "chars": 2007,
    "preview": "name: Checks\n\non:\n  push:\n    branches:\n      - main\n  pull_request:\n    types: [opened, synchronize, reopened, ready_fo"
  },
  {
    "path": ".github/workflows/rust-ci.yml",
    "chars": 1133,
    "preview": "on:\n  push:\n    branches: [ main ]\n  pull_request:\n    branches: [ main, refacto ]\n\nname: Rust CI\n\njobs:\n  check:\n    na"
  },
  {
    "path": ".gitignore",
    "chars": 267,
    "preview": "~*\n__pycache__\n*.pt\n*.pth\n*.ipynb*\n*.egg-info\n*.jsonl\nnohup.out\n.idea/*\nclient/node_modules\nclient/dist\ntarget/\n*.safete"
  },
  {
    "path": "CONTRIBUTING.md",
    "chars": 3176,
    "preview": "# Contributing to MoshiVis\n\n## Pull Requests\n\nMoshiVis is the implementation of a research paper.\nTherefore, we do not p"
  },
  {
    "path": "ISSUE_TEMPLATE/bug.yml",
    "chars": 2046,
    "preview": "name: Bug Report\ndescription: You found a bug.\nlabels: [\"bug\", \"triage\"]\nbody:\n  - type: dropdown\n    id: backend\n    at"
  },
  {
    "path": "ISSUE_TEMPLATE/question.yml",
    "chars": 1215,
    "preview": "name: Question\ndescription: You have a question about Moshi/Mimi, this codebase.\nlabels: [\"question\", \"triage\"]\nbody:\n  "
  },
  {
    "path": "LICENSE-APACHE",
    "chars": 11357,
    "preview": "                                 Apache License\n                           Version 2.0, January 2004\n                   "
  },
  {
    "path": "LICENSE-MIT",
    "chars": 1023,
    "preview": "Permission is hereby granted, free of charge, to any\nperson obtaining a copy of this software and associated\ndocumentati"
  },
  {
    "path": "LICENSE.md",
    "chars": 11356,
    "preview": "                                 Apache License\n                           Version 2.0, January 2004\n                   "
  },
  {
    "path": "PULL_REQUEST_TEMPLATE.md",
    "chars": 292,
    "preview": "## Checklist\n\n- [ ] Read CONTRIBUTING.md, and accept the CLA by including the provided snippet. We will not accept PR wi"
  },
  {
    "path": "README.md",
    "chars": 11720,
    "preview": "# M👁️shiVis: Teaching Speech Models to Converse about Images\n\n![CI checks](https://github.com/kyutai-labs/moshivis/actio"
  },
  {
    "path": "client/.eslinrc.json",
    "chars": 494,
    "preview": "{\n  \"env\": {\n    \"browser\": true,\n    \"es2021\": true\n  },\n  \"extends\": [\n    \"plugin:react/recommended\",\n    \"standard-w"
  },
  {
    "path": "client/.nvmrc",
    "chars": 9,
    "preview": "v20.12.2\n"
  },
  {
    "path": "client/.prettierignore",
    "chars": 6,
    "preview": "dist/*"
  },
  {
    "path": "client/.prettierrc.json",
    "chars": 198,
    "preview": "{\n  \"arrowParens\": \"avoid\",\n  \"singleQuote\": false,\n  \"trailingComma\": \"all\",\n  \"tabWidth\": 2,\n  \"useTabs\": false,\n  \"se"
  },
  {
    "path": "client/Dockerfile",
    "chars": 151,
    "preview": "FROM node:20 AS builder\n\nWORKDIR /app\n\nCOPY . /app\n\nRUN npm install\n\nRUN npx vite build\n\nFROM scratch AS build_result\n\nC"
  },
  {
    "path": "client/LICENSE",
    "chars": 1023,
    "preview": "Permission is hereby granted, free of charge, to any\nperson obtaining a copy of this software and associated\ndocumentati"
  },
  {
    "path": "client/README.md",
    "chars": 994,
    "preview": "# moshi-client\n\nFrontend for the demo.\n\n## Quickstart\n\nTo start developping, you will need a basic environment with Node"
  },
  {
    "path": "client/index.html",
    "chars": 538,
    "preview": "<!doctype html>\n<html lang=\"en\" class=\" bg-black\" data-theme=\"dark\">\n  <head>\n    <meta charset=\"UTF-8\" />\n    <meta nam"
  },
  {
    "path": "client/package.json",
    "chars": 1196,
    "preview": "{\n  \"name\": \"kyutai-client\",\n  \"private\": true,\n  \"version\": \"0.0.0\",\n  \"type\": \"module\",\n  \"scripts\": {\n    \"dev\": \"vit"
  },
  {
    "path": "client/postcss.config.js",
    "chars": 81,
    "preview": "export default {\n  plugins: {\n    tailwindcss: {},\n    autoprefixer: {},\n  },\n};\n"
  },
  {
    "path": "client/public/assets/images/demo/attribution.txt",
    "chars": 2247,
    "preview": "image1.jpg  https://unsplash.com/photos/seven-brushes-and-water-color-palette-TTwwVG4Isjw   Crystal de Passillé-Chabot\ni"
  },
  {
    "path": "client/src/app.tsx",
    "chars": 516,
    "preview": "import ReactDOM from \"react-dom/client\";\nimport {\n  createBrowserRouter,\n  RouterProvider,\n} from \"react-router-dom\";\nim"
  },
  {
    "path": "client/src/audio-processor.ts",
    "chars": 6555,
    "preview": "// @ts-nocheck\nfunction asMs(samples) {\n  return (samples * 1000 / sampleRate).toFixed(1);\n}\n\nfunction asSamples(mili) {"
  },
  {
    "path": "client/src/components/Button/Button.tsx",
    "chars": 403,
    "preview": "import { FC } from \"react\";\n\ntype ButtonProps = React.ButtonHTMLAttributes<HTMLButtonElement>;\nexport const Button: FC<B"
  },
  {
    "path": "client/src/components/ImageGallery/ImageGallery.tsx",
    "chars": 6883,
    "preview": "\nimport { useState, ChangeEvent } from \"react\";\n\nimport { Button } from \"../Button/Button\";\n\n// Natural images\nimport im"
  },
  {
    "path": "client/src/components/Input/Input.tsx",
    "chars": 493,
    "preview": "type InputProps = React.InputHTMLAttributes<HTMLInputElement> & {\n  error?: string;\n}\n\nexport const Input = ({className,"
  },
  {
    "path": "client/src/decoder/decoderWorker.ts",
    "chars": 104,
    "preview": "export const DecoderWorker = new Worker(\n  new URL(\"/assets/decoderWorker.min.js\", import.meta.url),\n);\n"
  },
  {
    "path": "client/src/env.ts",
    "chars": 426,
    "preview": "type ENV = {\n  VITE_QUEUE_API_PATH: string;\n  VITE_ENV: 'development' | 'production';\n};\n\nconst parseEnv = (): ENV => {\n"
  },
  {
    "path": "client/src/index.css",
    "chars": 2725,
    "preview": "@tailwind base;\n@tailwind components;\n@tailwind utilities;\n\n@layer utilities {\n\n  /* Hide scrollbar for Chrome, Safari a"
  },
  {
    "path": "client/src/modules.d.ts",
    "chars": 32,
    "preview": "declare module \"opus-recorder\";\n"
  },
  {
    "path": "client/src/pages/Conversation/Conversation.tsx",
    "chars": 15794,
    "preview": "import { FC, MutableRefObject, useCallback, useEffect, useMemo, useRef, useState } from \"react\";\nimport { useSocket } fr"
  },
  {
    "path": "client/src/pages/Conversation/MediaContext.ts",
    "chars": 715,
    "preview": "import { MutableRefObject, createContext, useContext } from \"react\";\ntype MediaContextType = {\n  startRecording: () => v"
  },
  {
    "path": "client/src/pages/Conversation/SocketContext.ts",
    "chars": 441,
    "preview": "import { createContext, useContext } from \"react\";\nimport { WSMessage } from \"../../protocol/types\";\n\ntype SocketContext"
  },
  {
    "path": "client/src/pages/Conversation/components/AudioVisualizer/AudioVisualizer.tsx",
    "chars": 2032,
    "preview": "import { FC, useCallback, useEffect, useRef } from \"react\";\n\ntype AudioVisualizerProps = {\n  analyser: AnalyserNode | nu"
  },
  {
    "path": "client/src/pages/Conversation/components/AudioVisualizer/ClientVisualizer.tsx",
    "chars": 4296,
    "preview": "import { FC, RefObject, useCallback, useEffect, useRef, useState } from \"react\";\nimport { clamp } from \"../../hooks/audi"
  },
  {
    "path": "client/src/pages/Conversation/components/AudioVisualizer/ServerVisualizer.tsx",
    "chars": 4247,
    "preview": "import { FC, RefObject, useCallback, useEffect, useRef, useState } from \"react\";\nimport { clamp } from \"../../hooks/audi"
  },
  {
    "path": "client/src/pages/Conversation/components/Controls/Controls.tsx",
    "chars": 709,
    "preview": "import {\n  controlBOSMessage,\n  controlEOSMessage,\n} from \"../../../../protocol/testMessages\";\nimport { useSocketContext"
  },
  {
    "path": "client/src/pages/Conversation/components/ModelParams/ModelParams.tsx",
    "chars": 7361,
    "preview": "import { FC, RefObject } from \"react\";\nimport { useModelParams } from \"../../hooks/useModelParams\";\nimport { Button } fr"
  },
  {
    "path": "client/src/pages/Conversation/components/ServerAudio/ServerAudio.tsx",
    "chars": 1323,
    "preview": "import { FC, useRef } from \"react\";\nimport { AudioStats, useServerAudio } from \"../../hooks/useServerAudio\";\nimport { Se"
  },
  {
    "path": "client/src/pages/Conversation/components/ServerAudio/ServerAudioStats.tsx",
    "chars": 2457,
    "preview": "import { useState, useEffect, useRef } from \"react\";\n\ntype ServerAudioStatsProps = {\n  getAudioStats: React.MutableRefOb"
  },
  {
    "path": "client/src/pages/Conversation/components/ServerInfo/ServerInfo.tsx",
    "chars": 1299,
    "preview": "import { useServerInfo } from \"../../hooks/useServerInfo\";\n\nfunction pretty_format(num: number): number {\n  return Math."
  },
  {
    "path": "client/src/pages/Conversation/components/TextDisplay/TextDisplay.tsx",
    "chars": 2003,
    "preview": "import { FC, useEffect, useRef } from \"react\";\nimport { useServerText } from \"../../hooks/useServerText\";\n\ntype TextDisp"
  },
  {
    "path": "client/src/pages/Conversation/components/TextDisplay/TextDisplayStats.tsx",
    "chars": 496,
    "preview": "import { FC } from \"react\";\n\ntype TextDisplayStatsProps = {\n  totalTextMessages: number;\n};\nexport const TextDisplayStat"
  },
  {
    "path": "client/src/pages/Conversation/components/UserAudio/UserAudio.tsx",
    "chars": 2015,
    "preview": "import { FC, useCallback, useEffect, useRef, useState } from \"react\";\nimport { useSocketContext } from \"../../SocketCont"
  },
  {
    "path": "client/src/pages/Conversation/components/UserAudio/UserAudioStats.tsx",
    "chars": 459,
    "preview": "import { FC } from \"react\";\n\ntype UserAudioStatsProps = {\n  sentMessagesCount: number;\n};\n\nexport const UserAudioStats: "
  },
  {
    "path": "client/src/pages/Conversation/getMimeType.ts",
    "chars": 1603,
    "preview": "export const mimeTypeCheck = () => {\n  const types = [\n    \"audio/ogg\",\n    \"audio/wav\",\n    \"audio/webm;codecs=opus\",\n "
  },
  {
    "path": "client/src/pages/Conversation/hooks/audioUtils.ts",
    "chars": 117,
    "preview": "export const clamp = (value: number, min: number, max: number) => {\n  return Math.min(Math.max(value, min), max);\n};\n"
  },
  {
    "path": "client/src/pages/Conversation/hooks/useModelParams.ts",
    "chars": 6605,
    "preview": "import { useCallback, useState } from \"react\";\n\nexport const DEFAULT_TEXT_TEMPERATURE = 0.45;\nexport const DEFAULT_TEXT_"
  },
  {
    "path": "client/src/pages/Conversation/hooks/useServerAudio.ts",
    "chars": 4840,
    "preview": "import { useCallback, useEffect, useRef, useState } from \"react\";\nimport { useSocketContext } from \"../SocketContext\";\ni"
  },
  {
    "path": "client/src/pages/Conversation/hooks/useServerInfo.ts",
    "chars": 2667,
    "preview": "import { useCallback, useEffect, useState } from \"react\";\nimport { useSocketContext } from \"../SocketContext\";\nimport { "
  },
  {
    "path": "client/src/pages/Conversation/hooks/useServerText.ts",
    "chars": 1280,
    "preview": "import { useCallback, useEffect, useState } from \"react\";\nimport { useSocketContext } from \"../SocketContext\";\nimport { "
  },
  {
    "path": "client/src/pages/Conversation/hooks/useSocket.ts",
    "chars": 3980,
    "preview": "import { useState, useEffect, useCallback, useRef } from \"react\";\nimport { WSMessage } from \"../../../protocol/types\";\ni"
  },
  {
    "path": "client/src/pages/Conversation/hooks/useUserAudio.ts",
    "chars": 4329,
    "preview": "import { useCallback, useRef, useState } from \"react\";\nimport Recorder from \"opus-recorder\";\nimport encoderPath from \"op"
  },
  {
    "path": "client/src/pages/Queue/Queue.tsx",
    "chars": 14290,
    "preview": "import moshiProcessorUrl from \"../../audio-processor.ts?worker&url\";\nimport { FC, useEffect, useMemo, useState, useCallb"
  },
  {
    "path": "client/src/pages/Queue/api/client.ts",
    "chars": 2521,
    "preview": "import { APIError } from \"./errors/api_error\";\nimport { ResponseError } from \"./errors/response_error\";\nimport { validat"
  },
  {
    "path": "client/src/pages/Queue/api/errors/api_error.ts",
    "chars": 183,
    "preview": "export class APIError extends Error {\n  status:number;\n\n  constructor(message:string, status:number) {\n    super(message"
  },
  {
    "path": "client/src/pages/Queue/api/errors/response_error.ts",
    "chars": 134,
    "preview": "export class ResponseError extends Error {\n  constructor(message:string) {\n    super(message);\n    this.name = \"Response"
  },
  {
    "path": "client/src/pages/Queue/api/validators.ts",
    "chars": 570,
    "preview": "import { z } from \"zod\"\n\nexport const validateAddUser = (response: unknown) => {\n  const AddUser = z.object({\n    sessio"
  },
  {
    "path": "client/src/pages/Queue/hooks/useUserEmail.ts",
    "chars": 751,
    "preview": "import { useCallback, useState } from \"react\";\nimport { z } from \"zod\";\n\nconst validateEmail = z.string().email();\n\nexpo"
  },
  {
    "path": "client/src/protocol/encoder.ts",
    "chars": 2949,
    "preview": "import {\n  CONTROL_MESSAGE,\n  CONTROL_MESSAGES_MAP,\n  MODELS_MAP,\n  WSMessage,\n  VERSIONS_MAP,\n} from \"./types\";\n\nexport"
  },
  {
    "path": "client/src/protocol/testMessages.ts",
    "chars": 572,
    "preview": "import { WSMessage } from \"./types\";\n\nexport const handshakeMessage: WSMessage = {\n  type: \"handshake\",\n  version: 0,\n  "
  },
  {
    "path": "client/src/protocol/types.ts",
    "chars": 1089,
    "preview": "export type MessageType =\n  | \"handshake\"\n  | \"audio\"\n  | \"text\"\n  | \"coloredtext\"\n  | \"control\"\n  | \"metadata\";\n\nexport"
  },
  {
    "path": "client/tailwind.config.js",
    "chars": 189,
    "preview": "/** @type {import('tailwindcss').Config} */\n\nexport default {\n  content: [\"./src/**/*.{js,jsx,ts,tsx}\", \"./index.html\"],"
  },
  {
    "path": "client/tsconfig.json",
    "chars": 644,
    "preview": "{\n  \"compilerOptions\": {\n    \"target\": \"ES2020\",\n    \"useDefineForClassFields\": true,\n    \"module\": \"ESNext\",\n    \"lib\":"
  },
  {
    "path": "client/vite.config.ts",
    "chars": 883,
    "preview": "import { ProxyOptions, defineConfig, loadEnv } from \"vite\";\nimport topLevelAwait from \"vite-plugin-top-level-await\";\n\nex"
  },
  {
    "path": "docker-bake.hcl",
    "chars": 188,
    "preview": "group \"default\" {\n  targets = [\"client\"]\n}\n\ntarget \"client\" {\n  context    = \"./client\"\n\n  # Specify output type as a lo"
  },
  {
    "path": "kyuteye_mlx/.pylintrc",
    "chars": 21744,
    "preview": "[MAIN]\n\n# Analyse import fallback blocks. This can be used to support both Python 2 and\n# 3 compatible code, which means"
  },
  {
    "path": "kyuteye_mlx/LICENSE",
    "chars": 1023,
    "preview": "Permission is hereby granted, free of charge, to any\nperson obtaining a copy of this software and associated\ndocumentati"
  },
  {
    "path": "kyuteye_mlx/MANIFEST.in",
    "chars": 96,
    "preview": "include LICENSE*\ninclude *.md\ninclude *.cfg\ninclude requirements.txt\ninclude moshi_mlx/py.typed\n"
  },
  {
    "path": "kyuteye_mlx/README.md",
    "chars": 1961,
    "preview": "# MoshiVis - MLX\n\nSee the [top-level README.md][main_repo] for more information on MoshiVis.\n\nThis is the MLX implementa"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/__init__.py",
    "chars": 1007,
    "preview": "# Copyright (c) Kyutai, all rights reserved.\n# This source code is licensed under the license found in the\n# LICENSE fil"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/benchmark.py",
    "chars": 741,
    "preview": "import time\n\nimport mlx.core as mx\nimport numpy as np\n\nfrom .local_web import get_args_for_main, get_model, predict_text"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/local_web.py",
    "chars": 21222,
    "preview": "# Copyright (c) Kyutai, all rights reserved.\n# This source code is licensed under the license found in the\n# LICENSE fil"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/mlx_vlm/LICENSE",
    "chars": 1064,
    "preview": "MIT License\n\nCopyright © 2023 Apple Inc.\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\no"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/mlx_vlm/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/mlx_vlm/models/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/mlx_vlm/models/pixtral/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/mlx_vlm/models/pixtral/vision.py",
    "chars": 8874,
    "preview": "import inspect\nfrom dataclasses import dataclass\n\nimport mlx.core as mx\nimport mlx.nn as nn\n\n\n@dataclass\nclass VisionCon"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/mlx_vlm/models/siglip/vision.py",
    "chars": 7648,
    "preview": "import inspect\nfrom dataclasses import dataclass\n\nimport mlx.core as mx\nimport mlx.nn as nn\nimport numpy as np\n\n\n@datacl"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/models/__init__.py",
    "chars": 467,
    "preview": "# Copyright (c) Kyutai, all rights reserved.\n# This source code is licensed under the license found in the\n# LICENSE fil"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/models/generate.py",
    "chars": 4931,
    "preview": "# Copyright (c) Kyutai, all rights reserved.\n# This source code is licensed under the license found in the\n# LICENSE fil"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/models/lm.py",
    "chars": 13153,
    "preview": "# Copyright (c) Kyutai, all rights reserved.\n# This source code is licensed under the license found in the\n# LICENSE fil"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/models/pixtral.py",
    "chars": 1835,
    "preview": "import json\n\nimport mlx\nimport mlx.core as mx\nimport mlx.nn\n\nfrom ..mlx_vlm.models.pixtral.vision import PixtralVisionMo"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/models/siglip.py",
    "chars": 1026,
    "preview": "import json\nimport os\n\nimport mlx\nimport mlx.core as mx\nimport mlx.nn\n\nfrom ..mlx_vlm.models.siglip.vision import Vision"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/modules/__init__.py",
    "chars": 328,
    "preview": "# Copyright (c) Kyutai, all rights reserved.\n# This source code is licensed under the license found in the\n# LICENSE fil"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/modules/config.py",
    "chars": 766,
    "preview": "from dataclasses import dataclass\nfrom typing import Literal\n\n\n@dataclass\nclass TransformerConfig:\n    d_model: int\n    "
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/modules/cross_attention.py",
    "chars": 4277,
    "preview": "# Copyright (c) Kyutai, all rights reserved.\n# This source code is licensed under the license found in the\n# LICENSE fil"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/modules/kv_cache.py",
    "chars": 7073,
    "preview": "# Most of the code below comes from:\n# https://github.com/ml-explore/mlx-examples/blob/6c2369e4b97f49fb5906ec46033497b39"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/modules/transformer.py",
    "chars": 7366,
    "preview": "# Copyright (c) Kyutai, all rights reserved.\n# This source code is licensed under the license found in the\n# LICENSE fil"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/py.typed",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/quantize.py",
    "chars": 2480,
    "preview": "# /// script\n# requires-python = \">=3.10\"\n# dependencies = [\n#     \"fire\",\n#     \"mlx==0.18.1\",\n#     \"safetensors >= 0."
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/utils/__init__.py",
    "chars": 228,
    "preview": "# Copyright (c) Kyutai, all rights reserved.\n# This source code is licensed under the license found in the\n# LICENSE fil"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/utils/loading.py",
    "chars": 1301,
    "preview": "import mlx.core as mx\n\n\ndef repeat_shared_weights(weights: dict[str, mx.array], num_layers: int) -> dict[str, mx.array]:"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/utils/profiling.py",
    "chars": 255,
    "preview": "from typing import Callable\n\nimport line_profiler\n\nPROFILING_ENABLED = False\nprofile: line_profiler.LineProfiler | Calla"
  },
  {
    "path": "kyuteye_mlx/kyuteye_mlx/utils/sampling.py",
    "chars": 2564,
    "preview": "# Taken from https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/sample_utils.py\n# Copyright © 2023-2024 Ap"
  },
  {
    "path": "kyuteye_mlx/pixtral-12b-8bit.config",
    "chars": 288,
    "preview": "{\"model_type\": \"pixtral\", \"num_hidden_layers\": 24, \"hidden_size\": 1024, \"head_dim\": 64, \"intermediate_size\": 4096, \"num_"
  },
  {
    "path": "kyuteye_mlx/pyproject.toml",
    "chars": 1577,
    "preview": "[project]\nname = \"kyuteye_mlx\"\nrequires-python = \">= 3.10,<3.13\"\ndescription = \"Kyutai with an 'eye', but running on mac"
  },
  {
    "path": "kyuteye_mlx/siglip448.config",
    "chars": 242,
    "preview": "{\"model_type\": \"siglip_vision_model\", \"num_hidden_layers\": 27, \"hidden_size\": 1152, \"intermediate_size\": 4304, \"num_atte"
  },
  {
    "path": "kyuteye_mlx/tests/test_siglip.py",
    "chars": 1658,
    "preview": "import mlx.core as mx\nimport numpy as np\nimport torch\nfrom transformers import AutoModelForImageTextToText, AutoProcesso"
  },
  {
    "path": "kyuteye_pt/.pylintrc",
    "chars": 21744,
    "preview": "[MAIN]\n\n# Analyse import fallback blocks. This can be used to support both Python 2 and\n# 3 compatible code, which means"
  },
  {
    "path": "kyuteye_pt/LICENSE.md",
    "chars": 1022,
    "preview": "Permission is hereby granted, free of charge, to any\nperson obtaining a copy of this software and associated\ndocumentati"
  },
  {
    "path": "kyuteye_pt/README.md",
    "chars": 1127,
    "preview": "# MoshiVis - PyTorch\nSee the [top-level README.md][main_repo] for more information on MoshiVis. \nThis is the PyTorch imp"
  },
  {
    "path": "kyuteye_pt/configs/moshika-vis.yaml",
    "chars": 541,
    "preview": "add_boi_eoi: false\nalign_img_and_speech_tokens_dim: true\nencoder_name: siglip_gemma2_448\nhf_repo: 'kyutai/moshika-vis-py"
  },
  {
    "path": "kyuteye_pt/kyuteye/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "kyuteye_pt/kyuteye/config/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "kyuteye_pt/kyuteye/config/enums.py",
    "chars": 2269,
    "preview": "\"\"\"Knowledge base of useful fixed values used across the codebase\"\"\"\n\nfrom enum import Enum, unique\nfrom typing import L"
  },
  {
    "path": "kyuteye_pt/kyuteye/config/kyuteye_config.py",
    "chars": 7273,
    "preview": "\"\"\"Main configuration object used to configure the model and training pipeline\"\"\"\n\nimport os\nfrom copy import deepcopy\nf"
  },
  {
    "path": "kyuteye_pt/kyuteye/config/subconfigs.py",
    "chars": 10704,
    "preview": "\"\"\"Modular configs for configuring a Kyuteye model training run\"\"\"\n\nfrom collections.abc import Iterable\nfrom dataclasse"
  },
  {
    "path": "kyuteye_pt/kyuteye/models/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "kyuteye_pt/kyuteye/models/docker-bake.hcl",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "kyuteye_pt/kyuteye/models/helium.py",
    "chars": 5978,
    "preview": "# pylint: disable=redefined-outer-name, pointless-string-statement\n\"\"\"Port of Helium from Jax to Pytorch and then HF.\nTh"
  },
  {
    "path": "kyuteye_pt/kyuteye/models/hf_model_configs.py",
    "chars": 7745,
    "preview": "# pylint: disable=protected-access\n\"\"\"Configuration for HF-compliant models\"\"\"\n\nfrom typing import Any, Literal, Optiona"
  },
  {
    "path": "kyuteye_pt/kyuteye/models/image_projection.py",
    "chars": 7048,
    "preview": "\"\"\"Image encoders  (CLIP, SigLIP)\"\"\"\n\nfrom typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple\n\nimpo"
  },
  {
    "path": "kyuteye_pt/kyuteye/models/loaders.py",
    "chars": 1459,
    "preview": "# Copyright (c) Kyutai, all rights reserved.\n# This source code is licensed under the license found in the\n# LICENSE fil"
  },
  {
    "path": "kyuteye_pt/kyuteye/models/moshivis.py",
    "chars": 20586,
    "preview": "\"\"\"Moshi the little AI\"\"\"\n\nfrom functools import partial\nfrom typing import Any, Dict, List, Literal, Optional, Tuple\n\ni"
  },
  {
    "path": "kyuteye_pt/kyuteye/modules/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "kyuteye_pt/kyuteye/modules/attention.py",
    "chars": 13818,
    "preview": "# Copyright (c) Kyutai, all rights reserved.\n# This source code is licensed under the license found in the\n# LICENSE fil"
  },
  {
    "path": "kyuteye_pt/kyuteye/modules/cross_attention.py",
    "chars": 14203,
    "preview": "# Copyright (c) Kyutai, all rights reserved.\n# This source code is licensed under the license found in the\n# LICENSE fil"
  },
  {
    "path": "kyuteye_pt/kyuteye/modules/image_encoder.py",
    "chars": 6346,
    "preview": "# Copyright (c) Kyutai, all rights reserved.\n# This source code is licensed under the license found in the\n# LICENSE fil"
  },
  {
    "path": "kyuteye_pt/kyuteye/modules/image_transforms.py",
    "chars": 5244,
    "preview": "# Copyright (c) Kyutai, all rights reserved.\n# This source code is licensed under the license found in the\n# LICENSE fil"
  },
  {
    "path": "kyuteye_pt/kyuteye/modules/streaming_utils.py",
    "chars": 5535,
    "preview": "# pylint: disable=protected-access\n# Copyright (c) Kyutai, all rights reserved.\n# This source code is licensed under the"
  },
  {
    "path": "kyuteye_pt/kyuteye/modules/transformer.py",
    "chars": 12931,
    "preview": "# Copyright (c) Kyutai, all rights reserved.\n# This source code is licensed under the license found in the\n# LICENSE fil"
  },
  {
    "path": "kyuteye_pt/kyuteye/modules/utils.py",
    "chars": 14386,
    "preview": "# Copyright (c) Kyutai, all rights reserved.\n# This source code is licensed under the license found in the\n# LICENSE fil"
  },
  {
    "path": "kyuteye_pt/kyuteye/server.py",
    "chars": 16478,
    "preview": "# pylint: disable=protected-access,no-member\n# Copyright (c) Kyutai, all rights reserved.\n# This source code is licensed"
  },
  {
    "path": "kyuteye_pt/kyuteye/utils/__init__.py",
    "chars": 0,
    "preview": ""
  },
  {
    "path": "kyuteye_pt/kyuteye/utils/dist_utils.py",
    "chars": 747,
    "preview": "\"\"\"Some utils for distributed training\"\"\"\n\nimport os\nfrom typing import Any\n\nimport torch.distributed as dist\nfrom rich "
  },
  {
    "path": "kyuteye_pt/kyuteye/utils/logging_utils.py",
    "chars": 1377,
    "preview": "\"\"\"Some utils for experiment tracking and logging\"\"\"\n\nimport json\nimport subprocess\nfrom typing import Dict, Tuple\n\nimpo"
  },
  {
    "path": "kyuteye_pt/kyuteye/utils/struct_utils.py",
    "chars": 1594,
    "preview": "\"\"\"Useful structure and simple class definition\n\nFrozenEnum are used to hold global configs shared across multiple files"
  },
  {
    "path": "kyuteye_pt/pyproject.toml",
    "chars": 1241,
    "preview": "[build-system]\nrequires = [\"setuptools>=42\", \"wheel\"]\nbuild-backend = \"setuptools.build_meta\"\n\n[project]\nname = \"kyuteye"
  },
  {
    "path": "kyuteye_pt/tests/hello.py",
    "chars": 1111,
    "preview": "from transformers import AutoProcessor, AutoModelForImageTextToText\nimport numpy as np\nimport torch\nimport torch\nfrom mo"
  },
  {
    "path": "kyuteye_rs/Cargo.toml",
    "chars": 2137,
    "preview": "[workspace]\nmembers = [\n    \"moshi-core\",\n    \"moshi-backend\",\n]\nresolver = \"2\"\n\n[workspace.dependencies]\nanyhow = \"1\"\na"
  },
  {
    "path": "kyuteye_rs/configs/config-moshika-vis-q8.json",
    "chars": 874,
    "preview": "{\n    \"instance_name\": \"foo\",\n    \"hf_repo\": \"kyutai/moshika-vis-candle-q8\",\n    \"lm_model_file\": \"$HOME/tmp/model.q8_0."
  },
  {
    "path": "kyuteye_rs/configs/config-moshika-vis.json",
    "chars": 878,
    "preview": "{\n    \"instance_name\": \"foo\",\n    \"hf_repo\": \"kyutai/moshika-vis-candle-bf16\",\n    \"lm_model_file\": \"$HOME/tmp/model.saf"
  },
  {
    "path": "kyuteye_rs/moshi-backend/Cargo.toml",
    "chars": 1759,
    "preview": "[package]\nname = \"moshi-backend\"\nversion = \"0.1.0\"\nedition = \"2021\"\n\n# See more keys and their definitions at https://do"
  },
  {
    "path": "kyuteye_rs/moshi-backend/build.rs",
    "chars": 390,
    "preview": "use anyhow::Result;\nuse vergen::EmitBuilder;\n\npub fn main() -> Result<()> {\n    // NOTE: This will output everything, an"
  },
  {
    "path": "kyuteye_rs/moshi-backend/src/audio.rs",
    "chars": 6143,
    "preview": "// Copyright (c) Kyutai, all rights reserved.\n// This source code is licensed under the license found in the\n// LICENSE "
  },
  {
    "path": "kyuteye_rs/moshi-backend/src/build.rs",
    "chars": 505,
    "preview": "// Copyright (c) Kyutai, all rights reserved.\n// This source code is licensed under the license found in the\n// LICENSE "
  },
  {
    "path": "kyuteye_rs/moshi-backend/src/image_embedder.rs",
    "chars": 13236,
    "preview": "use anyhow::Result;\nuse candle::{Device, Tensor};\nuse candle_nn::{linear, Linear, VarBuilder};\nuse candle_transformers::"
  },
  {
    "path": "kyuteye_rs/moshi-backend/src/main.rs",
    "chars": 5021,
    "preview": "// Copyright (c) Kyutai, all rights reserved.\n// This source code is licensed under the license found in the\n// LICENSE "
  },
  {
    "path": "kyuteye_rs/moshi-backend/src/metrics.rs",
    "chars": 446,
    "preview": "use lazy_static::lazy_static;\nuse prometheus::Histogram;\nuse prometheus::{histogram_opts, register_histogram};\n\npub mod "
  },
  {
    "path": "kyuteye_rs/moshi-backend/src/standalone.rs",
    "chars": 11347,
    "preview": "// Copyright (c) Kyutai, all rights reserved.\n// This source code is licensed under the license found in the\n// LICENSE "
  },
  {
    "path": "kyuteye_rs/moshi-backend/src/stream_both.rs",
    "chars": 32719,
    "preview": "// Copyright (c) Kyutai, all rights reserved.\n// This source code is licensed under the license found in the\n// LICENSE "
  },
  {
    "path": "kyuteye_rs/moshi-backend/src/utils.rs",
    "chars": 3274,
    "preview": "#[derive(Debug, PartialEq, Clone, serde::Deserialize, serde::Serialize)]\npub struct BuildInfo {\n    build_timestamp: Str"
  },
  {
    "path": "kyuteye_rs/moshi-core/Cargo.toml",
    "chars": 822,
    "preview": "[package]\nname = \"moshi\"\nversion = \"0.1.0\"\nedition = \"2021\"\n\n[dependencies]\ncandle = { workspace = true }\ncandle-nn = { "
  },
  {
    "path": "kyuteye_rs/moshi-core/src/conv.rs",
    "chars": 21188,
    "preview": "// Copyright (c) Kyutai, all rights reserved.\n// This source code is licensed under the license found in the\n// LICENSE "
  },
  {
    "path": "kyuteye_rs/moshi-core/src/dynamic_logits_processor.rs",
    "chars": 6526,
    "preview": "use candle::{Context, DType, Error, Result, Tensor};\nuse candle_transformers::generation::Sampling;\nuse rand::{distribut"
  },
  {
    "path": "kyuteye_rs/moshi-core/src/lib.rs",
    "chars": 557,
    "preview": "// Copyright (c) Kyutai, all rights reserved.\n// This source code is licensed under the license found in the\n// LICENSE "
  },
  {
    "path": "kyuteye_rs/moshi-core/src/lm.rs",
    "chars": 22933,
    "preview": "// Copyright (c) Kyutai, all rights reserved.\n// This source code is licensed under the license found in the\n// LICENSE "
  },
  {
    "path": "kyuteye_rs/moshi-core/src/lm_generate.rs",
    "chars": 7381,
    "preview": "// Copyright (c) Kyutai, all rights reserved.\n// This source code is licensed under the license found in the\n// LICENSE "
  },
  {
    "path": "kyuteye_rs/moshi-core/src/lm_generate_multistream.rs",
    "chars": 11958,
    "preview": "// Copyright (c) Kyutai, all rights reserved.\n// This source code is licensed under the license found in the\n// LICENSE "
  },
  {
    "path": "kyuteye_rs/moshi-core/src/mimi.rs",
    "chars": 7766,
    "preview": "// Copyright (c) Kyutai, all rights reserved.\n// This source code is licensed under the license found in the\n// LICENSE "
  },
  {
    "path": "kyuteye_rs/moshi-core/src/nn.rs",
    "chars": 5828,
    "preview": "use candle::quantized::QTensor;\nuse candle::{DType, Device, Module, Result, Shape, Tensor};\nuse candle_transformers::qua"
  },
  {
    "path": "kyuteye_rs/moshi-core/src/quantization.rs",
    "chars": 13259,
    "preview": "// Copyright (c) Kyutai, all rights reserved.\n// This source code is licensed under the license found in the\n// LICENSE "
  },
  {
    "path": "kyuteye_rs/moshi-core/src/seanet.rs",
    "chars": 14771,
    "preview": "// Copyright (c) Kyutai, all rights reserved.\n// This source code is licensed under the license found in the\n// LICENSE "
  },
  {
    "path": "kyuteye_rs/moshi-core/src/streaming.rs",
    "chars": 5943,
    "preview": "// Copyright (c) Kyutai, all rights reserved.\n// This source code is licensed under the license found in the\n// LICENSE "
  },
  {
    "path": "kyuteye_rs/moshi-core/src/transformer.rs",
    "chars": 39618,
    "preview": "// Implements various modules for transformers with support for both quantized and unquantized forwards\n// Main differen"
  },
  {
    "path": "scripts/convert_ckpt_utils.py",
    "chars": 10372,
    "preview": "# /// script\n# requires-python = \">=3.10\"\n# dependencies = [\n#     \"fire\",\n#     \"numpy\",\n#     \"rich\",\n#     \"safetenso"
  },
  {
    "path": "scripts/get_static_client.py",
    "chars": 874,
    "preview": "# /// script\n# requires-python = \">=3.10\"\n# dependencies = [\n#     \"fire\",\n#     \"huggingface-hub\",\n#     \"rich\",\n# ]\n# "
  },
  {
    "path": "ssvd/README.md",
    "chars": 5573,
    "preview": "# Synthetic visual dialogues pipeline\n\nThis directory contains the data generation pipeline for synthetic visual dialogu"
  },
  {
    "path": "ssvd/__init__.py",
    "chars": 56,
    "preview": "\"\"\"Scripts for generating synthetic visual dialogues\"\"\"\n"
  },
  {
    "path": "ssvd/generate.py",
    "chars": 14561,
    "preview": "# pylint: disable=C0413,C0411\n# /// script\n# requires-python = \">=3.10\"\n# dependencies = [\n#     \"datasets\",\n#     \"fire"
  },
  {
    "path": "ssvd/multiturn_instruct.py",
    "chars": 16497,
    "preview": "# pylint: disable=line-too-long\n\"\"\"Main instruct prompts for different roles in Multi-Turn Coversation (dialogues)\"\"\"\n\ni"
  },
  {
    "path": "ssvd/multiturn_prompting.py",
    "chars": 11351,
    "preview": "\"\"\"Main pipeline for generating dialogues\"\"\"\n\nimport json\nfrom copy import copy\nfrom random import random\nfrom typing im"
  },
  {
    "path": "ssvd/utils.py",
    "chars": 4797,
    "preview": "\"\"\"Extra utils for annotations scripts, main for post-processing\"\"\"\n\nimport re\nfrom functools import lru_cache\nfrom typi"
  }
]

// ... and 1 more files (download for full content)

About this extraction

This page contains the full source code of the kyutai-labs/moshivis GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 170 files (751.6 KB), approximately 189.7k tokens, and a symbol index with 859 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.

Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.

Copied to clipboard!